Spaces:

mgyigit
/

misinfo

Sleeping

App Files Files Community

gyigit commited on Jan 2

Commit

54e8a79

1 Parent(s): b61672d

update

Browse files

Files changed (47) hide show

app.py +446 -0
ckpts/model.pt +3 -0
src/__init__.py +0 -0
src/__pycache__/__init__.cpython-311.pyc +0 -0
src/data_loader/__init__.py +0 -0
src/data_loader/__pycache__/__init__.cpython-311.pyc +0 -0
src/data_loader/__pycache__/download_data.cpython-311.pyc +0 -0
src/data_loader/__pycache__/download_images.cpython-311.pyc +0 -0
src/data_loader/download_data.py +76 -0
src/data_loader/download_data_mocheg.py +71 -0
src/data_loader/download_images.py +168 -0
src/data_loader/preprocess_embeddings.py +129 -0
src/demo/__init__.py +0 -0
src/demo/__pycache__/__init__.cpython-311.pyc +0 -0
src/demo/__pycache__/app.cpython-311.pyc +0 -0
src/demo/app.py +446 -0
src/evidence/__init__.py +0 -0
src/evidence/__pycache__/__init__.cpython-311.pyc +0 -0
src/evidence/__pycache__/corpus_utils.cpython-311.pyc +0 -0
src/evidence/__pycache__/im2im_retrieval.cpython-311.pyc +0 -0
src/evidence/__pycache__/text2text_retrieval.cpython-311.pyc +0 -0
src/evidence/corpus_utils.py +100 -0
src/evidence/im2im_retrieval.py +169 -0
src/evidence/text2text_retrieval.py +203 -0
src/experimental/__init__.py +0 -0
src/experimental/dataset_search.ipynb +0 -0
src/experimental/dataset_stats.ipynb +0 -0
src/experimental/image_captioning.ipynb +96 -0
src/model/__init__.py +0 -0
src/model/__pycache__/__init__.cpython-311.pyc +0 -0
src/model/__pycache__/layers.cpython-311.pyc +0 -0
src/model/__pycache__/model.cpython-311.pyc +0 -0
src/model/dataset.py +164 -0
src/model/layers.py +58 -0
src/model/model.py +432 -0
src/preprocess/__init__.py +0 -0
src/preprocess/__pycache__/__init__.cpython-311.pyc +0 -0
src/preprocess/__pycache__/caption.cpython-311.pyc +0 -0
src/preprocess/__pycache__/preprocess.cpython-311.pyc +0 -0
src/preprocess/caption.py +129 -0
src/preprocess/preprocess.py +82 -0
src/utils/__init__.py +0 -0
src/utils/__pycache__/__init__.cpython-311.pyc +0 -0
src/utils/__pycache__/data_utils.cpython-311.pyc +0 -0
src/utils/__pycache__/path_utils.cpython-311.pyc +0 -0
src/utils/data_utils.py +73 -0
src/utils/path_utils.py +6 -0

app.py ADDED Viewed

	@@ -0,0 +1,446 @@

+import streamlit as st
+from PIL import Image
+from transformers import BlipProcessor, BlipForConditionalGeneration
+import pandas as pd
+import os
+from evaluate import MisinformationPredictor
+from src.evidence.im2im_retrieval import ImageCorpus
+from src.evidence.text2text_retrieval import SemanticSimilarity
+from src.utils.path_utils import get_project_root
+from typing import List, Optional, Tuple
+from dataclasses import dataclass
+# Initialize BLIP model and processor
+processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
+model = BlipForConditionalGeneration.from_pretrained(
+    "Salesforce/blip-image-captioning-large"
+)
+PROJECT_ROOT = get_project_root()
+@dataclass
+class Evidence:
+    evidence_id: str
+    dataset: str
+    text: Optional[str]
+    image: Optional[Image.Image]
+    caption: Optional[str]
+    image_path: Optional[str]
+    classification_result_all: Optional[Tuple[str, str, str, str]] = None
+    classification_result_final: Optional[str] = None
+CLASSIFICATION_CATEGORIES = ["support", "refute", "not_enough_information"]
+def generate_caption(image: Image.Image) -> str:
+    """Generates a caption for a given image."""
+    try:
+        with st.spinner("Generating caption..."):
+            inputs = processor(image, return_tensors="pt")
+            output = model.generate(**inputs)
+            return processor.decode(output[0], skip_special_tokens=True)
+    except Exception as e:
+        st.error(f"Error generating caption: {e}")
+        return ""
+def enrich_text_with_caption(text: str, image_caption: str) -> str:
+    """Appends the image caption to the given text."""
+    if image_caption:
+        return f"{text}. {image_caption}"
+    return text
+@st.cache_data
+def get_train_df():
+    data_dir = os.path.join(PROJECT_ROOT, "data", "preprocessed")
+    train_csv_path = os.path.join(data_dir, "train_enriched.csv")
+    return pd.read_csv(train_csv_path)
+@st.cache_data
+def get_test_df():
+    data_dir = os.path.join(PROJECT_ROOT, "data", "preprocessed")
+    train_csv_path = os.path.join(data_dir, "test_enriched.csv")
+    return pd.read_csv(train_csv_path)
+@st.cache_data
+def get_semantic_similarity(
+    train_embeddings_file: str,
+    test_embeddings_file: str,
+    train_df: pd.DataFrame,
+    test_df: pd.DataFrame,
+):
+    return SemanticSimilarity(
+        train_embeddings_file=train_embeddings_file,
+        test_embeddings_file=test_embeddings_file,
+        train_df=train_df,
+        test_df=test_df,
+    )
+def retrieve_evidences_by_text(
+    query: str,
+    top_k: int = 5,
+) -> List[Evidence]:
+    """
+    Retrieves evidence rows from preloaded embeddings and CSV data using semantic similarity.
+    Args:
+        query (str): The query text to perform the search.
+        top_k (int): Number of top results to retrieve.
+    Returns:
+        List[Evidence]: A list of retrieved evidence objects.
+    """
+    train_embeddings_file = os.path.join(PROJECT_ROOT, "train_embeddings.h5")
+    test_embeddings_file = os.path.join(PROJECT_ROOT, "test_embeddings.h5")
+    similarity = get_semantic_similarity(
+        train_embeddings_file=train_embeddings_file,
+        test_embeddings_file=test_embeddings_file,
+        train_df=get_train_df(),
+        test_df=get_test_df(),
+    )
+    evidences = []
+    try:
+        # Perform semantic search across both train and test datasets
+        results = similarity.search(query=query, top_k=top_k)
+        # Retrieve evidence rows based on the search results
+        for evidence_id, score in results:
+            # Determine whether the ID belongs to train or test set
+            if evidence_id.startswith("train_"):
+                df = similarity.train_csv
+            elif evidence_id.startswith("test_"):
+                df = similarity.test_csv
+            else:
+                continue  # Skip invalid IDs
+            # Extract the row by ID
+            row = df[df["id"] == int(evidence_id.split("_")[1])].iloc[0]
+            evidence_text = row.get("evidence_enriched")
+            evidence_image_caption = row.get("evidence_image_caption")
+            evidence_image_path = row.get("evidence_image")
+            evidence_image = None
+            full_image_path = None
+            # Load the image if a valid path is provided
+            if pd.notna(evidence_image_path):
+                full_image_path = os.path.join(PROJECT_ROOT, evidence_image_path)
+                try:
+                    evidence_image = Image.open(full_image_path).convert("RGB")
+                except Exception as e:
+                    st.error(f"Failed to load image {evidence_image_path}: {e}")
+            evidence_id_number = evidence_id.split("_")[1]
+            evidence_dataset = evidence_id.split("_")[0]
+            # Create an Evidence object
+            evidences.append(
+                Evidence(
+                    text=evidence_text,
+                    image=evidence_image,
+                    caption=evidence_image_caption,
+                    evidence_id=evidence_id_number,
+                    dataset=evidence_dataset,
+                    image_path=full_image_path,
+                )
+            )
+    except Exception as e:
+        st.error(f"Error performing semantic search: {e}")
+    return evidences
+@st.cache_data
+def get_image_corpus(image_features):
+    return ImageCorpus(image_features)
+def retrieve_evidences_by_image(
+    image_path: str,
+    top_k: int = 5,
+) -> List[Evidence]:
+    """
+    Retrieves evidence rows from preloaded embeddings and CSV data using semantic similarity.
+    Args:
+        query (str): The query text to perform the search.
+        top_k (int): Number of top results to retrieve.
+    Returns:
+        List[Evidence]: A list of retrieved evidence objects.
+    """
+    image_features = os.path.join(PROJECT_ROOT, "evidence_features.pkl")
+    image_corpus = get_image_corpus(image_features)
+    evidences = []
+    try:
+        # Perform semantic search across both train and test datasets
+        results = image_corpus.retrieve_similar_images(image_path, top_k=top_k)
+        # Retrieve evidence rows based on the search results
+        for evidence_path, score in results:
+            evidence_id = evidence_path.split("/")[-1]
+            evidence_id_number = evidence_id.split("_")[0]
+            # Determine whether the ID belongs to train or test set
+            if "train" in evidence_path:
+                df = get_train_df()
+            elif "test" in evidence_path:
+                df = get_test_df()
+            else:
+                continue  # Skip invalid IDs
+            # Extract the row by ID
+            row = df[df["id"] == int(evidence_id_number)].iloc[0]
+            evidence_text = row.get("evidence_enriched")
+            evidence_image_caption = row.get("evidence_image_caption")
+            evidence_image_path = row.get("evidence_image")
+            evidence_image = None
+            full_image_path = None
+            # Load the image if a valid path is provided
+            if pd.notna(evidence_image_path):
+                full_image_path = os.path.join(PROJECT_ROOT, evidence_image_path)
+                try:
+                    evidence_image = Image.open(full_image_path).convert("RGB")
+                except Exception as e:
+                    st.error(f"Failed to load image {evidence_image_path}: {e}")
+            # Create an Evidence object
+            evidences.append(
+                Evidence(
+                    text=evidence_text,
+                    image=evidence_image,
+                    caption=evidence_image_caption,
+                    dataset=evidence_path.split("/")[-2],
+                    evidence_id=evidence_id_number,
+                    image_path=full_image_path,
+                )
+            )
+    except Exception as e:
+        st.error(f"Error performing semantic search: {e}")
+    return evidences
+@st.cache_resource
+def get_predictor():
+    return MisinformationPredictor(model_path="ckpts/model.pt", device="cpu")
+def classify_evidence(
+    claim_text: str, claim_image_path: str, evidence_text: str, evidence_image_path: str
+) -> Tuple[str, str, str, str]:
+    """Assigns a random classification to each evidence."""
+    predictor = get_predictor()
+    predictions = predictor.evaluate(
+        claim_text, claim_image_path, evidence_text, evidence_image_path
+    )
+    if predictions:
+        return (
+            predictions.get("text_text", "not_enough_information"),
+            predictions.get("text_image", "not_enough_information"),
+            predictions.get("image_text", "not_enough_information"),
+            predictions.get("image_image", "not_enough_information"),
+        )
+    else:
+        return (
+            "not_enough_information",
+            "not_enough_information",
+            "not_enough_information",
+            "not_enough_information",
+        )
+def display_evidence_tab(evidences: List[Evidence], tab_label: str):
+    """Displays evidence in a tabbed format."""
+    with st.container():
+        for index, evidence in enumerate(evidences):
+            with st.container():
+                st.subheader(f"Evidence {index + 1}")
+                st.write(f"Evidence Dataset: {evidence.dataset}")
+                st.write(f"Evidence ID: {evidence.evidence_id}")
+                if evidence.image:
+                    st.image(
+                        evidence.image,
+                        caption="Evidence Image",
+                        use_container_width=True,
+                    )
+                st.text_area(
+                    "Evidence Caption",
+                    value=evidence.caption or "No caption available.",
+                    height=100,
+                    key=f"caption_{tab_label}_{index}",
+                    disabled=True,
+                )
+                st.text_area(
+                    "Evidence Text",
+                    value=evidence.text or "No text available.",
+                    height=100,
+                    key=f"text_{tab_label}_{index}",
+                    disabled=True,
+                )
+                if evidence.classification_result_all:
+                    st.write("**Classification:**")
+                    st.write(f"**text|text:** {evidence.classification_result_all[0]}")
+                    st.write(f"**text|image:** {evidence.classification_result_all[1]}")
+                    st.write(f"**image|text:** {evidence.classification_result_all[2]}")
+                    st.write(
+                        f"**image|image:** {evidence.classification_result_all[3]}"
+                    )
+                    st.write(
+                        f"**Final classification result:** {evidence.classification_result_final}"
+                    )
+def get_final_classification(results: Tuple[str, str, str, str]) -> str:
+    text_text = results[0]
+    text_image = results[1]
+    image_text = results[2]
+    image_image = results[3]
+    # Helper function to determine the final classification based on two inputs
+    def resolve_classification(val1: str, val2: str) -> str:
+        if val1 == val2 and val1 in {"support", "refute"}:
+            return val1
+        if (val1 in {"support", "refute"} and val2 == "not_enough_information") or (
+            val2 in {"support", "refute"} and val1 == "not_enough_information"
+        ):
+            return val1 if val1 != "not_enough_information" else val2
+        return "not_enough_information"
+    # Step 1: Check text_text and image_image
+    final_result = resolve_classification(text_text, image_image)
+    if final_result != "not_enough_information":
+        return final_result
+    # Step 2: Check text_image and image_text
+    final_result = resolve_classification(text_image, image_text)
+    if final_result != "not_enough_information":
+        return final_result
+    # Step 3: If still undetermined, return "not_enough_information"
+    return "not_enough_information"
+def main():
+    st.title("Multimodal Evidence-Based Misinformation Classification")
+    st.write("Upload claims that have image and/or text content to verify.")
+    # File uploader for images
+    uploaded_image = st.file_uploader(
+        "Upload an image (1 max)", type=["jpg", "jpeg", "png"], key="image_uploader"
+    )
+    if uploaded_image:
+        try:
+            image = Image.open(uploaded_image).convert("RGB")
+            st.image(image, caption="Uploaded Image", use_container_width=True)
+        except Exception as e:
+            st.error(f"Failed to display the image: {e}")
+    # Text input field
+    input_text = st.text_area("Enter text (max 4096 characters)", "", max_chars=4096)
+    # Sliders for top_k values
+    col1, col2 = st.columns(2)
+    with col1:
+        top_k_text = st.slider(
+            "Top-k Text Evidences", min_value=1, max_value=5, value=2, key="top_k_text"
+        )
+    with col2:
+        top_k_image = st.slider(
+            "Top-k Image Evidences",
+            min_value=1,
+            max_value=5,
+            value=2,
+            key="top_k_image",
+        )
+    # Generate Enriched Text button
+    if st.button("Verify Claim"):
+        if not uploaded_image and not input_text:
+            st.warning("Please upload an image or enter text.")
+            return
+        progress = st.progress(0)
+        # Step 1: Generate caption
+        progress.progress(10)
+        st.write("### Step 1: Generating caption...")
+        image_caption = ""
+        if uploaded_image:
+            image_caption = generate_caption(image)
+            st.write("**Generated Image Caption:**", image_caption)
+        # Step 2: Enrich text
+        progress.progress(40)
+        st.write("### Step 2: Enriching text...")
+        enriched_text = enrich_text_with_caption(input_text, image_caption)
+        st.write("**Enriched Text:**")
+        st.write(enriched_text)
+        # Step 3: Retrieve evidences by text
+        progress.progress(50)
+        st.write("### Step 3: Retrieving evidences by text...")
+        if input_text:
+            text_evidences = retrieve_evidences_by_text(enriched_text, top_k=top_k_text)
+            st.write(f"Retrieved {len(text_evidences)} text evidences.")
+        else:
+            text_evidences = None
+            st.write("Text modality is missing from the input claim!")
+        # Step 4: Retrieve evidences by image
+        progress.progress(70)
+        st.write("### Step 4: Retrieving evidences by image...")
+        if uploaded_image:
+            image_evidences = retrieve_evidences_by_image(
+                uploaded_image, top_k=top_k_image
+            )
+            st.write(f"Retrieved {len(image_evidences)} image evidences.")
+        else:
+            image_evidences = None
+            st.write("Image modality is missing from the input claim!")
+        # Step 5: Classify evidences
+        progress.progress(90)
+        st.write("### Step 5: Verifying claim with retrieved evidences...")
+        for evidence in (text_evidences or []) + (image_evidences or []):
+            a, b, c, d = classify_evidence(
+                claim_text=enriched_text,
+                claim_image_path=uploaded_image,
+                evidence_text=evidence.text,
+                evidence_image_path=evidence.image_path,
+            )
+            evidence.classification_result_all = a, b, c, d
+            evidence.classification_result_final = get_final_classification(
+                evidence.classification_result_all
+            )
+        # Step 6: Display evidences
+        progress.progress(100)
+        if text_evidences or image_evidences:
+            st.write("## Results")
+            tabs = st.tabs(["Text Evidences", "Image Evidences"])
+            with tabs[0]:
+                if text_evidences:
+                    st.write("### Text Evidences")
+                    display_evidence_tab(text_evidences, "text")
+                else:
+                    st.write("Text modality is missing from the input claim!")
+            with tabs[1]:
+                if image_evidences:
+                    st.write("### Image Evidences")
+                    display_evidence_tab(image_evidences, "image")
+                else:
+                    st.write("Image modality is missing from the input claim!")
+if __name__ == "__main__":
+    main()

ckpts/model.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:15237d481c551aba1df0bae16f0adf43b23ba019e138712010453bda62d39bd0
+size 51850010

src/__init__.py ADDED Viewed

File without changes

src/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (190 Bytes). View file

src/data_loader/__init__.py ADDED Viewed

File without changes

src/data_loader/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (202 Bytes). View file

src/data_loader/__pycache__/download_data.cpython-311.pyc ADDED Viewed

Binary file (4.94 kB). View file

src/data_loader/__pycache__/download_images.cpython-311.pyc ADDED Viewed

Binary file (8.03 kB). View file

src/data_loader/download_data.py ADDED Viewed

	@@ -0,0 +1,76 @@

+import os
+import zipfile
+import gdown
+from getpass import getpass
+import shutil
+from pathlib import Path
+from src.utils.path_utils import get_project_root
+# Constants
+PROJECT_ROOT = get_project_root()
+ZIP_FILE_PATH = str(PROJECT_ROOT / "data/raw/factify/factify_data.zip")
+EXTRACTION_DIR = str(PROJECT_ROOT / "data/raw/factify/extracted")
+TEMP_EXTRACTION_DIR = str(PROJECT_ROOT / "data/raw/factify/public_folder")
+GDRIVE_FILE_URL = "https://drive.google.com/uc?id=1ig7XEYU1UKDHrHgDYgqiARWvNdswgFEX"
+def ensure_directories():
+    """Ensure necessary directories exist."""
+    os.makedirs(os.path.dirname(ZIP_FILE_PATH), exist_ok=True)
+def download_zip():
+    """Download the ZIP file if it doesn't already exist."""
+    if os.path.exists(ZIP_FILE_PATH):
+        print(f"Zip file already exists at {ZIP_FILE_PATH}. Skipping download...")
+        return
+    print("Downloading zip file from Google Drive...")
+    gdown.download(GDRIVE_FILE_URL, ZIP_FILE_PATH, quiet=False)
+    print(f"Downloaded zip file to {ZIP_FILE_PATH}")
+def extract_zip():
+    """Extract the ZIP file and handle folder and file renaming."""
+    train_csv_path = os.path.join(EXTRACTION_DIR, "train.csv")
+    if os.path.exists(train_csv_path):
+        print(f"{train_csv_path} already exists. Skipping extraction...")
+        return
+    print("Extracting zip file...")
+    # Get password for the zip file
+    password = getpass("Enter the password for the zip file: ")
+    with zipfile.ZipFile(ZIP_FILE_PATH, "r") as zip_ref:
+        try:
+            zip_ref.extractall(
+                str(PROJECT_ROOT / "data/raw/factify/"), pwd=password.encode()
+            )
+            print(f"Extracted files to temporary folder: {TEMP_EXTRACTION_DIR}")
+        except RuntimeError:
+            print("Incorrect password. Exiting...")
+            exit(1)
+    # Remove existing extracted directory if it exists
+    if os.path.exists(EXTRACTION_DIR):
+        shutil.rmtree(EXTRACTION_DIR)
+        print(f"Removed existing directory: {EXTRACTION_DIR}")
+    # Rename extracted folder
+    if os.path.exists(TEMP_EXTRACTION_DIR):
+        os.rename(TEMP_EXTRACTION_DIR, EXTRACTION_DIR)
+        print(f"Renamed folder {TEMP_EXTRACTION_DIR} to {EXTRACTION_DIR}")
+    # Rename val.csv to test.csv
+    val_csv_path = os.path.join(EXTRACTION_DIR, "val.csv")
+    test_csv_path = os.path.join(EXTRACTION_DIR, "test.csv")
+    if os.path.exists(val_csv_path):
+        os.rename(val_csv_path, test_csv_path)
+        print(f"Renamed {val_csv_path} to {test_csv_path}")
+def main():
+    ensure_directories()
+    download_zip()
+    extract_zip()
+if __name__ == "__main__":
+    main()

src/data_loader/download_data_mocheg.py ADDED Viewed

	@@ -0,0 +1,71 @@

+import os
+import requests
+import tarfile
+from tqdm import tqdm
+DATA_URL: str = (
+    "http://nlplab1.cs.vt.edu/~menglong/project/multimodal/fact_checking/MOCHEG/dataset/latest_dataset/mocheg_with_tweet_2023_03.tar.gz"
+)
+RAW_DATA_DIR: str = "data/raw"
+ARCHIVE_NAME: str = "mocheg_with_tweet_2023_03.tar.gz"
+CHUNK_SIZE: int = 16 * 1024 * 1024  # 16 MB
+# Ensure the raw data directory exists
+os.makedirs(RAW_DATA_DIR, exist_ok=True)
+archive_path: str = os.path.join(RAW_DATA_DIR, ARCHIVE_NAME)
+def check_disk_space(required_space_gb: int) -> bool:
+    """Check if there is enough free disk space."""
+    stat = os.statvfs(RAW_DATA_DIR)
+    free_space_gb: float = (stat.f_bavail * stat.f_frsize) / (1024**3)
+    return free_space_gb > required_space_gb
+def download_data() -> None:
+    """Download the data if not already present and extract it."""
+    # Check if the data file already exists
+    if os.path.exists(archive_path):
+        print(f"Data already downloaded at {archive_path}. Skipping download.")
+        return
+    # Ensure enough disk space (approximate)
+    required_space_gb: int = 80  # Adjust based on expected file size + extraction space
+    if not check_disk_space(required_space_gb):
+        print(f"Not enough disk space. At least {required_space_gb} GB required.")
+        return
+    # Download the data in larger chunks
+    print(f"Downloading data from {DATA_URL}...")
+    response = requests.get(DATA_URL, stream=True)
+    response.raise_for_status()  # Ensure the URL is accessible
+    total_size: int = int(response.headers.get("content-length", 0))
+    with open(archive_path, "wb") as file, tqdm(
+        desc=ARCHIVE_NAME,
+        total=total_size,
+        unit="B",
+        unit_scale=True,
+        unit_divisor=1024,
+    ) as progress_bar:
+        for chunk in response.iter_content(chunk_size=CHUNK_SIZE):
+            if chunk:
+                file.write(chunk)
+                progress_bar.update(len(chunk))
+    print(f"Download completed: {archive_path}")
+    # Extract the tar.gz file
+    extract_data(archive_path)
+def extract_data(archive_path: str) -> None:
+    """Extract the downloaded tar.gz file."""
+    print(f"Extracting data from {archive_path}...")
+    with tarfile.open(archive_path, "r:gz") as tar:
+        tar.extractall(path=RAW_DATA_DIR)
+    print(f"Data extracted to {RAW_DATA_DIR}")
+if __name__ == "__main__":
+    download_data()

src/data_loader/download_images.py ADDED Viewed

	@@ -0,0 +1,168 @@

+import os
+import argparse
+import pandas as pd
+import requests
+import json
+import io
+from tqdm import tqdm
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from collections import defaultdict
+from PIL import Image
+from src.utils.data_utils import HEADERS
+from src.utils.path_utils import get_project_root
+# Constants
+PROJECT_ROOT = get_project_root()
+EXTRACTION_DIR = str(PROJECT_ROOT / "data/raw/factify/extracted")
+IMAGES_DIR = os.path.join(EXTRACTION_DIR, "images")
+def ensure_directories(images_folder):
+    """Ensure the image directory exists."""
+    os.makedirs(images_folder, exist_ok=True)
+def download_image(url, save_path):
+    """Download a single image if not already downloaded."""
+    # Check if the image already exists
+    if os.path.exists(save_path):
+        print(f"Image already exists: {save_path}")
+        return True
+    headers = {
+        "User-Agent": (
+            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) "
+            "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36"
+        )
+    }
+    try:
+        response = requests.get(url, headers=headers, stream=True, timeout=30)
+        response.raise_for_status()  # Raise an error for HTTP issues
+        img = Image.open(io.BytesIO(response.content))
+        img = img.convert("RGB")  # Ensure the image is in RGB format
+        img.save(save_path)
+        print(f"Downloaded and saved image: {save_path}")
+        return True
+    except Exception as e:
+        print(f"Failed to download image from {url}: {e}")
+        return False
+def process_image(row, images_folder, stats, dataset_name):
+    """Process claim and evidence image downloads."""
+    file_id = str(row["id"])
+    category = row.get("category", "Unknown")
+    claim_image_url = row.get("claim_image", "")
+    evidence_image_url = row.get("evidence_image", "")
+    # Ensure category stats exist
+    stats["categories"].setdefault(
+        category,
+        {
+            "total_claim": 0,
+            "successful_claim": 0,
+            "total_evidence": 0,
+            "successful_evidence": 0,
+        },
+    )
+    stats["categories"][category]["total_claim"] += 1
+    stats["categories"][category]["total_evidence"] += 1
+    # Download claim image
+    if claim_image_url:
+        success = download_image(
+            claim_image_url, os.path.join(images_folder, f"{file_id}_claim.jpg")
+        )
+        if success:
+            stats["successful_claim"] += 1
+            stats["categories"][category]["successful_claim"] += 1
+    # Download evidence image
+    if evidence_image_url:
+        success = download_image(
+            evidence_image_url, os.path.join(images_folder, f"{file_id}_evidence.jpg")
+        )
+        if success:
+            stats["successful_evidence"] += 1
+            stats["categories"][category]["successful_evidence"] += 1
+def download_images(dataset, use_threading):
+    """Download images for the specified dataset (train or test)."""
+    csv_path = os.path.join(EXTRACTION_DIR, f"{dataset}.csv")
+    images_folder = os.path.join(IMAGES_DIR, dataset)
+    stats_file_path = os.path.join(
+        EXTRACTION_DIR, f"{dataset}_image_download_stats.json"
+    )
+    ensure_directories(images_folder)
+    if not os.path.exists(csv_path):
+        print(f"CSV file not found for {dataset}: {csv_path}")
+        return
+    stats = {
+        "successful_claim": 0,
+        "successful_evidence": 0,
+        "categories": defaultdict(
+            lambda: {
+                "total_claim": 0,
+                "successful_claim": 0,
+                "total_evidence": 0,
+                "successful_evidence": 0,
+            }
+        ),
+    }
+    df = pd.read_csv(csv_path, names=HEADERS, header=None, sep="\t", skiprows=1)
+    if use_threading:
+        with ThreadPoolExecutor(max_workers=10) as executor:
+            futures = [
+                executor.submit(process_image, row, images_folder, stats, dataset)
+                for _, row in df.iterrows()
+            ]
+            for _ in tqdm(
+                as_completed(futures),
+                total=len(futures),
+                desc=f"Downloading {dataset} images",
+            ):
+                pass
+    else:
+        for _, row in tqdm(
+            df.iterrows(), total=len(df), desc=f"Downloading {dataset} images"
+        ):
+            process_image(row, images_folder, stats, dataset)
+    with open(stats_file_path, "w") as stats_file:
+        json.dump(stats, stats_file, indent=4)
+    print(f"Image download stats saved to {stats_file_path}")
+def main():
+    parser = argparse.ArgumentParser(description="Download images for Factify dataset.")
+    parser.add_argument(
+        "--dataset",
+        choices=["train", "test"],
+        help="Specify which dataset to download images for (train or test). If not specified, both will be downloaded.",
+    )
+    parser.add_argument(
+        "--use-threading",
+        action="store_true",
+        default=True,
+        help="Enable threading for image downloads (default: True).",
+    )
+    args = parser.parse_args()
+    if args.dataset:
+        # Run for the specified dataset
+        download_images(args.dataset, args.use_threading)
+    else:
+        # Run for both train and test if no dataset is specified
+        print("No dataset specified. Downloading images for both train and test...")
+        for dataset in ["train", "test"]:
+            download_images(dataset, args.use_threading)
+if __name__ == "__main__":
+    main()

src/data_loader/preprocess_embeddings.py ADDED Viewed

	@@ -0,0 +1,129 @@

+import h5py
+import torch
+import logging
+from tqdm import tqdm
+from transformers import AutoTokenizer, AutoModel, Swinv2Model
+logger = logging.getLogger(__name__)
+@torch.no_grad()
+def create_embeddings_h5(input_h5_path, output_h5_path, batch_size=32, device="cuda"):
+    """
+    Create a new H5 file with pre-computed embeddings from text and images.
+    Args:
+        input_h5_path (str): Path to input H5 file with raw data
+        output_h5_path (str): Path where to save the new H5 file with embeddings
+        batch_size (int): Batch size for processing
+        device (str): Device to use for computation
+    """
+    logger.info(f"Creating embeddings H5 file from {input_h5_path}")
+    # Initialize models
+    tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-xsmall")
+    text_encoder = AutoModel.from_pretrained("microsoft/deberta-v3-xsmall").to(device)
+    image_encoder = Swinv2Model.from_pretrained(
+        "microsoft/swinv2-base-patch4-window8-256"
+    ).to(device)
+    # Set models to eval mode
+    text_encoder.eval()
+    image_encoder.eval()
+    # Open input H5 file
+    with h5py.File(input_h5_path, "r") as in_f, h5py.File(output_h5_path, "w") as out_f:
+        total_samples = len(in_f.keys())
+        # Process in batches
+        for batch_start in tqdm(range(0, total_samples, batch_size)):
+            batch_end = min(batch_start + batch_size, total_samples)
+            batch_indices = range(batch_start, batch_end)
+            # Collect batch data
+            claim_texts = []
+            doc_texts = []
+            claim_images = []
+            doc_images = []
+            labels = []
+            for idx in batch_indices:
+                sample = in_f[str(idx)]
+                claim_texts.append(sample["claim"][()].decode())
+                doc_texts.append(sample["document"][()].decode())
+                claim_images.append(torch.from_numpy(sample["claim_image"][()]))
+                doc_images.append(torch.from_numpy(sample["document_image"][()]))
+                labels.append(sample["labels"][()])
+            # Convert to tensors
+            claim_images = torch.stack(claim_images).to(device)
+            doc_images = torch.stack(doc_images).to(device)
+            # Get text embeddings with fixed sequence length
+            claim_text_inputs = tokenizer(
+                claim_texts,
+                truncation=True,
+                padding="max_length",  # Changed to max_length
+                return_tensors="pt",
+                max_length=512,
+            ).to(device)
+            doc_text_inputs = tokenizer(
+                doc_texts,
+                truncation=True,
+                padding="max_length",  # Changed to max_length
+                return_tensors="pt",
+                max_length=512,
+            ).to(device)
+            claim_text_embeds = text_encoder(**claim_text_inputs).last_hidden_state
+            doc_text_embeds = text_encoder(**doc_text_inputs).last_hidden_state
+            # Verify shapes
+            assert (
+                claim_text_embeds.shape[1] == 512
+            ), f"Unexpected claim text shape: {claim_text_embeds.shape}"
+            assert (
+                doc_text_embeds.shape[1] == 512
+            ), f"Unexpected doc text shape: {doc_text_embeds.shape}"
+            # Get image embeddings
+            claim_image_embeds = image_encoder(claim_images).last_hidden_state
+            doc_image_embeds = image_encoder(doc_images).last_hidden_state
+            # Store embeddings and labels
+            for batch_idx, idx in enumerate(batch_indices):
+                sample_group = out_f.create_group(str(idx))
+                # Store embeddings
+                sample_group.create_dataset(
+                    "claim_text_embeds", data=claim_text_embeds[batch_idx].cpu().numpy()
+                )
+                sample_group.create_dataset(
+                    "doc_text_embeds", data=doc_text_embeds[batch_idx].cpu().numpy()
+                )
+                sample_group.create_dataset(
+                    "claim_image_embeds",
+                    data=claim_image_embeds[batch_idx].cpu().numpy(),
+                )
+                sample_group.create_dataset(
+                    "doc_image_embeds", data=doc_image_embeds[batch_idx].cpu().numpy()
+                )
+                # Store labels
+                sample_group.create_dataset("labels", data=labels[batch_idx])
+    logger.info(f"Created embeddings H5 file at {output_h5_path}")
+if __name__ == "__main__":
+    # Set up logging
+    logging.basicConfig(level=logging.INFO)
+    # Example usage
+    create_embeddings_h5(
+        input_h5_path="data/preprocessed/train.h5",
+        output_h5_path="data/preprocessed/train_embeddings.h5",
+        batch_size=32,
+        device="cuda:0",
+    )

src/demo/__init__.py ADDED Viewed

File without changes

src/demo/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (195 Bytes). View file

src/demo/__pycache__/app.cpython-311.pyc ADDED Viewed

Binary file (16.5 kB). View file

src/demo/app.py ADDED Viewed

	@@ -0,0 +1,446 @@

+import streamlit as st
+from PIL import Image
+from transformers import BlipProcessor, BlipForConditionalGeneration
+import pandas as pd
+import os
+from evaluate import MisinformationPredictor
+from src.evidence.im2im_retrieval import ImageCorpus
+from src.evidence.text2text_retrieval import SemanticSimilarity
+from src.utils.path_utils import get_project_root
+from typing import List, Optional, Tuple
+from dataclasses import dataclass
+# Initialize BLIP model and processor
+processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
+model = BlipForConditionalGeneration.from_pretrained(
+    "Salesforce/blip-image-captioning-large"
+)
+PROJECT_ROOT = get_project_root()
+@dataclass
+class Evidence:
+    evidence_id: str
+    dataset: str
+    text: Optional[str]
+    image: Optional[Image.Image]
+    caption: Optional[str]
+    image_path: Optional[str]
+    classification_result_all: Optional[Tuple[str, str, str, str]] = None
+    classification_result_final: Optional[str] = None
+CLASSIFICATION_CATEGORIES = ["support", "refute", "not_enough_information"]
+def generate_caption(image: Image.Image) -> str:
+    """Generates a caption for a given image."""
+    try:
+        with st.spinner("Generating caption..."):
+            inputs = processor(image, return_tensors="pt")
+            output = model.generate(**inputs)
+            return processor.decode(output[0], skip_special_tokens=True)
+    except Exception as e:
+        st.error(f"Error generating caption: {e}")
+        return ""
+def enrich_text_with_caption(text: str, image_caption: str) -> str:
+    """Appends the image caption to the given text."""
+    if image_caption:
+        return f"{text}. {image_caption}"
+    return text
+@st.cache_data
+def get_train_df():
+    data_dir = os.path.join(PROJECT_ROOT, "data", "preprocessed")
+    train_csv_path = os.path.join(data_dir, "train_enriched.csv")
+    return pd.read_csv(train_csv_path)
+@st.cache_data
+def get_test_df():
+    data_dir = os.path.join(PROJECT_ROOT, "data", "preprocessed")
+    train_csv_path = os.path.join(data_dir, "test_enriched.csv")
+    return pd.read_csv(train_csv_path)
+@st.cache_data
+def get_semantic_similarity(
+    train_embeddings_file: str,
+    test_embeddings_file: str,
+    train_df: pd.DataFrame,
+    test_df: pd.DataFrame,
+):
+    return SemanticSimilarity(
+        train_embeddings_file=train_embeddings_file,
+        test_embeddings_file=test_embeddings_file,
+        train_df=train_df,
+        test_df=test_df,
+    )
+def retrieve_evidences_by_text(
+    query: str,
+    top_k: int = 5,
+) -> List[Evidence]:
+    """
+    Retrieves evidence rows from preloaded embeddings and CSV data using semantic similarity.
+    Args:
+        query (str): The query text to perform the search.
+        top_k (int): Number of top results to retrieve.
+    Returns:
+        List[Evidence]: A list of retrieved evidence objects.
+    """
+    train_embeddings_file = os.path.join(PROJECT_ROOT, "train_embeddings.h5")
+    test_embeddings_file = os.path.join(PROJECT_ROOT, "test_embeddings.h5")
+    similarity = get_semantic_similarity(
+        train_embeddings_file=train_embeddings_file,
+        test_embeddings_file=test_embeddings_file,
+        train_df=get_train_df(),
+        test_df=get_test_df(),
+    )
+    evidences = []
+    try:
+        # Perform semantic search across both train and test datasets
+        results = similarity.search(query=query, top_k=top_k)
+        # Retrieve evidence rows based on the search results
+        for evidence_id, score in results:
+            # Determine whether the ID belongs to train or test set
+            if evidence_id.startswith("train_"):
+                df = similarity.train_csv
+            elif evidence_id.startswith("test_"):
+                df = similarity.test_csv
+            else:
+                continue  # Skip invalid IDs
+            # Extract the row by ID
+            row = df[df["id"] == int(evidence_id.split("_")[1])].iloc[0]
+            evidence_text = row.get("evidence_enriched")
+            evidence_image_caption = row.get("evidence_image_caption")
+            evidence_image_path = row.get("evidence_image")
+            evidence_image = None
+            full_image_path = None
+            # Load the image if a valid path is provided
+            if pd.notna(evidence_image_path):
+                full_image_path = os.path.join(PROJECT_ROOT, evidence_image_path)
+                try:
+                    evidence_image = Image.open(full_image_path).convert("RGB")
+                except Exception as e:
+                    st.error(f"Failed to load image {evidence_image_path}: {e}")
+            evidence_id_number = evidence_id.split("_")[1]
+            evidence_dataset = evidence_id.split("_")[0]
+            # Create an Evidence object
+            evidences.append(
+                Evidence(
+                    text=evidence_text,
+                    image=evidence_image,
+                    caption=evidence_image_caption,
+                    evidence_id=evidence_id_number,
+                    dataset=evidence_dataset,
+                    image_path=full_image_path,
+                )
+            )
+    except Exception as e:
+        st.error(f"Error performing semantic search: {e}")
+    return evidences
+@st.cache_data
+def get_image_corpus(image_features):
+    return ImageCorpus(image_features)
+def retrieve_evidences_by_image(
+    image_path: str,
+    top_k: int = 5,
+) -> List[Evidence]:
+    """
+    Retrieves evidence rows from preloaded embeddings and CSV data using semantic similarity.
+    Args:
+        query (str): The query text to perform the search.
+        top_k (int): Number of top results to retrieve.
+    Returns:
+        List[Evidence]: A list of retrieved evidence objects.
+    """
+    image_features = os.path.join(PROJECT_ROOT, "evidence_features.pkl")
+    image_corpus = get_image_corpus(image_features)
+    evidences = []
+    try:
+        # Perform semantic search across both train and test datasets
+        results = image_corpus.retrieve_similar_images(image_path, top_k=top_k)
+        # Retrieve evidence rows based on the search results
+        for evidence_path, score in results:
+            evidence_id = evidence_path.split("/")[-1]
+            evidence_id_number = evidence_id.split("_")[0]
+            # Determine whether the ID belongs to train or test set
+            if "train" in evidence_path:
+                df = get_train_df()
+            elif "test" in evidence_path:
+                df = get_test_df()
+            else:
+                continue  # Skip invalid IDs
+            # Extract the row by ID
+            row = df[df["id"] == int(evidence_id_number)].iloc[0]
+            evidence_text = row.get("evidence_enriched")
+            evidence_image_caption = row.get("evidence_image_caption")
+            evidence_image_path = row.get("evidence_image")
+            evidence_image = None
+            full_image_path = None
+            # Load the image if a valid path is provided
+            if pd.notna(evidence_image_path):
+                full_image_path = os.path.join(PROJECT_ROOT, evidence_image_path)
+                try:
+                    evidence_image = Image.open(full_image_path).convert("RGB")
+                except Exception as e:
+                    st.error(f"Failed to load image {evidence_image_path}: {e}")
+            # Create an Evidence object
+            evidences.append(
+                Evidence(
+                    text=evidence_text,
+                    image=evidence_image,
+                    caption=evidence_image_caption,
+                    dataset=evidence_path.split("/")[-2],
+                    evidence_id=evidence_id_number,
+                    image_path=full_image_path,
+                )
+            )
+    except Exception as e:
+        st.error(f"Error performing semantic search: {e}")
+    return evidences
+@st.cache_resource
+def get_predictor():
+    return MisinformationPredictor(model_path="ckpts/model.pt", device="cpu")
+def classify_evidence(
+    claim_text: str, claim_image_path: str, evidence_text: str, evidence_image_path: str
+) -> Tuple[str, str, str, str]:
+    """Assigns a random classification to each evidence."""
+    predictor = get_predictor()
+    predictions = predictor.evaluate(
+        claim_text, claim_image_path, evidence_text, evidence_image_path
+    )
+    if predictions:
+        return (
+            predictions.get("text_text", "not_enough_information"),
+            predictions.get("text_image", "not_enough_information"),
+            predictions.get("image_text", "not_enough_information"),
+            predictions.get("image_image", "not_enough_information"),
+        )
+    else:
+        return (
+            "not_enough_information",
+            "not_enough_information",
+            "not_enough_information",
+            "not_enough_information",
+        )
+def display_evidence_tab(evidences: List[Evidence], tab_label: str):
+    """Displays evidence in a tabbed format."""
+    with st.container():
+        for index, evidence in enumerate(evidences):
+            with st.container():
+                st.subheader(f"Evidence {index + 1}")
+                st.write(f"Evidence Dataset: {evidence.dataset}")
+                st.write(f"Evidence ID: {evidence.evidence_id}")
+                if evidence.image:
+                    st.image(
+                        evidence.image,
+                        caption="Evidence Image",
+                        use_container_width=True,
+                    )
+                st.text_area(
+                    "Evidence Caption",
+                    value=evidence.caption or "No caption available.",
+                    height=100,
+                    key=f"caption_{tab_label}_{index}",
+                    disabled=True,
+                )
+                st.text_area(
+                    "Evidence Text",
+                    value=evidence.text or "No text available.",
+                    height=100,
+                    key=f"text_{tab_label}_{index}",
+                    disabled=True,
+                )
+                if evidence.classification_result_all:
+                    st.write("**Classification:**")
+                    st.write(f"**text|text:** {evidence.classification_result_all[0]}")
+                    st.write(f"**text|image:** {evidence.classification_result_all[1]}")
+                    st.write(f"**image|text:** {evidence.classification_result_all[2]}")
+                    st.write(
+                        f"**image|image:** {evidence.classification_result_all[3]}"
+                    )
+                    st.write(
+                        f"**Final classification result:** {evidence.classification_result_final}"
+                    )
+def get_final_classification(results: Tuple[str, str, str, str]) -> str:
+    text_text = results[0]
+    text_image = results[1]
+    image_text = results[2]
+    image_image = results[3]
+    # Helper function to determine the final classification based on two inputs
+    def resolve_classification(val1: str, val2: str) -> str:
+        if val1 == val2 and val1 in {"support", "refute"}:
+            return val1
+        if (val1 in {"support", "refute"} and val2 == "not_enough_information") or (
+            val2 in {"support", "refute"} and val1 == "not_enough_information"
+        ):
+            return val1 if val1 != "not_enough_information" else val2
+        return "not_enough_information"
+    # Step 1: Check text_text and image_image
+    final_result = resolve_classification(text_text, image_image)
+    if final_result != "not_enough_information":
+        return final_result
+    # Step 2: Check text_image and image_text
+    final_result = resolve_classification(text_image, image_text)
+    if final_result != "not_enough_information":
+        return final_result
+    # Step 3: If still undetermined, return "not_enough_information"
+    return "not_enough_information"
+def main():
+    st.title("Multimodal Evidence-Based Misinformation Classification")
+    st.write("Upload claims that have image and/or text content to verify.")
+    # File uploader for images
+    uploaded_image = st.file_uploader(
+        "Upload an image (1 max)", type=["jpg", "jpeg", "png"], key="image_uploader"
+    )
+    if uploaded_image:
+        try:
+            image = Image.open(uploaded_image).convert("RGB")
+            st.image(image, caption="Uploaded Image", use_container_width=True)
+        except Exception as e:
+            st.error(f"Failed to display the image: {e}")
+    # Text input field
+    input_text = st.text_area("Enter text (max 4096 characters)", "", max_chars=4096)
+    # Sliders for top_k values
+    col1, col2 = st.columns(2)
+    with col1:
+        top_k_text = st.slider(
+            "Top-k Text Evidences", min_value=1, max_value=5, value=2, key="top_k_text"
+        )
+    with col2:
+        top_k_image = st.slider(
+            "Top-k Image Evidences",
+            min_value=1,
+            max_value=5,
+            value=2,
+            key="top_k_image",
+        )
+    # Generate Enriched Text button
+    if st.button("Verify Claim"):
+        if not uploaded_image and not input_text:
+            st.warning("Please upload an image or enter text.")
+            return
+        progress = st.progress(0)
+        # Step 1: Generate caption
+        progress.progress(10)
+        st.write("### Step 1: Generating caption...")
+        image_caption = ""
+        if uploaded_image:
+            image_caption = generate_caption(image)
+            st.write("**Generated Image Caption:**", image_caption)
+        # Step 2: Enrich text
+        progress.progress(40)
+        st.write("### Step 2: Enriching text...")
+        enriched_text = enrich_text_with_caption(input_text, image_caption)
+        st.write("**Enriched Text:**")
+        st.write(enriched_text)
+        # Step 3: Retrieve evidences by text
+        progress.progress(50)
+        st.write("### Step 3: Retrieving evidences by text...")
+        if input_text:
+            text_evidences = retrieve_evidences_by_text(enriched_text, top_k=top_k_text)
+            st.write(f"Retrieved {len(text_evidences)} text evidences.")
+        else:
+            text_evidences = None
+            st.write("Text modality is missing from the input claim!")
+        # Step 4: Retrieve evidences by image
+        progress.progress(70)
+        st.write("### Step 4: Retrieving evidences by image...")
+        if uploaded_image:
+            image_evidences = retrieve_evidences_by_image(
+                uploaded_image, top_k=top_k_image
+            )
+            st.write(f"Retrieved {len(image_evidences)} image evidences.")
+        else:
+            image_evidences = None
+            st.write("Image modality is missing from the input claim!")
+        # Step 5: Classify evidences
+        progress.progress(90)
+        st.write("### Step 5: Verifying claim with retrieved evidences...")
+        for evidence in (text_evidences or []) + (image_evidences or []):
+            a, b, c, d = classify_evidence(
+                claim_text=enriched_text,
+                claim_image_path=uploaded_image,
+                evidence_text=evidence.text,
+                evidence_image_path=evidence.image_path,
+            )
+            evidence.classification_result_all = a, b, c, d
+            evidence.classification_result_final = get_final_classification(
+                evidence.classification_result_all
+            )
+        # Step 6: Display evidences
+        progress.progress(100)
+        if text_evidences or image_evidences:
+            st.write("## Results")
+            tabs = st.tabs(["Text Evidences", "Image Evidences"])
+            with tabs[0]:
+                if text_evidences:
+                    st.write("### Text Evidences")
+                    display_evidence_tab(text_evidences, "text")
+                else:
+                    st.write("Text modality is missing from the input claim!")
+            with tabs[1]:
+                if image_evidences:
+                    st.write("### Image Evidences")
+                    display_evidence_tab(image_evidences, "image")
+                else:
+                    st.write("Image modality is missing from the input claim!")
+if __name__ == "__main__":
+    main()

src/evidence/__init__.py ADDED Viewed

File without changes

src/evidence/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (199 Bytes). View file

src/evidence/__pycache__/corpus_utils.cpython-311.pyc ADDED Viewed

Binary file (4.06 kB). View file

src/evidence/__pycache__/im2im_retrieval.cpython-311.pyc ADDED Viewed

Binary file (10.3 kB). View file

src/evidence/__pycache__/text2text_retrieval.cpython-311.pyc ADDED Viewed

Binary file (10.3 kB). View file

src/evidence/corpus_utils.py ADDED Viewed

	@@ -0,0 +1,100 @@

+import os
+import shutil
+from src.utils.path_utils import get_project_root
+def separate_evidence_images(base_dir):
+    """
+    Separates evidence images from the train directory and copies them into a new 'evidence_corpus' folder.
+    Args:
+        base_dir (str): The base directory containing the 'train' folder.
+    """
+    # Define paths
+    datasets = ["train", "test"]
+    evidence_corpus_dir = os.path.join(base_dir, "evidence_corpus")
+    # Create the evidence_corpus directory if it doesn't exist
+    os.makedirs(evidence_corpus_dir, exist_ok=True)
+    # Loop through the train directory and copy evidence images
+    for dataset in datasets:
+        dataset_dir = os.path.join(base_dir, dataset)
+        for filename in os.listdir(dataset_dir):
+            if filename.split("_")[-1].split(".")[0] == "evidence":
+                new_filename = f"{dataset}_{filename}"
+                source_path = os.path.join(dataset_dir, filename)
+                target_path = os.path.join(evidence_corpus_dir, new_filename)
+                shutil.copy(source_path, target_path)
+    print("All evidence images in the train set have been copied.")
+import pickle
+# File path for the evidence features pickle
+pickle_file_path = "evidence_features.pkl"
+# Function to update the keys in the pickle
+def update_pickle_keys(pickle_file_path, output_pickle_path=None):
+    # Open and load the existing pickle
+    with open(pickle_file_path, "rb") as f:
+        feature_dict = pickle.load(f)
+    updated_dict = {}
+    # Update each key
+    for old_path, features in feature_dict.items():
+        # Extract the filename (e.g., test_0_evidence.jpg)
+        filename = os.path.basename(old_path)
+        # Determine if it's a test or train image based on the filename
+        if filename.startswith("test"):
+            new_relative_path = os.path.join(
+                "data",
+                "raw",
+                "factify",
+                "extracted",
+                "images",
+                "test",
+                filename.split("_", 1)[1],
+            )
+        elif filename.startswith("train"):
+            new_relative_path = os.path.join(
+                "data",
+                "raw",
+                "factify",
+                "extracted",
+                "images",
+                "train",
+                filename.split("_", 1)[1],
+            )
+        else:
+            raise ValueError(f"Unexpected filename format: {filename}")
+        # Add the updated key and its value to the new dictionary
+        updated_dict[new_relative_path] = features
+    # Save the updated dictionary back to a pickle file
+    output_path = output_pickle_path if output_pickle_path else pickle_file_path
+    with open(output_path, "wb") as f:
+        pickle.dump(updated_dict, f)
+    print(f"Updated pickle saved at: {output_path}")
+# Example usage
+if __name__ == "__main__":
+    pickle_file_path = "/evidence_features.pkl"
+    project_root = get_project_root()
+    # Run the function
+    base_dir = os.path.join(
+        project_root, "data", "raw", "factify", "extracted", "images"
+    )
+    separate_evidence_images(base_dir)
+    # out_pkl_path = "C:\\Users\\defne\\Desktop\\2024-2025FallSemester\\Applied NLP\\multimodal-misinformation-detection\\data\\raw\\factify\\extracted\\images"
+    # update_pickle_keys(pickle_file_path, output_pickle_path=out_pkl_path)

src/evidence/im2im_retrieval.py ADDED Viewed

	@@ -0,0 +1,169 @@

+import os.path
+from torchvision.models import resnet50
+from torchvision.transforms import transforms
+from PIL import Image
+import torch.nn as nn
+import torch
+import pickle
+import matplotlib.pyplot as plt
+from src.utils.path_utils import get_project_root
+class ImageSimilarity:
+    def __init__(self):
+        self.model = resnet50(weights="DEFAULT")
+        self.model = nn.Sequential(
+            *list(self.model.children())[:-1]
+        )  # Ignoring the last classification layer
+        self.model.eval()
+        self.transform = transforms.Compose(
+            [
+                transforms.Resize((224, 224)),
+                transforms.ToTensor(),
+                transforms.Normalize(
+                    mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
+                ),
+            ]
+        )
+    def extract_features(self, image_stream):
+        image = Image.open(image_stream).convert("RGB")
+        image = self.transform(image).unsqueeze(0)
+        with torch.no_grad():
+            features = self.model(image)
+            features = features.flatten()
+        return features
+    def similarity(self, features1, features2):
+        # Calculating cosine similarity
+        cos = nn.CosineSimilarity(dim=1, eps=1e-6)
+        similarity = cos(features1.unsqueeze(0), features2.unsqueeze(0))
+        return similarity.item()
+class ImageCorpus:
+    def __init__(self, feature_corpus_path):
+        self.feature_corpus_path = feature_corpus_path
+        self.feature_dict = self.load_features()
+        self.feature_extractor = ImageSimilarity()
+    def load_features(self):
+        try:
+            with open(self.feature_corpus_path, "rb") as f:
+                return pickle.load(f)
+        except (EOFError, pickle.UnpicklingError):
+            print(
+                "Warning: Pickle file is empty or corrupted. Initializing empty feature dict."
+            )
+    def save_features(self):
+        with open(self.feature_corpus_path, "wb") as f:
+            pickle.dump(self.feature_dict, f)
+    def add_image(self, image_path):
+        features = self.feature_extractor.extract_features(image_path)
+        self.feature_dict[image_path] = features
+        self.save_features()
+    def create_feature_corpus(self, image_dir):
+        for image_name in os.listdir(image_dir):
+            image_path = os.path.join(image_dir, image_name)
+            if os.path.isfile(image_path) and image_path.lower().endswith(
+                (".png", ".jpg", ".jpeg")
+            ):
+                features = self.feature_extractor.extract_features(image_path)
+                self.feature_dict[image_path] = features
+        self.save_features()
+    def retrieve_similar_images(self, query_image_path, top_k=50):
+        query_features = self.feature_extractor.extract_features(query_image_path)
+        similarity_scores = {}
+        for image_name, corpus_feature in self.feature_dict.items():
+            similarity = self.feature_extractor.similarity(
+                query_features, corpus_feature
+            )
+            similarity_scores[image_name] = similarity
+        retrieved_images = sorted(
+            similarity_scores.items(), key=lambda x: x[1], reverse=True
+        )
+        # Filter out identical images (based on scores)
+        unique_scores = set()
+        filtered_images = []
+        for image_path, score in retrieved_images:
+            if score not in unique_scores:  # Check if this score is already added
+                unique_scores.add(score)
+                filtered_images.append((image_path, score))
+            if len(filtered_images) == top_k:  # Stop once we have top_k unique images
+                break
+        return filtered_images
+def visualize_retrieved_images(query_image_path, top_retrievals):
+    # Load query image
+    query_image = Image.open(query_image_path).convert("RGB")
+    project_base = get_project_root()
+    # Load retrieved images and their scores
+    retrieved_images = [
+        (Image.open(os.path.join(project_base, img_path)).convert("RGB"), score)
+        for img_path, score in top_retrievals
+    ]
+    # Set up the grid for visualization
+    total_retrieved = len(retrieved_images)
+    rows = 2 + (total_retrieved - 1) // 5  # 1 row for query + rows for 5 images per row
+    cols = 5
+    # Set figure size
+    plt.figure(figsize=(20, rows * 4))
+    # Plot query image at the top row (centered in row of 5)
+    plt.subplot(rows, cols, (cols // 2) + 1)  # Center in the first row
+    plt.imshow(query_image)
+    plt.title("Query Image", fontsize=12)
+    plt.axis("off")
+    # Plot retrieved images
+    for idx, (img, score) in enumerate(retrieved_images):
+        plt.subplot(rows, cols, cols + idx + 1)  # Start plotting after the query image
+        plt.imshow(img)
+        plt.title(f"Rank: {idx+1}\nScore: {score:.4f}", fontsize=10)
+        plt.axis("off")
+    plt.tight_layout()
+    plt.show()
+if __name__ == "__main__":
+    project_root = get_project_root()
+    image_feature = os.path.join(project_root, "evidence_features.pkl")
+    image_dir = os.path.join(
+        project_root, "data", "raw", "factify", "extracted", "images", "evidence_corpus"
+    )  # Replace with your base directory path
+    query_image_path = os.path.join(
+        project_root,
+        "data",
+        "raw",
+        "factify",
+        "extracted",
+        "images",
+        "train",
+        "1_claim.jpg",
+    )
+    image_corpus = ImageCorpus(image_feature)
+    # corpus = image_corpus.create_feature_corpus(image_dir)
+    print(list(image_corpus.feature_dict.keys())[0])
+    top_retrievals = image_corpus.retrieve_similar_images(query_image_path, top_k=5)
+    print(top_retrievals)
+    visualize_retrieved_images(query_image_path, top_retrievals)

src/evidence/text2text_retrieval.py ADDED Viewed

	@@ -0,0 +1,203 @@

+import h5py
+from sentence_transformers import SentenceTransformer, CrossEncoder, util
+import os
+import torch
+import pandas as pd
+from src.utils.path_utils import get_project_root
+class SemanticSimilarity:
+    def __init__(
+        self,
+        train_embeddings_file,
+        test_embeddings_file,
+        train_csv_path=None,
+        test_csv_path=None,
+        train_df=None,
+        test_df=None,
+    ):
+        # We use the Bi-Encoder to encode all passages
+        self.bi_encoder = SentenceTransformer("multi-qa-mpnet-base-dot-v1")
+        self.bi_encoder.max_seq_length = 512  # Truncate long passages to 256 tokens
+        self.cross_encoder = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")
+        self.train_embeddings, self.train_ids = self._load_embeddings(
+            train_embeddings_file
+        )
+        self.test_embeddings, self.test_ids = self._load_embeddings(
+            test_embeddings_file
+        )
+        # Load corresponding CSV files for enriched evidence
+        self.train_csv = (
+            train_df if train_df is not None else pd.read_csv(train_csv_path)
+        )
+        self.test_csv = test_df if test_df is not None else pd.read_csv(test_csv_path)
+    def _load_embeddings(self, h5_file_path):
+        """
+        Load embeddings and IDs from the HDF5 file
+        """
+        with h5py.File(h5_file_path, "r") as h5_file:
+            embeddings = torch.tensor(h5_file["embeddings"][:], dtype=torch.float16)
+            ids = list(h5_file["ids"][:])  # Retrieve the IDs as a list of strings
+        return embeddings, ids
+    def search(self, query, top_k):
+        ##### Sematic Search #####
+        # Encode the query using the bi-encoder and find potentially relevant passages
+        question_embedding = self.bi_encoder.encode(query, convert_to_tensor=True)
+        question_embedding = question_embedding.to(dtype=torch.float16)
+        # question_embedding = question_embedding
+        hits_train = util.semantic_search(
+            question_embedding, self.train_embeddings, top_k=top_k * 5
+        )
+        hits_train = hits_train[0]  # Get the hits for the first query
+        # print(f"len(hits_train) = {len(hits_train)}")
+        hits_test = util.semantic_search(
+            question_embedding, self.test_embeddings, top_k=top_k * 5
+        )
+        hits_test = hits_test[0]
+        # print(f"len(hits_test): {len(hits_test)}")
+        ##### Re-Ranking #####
+        # Now, score all retrieved passages with the cross_encoder
+        cross_inp_train = [
+            [query, self.train_csv["evidence_enriched"][hit["corpus_id"]]]
+            for hit in hits_train
+        ]
+        cross_scores_train = self.cross_encoder.predict(cross_inp_train)
+        cross_inp_test = [
+            [query, self.test_csv["evidence_enriched"][hit["corpus_id"]]]
+            for hit in hits_test
+        ]
+        cross_scores_test = self.cross_encoder.predict(cross_inp_test)
+        # Sort results by the cross-encoder scores
+        for idx in range(len(cross_scores_train)):
+            hits_train[idx]["cross-score"] = cross_scores_train[idx]
+        for idx in range(len(cross_scores_test)):
+            hits_test[idx]["cross-score"] = cross_scores_test[idx]
+        hits_train_cross_encoder = sorted(
+            hits_train, key=lambda x: x.get("cross-score"), reverse=True
+        )
+        hits_train_cross_encoder = hits_train_cross_encoder[: top_k * 5]
+        hits_test_cross_encoder = sorted(
+            hits_test, key=lambda x: x.get("cross-score"), reverse=True
+        )
+        hits_test_cross_encoder = hits_test_cross_encoder[: top_k * 5]
+        results = [
+            (self.train_ids[hit["corpus_id"]].decode("utf-8"), hit.get("cross-score"))
+            for hit in hits_train_cross_encoder
+        ] + [
+            (self.test_ids[hit["corpus_id"]].decode("utf-8"), hit.get("cross-score"))
+            for hit in hits_test_cross_encoder
+        ]
+        ##### Filter out duplicates based on scores #####
+        unique_scores = set()
+        filtered_results = []
+        # print(results)
+        for id_, score in sorted(results, key=lambda x: x[1], reverse=True):
+            if score not in unique_scores:
+                unique_scores.add(score)
+                filtered_results.append((id_, score))
+            if (
+                len(filtered_results) == top_k
+            ):  # Stop when top_k unique scores are reached
+                break
+        return filtered_results
+class TextCorpus:
+    def __init__(self, data_dir, split):
+        self.bi_encoder = SentenceTransformer("multi-qa-mpnet-base-dot-v1")
+        self.split = split  # train evidences or test evidences
+        self.data_dir = data_dir  # .csv file for enriched train and test is contained.
+    def encode_corpus(self):
+        """
+        Encode the corpus (evidence_enriched column for both train and test) and store the embeddings.
+        """
+        file_path = os.path.join(self.data_dir, f"{self.split}_enriched.csv")
+        df = pd.read_csv(file_path)
+        # Extract the enriched evidence column and ids
+        evidence_enriched = df["evidence_enriched"].tolist()
+        ids = df["id"].tolist()  # Assuming the 'id' column is in the CSV
+        # Encode the evidence using the bi-encoder
+        embeddings = self.bi_encoder.encode(evidence_enriched, convert_to_tensor=True)
+        # Define HDF5 file path
+        h5_file_path = os.path.join(get_project_root(), f"{self.split}_embeddings.h5")
+        with h5py.File(h5_file_path, "w") as h5_file:
+            h5_file.create_dataset(
+                "embeddings", data=embeddings.numpy(), dtype="float16"
+            )
+            h5_file.create_dataset(
+                "ids",
+                data=[f"{self.split}_{id}" for id in ids],
+                dtype=h5py.string_dtype(),
+            )
+        print(f"Embeddings saved to {h5_file_path}")
+if __name__ == "__main__":
+    import time
+    start_time = time.time()
+    project_root = get_project_root()
+    data_dir = os.path.join(project_root, "data", "preprocessed")
+    # query = train_enriched['evidence_enriched'][0]
+    # train_embeddings = os.path.join(get_project_root(), 'train_evidence_embeddings.pkl')
+    # test_embeddings = os.path.join(get_project_root(), 'test_evidence_embeddings.pkl')
+    # semantic = SemanticSimilarity(train_embeddings, test_embeddings)
+    # semantic.search(query, top_k=10)
+    # evidence = TextCorpus(data_dir, 'train')
+    # Define file paths
+    train_csv_path = os.path.join(data_dir, "train_enriched.csv")
+    test_csv_path = os.path.join(data_dir, "test_enriched.csv")
+    train_embeddings_file = os.path.join(project_root, "train_embeddings.h5")
+    test_embeddings_file = os.path.join(project_root, "test_embeddings.h5")
+    # Initialize the SemanticSimilarity class
+    similarity = SemanticSimilarity(
+        train_embeddings_file=train_embeddings_file,
+        test_embeddings_file=test_embeddings_file,
+        train_csv_path=train_csv_path,
+        test_csv_path=test_csv_path,
+    )
+    # Load the first query from train_enriched.csv
+    train_df = pd.read_csv(train_csv_path)
+    first_query = train_df["claim_enriched"].iloc[2]  # Get the first query
+    # Define the number of top-k results to retrieve
+    top_k = 5
+    # Perform the semantic search
+    results = similarity.search(query=first_query, top_k=top_k)
+    finish_time = time.time() - start_time
+    # Display the results
+    print(results)
+    print(f"Finish time: {finish_time}")

src/experimental/__init__.py ADDED Viewed

File without changes

src/experimental/dataset_search.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

src/experimental/dataset_stats.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

src/experimental/image_captioning.ipynb ADDED Viewed

	@@ -0,0 +1,96 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "id": "initial_id",
+   "metadata": {
+    "collapsed": true,
+    "ExecuteTime": {
+     "end_time": "2024-12-14T14:40:23.089485Z",
+     "start_time": "2024-12-14T14:40:22.937392Z"
+    }
+   },
+   "source": [
+    "import pandas as pd\n",
+    "from src.utils.path_utils import get_project_root\n",
+    "\n",
+    "PROJECT_ROOT = get_project_root()"
+   ],
+   "outputs": [],
+   "execution_count": 1
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-12-14T14:46:49.718444Z",
+     "start_time": "2024-12-14T14:46:46.361765Z"
+    }
+   },
+   "cell_type": "code",
+   "source": [
+    "import requests\n",
+    "from PIL import Image\n",
+    "from transformers import BlipProcessor, BlipForConditionalGeneration\n",
+    "\n",
+    "processor = BlipProcessor.from_pretrained(\"Salesforce/blip-image-captioning-large\")\n",
+    "model = BlipForConditionalGeneration.from_pretrained(\"Salesforce/blip-image-captioning-large\")\n",
+    "\n",
+    "image = Image.open(f\"{PROJECT_ROOT}/data/scenery_image.jpg\")\n",
+    "\n",
+    "# conditional image captioning\n",
+    "text = \"a photography of\"\n",
+    "inputs = processor(image, text, return_tensors=\"pt\")\n",
+    "\n",
+    "out = model.generate(**inputs)\n",
+    "print(processor.decode(out[0], skip_special_tokens=True))\n",
+    "\n",
+    "# unconditional image captioning\n",
+    "inputs = processor(image, return_tensors=\"pt\")\n",
+    "\n",
+    "out = model.generate(**inputs)\n",
+    "print(processor.decode(out[0], skip_special_tokens=True))\n"
+   ],
+   "id": "80b41a616dbbafd3",
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "a photography of a road leading to mountains with a sunset in the background\n",
+      "arafed road with mountains in the background and a sunset\n"
+     ]
+    }
+   ],
+   "execution_count": 8
+  },
+  {
+   "metadata": {},
+   "cell_type": "code",
+   "outputs": [],
+   "execution_count": null,
+   "source": "",
+   "id": "983b19a8aa6e4a39"
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

src/model/__init__.py ADDED Viewed

File without changes

src/model/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (197 Bytes). View file

src/model/__pycache__/layers.cpython-311.pyc ADDED Viewed

Binary file (4.08 kB). View file

src/model/__pycache__/model.cpython-311.pyc ADDED Viewed

Binary file (19.7 kB). View file

src/model/dataset.py ADDED Viewed

	@@ -0,0 +1,164 @@

+import os
+import h5py
+import pandas as pd
+import torch
+from torch.utils.data import Dataset, DataLoader
+from PIL import Image
+from torchvision import transforms
+import logging
+import numpy as np
+logger = logging.getLogger(__name__)
+# Define preprocessing transformations
+preprocess = transforms.Compose([
+    transforms.Resize(256),
+    transforms.CenterCrop(256),
+    transforms.ToTensor(),
+    transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.229, 0.224, 0.225]),
+])
+# Updated category mapping for multi-label classification
+# Each category maps to (text-text, text-image, image-text, image-image) labels
+# 0: Support, 1: NEI (Not Enough Information), 2: Refute
+category_to_labels = {
+    'Support_Text': [0, 1, 1, 1],        # Support only for text-text
+    'Support_Multimodal': [0, 0, 0, 0],  # Support for all paths
+    'Insufficient_Text': [1, 1, 1, 1],   # NEI for all paths
+    'Insufficient_Multimodal': [1, 1, 1, 0],  # Support for cross-modal paths, NEI for others
+    'Refute': [2, 2, 2, 2]              # Refute for all paths
+}
+def prepare_h5_dataset(csv_path, h5_path):
+    """
+    Prepare h5 dataset from CSV file where each index contains complete sample data
+    """
+    # Create output directory if it doesn't exist
+    os.makedirs(os.path.dirname(h5_path), exist_ok=True)
+    # Read CSV file
+    df = pd.read_csv(csv_path, index_col=0)[['claim', 'claim_image', 'evidence', 'evidence_image', 'category']]
+    with h5py.File(h5_path, 'w') as f:
+        # Process each row
+        for idx, (_, row) in enumerate(df.iterrows()):
+            # Create group for this sample
+            sample_group = f.create_group(str(idx))
+            # Store text data
+            sample_group.create_dataset('claim', data=row['claim'])
+            sample_group.create_dataset('document', data=row['evidence'])
+            # Process and store images
+            try:
+                claim_img = Image.open(row['claim_image']).convert('RGB')
+                claim_img_tensor = preprocess(claim_img).numpy()
+            except Exception as e:
+                logger.warning(f"Error processing claim image for idx {idx}: {e}")
+                claim_img_tensor = np.zeros((3, 256, 256), dtype='float32')
+            sample_group.create_dataset('claim_image', data=claim_img_tensor)
+            try:
+                doc_img = Image.open(row['evidence_image']).convert('RGB')
+                doc_img_tensor = preprocess(doc_img).numpy()
+            except Exception as e:
+                logger.warning(f"Error processing evidence image for idx {idx}: {e}")
+                doc_img_tensor = np.zeros((3, 256, 256), dtype='float32')
+            sample_group.create_dataset('document_image', data=doc_img_tensor)
+            # Store multi-path labels
+            labels = category_to_labels.get(row['category'], [1, 1, 1, 1])  # Default to NEI if category not found
+            sample_group.create_dataset('labels', data=np.array(labels, dtype=np.int64))
+    logger.info(f"Created H5 dataset at {h5_path}")
+class MisinformationDataset(Dataset):
+    def __init__(self, csv_path, pre_embed=False):
+        self.csv_path = csv_path
+        self.pre_embed = pre_embed
+        # Derive h5 path from csv path
+        base_path = os.path.splitext(csv_path)[0]
+        self.h5_path = base_path + '_embeddings.h5' if pre_embed else base_path + '.h5'
+        if not os.path.exists(self.h5_path):
+            if pre_embed:
+                raise FileNotFoundError(f"Pre-computed embeddings not found at {self.h5_path}. "
+                                      f"Please run preprocess_embeddings.py first.")
+            logger.info(f"H5 file not found at {self.h5_path}. Creating new H5 dataset...")
+            prepare_h5_dataset(self.csv_path, self.h5_path)
+        self.h5_file = h5py.File(self.h5_path, 'r')
+        self.length = len(self.h5_file.keys())
+    def __len__(self):
+        return self.length
+    def __getitem__(self, idx):
+        sample = self.h5_file[str(idx)]
+        if self.pre_embed:
+            return {
+                'id': str(idx),
+                'claim_text_embeds': torch.from_numpy(sample['claim_text_embeds'][()]),
+                'doc_text_embeds': torch.from_numpy(sample['doc_text_embeds'][()]),
+                'claim_image_embeds': torch.from_numpy(sample['claim_image_embeds'][()]),
+                'doc_image_embeds': torch.from_numpy(sample['doc_image_embeds'][()]),
+                'labels': torch.from_numpy(sample['labels'][()])
+            }
+        else:
+            return {
+                'id': str(idx),
+                'claim': sample['claim'][()].decode(),
+                'claim_image': torch.from_numpy(sample['claim_image'][()]),
+                'document': sample['document'][()].decode(),
+                'document_image': torch.from_numpy(sample['document_image'][()]),
+                'labels': torch.from_numpy(sample['labels'][()])
+            }
+    def __del__(self):
+        if hasattr(self, 'h5_file'):
+            self.h5_file.close()
+def get_dataloader(csv_path, batch_size=32, num_workers=4, shuffle=False, pre_embed=False):
+    dataset = MisinformationDataset(csv_path, pre_embed=pre_embed)
+    dataloader = DataLoader(
+        dataset,
+        batch_size=batch_size,
+        shuffle=shuffle,
+        num_workers=num_workers,
+        pin_memory=True
+    )
+    return dataloader
+if __name__ == "__main__":
+    # Set up logging
+    logging.basicConfig(level=logging.INFO)
+    # Create dataloaders
+    train_loader = get_dataloader('data/preprocessed/train.csv', shuffle=True)
+    #test_loader = get_dataloader('data/preprocessed/test.csv', shuffle=False)
+    # Test dataloaders
+    for batch in train_loader:
+        print("Train batch:")
+        print(f"Batch size: {len(batch['id'])}")
+        print(f"Claim shape: {batch['claim_image'].shape}")
+        print(f"Document image shape: {batch['document_image'].shape}")
+        print(f"Labels shape: {batch['labels'].shape}")  # Should be (batch_size, 4)
+        print(f"Sample labels: {batch['labels'][0]}")  # Show labels for first item
+        break
+    #for batch in test_loader:
+    #    print("\nTest batch:")
+    #    print(f"Batch size: {len(batch['id'])}")
+    #    print(f"Claim shape: {batch['claim_image'].shape}")
+    #    print(f"Document image shape: {batch['document_image'].shape}")
+    #    print(f"Labels shape: {batch['labels'].shape}")
+    #    print(f"Sample labels: {batch['labels'][0]}")
+    #    break

src/model/layers.py ADDED Viewed

	@@ -0,0 +1,58 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class MLP(nn.Module):
+    """
+    MLP block with GELU activation and dropout.
+    """
+    def __init__(self, embed_dim, mlp_ratio=4.0, dropout=0.1):
+        super().__init__()
+        hidden_dim = int(embed_dim * mlp_ratio)
+        self.net = nn.Sequential(
+            nn.Linear(embed_dim, hidden_dim),
+            nn.GELU(),
+            nn.Dropout(dropout),
+            nn.Linear(hidden_dim, embed_dim),
+            nn.Dropout(dropout)
+        )
+    def forward(self, x):
+        return self.net(x)
+class MultiHeadAttention(nn.Module):
+    """
+    Multi-head attention module with optional fused attention support.
+    """
+    def __init__(self, embed_dim, num_heads, dropout=0.1, fused_attn=False):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.fused_attn = fused_attn
+        self.attn_dropout = nn.Dropout(dropout)
+    def forward(self, Q, K, V, out_proj):
+        B, T, D = Q.shape
+        head_dim = D // self.num_heads
+        Q_ = Q.view(B, T, self.num_heads, head_dim).transpose(1, 2)  # (B, num_heads, T, head_dim)
+        K_ = K.view(B, -1, self.num_heads, head_dim).transpose(1, 2)
+        V_ = V.view(B, -1, self.num_heads, head_dim).transpose(1, 2)
+        if self.fused_attn:
+            context = F.scaled_dot_product_attention(
+                Q_, K_, V_,
+                dropout_p=self.dropout if self.training else 0.0,
+                is_causal=False
+            )
+        else:
+            scores = torch.matmul(Q_, K_.transpose(-1, -2)) / (head_dim ** 0.5)
+            attn_weights = F.softmax(scores, dim=-1)
+            attn_weights = self.attn_dropout(attn_weights)
+            context = torch.matmul(attn_weights, V_)  # (B, num_heads, T, head_dim)
+        context = context.transpose(1, 2).contiguous().view(B, T, D)
+        out = out_proj(context)
+        return out

src/model/model.py ADDED Viewed

	@@ -0,0 +1,432 @@

+import torch
+import torch.nn as nn
+from .layers import MLP, MultiHeadAttention
+class MultiViewClaimRepresentation(nn.Module):
+    """
+    Multi-view claim representation module with transformer-like architecture
+    for self-attention and cross-attention in text and image modalities.
+    """
+    def __init__(self, text_input_dim=384, image_input_dim=1024, embed_dim=512, num_heads=8, dropout=0.1, mlp_ratio=4.0, fused_attn=False):
+        super().__init__()
+        self.text_input_dim = text_input_dim
+        self.image_input_dim = image_input_dim
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.text_proj = nn.Linear(text_input_dim, embed_dim)
+        self.image_proj = nn.Linear(image_input_dim, embed_dim)
+        # Text projections for attention
+        self.text_WQ = nn.Linear(embed_dim, embed_dim)
+        self.text_WK = nn.Linear(embed_dim, embed_dim)
+        self.text_WV = nn.Linear(embed_dim, embed_dim)
+        # Image projections for attention
+        self.image_WQ = nn.Linear(embed_dim, embed_dim)
+        self.image_WK = nn.Linear(embed_dim, embed_dim)
+        self.image_WV = nn.Linear(embed_dim, embed_dim)
+        # Output projections
+        self.text_self_attn_out = nn.Linear(embed_dim, embed_dim)
+        self.image_self_attn_out = nn.Linear(embed_dim, embed_dim)
+        self.text_cross_attn_out = nn.Linear(embed_dim, embed_dim)
+        self.image_cross_attn_out = nn.Linear(embed_dim, embed_dim)
+        # Layer norms
+        self.text_self_ln1 = nn.LayerNorm(embed_dim)
+        self.text_self_ln2 = nn.LayerNorm(embed_dim)
+        self.image_self_ln1 = nn.LayerNorm(embed_dim)
+        self.image_self_ln2 = nn.LayerNorm(embed_dim)
+        self.text_cross_ln1 = nn.LayerNorm(embed_dim)
+        self.text_cross_ln2 = nn.LayerNorm(embed_dim)
+        self.image_cross_ln1 = nn.LayerNorm(embed_dim)
+        self.image_cross_ln2 = nn.LayerNorm(embed_dim)
+        # MLPs
+        self.text_mlp = MLP(embed_dim, mlp_ratio, dropout)
+        self.image_mlp = MLP(embed_dim, mlp_ratio, dropout)
+        # Multi-head attention
+        self.attention = MultiHeadAttention(embed_dim, num_heads, dropout, fused_attn)
+        self.proj_dropout = nn.Dropout(dropout)
+    def forward(self, X_t=None, X_i=None):
+        """
+        Args:
+            X_t (Tensor): Text embeddings of shape (B, L_t, D)
+            X_i (Tensor): Image embeddings of shape (B, L_i, D)
+        Returns:
+            (H_t_fused, H_i_fused):
+                H_t_fused: Text representations with self- and co-attention
+                H_i_fused: Image representations with self- and co-attention
+        """
+        # Project inputs to embedding dimension first
+        if X_t is not None:
+            X_t = self.text_proj(X_t)
+        if X_i is not None:
+            X_i = self.image_proj(X_i)
+        # Pre-compute Q,K,V for both modalities if present
+        text_Q = self.text_WQ(X_t) if X_t is not None else None
+        text_K = self.text_WK(X_t) if X_t is not None else None
+        text_V = self.text_WV(X_t) if X_t is not None else None
+        image_Q = self.image_WQ(X_i) if X_i is not None else None
+        image_K = self.image_WK(X_i) if X_i is not None else None
+        image_V = self.image_WV(X_i) if X_i is not None else None
+        # Unimodal text case
+        if X_t is not None and X_i is None:
+            # Self attention without MLP
+            H_t = X_t + self.attention(text_Q, text_K, text_V, self.text_self_attn_out)
+            H_t = self.text_self_ln1(H_t)
+            # Apply MLP after self attention
+            H_t = H_t + self.text_mlp(H_t)
+            H_t = self.text_self_ln2(H_t)
+            return H_t, None
+        # Unimodal image case
+        if X_i is not None and X_t is None:
+            # Self attention without MLP
+            H_i = X_i + self.attention(image_Q, image_K, image_V, self.image_self_attn_out)
+            H_i = self.image_self_ln1(H_i)
+            # Apply MLP after self attention
+            H_i = H_i + self.image_mlp(H_i)
+            H_i = self.image_self_ln2(H_i)
+            return None, H_i
+        # Multimodal case
+        # Text processing
+        H_t = X_t + self.attention(text_Q, text_K, text_V, self.text_self_attn_out)  # Self attention
+        H_t = self.text_self_ln1(H_t)
+        C_t = H_t + self.attention(H_t, text_K, text_V, self.text_cross_attn_out)  # Cross attention
+        C_t = self.text_cross_ln1(C_t)
+        # Apply MLP after combined attention
+        C_t = C_t + self.text_mlp(C_t)
+        C_t = self.text_cross_ln2(C_t)
+        # Image processing
+        H_i = X_i + self.attention(image_Q, image_K, image_V, self.image_self_attn_out)  # Self attention
+        H_i = self.image_self_ln1(H_i)
+        C_i = H_i + self.attention(H_i, image_K, image_V, self.image_cross_attn_out)  # Cross attention
+        C_i = self.image_cross_ln1(C_i)
+        # Apply MLP after combined attention
+        C_i = C_i + self.image_mlp(C_i)
+        C_i = self.image_cross_ln2(C_i)
+        return C_t, C_i
+class CrossAttentionEvidenceConditioning(nn.Module):
+    """
+    Cross-attention module to condition claim representations
+    on textual and visual evidence.
+    """
+    def __init__(self, text_input_dim=384, image_input_dim=1024, embed_dim=768, num_heads=8, dropout=0.1, mlp_ratio=4.0, fused_attn=False):
+        super().__init__()
+        self.num_heads = num_heads
+        self.embed_dim = embed_dim
+        self.dropout = dropout
+        self.fused_attn = fused_attn
+        # Query projections
+        self.text_WQ = nn.Linear(embed_dim, embed_dim)
+        self.image_WQ = nn.Linear(embed_dim, embed_dim)
+        # Text evidence projections
+        self.text_evidence_key = nn.Linear(text_input_dim, embed_dim)
+        self.text_evidence_value = nn.Linear(text_input_dim, embed_dim)
+        # Image evidence projections
+        self.image_evidence_key = nn.Linear(image_input_dim, embed_dim)
+        self.image_evidence_value = nn.Linear(image_input_dim, embed_dim)
+        # Separate output projections for each attention path
+        self.text_text_out = nn.Linear(embed_dim, embed_dim)
+        self.text_image_out = nn.Linear(embed_dim, embed_dim)
+        self.image_text_out = nn.Linear(embed_dim, embed_dim)
+        self.image_image_out = nn.Linear(embed_dim, embed_dim)
+        # Separate layer norms for each attention path
+        self.text_text_ln1 = nn.LayerNorm(embed_dim)
+        self.text_text_ln2 = nn.LayerNorm(embed_dim)
+        self.text_image_ln1 = nn.LayerNorm(embed_dim)
+        self.text_image_ln2 = nn.LayerNorm(embed_dim)
+        self.image_text_ln1 = nn.LayerNorm(embed_dim)
+        self.image_text_ln2 = nn.LayerNorm(embed_dim)
+        self.image_image_ln1 = nn.LayerNorm(embed_dim)
+        self.image_image_ln2 = nn.LayerNorm(embed_dim)
+        # MLPs
+        self.text_mlp = MLP(embed_dim, mlp_ratio, dropout)
+        self.image_mlp = MLP(embed_dim, mlp_ratio, dropout)
+        # Multi-head attention
+        self.attention = MultiHeadAttention(embed_dim, num_heads, dropout, fused_attn)
+        self.proj_dropout = nn.Dropout(dropout)
+    def forward(self, H_t=None, H_i=None, E_t=None, E_i=None):
+        """
+        Returns:
+            (S_t, S_i): Each contains a tuple of (text_evidence_output, image_evidence_output)
+        """
+        S_t_t, S_t_i = None, None
+        S_i_t, S_i_i = None, None
+        if H_t is not None:
+            # Text-to-text evidence attention
+            S_t_t = self.attention(
+                Q=self.text_WQ(H_t),
+                K=self.text_evidence_key(E_t),
+                V=self.text_evidence_value(E_t),
+                out_proj=self.text_text_out
+            )
+            S_t_t = H_t + S_t_t
+            S_t_t = self.text_text_ln1(S_t_t)
+            S_t_t = S_t_t + self.text_mlp(S_t_t)
+            S_t_t = self.text_text_ln2(S_t_t)
+            # Text-to-image evidence attention
+            S_t_i = self.attention(
+                Q=self.text_WQ(H_t),
+                K=self.image_evidence_key(E_i),
+                V=self.image_evidence_value(E_i),
+                out_proj=self.text_image_out
+            )
+            S_t_i = H_t + S_t_i
+            S_t_i = self.text_image_ln1(S_t_i)
+            S_t_i = S_t_i + self.text_mlp(S_t_i)
+            S_t_i = self.text_image_ln2(S_t_i)
+        if H_i is not None:
+            # Image-to-text evidence attention
+            S_i_t = self.attention(
+                Q=self.image_WQ(H_i),
+                K=self.text_evidence_key(E_t),
+                V=self.text_evidence_value(E_t),
+                out_proj=self.image_text_out
+            )
+            S_i_t = H_i + S_i_t
+            S_i_t = self.image_text_ln1(S_i_t)
+            S_i_t = S_i_t + self.image_mlp(S_i_t)
+            S_i_t = self.image_text_ln2(S_i_t)
+            # Image-to-image evidence attention
+            S_i_i = self.attention(
+                Q=self.image_WQ(H_i),
+                K=self.image_evidence_key(E_i),
+                V=self.image_evidence_value(E_i),
+                out_proj=self.image_image_out
+            )
+            S_i_i = H_i + S_i_i
+            S_i_i = self.image_image_ln1(S_i_i)
+            S_i_i = S_i_i + self.image_mlp(S_i_i)
+            S_i_i = self.image_image_ln2(S_i_i)
+        return (S_t_t, S_t_i), (S_i_t, S_i_i)
+class ClassificationModule(nn.Module):
+    """
+    Classification module that takes final text/image representations
+    and outputs logits for {support, refute, not enough info}
+    for each evidence path.
+    """
+    def __init__(self, embed_dim=768, hidden_dim=256, num_classes=3, dropout=0.1):
+        super().__init__()
+        # MLPs for text representations
+        self.mlp_text_given_text = nn.Sequential(
+            nn.Linear(embed_dim, hidden_dim),
+            nn.ReLU(),
+            nn.Dropout(dropout),
+            nn.Linear(hidden_dim, num_classes)
+        )
+        self.mlp_text_given_image = nn.Sequential(
+            nn.Linear(embed_dim, hidden_dim),
+            nn.ReLU(),
+            nn.Dropout(dropout),
+            nn.Linear(hidden_dim, num_classes)
+        )
+        # MLPs for image representations
+        self.mlp_image_given_text = nn.Sequential(
+            nn.Linear(embed_dim, hidden_dim),
+            nn.ReLU(),
+            nn.Dropout(dropout),
+            nn.Linear(hidden_dim, num_classes)
+        )
+        self.mlp_image_given_image = nn.Sequential(
+            nn.Linear(embed_dim, hidden_dim),
+            nn.ReLU(),
+            nn.Dropout(dropout),
+            nn.Linear(hidden_dim, num_classes)
+        )
+    def forward(self, S_t=None, S_i=None):
+        """
+        Args:
+            S_t: Tuple of (text_given_text, text_given_image) representations
+            S_i: Tuple of (image_given_text, image_given_image) representations
+        Returns:
+            y_t: Tuple of (text_given_text_logits, text_given_image_logits)
+            y_i: Tuple of (image_given_text_logits, image_given_image_logits)
+        """
+        y_t_t, y_t_i = None, None
+        y_i_t, y_i_i = None, None
+        if S_t is not None:
+            S_t_t, S_t_i = S_t
+            if S_t_t is not None:
+                pooled_t_t = S_t_t.mean(dim=1)
+                y_t_t = self.mlp_text_given_text(pooled_t_t)
+            if S_t_i is not None:
+                pooled_t_i = S_t_i.mean(dim=1)
+                y_t_i = self.mlp_text_given_image(pooled_t_i)
+        if S_i is not None:
+            S_i_t, S_i_i = S_i
+            if S_i_t is not None:
+                pooled_i_t = S_i_t.mean(dim=1)
+                y_i_t = self.mlp_image_given_text(pooled_i_t)
+            if S_i_i is not None:
+                pooled_i_i = S_i_i.mean(dim=1)
+                y_i_i = self.mlp_image_given_image(pooled_i_i)
+        return (y_t_t, y_t_i), (y_i_t, y_i_i)
+class MisinformationDetectionModel(nn.Module):
+    """
+    End-to-end model combining:
+    1) Multi-view claim representation
+    2) Cross-attention evidence conditioning
+    3) Classification for each evidence path
+    """
+    def __init__(self,
+                 text_input_dim=384,   # DeBERTa-v3-xsmall hidden size
+                 image_input_dim=1024,  # Swinv2-base hidden size
+                 embed_dim=512,
+                 num_heads=8,
+                 dropout=0.1,
+                 hidden_dim=256,
+                 num_classes=3,
+                 mlp_ratio=4.0,
+                 fused_attn=False):
+        super().__init__()
+        self.representation = MultiViewClaimRepresentation(
+            text_input_dim=text_input_dim,
+            image_input_dim=image_input_dim,
+            embed_dim=embed_dim,
+            num_heads=num_heads,
+            dropout=dropout,
+            mlp_ratio=mlp_ratio,
+            fused_attn=fused_attn
+        )
+        self.cross_attn = CrossAttentionEvidenceConditioning(
+            text_input_dim=text_input_dim,
+            image_input_dim=image_input_dim,
+            embed_dim=embed_dim,
+            num_heads=num_heads,
+            dropout=dropout,
+            mlp_ratio=mlp_ratio,
+            fused_attn=fused_attn
+        )
+        self.classifier = ClassificationModule(
+            embed_dim=embed_dim,
+            hidden_dim=hidden_dim,
+            num_classes=num_classes,
+            dropout=dropout
+        )
+        # Initialize weights
+        self._initialize_weights()
+    def _initialize_weights(self):
+        for module in self.modules():
+            if isinstance(module, nn.Linear):
+                nn.init.xavier_uniform_(module.weight)
+                if module.bias is not None:
+                    nn.init.zeros_(module.bias)
+            elif isinstance(module, nn.LayerNorm):
+                nn.init.ones_(module.weight)
+                nn.init.zeros_(module.bias)
+    def forward(self, X_t=None, X_i=None, E_t=None, E_i=None):
+        """
+        Args:
+            X_t (Tensor): Text claim embeddings (B, L_t, D)
+            X_i (Tensor): Image claim embeddings (B, L_i, D)
+            E_t (Tensor): Text evidence embeddings (B, L_e_t, D)
+            E_i (Tensor): Image evidence embeddings (B, L_e_i, D)
+        Returns:
+            y_t: Tuple of (text_given_text_logits, text_given_image_logits)
+            y_i: Tuple of (image_given_text_logits, image_given_image_logits)
+            Each logit tensor has shape (B, num_classes)
+        """
+        # Get fused claim representations
+        H_t, H_i = self.representation(X_t, X_i)
+        # Get evidence-conditioned representations for each path
+        (S_t_t, S_t_i), (S_i_t, S_i_i) = self.cross_attn(H_t, H_i, E_t, E_i)
+        # Get predictions for each evidence path
+        (y_t_t, y_t_i), (y_i_t, y_i_i) = self.classifier(
+            S_t=(S_t_t, S_t_i),
+            S_i=(S_i_t, S_i_i)
+        )
+        return (y_t_t, y_t_i), (y_i_t, y_i_i)
+if __name__ == "__main__":
+    # Example usage
+    batch_size = 2
+    seq_len_t = 5
+    seq_len_i = 7
+    evidence_len_t = 6
+    evidence_len_i = 8
+    embed_dim = 768
+    # Create random embeddings
+    text_claim = torch.randn(batch_size, seq_len_t, embed_dim)
+    image_claim = torch.randn(batch_size, seq_len_i, embed_dim)
+    text_evidence = torch.randn(batch_size, evidence_len_t, embed_dim)
+    image_evidence = torch.randn(batch_size, evidence_len_i, embed_dim)
+    # Build model
+    model = MisinformationDetectionModel(
+        embed_dim=embed_dim,
+        num_heads=8,
+        dropout=0.1,
+        hidden_dim=256,
+        num_classes=3
+    )
+    # Forward pass (multimodal)
+    (y_t_t, y_t_i), (y_i_t, y_i_i) = model(
+        X_t=text_claim,
+        X_i=image_claim,
+        E_t=text_evidence,
+        E_i=image_evidence
+    )
+    print("Text-Text logits:", y_t_t.shape)      # [B, 3]
+    print("Text-Image logits:", y_t_i.shape)     # [B, 3]
+    print("Image-Text logits:", y_i_t.shape)     # [B, 3]
+    print("Image-Image logits:", y_i_i.shape)    # [B, 3]
+    # Forward pass (unimodal text)
+    (y_t_t, y_t_i), (y_i_t, y_i_i) = model(
+        X_t=text_claim,
+        E_t=text_evidence
+    )
+    print("\nUnimodal Text:")
+    print("Text-Text logits:", y_t_t.shape if y_t_t is not None else None)
+    print("Text-Image logits:", y_t_i if y_t_i is not None else None)
+    print("Image-Text logits:", y_i_t if y_i_t is not None else None)
+    print("Image-Image logits:", y_i_i if y_i_i is not None else None)

src/preprocess/__init__.py ADDED Viewed

File without changes

src/preprocess/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (201 Bytes). View file

src/preprocess/__pycache__/caption.cpython-311.pyc ADDED Viewed

Binary file (5.88 kB). View file

src/preprocess/__pycache__/preprocess.cpython-311.pyc ADDED Viewed

Binary file (3.74 kB). View file

src/preprocess/caption.py ADDED Viewed

	@@ -0,0 +1,129 @@

+import os
+from typing import Tuple
+import pandas as pd
+from tqdm import tqdm
+from PIL import Image
+from transformers import BlipProcessor, BlipForConditionalGeneration
+from src.utils.path_utils import get_project_root
+# Initialize BLIP model and processor
+processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
+model = BlipForConditionalGeneration.from_pretrained(
+    "Salesforce/blip-image-captioning-large"
+)
+PROJECT_ROOT = get_project_root()
+RAW_DIR = PROJECT_ROOT / "data/raw/factify"
+PROCESSED_DIR = PROJECT_ROOT / "data/preprocessed"
+BATCH_SIZE = 20  # Number of rows to process per batch
+def generate_caption(image_path: str) -> str:
+    """Generates a caption for an image given its path."""
+    try:
+        image = Image.open(f"{PROJECT_ROOT}/{image_path}").convert("RGB")
+        inputs = processor(image, return_tensors="pt")
+        output = model.generate(**inputs)
+        return processor.decode(output[0], skip_special_tokens=True)
+    except Exception as e:
+        print(f"Error processing image {image_path}: {e}")
+        return ""
+def process_image_row(row: pd.Series) -> Tuple[str, str, str, str]:
+    """Processes a single row to generate captions and enriched columns."""
+    claim_image_caption = generate_caption(row["claim_image"])
+    evidence_image_caption = generate_caption(row["evidence_image"])
+    claim_enriched = f"{row['claim']}. {claim_image_caption}"
+    evidence_enriched = f"{row['evidence']}. {evidence_image_caption}"
+    return (
+        claim_image_caption,
+        evidence_image_caption,
+        claim_enriched,
+        evidence_enriched,
+    )
+def get_last_processed_index(df: pd.DataFrame) -> int:
+    """
+    Find the last processed row index by searching backwards from the end
+    until finding a row where evidence_image_caption is not NA.
+    Returns -1 if no processed rows are found.
+    """
+    for idx in range(len(df) - 1, -1, -1):
+        if pd.notna(df.loc[idx, "evidence_image_caption"]):
+            return idx
+    return -1
+def process_csv(input_csv: str, output_csv: str) -> None:
+    """Processes the CSV in chunks and writes results incrementally with efficient checkpointing."""
+    # Load input DataFrame
+    input_df = pd.read_csv(input_csv)
+    # Initialize or load output DataFrame
+    if os.path.exists(output_csv):
+        output_df = pd.read_csv(output_csv)
+        if len(output_df) != len(input_df):
+            print(
+                "Mismatch in input and output CSV lengths. Reinitializing output CSV..."
+            )
+    else:
+        output_df = input_df.copy()
+        for col in [
+            "claim_image_caption",
+            "evidence_image_caption",
+            "claim_enriched",
+            "evidence_enriched",
+        ]:
+            output_df[col] = pd.NA
+    # Find the last processed index
+    last_processed_idx = get_last_processed_index(output_df)
+    print(f"Resuming from index {last_processed_idx + 1}")
+    # Process remaining rows in batches
+    total_rows = len(input_df)
+    with tqdm(total=total_rows, initial=last_processed_idx + 1) as pbar:
+        for idx in range(last_processed_idx + 1, total_rows, BATCH_SIZE):
+            batch_end = min(idx + BATCH_SIZE, total_rows)
+            # Process each row in the batch
+            for row_idx in range(idx, batch_end):
+                row = input_df.iloc[row_idx]
+                # Skip if already processed
+                if pd.notna(output_df.at[row_idx, "evidence_image_caption"]):
+                    continue
+                # Process the row
+                claim_cap, evidence_cap, claim_enr, evidence_enr = process_image_row(
+                    row
+                )
+                # Update the output DataFrame
+                output_df.loc[row_idx, "claim_image_caption"] = claim_cap
+                output_df.loc[row_idx, "evidence_image_caption"] = evidence_cap
+                output_df.loc[row_idx, "claim_enriched"] = claim_enr
+                output_df.loc[row_idx, "evidence_enriched"] = evidence_enr
+                pbar.update(1)
+            # Save after each batch
+            output_df.to_csv(output_csv, index=False)
+            print(f"Saved progress at index {batch_end}")
+if __name__ == "__main__":
+    for name in ["train", "test"]:
+        input_csv = f"{PROCESSED_DIR}/{name}.csv"
+        output_csv = f"{PROCESSED_DIR}/{name}_enriched.csv"
+        if not os.path.exists(input_csv):
+            raise FileNotFoundError(f"Input CSV file does not exist: {input_csv}")
+        process_csv(input_csv, output_csv)
+        print(f"Processing complete. Output saved to {output_csv}")

src/preprocess/preprocess.py ADDED Viewed

	@@ -0,0 +1,82 @@

+import pandas as pd
+from src.utils.data_utils import HEADERS
+from src.utils.path_utils import get_project_root
+# Constants
+PROJECT_ROOT = get_project_root()
+RAW_DIR = PROJECT_ROOT / "data/raw/factify"
+PROCESSED_DIR = PROJECT_ROOT / "data/preprocessed"
+IMAGES_DIR = RAW_DIR / "extracted/images"
+def ensure_directories():
+    """Ensure that necessary directories exist."""
+    PROCESSED_DIR.mkdir(parents=True, exist_ok=True)  # Create 'data/preprocessed'
+def preprocess_csv(dataset: str):
+    """
+    Preprocess the given dataset CSV (train or test).
+    Args:
+        dataset (str): The dataset name ('train' or 'test').
+    """
+    # Paths
+    ensure_directories()
+    csv_path = RAW_DIR / f"extracted/{dataset}.csv"
+    processed_csv_path = PROCESSED_DIR / f"{dataset}.csv"
+    images_folder = IMAGES_DIR / dataset
+    if not csv_path.exists():
+        print(f"Dataset CSV not found: {csv_path}")
+        return
+    # Load the CSV
+    df = pd.read_csv(csv_path, names=HEADERS, header=None, sep="\t", skiprows=1)
+    # Update file paths for images
+    def update_image_path(row, column_name):
+        """Update the image path if it exists, else leave as None."""
+        image_file = row[column_name]
+        file_id = row["id"]
+        if column_name == "claim_image_original":
+            file_path = images_folder / f"{file_id}_claim.jpg"
+        elif column_name == "evidence_image_original":
+            file_path = images_folder / f"{file_id}_evidence.jpg"
+        else:
+            return None
+        # Check if the file exists
+        if file_path.exists():
+            # Use the relative path starting from "/data/.."
+            return str(file_path.relative_to(PROJECT_ROOT))
+        return None
+    df.rename(
+        columns={
+            "claim_image": "claim_image_original",
+            "evidence_image": "evidence_image_original",
+        },
+        inplace=True,
+    )
+    df["claim_image"] = df.apply(
+        lambda row: update_image_path(row, "claim_image_original"), axis=1
+    )
+    df["evidence_image"] = df.apply(
+        lambda row: update_image_path(row, "evidence_image_original"), axis=1
+    )
+    # Save the processed CSV
+    df.to_csv(processed_csv_path, index=False)
+    print(f"Processed {dataset}.csv saved to {processed_csv_path}")
+def main():
+    for dataset in ["train", "test"]:
+        preprocess_csv(dataset)
+if __name__ == "__main__":
+    main()

src/utils/__init__.py ADDED Viewed

File without changes

src/utils/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (197 Bytes). View file

src/utils/__pycache__/data_utils.cpython-311.pyc ADDED Viewed

Binary file (3.24 kB). View file

src/utils/__pycache__/path_utils.cpython-311.pyc ADDED Viewed

Binary file (538 Bytes). View file

src/utils/data_utils.py ADDED Viewed

	@@ -0,0 +1,73 @@

+import os
+import pandas as pd
+from PIL import Image
+from typing import Dict, Any
+from src.utils.path_utils import get_project_root
+# Constants
+PROJECT_ROOT = get_project_root()
+PREPROCESSED_DIR = PROJECT_ROOT / "data/preprocessed"
+HEADERS = [
+    "id",
+    "claim",
+    "claim_image",
+    "evidence",
+    "evidence_image",
+    "category",
+    "claim_ocr",
+    "evidence_ocr",
+]
+def get_preprocessed_data(dataset: str = "train") -> pd.DataFrame:
+    """
+    Load the preprocessed data for the specified dataset.
+    Args:
+        dataset (str): Either 'train' or 'test'. Defaults to 'train'.
+    Returns:
+        pd.DataFrame: A DataFrame containing the preprocessed data.
+    """
+    csv_path = PREPROCESSED_DIR / f"{dataset}.csv"
+    if not csv_path.exists():
+        raise FileNotFoundError(f"Preprocessed dataset CSV not found: {csv_path}")
+    return pd.read_csv(csv_path)
+def load_images_for_row(row: Dict[str, Any]) -> Dict[str, Any]:
+    """
+    Load the claim and evidence images for a given row of data.
+    Args:
+        row (Dict[str, Any]): A dictionary representing a row of preprocessed data.
+    Returns:
+        Dict[str, Any]: A dictionary containing the original row with loaded images added.
+    """
+    result = row.copy()  # Copy the original row to avoid modifying the input
+    claim_image_path = row.get("claim_image")
+    evidence_image_path = row.get("evidence_image")
+    if claim_image_path and os.path.exists(claim_image_path):
+        try:
+            result["claim_image"] = Image.open(claim_image_path).convert("RGB")
+        except Exception as e:
+            print(f"Failed to load claim image from {claim_image_path}: {e}")
+            result["claim_image"] = None
+    else:
+        result["claim_image"] = None
+    if evidence_image_path and os.path.exists(evidence_image_path):
+        try:
+            result["evidence_image"] = Image.open(evidence_image_path).convert("RGB")
+        except Exception as e:
+            print(f"Failed to load evidence image from {evidence_image_path}: {e}")
+            result["evidence_image"] = None
+    else:
+        result["evidence_image"] = None
+    return result

src/utils/path_utils.py ADDED Viewed

	@@ -0,0 +1,6 @@

+from pathlib import Path
+def get_project_root() -> Path:
+    """Get the project root directory."""
+    return Path(__file__).parent.parent.parent