Spaces:

valbuc
/

GenCeption

Sleeping

App Files Files Community

cao-lele commited on Feb 26, 2024

Commit

0724c4e

1 Parent(s): 9619f3f

initial commit

Browse files

Files changed (10) hide show

.gitignore +167 -0
Leaderboard.md +163 -0
README.md +55 -0
datasets/examples/000000061658.jpg +0 -0
datasets/examples/000000338560.jpg +0 -0
genception/evaluation.py +101 -0
genception/example_script.sh +8 -0
genception/experiment.py +373 -0
genception/utils.py +25 -0
requirements.txt +7 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,167 @@

+#custom
+mme_data/
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+# other
+.DS_Store

Leaderboard.md ADDED Viewed

	@@ -0,0 +1,163 @@

+# 🔥🏅️GenCeption Leaderboard 🏅️🔥
+Evaluated MLLMs: [ChatGPT-4V](https://cdn.openai.com/papers/GPTV_System_Card.pdf), [mPLUG-Owl2](https://arxiv.org/pdf/2311.04257.pdf), [LLaVA-13B](https://arxiv.org/pdf/2304.08485.pdf), [LLaVA-7B](https://arxiv.org/pdf/2304.08485.pdf)
+<table>
+<tr><th>Existence </th><th>Count</th></tr>
+<tr><td>
+| Model | GC@3|
+|--|--|
+| ChatGPT-4V|0.422 |
+| mPLUG-Owl2|0.323 |
+| LLaVA-7B|0.308 |
+| LLaVA-13B|0.305 |
+</td><td>
+| Model | GC@3|
+|--|--|
+| ChatGPT-4V|0.404 |
+| mPLUG-Owl2|0.299 |
+| LLaVA-13B|0.294 |
+| LLaVA-7B|0.353 |
+</td></tr> </table>
+<table>
+<tr><th>Position </th><th>Color</th></tr>
+<tr><td>
+| Model | GC@3|
+|--|--|
+| ChatGPT-4V|0.408|
+| mPLUG-Owl2|0.306 |
+| LLaVA-7B|0.285 |
+| LLaVA-13B|0.255 |
+</td><td>
+| Model | GC@3|
+|--|--|
+| ChatGPT-4V|0.403 |
+| LLaVA-13B|0.300 |
+| mPLUG-Owl2|0.290 |
+| LLaVA-7B|0.284 |
+</td></tr> </table>
+<table>
+<tr><th>Poster </th><th>Celebrity</th></tr>
+<tr><td>
+| Model | GC@3|
+|--|--|
+| ChatGPT-4V|0.324|
+| mPLUG-Owl2|0.243 |
+| LLaVA-13B|0.215 |
+| LLaVA-7B|0.214 |
+</td><td>
+| Model | GC@3|
+|--|--|
+| ChatGPT-4V|0.332 |
+| mPLUG-Owl2|0.232 |
+| LLaVA-13B|0.206 |
+| LLaVA-7B|0.188 |
+</td></tr> </table>
+<table>
+<tr><th>Scene </th><th>Landmark</th></tr>
+<tr><td>
+| Model | GC@3|
+|--|--|
+| ChatGPT-4V|0.393|
+| mPLUG-Owl2|0.299 |
+| LLaVA-13B|0.277 |
+| LLaVA-7B|0.266 |
+</td><td>
+| Model | GC@3|
+|--|--|
+| ChatGPT-4V|0.353 |
+| mPLUG-Owl2|0.275 |
+| LLaVA-7B|0.252 |
+| LLaVA-13B|0.242 |
+</td></tr> </table>
+<table>
+<tr><th>Artwork </th><th>Commonsense Reasoning</th></tr>
+<tr><td>
+| Model | GC@3|
+|--|--|
+| ChatGPT-4V|0.421|
+| mPLUG-Owl2|0.252 |
+| LLaVA-13B|0.212 |
+| LLaVA-7B|0.210 |
+</td><td>
+| Model | GC@3|
+|--|--|
+| ChatGPT-4V|0.471 |
+| mPLUG-Owl2|0.353 |
+| LLaVA-13B|0.334 |
+| LLaVA-7B|0.294 |
+</td></tr> </table>
+<table>
+<tr><th>Code Reasoning </th><th>Numerical Calculation</th></tr>
+<tr><td>
+| Model | GC@3|
+|--|--|
+| ChatGPT-4V|0.193|
+| mPLUG-Owl2|0.176 |
+| LLaVA-13B|0.144 |
+| LLaVA-7B|0.107 |
+</td><td>
+| Model | GC@3|
+|--|--|
+| ChatGPT-4V|0.240 |
+| LLaVA-13B|0.195 |
+| mPLUG-Owl2|0.192 |
+| LLaVA-7B|0.155 |
+</td></tr> </table>
+<table>
+<tr><th>Text Translation </th><th>OCR</th></tr>
+<tr><td>
+| Model | GC@3|
+|--|--|
+| ChatGPT-4V|0.157|
+| LLaVA-13B|0.116 |
+| LLaVA-7B|0.111 |
+| mPLUG-Owl2|0.081 |
+</td><td>
+| Model | GC@3|
+|--|--|
+| ChatGPT-4V|0.393 |
+| mPLUG-Owl2|0.276 |
+| LLaVA-13B|0.239 |
+| LLaVA-7B|0.222 |
+</td></tr> </table>

README.md ADDED Viewed

	@@ -0,0 +1,55 @@

+# GenCeption: Evaluate Multimodal LLMs with Unlabeled Unimodal Data
+<div>
+<p align="center">
+  <a href="https://github.com/EQTPartners/GenCeption/blob/main/Leaderboard.md">🔥🏅️Leaderboard🏅️🔥</a>&emsp;•&emsp;
+  <a href="#contribute">Contribute</a>&emsp;•&emsp;
+  <a href="https://arxiv.org/abs/2402.14973">Paper</a>&emsp;•&emsp;
+  <a href="#cite-this-work">Citation</a>
+</p>
+> GenCeption is an annotation-free MLLM (Multimodal Large Language Model) evaluation framework that merely requires unimodal data to assess inter-modality semantic coherence and inversely reflects the models' inclination to hallucinate.
+![GenCeption Procedure](figures/genception-correlation.jpeg)
+GenCeption is inspired by a popular multi-player game [DrawCeption](https://wikipedia.org/wiki/drawception). Using the image modality as an example, the process begins with a seed image $\mathbf{X}^{(0)}$ from a unimodal image dataset for the first iteration ($t$=1). The MLLM creates a detailed description of the image, which is then used by an image generator to produce $\mathbf{X}^{(t)}$. After $T$ iterations, we calculate the GC@T score to measure the MLLM's performance on $\mathbf{X}^{(0)}$.
+The GenCeption ranking on [MME](https://github.com/BradyFU/Awesome-Multimodal-Large-Language-Models/tree/Evaluation) benchmarking dataset (without using any label) shows a strong correlation with other sophisticated benchmarks such as [OpenCompass](https://rank.opencompass.org.cn/leaderboard-multimodal) and [HallusionBench](https://github.com/tianyi-lab/HallusionBench). Moreover, the negative correlation with MME scores suggests that GenCeption measures distinct aspects not covered by MME, using the same set of samples. For detailed experimental analysis, please read [our paper](https://arxiv.org/abs/2402.14973).
+We demostrate a 5-iteration GenCeption procedure below run on a seed images to evaluate 4 VLLMs. Each iteration $t$ shows the generated image $\mathbf{X}^{(t)}$, the description $\mathbf{Q}^{(t)}$ of the preceding image $\mathbf{X}^{(t-1)}$, and the similarity score $s^{(t)}$ relative to $\mathbf{X}^{(0)}$. The GC@5 metric for each VLLM is also presented. Hallucinated elements within descriptions $\mathbf{Q}^{(1)}$ and $\mathbf{Q}^{(2)}$ as compared to the seed image are indicated with  <span style="color:red"><u>red underlined</u></span>.
+![GenCeption Example](figures/existence-example.jpeg)
+## Contribute
+Please **create PR (Pull-Request)** to contribute your results to the [🔥🏅️**Leaderboard**🏅️🔥](https://github.com/EQTPartners/GenCeption/blob/main/Leaderboard.md). Start by creating your virtual environment:
+```{bash}
+conda create --name genception python=3.10 -y
+conda activate genception
+pip install -r requirements.txt
+```
+For example, if you want to evaluate mPLUG-Owl2 model, please follow the instructions in the [official mPLUG-OWL2 repository](https://github.com/X-PLUG/mPLUG-Owl/tree/main/mPLUG-Owl2#usage). Then run GenCeption by
+```{bash}
+bash example_script.sh # uses exemplary data in datasets/example/
+```
+This assumes that an OPENAI_API_KEY is set as an environment variable. The `model` argument to `experiment.py` in `example_script.sh` can be adjusted to `llava7b`, `llava13b`, `mPLUG`, or `gpt4v`. Please adapt accordingly for to evaluate your MLLM.
+The MME dataset, of which the image modality was used in our paper, can be obtained as [described here](https://github.com/BradyFU/Awesome-Multimodal-Large-Language-Models/blob/Evaluation/README.md#our-mllm-works).
+## Cite This Work
+```bibtex
+@article{cao2023genception,
+    author = {Lele Cao and
+              Valentin Buchner and
+              Zineb Senane and
+              Fangkai Yang},
+    title = {{GenCeption}: Evaluate Multimodal LLMs with Unlabeled Unimodal Data},
+    year={2023},
+    journal={arXiv preprint arXiv:2402.14973},
+    primaryClass={cs.AI,cs.CL,cs.LG}
+}
+```

datasets/examples/000000061658.jpg ADDED Viewed

datasets/examples/000000338560.jpg ADDED Viewed

genception/evaluation.py ADDED Viewed

	@@ -0,0 +1,101 @@

+import os
+import json
+import pickle
+import numpy as np
+import argparse
+from genception.utils import find_files
+def read_all_pkl(folder_path: str) -> dict:
+    """
+    Read all the pickle files in the given folder path
+    Args:
+    folder_path: str: The path to the folder
+    Returns:
+    dict: The dictionary containing the file path as key and the pickle file content as value
+    """
+    result_dict = dict()
+    file_list = find_files(folder_path, {".pkl"})
+    for file_path in file_list:
+        with open(file_path, "rb") as file:
+            result_dict[file_path] = pickle.load(file)
+    return result_dict
+def integrated_decay_area(scores: list[float]) -> float:
+    """
+    Calculate the Integrated Decay Area (IDA) for the given scores
+    Args:
+    scores: list[float]: The list of scores
+    Returns:
+    float: The IDA score
+    """
+    total_area = 0
+    for i, score in enumerate(scores):
+        total_area += (i + 1) * score
+    max_possible_area = sum(range(1, len(scores) + 1))
+    ida = total_area / max_possible_area if max_possible_area else 0
+    return ida
+def gc_score(folder_path: str, n_iter: int = None) -> tuple[float, list[float]]:
+    """
+    Calculate the GC@T score for the given folder path
+    Args:
+    folder_path: str: The path to the folder
+    n_iter: int: The number of iterations to consider for GC@T score
+    Returns:
+    tuple[float, list[float]]: The GC@T score and the list of GC scores for each file
+    """
+    test_data = read_all_pkl(folder_path)
+    all_gc_scores = []
+    for _, value in test_data.items():
+        sim_score = value["cosine_similarities"][1:]
+        if n_iter is None:
+            _gc = integrated_decay_area(sim_score)
+        else:
+            if len(value["cosine_similarities"]) >= n_iter:
+                _gc = integrated_decay_area(sim_score[:n_iter])
+            else:
+                continue
+        all_gc_scores.append(_gc)
+    return np.mean(all_gc_scores), all_gc_scores
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--results_path",
+        type=str,
+        help="Path to the folder containing the pickle files",
+        required=True,
+    )
+    parser.add_argument(
+        "--t",
+        type=int,
+        help="Number of iterations to consider for GC@T score",
+        required=True,
+    )
+    args = parser.parse_args()
+    # calculate GC@T score and save in results directory
+    gc, all_gc_scores = gc_score(args.results_path, args.t)
+    result = {
+        "GC Score": gc,
+        "All GC Scores": all_gc_scores,
+    }
+    results_path = os.path.join(args.results_path, f"GC@{str(args.t)}.json")
+    with open(results_path, "w") as file:
+        json.dump(result, file)
+if __name__ == "__main__":
+    main()

genception/example_script.sh ADDED Viewed

	@@ -0,0 +1,8 @@

+# run experiment with gpt4v on examples dataset
+python genception/experiment.py --model gpt4v --dataset datasets/examples
+# Calculate GC@T evaluation metric
+python genception/evaluation.py --results_path datasets/examples/results_gpt4v --t 1
+python genception/evaluation.py --results_path datasets/examples/results_gpt4v --t 3
+python genception/evaluation.py --results_path datasets/examples/results_gpt4v --t 5

genception/experiment.py ADDED Viewed

	@@ -0,0 +1,373 @@

+import os
+import torch
+import base64
+import pickle
+import requests
+import argparse
+import nltk
+from nltk.tokenize import word_tokenize
+from functools import partial
+from transformers import ViTImageProcessor, ViTModel
+from transformers import AutoProcessor, LlavaForConditionalGeneration
+from sklearn.metrics.pairwise import cosine_similarity
+from PIL import Image
+import logging
+from tqdm import tqdm
+from openai import OpenAI
+from mplug_owl2.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN
+from mplug_owl2.conversation import conv_templates
+from mplug_owl2.model.builder import load_pretrained_model
+from mplug_owl2.mm_utils import (
+    process_images,
+    tokenizer_image_token,
+    get_model_name_from_path,
+    KeywordsStoppingCriteria,
+)
+from genception.utils import find_files
+logging.basicConfig(level=logging.INFO)
+client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
+api_key = client.api_key
+nltk.download("punkt")
+device = "cuda" if torch.cuda.is_available() else "cpu"
+torch.backends.cudnn.enabled = False
+# VIT model
+vit_processor = ViTImageProcessor.from_pretrained("google/vit-base-patch16-224-in21k")
+vit_model = ViTModel.from_pretrained("google/vit-base-patch16-224-in21k")
+def image_embedding(image_file: str) -> list[float]:
+    """
+    Generates an image embedding using a vit model
+    Args:
+    image_file: str: The path to the image file
+    Returns:
+    list[float]: The image embedding
+    """
+    image = Image.open(image_file).convert("RGB")
+    inputs = vit_processor(images=image, return_tensors="pt")
+    outputs = vit_model(**inputs)
+    return outputs.last_hidden_state.tolist()[0][0]
+def save_image_from_url(url: str, filename: str):
+    """
+    Save an image from a given URL to a file
+    Args:
+    url: str: The URL of the image
+    filename: str: The name of the file to save the image to
+    """
+    response = requests.get(url)
+    if response.status_code == 200:
+        with open(filename, "wb") as file:
+            file.write(response.content)
+    else:
+        logging.warning(
+            f"Failed to download image. Status code: {response.status_code}"
+        )
+def find_image_files(folder_path: str) -> list[str]:
+    image_extensions = {".jpg", ".png"}
+    return find_files(folder_path, image_extensions)
+def count_words(text):
+    words = word_tokenize(text)
+    return len(words)
+def encode_image_os(image_path: str):
+    image = Image.open(image_path).convert("RGB")
+    return image
+def encode_image_gpt4v(image_path: str):
+    with open(image_path, "rb") as image_file:
+        return base64.b64encode(image_file.read()).decode("utf-8")
+def generate_xt(
+    image_desc: str, output_folder: str, i: int, file_name: str, file_extension: str
+) -> str:
+    """
+    Generate an image based on a description using dall-e and save it to a file
+    Args:
+    image_desc: str: The description of the image
+    output_folder: str: The path to the folder to save the image to
+    i: int: The iteration number
+    file_name: str: The name of the file
+    file_extension: str: The extension of the file
+    Returns:
+    str: The path to the saved image file
+    """
+    response = client.images.generate(
+        model="dall-e-3",
+        prompt="Generate an image that fully and precisely reflects this description: {}".format(
+            image_desc
+        ),
+        size="1024x1024",
+        quality="standard",
+        n=1,
+    )
+    new_image_filename = os.path.join(
+        output_folder, f"{file_name}_{i}.{file_extension}"
+    )
+    save_image_from_url(response.data[0].url, new_image_filename)
+    return new_image_filename
+def get_desc_mPLUG(image, image_processor, lmm_model, tokenizer, prompt):
+    """
+    Given an image, generate a description using the mPLUG model
+    Args:
+    image: Image: The image to describe
+    image_processor: callable: The image processor
+    lmm_model: The language model
+    tokenizer: The tokenizer
+    prompt: str: The prompt for the model
+    Returns:
+    str: The description of the image
+    """
+    conv = conv_templates["mplug_owl2"].copy()
+    max_edge = max(image.size)
+    image = image.resize((max_edge, max_edge))
+    image_tensor = process_images([image], image_processor)
+    image_tensor = image_tensor.to(lmm_model.device, dtype=torch.float16)
+    inp = DEFAULT_IMAGE_TOKEN + prompt
+    conv.append_message(conv.roles[0], inp)
+    conv.append_message(conv.roles[1], None)
+    prompt = conv.get_prompt()
+    input_ids = (
+        tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt")
+        .unsqueeze(0)
+        .to(lmm_model.device)
+    )
+    stop_str = conv.sep2
+    keywords = [stop_str]
+    stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
+    attention_mask = torch.ones_like(input_ids, dtype=torch.long)
+    temperature = 0.001
+    max_new_tokens = 512
+    with torch.inference_mode():
+        output_ids = lmm_model.generate(
+            input_ids,
+            images=image_tensor,
+            do_sample=True,
+            temperature=temperature,
+            max_new_tokens=max_new_tokens,
+            stopping_criteria=[stopping_criteria],
+            attention_mask=attention_mask,
+        )
+    image_desc = tokenizer.decode(
+        output_ids[0, input_ids.shape[1] :], skip_special_tokens=True
+    ).strip()
+    return image_desc
+def get_desc_llava(image, lmm_processor, lmm_model, prompt):
+    """
+    Given an image, generate a description using the llava model
+    Args:
+    image: Image: The image to describe
+    lmm_processor: callable: The language model processor
+    lmm_model: The language model
+    prompt: str: The prompt for the model
+    Returns:
+    str: The description of the image
+    """
+    inputs = lmm_processor(text=prompt, images=image, return_tensors="pt").to(device)
+    outputs = lmm_model.generate(**inputs, max_new_tokens=512, do_sample=False)
+    answer = lmm_processor.batch_decode(outputs, skip_special_tokens=True)[0]
+    image_desc = answer.split("ASSISTANT:")[1].strip()
+    return image_desc
+def get_desc_gpt4v(image, prompt):
+    """
+    Given an image, generate a description using the gpt-4-vision model
+    Args:
+    image: Image: The image to describe
+    prompt: str: The prompt for the model
+    Returns:
+    str: The description of the image
+    """
+    payload = {
+        "model": "gpt-4-vision-preview",
+        "messages": [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": prompt,
+                    },
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": f"data:image/jpeg;base64,{image}"},
+                    },
+                ],
+            }
+        ],
+        "max_tokens": 512,
+        "temperature": 0,
+    }
+    headers = {"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"}
+    response = requests.post(
+        "https://api.openai.com/v1/chat/completions", headers=headers, json=payload
+    )
+    image_desc = response.json()["choices"][0]["message"]["content"]
+    return image_desc
+def test_sample(
+    seed_image: str,
+    n_iteration: int,
+    output_folder: str,
+    get_desc_function: callable,
+    encode_image_function: callable,
+):
+    """
+    Iteratively generates T (n_iterations) descriptions and images based on the seed image
+    Args:
+    seed_image: str: The path to the seed image
+    n_iteration: int: The number of iterations to perform
+    output_folder: str: The path to the folder to save the results
+    get_desc_function: callable: The function to generate the description
+    encode_image_function: callable: The function to encode the image
+    """
+    list_of_desc = []
+    list_of_image = []
+    list_of_image_embedding = [image_embedding(seed_image)]
+    list_of_cos_sim = [1.0]
+    current_image_path = seed_image
+    current_image_name = os.path.basename(current_image_path)
+    file_name, file_extension = current_image_name.split(".")
+    logging.debug(f"Image: {current_image_path}")
+    pkl_file = os.path.join(output_folder, f"{file_name}_result.pkl")
+    if os.path.exists(pkl_file):
+        logging.info("Results already exist, skipping")
+        return None
+    for i in range(n_iteration):
+        # Encode the current image and get the description
+        image = encode_image_function(current_image_path)
+        image_desc = get_desc_function(image)
+        list_of_desc.append(image_desc)
+        logging.debug(image_desc)
+        # generate X^t, append image and embedding
+        new_image_filename = generate_xt(
+            image_desc, output_folder, i, file_name, file_extension
+        )
+        list_of_image.append(new_image_filename)
+        list_of_image_embedding.append(image_embedding(new_image_filename))
+        # Calculate Cosine Sim to original image
+        similarity = cosine_similarity(
+            [list_of_image_embedding[0]], [list_of_image_embedding[-1]]
+        )[0][0]
+        list_of_cos_sim.append(similarity)
+        logging.info(f"({count_words(image_desc)}, {round(similarity,2)})")
+        # Save checkpoint to avoid losing results
+        data_to_save = {
+            "descriptions": list_of_desc,
+            "images": list_of_image,
+            "image_embeddings": list_of_image_embedding,
+            "cosine_similarities": list_of_cos_sim,
+        }
+        with open(pkl_file, "wb") as file:
+            pickle.dump(data_to_save, file)
+        # Update current_image_path for the next iteration
+        current_image_path = new_image_filename
+    return None
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--dataset", type=str, default="mme_data/color")
+    parser.add_argument("--model", type=str, default="llava7b")
+    parser.add_argument("--n_iter", type=int, default=5)
+    args = parser.parse_args()
+    logging.info(args)
+    prompt = "Please write a clear, precise, detailed, and concise description of all elements in the image. Focus on accurately depicting various aspects, including but not limited to the colors, shapes, positions, styles, texts and the relationships between different objects and subjects in the image. Your description should be thorough enough to guide a professional in recreating this image solely based on your textual representation. Remember, only include descriptive texts that directly pertain to the contents of the image. You must complete the description using less than 500 words."
+    if "llava" in args.model:
+        lmm_model = LlavaForConditionalGeneration.from_pretrained(
+            f"llava-hf/llava-1.5-{args.model[5:]}-hf", load_in_8bit=True
+        )
+        lmm_processor = AutoProcessor.from_pretrained(
+            f"llava-hf/llava-1.5-{args.model[5:]}-hf"
+        )
+        prompt = f"<image>\nUSER: {prompt}\nASSISTANT:"
+        get_desc_function = partial(get_desc_llava, lmm_processor, lmm_model, prompt)
+        encode_image_function = encode_image_os
+    elif args.model == "mPLUG":
+        model_path = "MAGAer13/mplug-owl2-llama2-7b"
+        model_name = get_model_name_from_path(model_path)
+        tokenizer, lmm_model, image_processor, _ = load_pretrained_model(
+            model_path,
+            None,
+            model_name,
+            load_8bit=False,
+            load_4bit=False,
+            device=device,
+        )
+        tokenizer.pad_token_id = tokenizer.eos_token_id
+        tokenizer.pad_token = tokenizer.eos_token
+        get_desc_function = partial(
+            get_desc_mPLUG, image_processor, lmm_model, tokenizer, prompt
+        )
+        encode_image_function = encode_image_os
+    elif args.model == "gpt4v":
+        get_desc_function = partial(get_desc_gpt4v, prompt=prompt)
+        encode_image_function = encode_image_gpt4v
+    output_folder = os.path.join(args.dataset, f"results_{args.model}")
+    os.makedirs(output_folder, exist_ok=True)
+    logging.debug("Loaded model. Entered main loop.")
+    for img_file in tqdm(find_image_files(args.dataset)):
+        try:
+            logging.info(img_file)
+            test_sample(
+                seed_image=img_file,
+                n_iteration=args.n_iter,
+                output_folder=output_folder,
+                get_desc_function=get_desc_function,
+                encode_image_function=encode_image_function,
+            )
+        except Exception as e:
+            logging.warning("caught error:")
+            logging.warning(e)
+            continue
+if __name__ == "__main__":
+    main()

genception/utils.py ADDED Viewed

	@@ -0,0 +1,25 @@

+import os
+def find_files(folder_path: str, file_extensions: dict) -> list[str]:
+    """
+    Find all files with the given extensions in the given folder path
+    Args:
+    folder_path: str: The path to the folder
+    file_extensions: dict: The file extensions to look for
+    Returns:
+    list[str]: The list of file paths
+    """
+    file_paths = []
+    for file in os.listdir(folder_path):
+        if (
+            os.path.isfile(os.path.join(folder_path, file))
+            and os.path.splitext(file)[1].lower() in file_extensions
+        ):
+            absolute_path = os.path.abspath(os.path.join(folder_path, file))
+            file_paths.append(absolute_path)
+    return file_paths

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+transformers>=4.37.1
+pillow
+requests
+scikit-learn
+nltk
+openai
+sentencepiece