Spaces:

ucla-contextual
/

contextual_leaderboard

Runtime error

App Files Files Community

Rohan Wadhawan commited on Mar 19, 2024

Commit

e61d9ba

0 Parent(s):

ConTextual Leaderboard setup

Browse files

Files changed (8) hide show

.DS_Store +0 -0
.gitattributes +35 -0
.gitignore +1 -0
README.md +13 -0
app.py +282 -0
content.py +100 -0
requirements.txt +5 -0
scorer.py +50 -0

.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ .DS_Store

README.md ADDED Viewed

	@@ -0,0 +1,13 @@

+---
+title: Contextual Leaderboard
+emoji: 🐨
+colorFrom: purple
+colorTo: blue
+sdk: gradio
+sdk_version: 4.16.0
+app_file: app.py
+pinned: false
+license: mit
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,282 @@

+import os
+import json
+import csv
+import datetime
+from email.utils import parseaddr
+import gradio as gr
+import pandas as pd
+import numpy as np
+from datasets import load_dataset
+from apscheduler.schedulers.background import BackgroundScheduler
+from huggingface_hub import HfApi
+from scorer import instruction_scorer
+from content import format_error, format_warning, format_log, TITLE, INTRODUCTION_TEXT, SUBMISSION_TEXT, CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT, model_hyperlink
+TOKEN = os.environ.get("TOKEN", None)
+OWNER="ucla-contextual"
+TEST_DATASET = f"{OWNER}/contextual_test"
+VAL_DATASET = f"{OWNER}/contextual_val"
+SUBMISSION_DATASET = f"{OWNER}/submissions_internal"
+CONTACT_DATASET = f"{OWNER}/contact_info"
+RESULTS_DATASET = f"{OWNER}/results"
+LEADERBOARD_PATH = f"{OWNER}/leaderboard"
+api = HfApi()
+YEAR_VERSION = "2024"
+def read_json_file(filepath):
+    with open(filepath) as infile:
+        data_dict = json.load(infile)
+    return data_dict
+def save_json_file(filepath, data_dict):
+    with open(filepath, "w") as outfile:
+        json.dump(data_dict, outfile)
+os.makedirs("scored", exist_ok=True)
+test_data_files = {"test": "contextual_test.csv"}
+test_dataset = load_dataset(TEST_DATASET, data_files=test_data_files , token=TOKEN, download_mode="force_redownload", ignore_verifications=True)
+val_data_files = {"val": "contextual_val.csv"}
+val_dataset = load_dataset(VAL_DATASET, data_files=val_data_files , token=TOKEN, download_mode="force_redownload", ignore_verifications=True)
+results_data_files = {"test": "contextual_test_results.csv", "val": "contextual_val_results.csv"}
+results = load_dataset(RESULTS_DATASET, data_files=
+results_data_files, token=TOKEN, download_mode="force_redownload", ignore_verifications=True)
+contacts_data_files = {"contacts": "contacts.csv"}
+contact_infos = load_dataset(CONTACT_DATASET, data_files=contacts_data_files, token=TOKEN, download_mode="force_redownload", ignore_verifications=True)
+def get_dataframe_from_results(results, split):
+    df = results[split].to_pandas()
+    df.drop(columns=['URL'], inplace=True)
+    df = df.sort_values(by=["All"], ascending=False)
+    return df
+test_dataset_dataframe = test_dataset["test"].to_pandas()
+val_dataset_dataframe = val_dataset["val"].to_pandas()
+contacts_dataframe = contact_infos["contacts"].to_pandas()
+val_results_dataframe = get_dataframe_from_results(results=results, split="val")
+test_results_dataframe = get_dataframe_from_results(results=results, split="test")
+def restart_space():
+    api.restart_space(repo_id=LEADERBOARD_PATH, token=TOKEN)
+TYPES = ["markdown", "markdown", "markdown", "number", "number", "number","number", "number", "number", "number", "number", "number"]
+def add_new_eval(
+    model: str,
+    method: str,
+    url: str,
+    path_to_file: str,
+    organisation: str,
+    mail: str,
+):
+    print("printing all inputs:", model, method, url, path_to_file, organisation, mail)
+    if len(model)==0:
+        print("model none")
+        raise gr.Error("Please provide a model name. Field empty!")
+    if len(method)==0:
+        print("method none")
+        raise gr.Error("Please provide a method. Field empty!")
+    if len(organisation)==0:
+        print("org none")
+        raise gr.Error("Please provide organisation information. Field empty!")
+    # Very basic email parsing
+    _, parsed_mail = parseaddr(mail)
+    if not "@" in parsed_mail:
+        print("email here")
+        raise gr.Error("Please provide a valid email address.")
+    # Check if the combination model/org already exists and prints a warning message if yes
+    if model.lower() in set([m.lower() for m in results["val"]["Model"]]) and organisation.lower() in set([o.lower() for o in results["val"]["Organisation"]]):
+        print("model org combo here")
+        raise gr.Error("This model has been already submitted.")
+    if path_to_file is None:
+        print("file missing here")
+        raise gr.Error("Please attach a file.")
+    tmp_file_output = read_json_file(path_to_file.name)
+    if len(tmp_file_output.keys())!=1:
+        print("file format wrong here")
+        raise gr.Error("Submission file format incorrect. Please refer to the format description!")
+    tmp_output_key = list(tmp_file_output.keys())[0]
+    if len(tmp_file_output[tmp_output_key].keys())!=100:
+        print("file not 100 here")
+        raise gr.Error("File must contain exactly 100 predictions.")
+    # Save submitted file
+    time_atm = datetime.datetime.today()
+    api.upload_file(
+        repo_id=SUBMISSION_DATASET,
+        path_or_fileobj=path_to_file.name,
+        path_in_repo=f"{organisation}/{model}/{YEAR_VERSION}_raw_{time_atm}.json",
+        repo_type="dataset",
+        token=TOKEN
+    )
+    # Compute score
+    file_path = path_to_file.name
+    scores = instruction_scorer(val_dataset_dataframe, file_path , model)
+    path_or_fileobj=f"scored/{organisation}_{model}.json"
+    save_json_file(path_or_fileobj, scores)
+    # Save scored file
+    api.upload_file(
+        repo_id=SUBMISSION_DATASET,
+        path_or_fileobj=path_or_fileobj,
+        path_in_repo=f"{organisation}/{model}/{YEAR_VERSION}_scored_{time_atm}.json",
+        repo_type="dataset",
+        token=TOKEN
+    )
+    # Actual submission
+    eval_entry = {
+        "Model": model,
+        "Method":method,
+        "Organisation": organisation,
+        "URL": url,
+        "All":scores["average"],
+        "Time":scores["time"],
+        "Shopping":scores["shopping"],
+        "Navigation":scores["navigation-transportation"],
+        "Abstract":scores["abstract"],
+        "Application Usage":scores["app"],
+        "Web Usage":scores["web"],
+        "Infographic":scores["infographics"],
+        "Miscellaneous Natural Scenes": scores["misc"]
+    }
+    val_results_dataframe = get_dataframe_from_results(results=results, split="val")
+    val_results_dataframe = pd.concat([val_results_dataframe, pd.DataFrame([eval_entry])], ignore_index=True)
+    val_results_dataframe.to_csv('contextual_val_results.csv', index=False)
+    api.upload_file(
+        repo_id=RESULTS_DATASET,
+        path_or_fileobj="contextual_val_results.csv",
+        path_in_repo=f"contextual_val_results.csv",
+        repo_type="dataset",
+        token=TOKEN
+    )
+    contact_info = {
+        "Model": model,
+        "URL": url,
+        "Organisation": organisation,
+        "Mail": mail,
+    }
+    contacts_dataframe = contact_infos["contacts"].to_pandas()
+    contacts_dataframe = pd.concat([contacts_dataframe, pd.DataFrame([contact_info])], ignore_index=True)
+    contacts_dataframe.to_csv('contacts.csv', index=False)
+    api.upload_file(
+        repo_id=CONTACT_DATASET,
+        path_or_fileobj="contacts.csv",
+        path_in_repo=f"contacts.csv",
+        repo_type="dataset",
+        token=TOKEN
+    )
+    return format_log(f"Model {model} submitted by {organisation} successfully! \nPlease refresh the val leaderboard, and wait a bit to see the score displayed")
+def refresh():
+    results_data_files = {"test": "contextual_test_results.csv", "val": "contextual_val_results.csv"}
+    results = load_dataset(RESULTS_DATASET, data_files=
+    results_data_files, token=TOKEN, download_mode="force_redownload", ignore_verifications=True)
+    val_results_dataframe = get_dataframe_from_results(results=results, split="val")
+    test_results_dataframe = get_dataframe_from_results(results=results, split="test")
+    return val_results_dataframe, test_results_dataframe
+def upload_file(files):
+    file_paths = [file.name for file in files]
+    return file_paths
+demo = gr.Blocks()
+with demo:
+    gr.HTML(TITLE)
+    # gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
+    with gr.Row():
+        with gr.Accordion("🧐 Introduction", open=False):
+            gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
+    with gr.Row():
+        with gr.Accordion("🎯 Submission Guidelines", open=False):
+            gr.Markdown(SUBMISSION_TEXT, elem_classes="markdown-text")
+    with gr.Row():
+        with gr.Accordion("📙 Citation", open=False):
+            citation_button = gr.TextArea(
+                value=CITATION_BUTTON_TEXT,
+                label=CITATION_BUTTON_LABEL,
+                elem_id="citation-button",
+            )
+    with gr.Tab("Results: Test"):
+        leaderboard_table_test = gr.components.Dataframe(
+            value=test_results_dataframe, datatype=TYPES, interactive=False,
+            column_widths=["20%"]
+        )
+    with gr.Tab("Results: Val"):
+        leaderboard_table_val = gr.components.Dataframe(
+            value=val_results_dataframe, datatype=TYPES, interactive=False,
+            column_widths=["20%"]
+        )
+    refresh_button = gr.Button("Refresh")
+    refresh_button.click(
+        refresh,
+        inputs=[],
+        outputs=[
+            leaderboard_table_val,
+            leaderboard_table_test,
+        ],
+    )
+    with gr.Accordion("Submit a new model for evaluation"):
+        with gr.Row():
+            with gr.Column():
+                model_name_textbox = gr.Textbox(label="Model name", type='text')
+                method_textbox = gr.Textbox(label="Method (LMM or Aug LLM or any other)", type='text')
+                url_textbox = gr.Textbox(label="URL to model information", type='text')
+            with gr.Column():
+                organisation = gr.Textbox(label="Organisation", type='text')
+                mail = gr.Textbox(label="Contact email (will be stored privately, & used if there is an issue with your submission)", type='email')
+                file_output = gr.File()
+        submit_button = gr.Button("Submit Eval")
+        submission_result = gr.Markdown()
+        submit_button.click(
+            add_new_eval,
+            [
+                model_name_textbox,
+                method_textbox,
+                url_textbox,
+                file_output,
+                organisation,
+                mail
+            ],
+            submission_result,
+        )
+scheduler = BackgroundScheduler()
+scheduler.add_job(restart_space, "interval", seconds=3600)
+scheduler.start()
+demo.launch(debug=True)

content.py ADDED Viewed

	@@ -0,0 +1,100 @@

+TITLE = """<h1 align="center" id="space-title">ConTextual Leaderboard</h1>"""
+INTRODUCTION_TEXT = """
+Models are becoming quite good at understanding text on its own, but what about text in images, which gives important contextual information? For example, navigating a map, or understanding a meme? The ability to reason about the interactions between the text and visual context in images can power many real-world applications, such as AI assistants, or tools to assist the visually impaired. We refer to these tasks as context-sensitive text-rich visual reasoning tasks.
+At the moment, most evaluations of instruction-tuned large multimodal models (LMMs) focus on testing how well models can respond to human instructions posed as questions or imperative tasks over images… but not how well they understand context-sensitive text-rich scenes! That’s why we created ConTextual, a Context-sensitive Text-rich visuaL reasoning dataset for evaluating LMMs. We also released a leaderboard, so that the community can see for themselves which models are the best at this task. (See our [paper](https://arxiv.org/abs/2401.13311) for more details.)
+## Data
+ConTextual comprises **506 examples covering 8 real-world visual scenarios** - *Time Reading, Shopping, Navigation, Abstract Scenes, Mobile Application, Webpages, Infographics and Miscellaneous Natural Scenes*. Each sample consists of:
+- A text-rich image
+- A human-written instruction (question or imperative task)
+- A human-written reference response
+### Data Access
+ConTextual data can be found on HuggingFace and GitHub.
+- HuggingFace
+    - [Test](https://huggingface.co/datasets/ucla-contextual/contextual_test)
+    - [Val](https://huggingface.co/datasets/ucla-contextual/contextual_val)
+- Github
+    - [Test](https://github.com/rohan598/ConTextual/blob/main/data/contextual_test.csv)
+    - [Val](https://github.com/rohan598/ConTextual/blob/main/data/contextual_val.csv)
+### Data Format
+```
+{
+    "image_url": [string] url to the hosted image,
+    "instruction" [string] instruction text,
+    "response": [string] response text (only provided for samples in the val subset),
+    "category": visual scenario this example belongs to like 'time' and 'shopping' out of 8 possible scenarios in ConTextual
+}
+```
+"""
+SUBMISSION_TEXT = """
+## Submissions
+Results can be submitted for only validation here. Scores are expressed as the percentage of correct answers for a given split.
+Submission made by our team are labelled "ConTextual authors".
+### Validation Results
+To submit your validation results to the leaderboard, you can run our auto-evaluation code (Evaluation Pipeline with GPT4), following the instructions [here](https://github.com/rohan598/ConTextual?tab=readme-ov-file#-evaluation-pipeline-gpt-4).
+We expect submissions to be json format as shown below:
+```
+{"model_name": {"img_url": "1 or 0 as integer"}
+Replace model name with your model name (string)
+Replace img_url with img_url of the instance (string)
+Value for an img url is either 0 or 1 (int)
+There should be 100 predictions, corresponding to the 100 urls of the val set.
+```
+**Please do not utilize the public dev set as part of training data for your models.**
+### Test Results
+Once you are happy with your val results, you can send your model predictions to [rohan](mailto:rwadhawan7@g.ucla.edu) and [hritik](mailto:hbansal@g.ucla.edu).
+Please include in your email
+1) A name for your model.
+2) Organization (affiliation).
+3) (Optionally) GitHub repo or paper link.
+We expect submissions to be json format similar to val set as shown below:
+```
+{"model_name": {"img_url": "predicted response"}
+Replace model name with your model name (string)
+Replace img_url with img_url of the instance (string)
+Value for an img url is the predicted response for that instance (string)
+There should be 506 predictions, corresponding to the 506 urls of the test set.
+```
+**Please revisit the test leaderboard within 1 to 2 days after sharing your prediction file to view your model scores and ranking on the leaderboard.**
+"""
+CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
+CITATION_BUTTON_TEXT = r"""@misc{wadhawan2024contextual,
+      title={ConTextual: Evaluating Context-Sensitive Text-Rich Visual Reasoning in Large Multimodal Models},
+      author={Rohan Wadhawan and Hritik Bansal and Kai-Wei Chang and Nanyun Peng},
+      year={2024},
+      eprint={2401.13311},
+      archivePrefix={arXiv},
+      primaryClass={cs.CV}
+}"""
+def format_error(msg):
+    return f"<p style='color: red; font-size: 20px; text-align: center;'>{msg}</p>"
+def format_warning(msg):
+    return f"<p style='color: orange; font-size: 20px; text-align: center;'>{msg}</p>"
+def format_log(msg):
+    return f"<p style='color: green; font-size: 20px; text-align: center;'>{msg}</p>"
+def model_hyperlink(link, model_name):
+    return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+datasets==2.14.5
+gradio==4.19.2
+huggingface-hub==0.19.3
+numpy==1.24.2
+APScheduler==3.10.1

scorer.py ADDED Viewed

	@@ -0,0 +1,50 @@

+import json
+import re
+import string
+import warnings
+import pandas as pd
+import numpy as np
+import os
+def instruction_scorer(data, judgment_file, model_name):
+    df = data
+    img_dict = {}
+    for j in range(len(df)):
+        row = df.iloc[j]
+        img_dict[row['image_url']] = {'category': row['category']}
+    with open(judgment_file, 'r') as f:
+        judgements = json.load(f)
+    model_data = judgements[model_name]
+    model_analysis = {}
+    cat = {'time': [0,0], 'shopping': [0,0], 'navigation-transportation': [0,0], 'abstract': [0,0], 'app': [0,0], 'web': [0,0], 'infographics': [0,0], 'stvqa': [0,0], 'estvqa': [0,0]}
+    count, total = 0, 0
+    for key in model_data:
+        if key in img_dict:
+            img_data = img_dict[key]
+            rating = model_data[key]
+            count += rating
+            total += 1
+            cat[img_data['category']][1] += 1
+            cat[img_data['category']][0] += rating
+    model_analysis[model_name] = {'category': cat}
+    x = model_analysis[model_name]['category']
+    output_dict = {}
+    for h in x:
+        output_dict[h]=100*x[h][0]/x[h][1]
+    output_dict["misc"]= 100 * (x['stvqa'][0] + x['estvqa'][0])/(x['stvqa'][1] + x['stvqa'][1])
+    output_dict["average"] = (output_dict["time"]+output_dict["shopping"]+output_dict["navigation-transportation"]+output_dict["abstract"]+output_dict["app"]+output_dict["web"]+output_dict["infographics"]+output_dict["misc"])/8
+    return output_dict