Spaces:

snap-stanford
/

stark-leaderboard

Running on CPU Upgrade

App Files Files Community

Shiyu Zhao commited on Oct 22, 2024

Commit

c8373c1

1 Parent(s): 1d08545

Update space

Browse files

Files changed (1) hide show

app.py +498 -195

app.py CHANGED Viewed

@@ -1,204 +1,507 @@
 import gradio as gr
-from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
 import pandas as pd
-from apscheduler.schedulers.background import BackgroundScheduler
-from huggingface_hub import snapshot_download
-from src.about import (
-    CITATION_BUTTON_LABEL,
-    CITATION_BUTTON_TEXT,
-    EVALUATION_QUEUE_TEXT,
-    INTRODUCTION_TEXT,
-    LLM_BENCHMARKS_TEXT,
-    TITLE,
-)
-from src.display.css_html_js import custom_css
-from src.display.utils import (
-    BENCHMARK_COLS,
-    COLS,
-    EVAL_COLS,
-    EVAL_TYPES,
-    AutoEvalColumn,
-    ModelType,
-    fields,
-    WeightType,
-    Precision
-)
-from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
-from src.populate import get_evaluation_queue_df, get_leaderboard_df
-from src.submission.submit import add_new_eval
-def restart_space():
-    API.restart_space(repo_id=REPO_ID)
-### Space initialisation
-try:
-    print(EVAL_REQUESTS_PATH)
-    snapshot_download(
-        repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
-    )
-except Exception:
-    restart_space()
-try:
-    print(EVAL_RESULTS_PATH)
-    snapshot_download(
-        repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
-    )
-except Exception:
-    restart_space()
-LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
-(
-    finished_eval_queue_df,
-    running_eval_queue_df,
-    pending_eval_queue_df,
-) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
-def init_leaderboard(dataframe):
-    if dataframe is None or dataframe.empty:
-        raise ValueError("Leaderboard DataFrame is empty or None.")
-    return Leaderboard(
-        value=dataframe,
-        datatype=[c.type for c in fields(AutoEvalColumn)],
-        select_columns=SelectColumns(
-            default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
-            cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
-            label="Select Columns to Display:",
-        ),
-        search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
-        hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
-        filter_columns=[
-            ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
-            ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
-            ColumnFilter(
-                AutoEvalColumn.params.name,
-                type="slider",
-                min=0.01,
-                max=150,
-                label="Select the number of parameters (B)",
-            ),
-            ColumnFilter(
-                AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
-            ),
-        ],
-        bool_checkboxgroup_label="Hide models",
-        interactive=False,
     )
-demo = gr.Blocks(css=custom_css)
-with demo:
-    gr.HTML(TITLE)
-    gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
-    with gr.Tabs(elem_classes="tab-buttons") as tabs:
-        with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
-            leaderboard = init_leaderboard(LEADERBOARD_DF)
-        with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
-            gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
-        with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
-            with gr.Column():
-                with gr.Row():
-                    gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
-                with gr.Column():
-                    with gr.Accordion(
-                        f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
-                        open=False,
-                    ):
-                        with gr.Row():
-                            finished_eval_table = gr.components.Dataframe(
-                                value=finished_eval_queue_df,
-                                headers=EVAL_COLS,
-                                datatype=EVAL_TYPES,
-                                row_count=5,
-                            )
-                    with gr.Accordion(
-                        f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
-                        open=False,
-                    ):
-                        with gr.Row():
-                            running_eval_table = gr.components.Dataframe(
-                                value=running_eval_queue_df,
-                                headers=EVAL_COLS,
-                                datatype=EVAL_TYPES,
-                                row_count=5,
-                            )
-                    with gr.Accordion(
-                        f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
-                        open=False,
-                    ):
-                        with gr.Row():
-                            pending_eval_table = gr.components.Dataframe(
-                                value=pending_eval_queue_df,
-                                headers=EVAL_COLS,
-                                datatype=EVAL_TYPES,
-                                row_count=5,
-                            )
-            with gr.Row():
-                gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
-            with gr.Row():
-                with gr.Column():
-                    model_name_textbox = gr.Textbox(label="Model name")
-                    revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
-                    model_type = gr.Dropdown(
-                        choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
-                        label="Model type",
-                        multiselect=False,
-                        value=None,
-                        interactive=True,
-                    )
-                with gr.Column():
-                    precision = gr.Dropdown(
-                        choices=[i.value.name for i in Precision if i != Precision.Unknown],
-                        label="Precision",
-                        multiselect=False,
-                        value="float16",
-                        interactive=True,
-                    )
-                    weight_type = gr.Dropdown(
-                        choices=[i.value.name for i in WeightType],
-                        label="Weights type",
-                        multiselect=False,
-                        value="Original",
-                        interactive=True,
-                    )
-                    base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
-            submit_button = gr.Button("Submit Eval")
-            submission_result = gr.Markdown()
-            submit_button.click(
-                add_new_eval,
-                [
-                    model_name_textbox,
-                    base_model_name_textbox,
-                    revision_name_textbox,
-                    precision,
-                    weight_type,
-                    model_type,
-                ],
-                submission_result,
-            )
     with gr.Row():
-        with gr.Accordion("📙 Citation", open=False):
-            citation_button = gr.Textbox(
-                value=CITATION_BUTTON_TEXT,
-                label=CITATION_BUTTON_LABEL,
-                lines=20,
-                elem_id="citation-button",
-                show_copy_button=True,
             )
-scheduler = BackgroundScheduler()
-scheduler.add_job(restart_space, "interval", seconds=1800)
-scheduler.start()
-demo.queue(default_concurrency_limit=40).launch()

 import gradio as gr
 import pandas as pd
+import numpy as np
+import os
+import re
+from datetime import datetime
+import json
+import torch
+from tqdm import tqdm
+from concurrent.futures import ProcessPoolExecutor, as_completed
+from stark_qa import load_qa
+from stark_qa.evaluator import Evaluator
+def process_single_instance(args):
+    idx, eval_csv, qa_dataset, evaluator, eval_metrics = args
+    query, query_id, answer_ids, meta_info = qa_dataset[idx]
+    try:
+        pred_rank = eval_csv[eval_csv['query_id'] == query_id]['pred_rank'].item()
+    except IndexError:
+        raise IndexError(f'Error when processing query_id={query_id}, please make sure the predicted results exist for this query.')
+    except Exception as e:
+        raise RuntimeError(f'Unexpected error occurred while fetching prediction rank for query_id={query_id}: {e}')
+    if isinstance(pred_rank, str):
+        try:
+            pred_rank = eval(pred_rank)
+        except SyntaxError as e:
+            raise ValueError(f'Failed to parse pred_rank as a list for query_id={query_id}: {e}')
+    if not isinstance(pred_rank, list):
+        raise TypeError(f'Error when processing query_id={query_id}, expected pred_rank to be a list but got {type(pred_rank)}.')
+    pred_dict = {pred_rank[i]: -i for i in range(min(100, len(pred_rank)))}
+    answer_ids = torch.LongTensor(answer_ids)
+    result = evaluator.evaluate(pred_dict, answer_ids, metrics=eval_metrics)
+    result["idx"], result["query_id"] = idx, query_id
+    return result
+def compute_metrics(csv_path: str, dataset: str, split: str, num_workers: int = 4):
+    candidate_ids_dict = {
+        'amazon': [i for i in range(957192)],
+        'mag': [i for i in range(1172724, 1872968)],
+        'prime': [i for i in range(129375)]
+    }
+    try:
+        eval_csv = pd.read_csv(csv_path)
+        if 'query_id' not in eval_csv.columns:
+            raise ValueError('No `query_id` column found in the submitted csv.')
+        if 'pred_rank' not in eval_csv.columns:
+            raise ValueError('No `pred_rank` column found in the submitted csv.')
+        eval_csv = eval_csv[['query_id', 'pred_rank']]
+        if dataset not in candidate_ids_dict:
+            raise ValueError(f"Invalid dataset '{dataset}', expected one of {list(candidate_ids_dict.keys())}.")
+        if split not in ['test', 'test-0.1', 'human_generated_eval']:
+            raise ValueError(f"Invalid split '{split}', expected one of ['test', 'test-0.1', 'human_generated_eval'].")
+        evaluator = Evaluator(candidate_ids_dict[dataset])
+        eval_metrics = ['hit@1', 'hit@5', 'recall@20', 'mrr']
+        qa_dataset = load_qa(dataset, human_generated_eval=split == 'human_generated_eval')
+        split_idx = qa_dataset.get_idx_split()
+        all_indices = split_idx[split].tolist()
+        results_list = []
+        query_ids = []
+        # Prepare args for each worker
+        args = [(idx, eval_csv, qa_dataset, evaluator, eval_metrics) for idx in all_indices]
+        with ProcessPoolExecutor(max_workers=num_workers) as executor:
+            futures = [executor.submit(process_single_instance, arg) for arg in args]
+            for future in tqdm(as_completed(futures), total=len(futures)):
+                result = future.result()  # This will raise an error if the worker encountered one
+                results_list.append(result)
+                query_ids.append(result['query_id'])
+        # Concatenate results and compute final metrics
+        eval_csv = pd.concat([eval_csv, pd.DataFrame(results_list)], ignore_index=True)
+        final_results = {
+            metric: np.mean(eval_csv[eval_csv['query_id'].isin(query_ids)][metric]) for metric in eval_metrics
+        }
+        return final_results
+    except pd.errors.EmptyDataError:
+        return "Error: The CSV file is empty or could not be read. Please check the file and try again."
+    except FileNotFoundError:
+        return f"Error: The file {csv_path} could not be found. Please check the file path and try again."
+    except Exception as error:
+        return f"{error}"
+# Data dictionaries for leaderboard
+data_synthesized_full = {
+    'Method': ['BM25', 'DPR (roberta)', 'ANCE (roberta)', 'QAGNN (roberta)', 'ada-002', 'voyage-l2-instruct', 'LLM2Vec', 'GritLM-7b', 'multi-ada-002', 'ColBERTv2'],
+    'STARK-AMAZON_Hit@1': [44.94, 15.29, 30.96, 26.56, 39.16, 40.93, 21.74, 42.08, 40.07, 46.10],
+    'STARK-AMAZON_Hit@5': [67.42, 47.93, 51.06, 50.01, 62.73, 64.37, 41.65, 66.87, 64.98, 66.02],
+    'STARK-AMAZON_R@20': [53.77, 44.49, 41.95, 52.05, 53.29, 54.28, 33.22, 56.52, 55.12, 53.44],
+    'STARK-AMAZON_MRR': [55.30, 30.20, 40.66, 37.75, 50.35, 51.60, 31.47, 53.46, 51.55, 55.51],
+    'STARK-MAG_Hit@1': [25.85, 10.51, 21.96, 12.88, 29.08, 30.06, 18.01, 37.90, 25.92, 31.18],
+    'STARK-MAG_Hit@5': [45.25, 35.23, 36.50, 39.01, 49.61, 50.58, 34.85, 56.74, 50.43, 46.42],
+    'STARK-MAG_R@20': [45.69, 42.11, 35.32, 46.97, 48.36, 50.49, 35.46, 46.40, 50.80, 43.94],
+    'STARK-MAG_MRR': [34.91, 21.34, 29.14, 29.12, 38.62, 39.66, 26.10, 47.25, 36.94, 38.39],
+    'STARK-PRIME_Hit@1': [12.75, 4.46, 6.53, 8.85, 12.63, 10.85, 10.10, 15.57, 15.10, 11.75],
+    'STARK-PRIME_Hit@5': [27.92, 21.85, 15.67, 21.35, 31.49, 30.23, 22.49, 33.42, 33.56, 23.85],
+    'STARK-PRIME_R@20': [31.25, 30.13, 16.52, 29.63, 36.00, 37.83, 26.34, 39.09, 38.05, 25.04],
+    'STARK-PRIME_MRR': [19.84, 12.38, 11.05, 14.73, 21.41, 19.99, 16.12, 24.11, 23.49, 17.39]
+}
+data_synthesized_10 = {
+    'Method': ['BM25', 'DPR (roberta)', 'ANCE (roberta)', 'QAGNN (roberta)', 'ada-002', 'voyage-l2-instruct', 'LLM2Vec', 'GritLM-7b', 'multi-ada-002', 'ColBERTv2', 'Claude3 Reranker', 'GPT4 Reranker'],
+    'STARK-AMAZON_Hit@1': [42.68, 16.46, 30.09, 25.00, 39.02, 43.29, 18.90, 43.29, 40.85, 44.31, 45.49, 44.79],
+    'STARK-AMAZON_Hit@5': [67.07, 50.00, 49.27, 48.17, 64.02, 67.68, 37.80, 71.34, 62.80, 65.24, 71.13, 71.17],
+    'STARK-AMAZON_R@20': [54.48, 42.15, 41.91, 51.65, 49.30, 56.04, 34.73, 56.14, 52.47, 51.00, 53.77, 55.35],
+    'STARK-AMAZON_MRR': [54.02, 30.20, 39.30, 36.87, 50.32, 54.20, 28.76, 55.07, 51.54, 55.07, 55.91, 55.69],
+    'STARK-MAG_Hit@1': [27.81, 11.65, 22.89, 12.03, 28.20, 34.59, 19.17, 38.35, 25.56, 31.58, 36.54, 40.90],
+    'STARK-MAG_Hit@5': [45.48, 36.84, 37.26, 37.97, 52.63, 50.75, 33.46, 58.64, 50.37, 47.36, 53.17, 58.18],
+    'STARK-MAG_R@20': [44.59, 42.30, 44.16, 47.98, 49.25, 50.75, 29.85, 46.38, 53.03, 45.72, 48.36, 48.60],
+    'STARK-MAG_MRR': [35.97, 21.82, 30.00, 28.70, 38.55, 42.90, 26.06, 48.25, 36.82, 38.98, 44.15, 49.00],
+    'STARK-PRIME_Hit@1': [13.93, 5.00, 6.78, 7.14, 15.36, 12.14, 9.29, 16.79, 15.36, 15.00, 17.79, 18.28],
+    'STARK-PRIME_Hit@5': [31.07, 23.57, 16.15, 17.14, 31.07, 31.42, 20.7, 34.29, 32.86, 26.07, 36.90, 37.28],
+    'STARK-PRIME_R@20': [32.84, 30.50, 17.07, 32.95, 37.88, 37.34, 25.54, 41.11, 40.99, 27.78, 35.57, 34.05],
+    'STARK-PRIME_MRR': [21.68, 13.50, 11.42, 16.27, 23.50, 21.23, 15.00, 24.99, 23.70, 19.98, 26.27, 26.55]
+}
+data_human_generated = {
+    'Method': ['BM25', 'DPR (roberta)', 'ANCE (roberta)', 'QAGNN (roberta)', 'ada-002', 'voyage-l2-instruct', 'LLM2Vec', 'GritLM-7b', 'multi-ada-002', 'ColBERTv2', 'Claude3 Reranker', 'GPT4 Reranker'],
+    'STARK-AMAZON_Hit@1': [27.16, 16.05, 25.93, 22.22, 39.50, 35.80, 29.63, 40.74, 46.91, 33.33, 53.09, 50.62],
+    'STARK-AMAZON_Hit@5': [51.85, 39.51, 54.32, 49.38, 64.19, 62.96, 46.91, 71.60, 72.84, 55.56, 74.07, 75.31],
+    'STARK-AMAZON_R@20': [29.23, 15.23, 23.69, 21.54, 35.46, 33.01, 21.21, 36.30, 40.22, 29.03, 35.46, 35.46],
+    'STARK-AMAZON_MRR': [18.79, 27.21, 37.12, 31.33, 52.65, 47.84, 38.61, 53.21, 58.74, 43.77, 62.11, 61.06],
+    'STARK-MAG_Hit@1': [32.14, 4.72, 25.00, 20.24, 28.57, 22.62, 16.67, 34.52, 23.81, 33.33, 38.10, 36.90],
+    'STARK-MAG_Hit@5': [41.67, 9.52, 30.95, 26.19, 41.67, 36.90, 28.57, 44.04, 41.67, 36.90, 45.24, 46.43],
+    'STARK-MAG_R@20': [32.46, 25.00, 27.24, 28.76, 35.95, 32.44, 21.74, 34.57, 39.85, 30.50, 35.95, 35.95],
+    'STARK-MAG_MRR': [37.42, 7.90, 27.98, 25.53, 35.81, 29.68, 21.59, 38.72, 31.43, 35.97, 42.00, 40.65],
+    'STARK-PRIME_Hit@1': [22.45, 2.04, 7.14, 6.12, 17.35, 16.33, 9.18, 25.51, 24.49, 15.31, 28.57, 28.57],
+    'STARK-PRIME_Hit@5': [41.84, 9.18, 13.27, 13.27, 34.69, 32.65, 21.43, 41.84, 39.80, 26.53, 46.94, 44.90],
+    'STARK-PRIME_R@20': [42.32, 10.69, 11.72, 17.62, 41.09, 39.01, 26.77, 48.10, 47.21, 25.56, 41.61, 41.61],
+    'STARK-PRIME_MRR': [30.37, 7.05, 10.07, 9.39, 26.35, 24.33, 15.24, 34.28, 32.98, 19.67, 36.32, 34.82]
+}
+# Initialize DataFrames
+df_synthesized_full = pd.DataFrame(data_synthesized_full)
+df_synthesized_10 = pd.DataFrame(data_synthesized_10)
+df_human_generated = pd.DataFrame(data_human_generated)
+# Model type definitions
+model_types = {
+    'Sparse Retriever': ['BM25'],
+    'Small Dense Retrievers': ['DPR (roberta)', 'ANCE (roberta)', 'QAGNN (roberta)'],
+    'LLM-based Dense Retrievers': ['ada-002', 'voyage-l2-instruct', 'LLM2Vec', 'GritLM-7b'],
+    'Multivector Retrievers': ['multi-ada-002', 'ColBERTv2'],
+    'LLM Rerankers': ['Claude3 Reranker', 'GPT4 Reranker']
+}
+# Submission form validation functions
+def validate_email(email_str):
+    """Validate email format(s)"""
+    emails = [e.strip() for e in email_str.split(';')]
+    email_pattern = re.compile(r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$')
+    return all(email_pattern.match(email) for email in emails)
+def validate_github_url(url):
+    """Validate GitHub URL format"""
+    github_pattern = re.compile(
+        r'^https?:\/\/(?:www\.)?github\.com\/[\w-]+\/[\w.-]+\/?$'
     )
+    return bool(github_pattern.match(url))
+def validate_csv(file_obj):
+    """Validate CSV file format and content"""
+    try:
+        df = pd.read_csv(file_obj.name)
+        required_cols = ['query_id', 'pred_rank']
+        if not all(col in df.columns for col in required_cols):
+            return False, "CSV must contain 'query_id' and 'pred_rank' columns"
+        try:
+            first_rank = eval(df['pred_rank'].iloc[0]) if isinstance(df['pred_rank'].iloc[0], str) else df['pred_rank'].iloc[0]
+            if not isinstance(first_rank, list) or len(first_rank) < 20:
+                return False, "pred_rank must be a list with at least 20 candidates"
+        except:
+            return False, "Invalid pred_rank format"
+        return True, "Valid CSV file"
+    except Exception as e:
+        return False, f"Error processing CSV: {str(e)}"
+def sanitize_name(name):
+    """Sanitize name for file system use"""
+    return re.sub(r'[^a-zA-Z0-9]', '_', name)
+def save_submission(submission_data, csv_file):
+    """
+    Save submission data and CSV file using model_name_team_name format
+    Args:
+        submission_data (dict): Metadata and results for the submission
+        csv_file: The uploaded CSV file object
+    """
+    # Create folder name from model name and team name
+    model_name_clean = sanitize_name(submission_data['method_name'])
+    team_name_clean = sanitize_name(submission_data['team_name'])
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    # Create folder name: model_name_team_name
+    folder_name = f"{model_name_clean}_{team_name_clean}"
+    submission_id = f"{folder_name}_{timestamp}"
+    # Create submission directory structure
+    base_dir = "submissions"
+    submission_dir = os.path.join(base_dir, folder_name)
+    os.makedirs(submission_dir, exist_ok=True)
+    # Save CSV file with timestamp to allow multiple submissions
+    csv_filename = f"predictions_{timestamp}.csv"
+    csv_path = os.path.join(submission_dir, csv_filename)
+    if hasattr(csv_file, 'name'):
+        with open(csv_file.name, 'rb') as source, open(csv_path, 'wb') as target:
+            target.write(source.read())
+    # Add file paths to submission data
+    submission_data.update({
+        "csv_path": csv_path,
+        "submission_id": submission_id,
+        "folder_name": folder_name
+    })
+    # Save metadata as JSON with timestamp
+    metadata_path = os.path.join(submission_dir, f"metadata_{timestamp}.json")
+    with open(metadata_path, 'w') as f:
+        json.dump(submission_data, f, indent=4)
+    # Update latest.json to track most recent submission
+    latest_path = os.path.join(submission_dir, "latest.json")
+    with open(latest_path, 'w') as f:
+        json.dump({
+            "latest_submission": timestamp,
+            "status": "pending_review",
+            "method_name": submission_data['method_name']
+        }, f, indent=4)
+    return submission_id
+def update_leaderboard_data(submission_data):
+    """
+    Update leaderboard data with new submission results
+    Only uses model name in the displayed table
+    """
+    global df_synthesized_full, df_synthesized_10, df_human_generated
+    # Determine which DataFrame to update based on split
+    split_to_df = {
+        'test': df_synthesized_full,
+        'test-0.1': df_synthesized_10,
+        'human_generated_eval': df_human_generated
+    }
+    df_to_update = split_to_df[submission_data['split']]
+    # Prepare new row data
+    new_row = {
+        'Method': submission_data['method_name'],  # Only use method name in table
+        f'STARK-{submission_data["dataset"].upper()}_Hit@1': submission_data['results']['hit@1'],
+        f'STARK-{submission_data["dataset"].upper()}_Hit@5': submission_data['results']['hit@5'],
+        f'STARK-{submission_data["dataset"].upper()}_R@20': submission_data['results']['recall@20'],
+        f'STARK-{submission_data["dataset"].upper()}_MRR': submission_data['results']['mrr']
+    }
+    # Check if method already exists
+    method_mask = df_to_update['Method'] == submission_data['method_name']
+    if method_mask.any():
+        # Update existing row
+        for col in new_row:
+            df_to_update.loc[method_mask, col] = new_row[col]
+    else:
+        # Add new row
+        df_to_update.loc[len(df_to_update)] = new_row
+def process_submission(
+    method_name, team_name, dataset, split, contact_email,
+    code_repo, csv_file, model_description, hardware, paper_link
+):
+    """Process and validate submission"""
+    try:
+        # [Previous validation code remains the same]
+        # Process CSV file through evaluation pipeline
+        results = compute_metrics(
+            csv_file.name,
+            dataset=dataset.lower(),
+            split=split,
+            num_workers=4
+        )
+        if isinstance(results, str) and results.startswith("Error"):
+            return f"Evaluation error: {results}"
+        # Prepare submission data
+        submission_data = {
+            "method_name": method_name,
+            "team_name": team_name,
+            "dataset": dataset,
+            "split": split,
+            "contact_email": contact_email,
+            "code_repo": code_repo,
+            "model_description": model_description,
+            "hardware": hardware,
+            "paper_link": paper_link,
+            "results": results,
+            "status": "pending_review",
+            "submission_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+        }
+        # Save submission and get ID
+        submission_id = save_submission(submission_data, csv_file)
+        # Update leaderboard data if submission is valid
+        update_leaderboard_data(submission_data)
+        return f"""
+        Submission successful! Your submission ID is: {submission_id}
+        Evaluation Results:
+        Hit@1: {results['hit@1']:.2f}
+        Hit@5: {results['hit@5']:.2f}
+        Recall@20: {results['recall@20']:.2f}
+        MRR: {results['mrr']:.2f}
+        Your submission has been saved and is pending review.
+        Once approved, your results will appear in the leaderboard under the method name: {method_name}
+        """
+    except Exception as e:
+        return f"Error processing submission: {str(e)}"
+def filter_by_model_type(df, selected_types):
+    if not selected_types:
+        return df.head(0)
+    selected_models = [model for type in selected_types for model in model_types[type]]
+    return df[df['Method'].isin(selected_models)]
+def format_dataframe(df, dataset):
+    columns = ['Method'] + [col for col in df.columns if dataset in col]
+    filtered_df = df[columns].copy()
+    filtered_df.columns = [col.split('_')[-1] if '_' in col else col for col in filtered_df.columns]
+    filtered_df = filtered_df.sort_values('MRR', ascending=False)
+    return filtered_df
+def update_tables(selected_types):
+    filtered_df_full = filter_by_model_type(df_synthesized_full, selected_types)
+    filtered_df_10 = filter_by_model_type(df_synthesized_10, selected_types)
+    filtered_df_human = filter_by_model_type(df_human_generated, selected_types)
+    outputs = []
+    for df in [filtered_df_full, filtered_df_10, filtered_df_human]:
+        for dataset in ['AMAZON', 'MAG', 'PRIME']:
+            outputs.append(format_dataframe(df, f"STARK-{dataset}"))
+    return outputs
+css = """
+table > thead {
+    white-space: normal
+}
+table {
+    --cell-width-1: 250px
+}
+table > tbody > tr > td:nth-child(2) > div {
+    overflow-x: auto
+}
+.tab-nav {
+    border-bottom: 1px solid rgba(255, 255, 255, 0.1);
+    margin-bottom: 1rem;
+}
+"""
+# Main application
+with gr.Blocks(css=css) as demo:
+    gr.Markdown("# Semi-structured Retrieval Benchmark (STaRK) Leaderboard")
+    gr.Markdown("Refer to the [STaRK paper](https://arxiv.org/pdf/2404.13207) for details on metrics, tasks and models.")
+    # Model type filter
+    model_type_filter = gr.CheckboxGroup(
+        choices=list(model_types.keys()),
+        value=list(model_types.keys()),
+        label="Model types",
+        interactive=True
+    )
+    # Initialize dataframes list
+    all_dfs = []
+    # Create nested tabs structure
+    with gr.Tabs() as outer_tabs:
+        with gr.TabItem("Synthesized (full)"):
+            with gr.Tabs() as inner_tabs1:
+                for dataset in ['AMAZON', 'MAG', 'PRIME']:
+                    with gr.TabItem(dataset):
+                        all_dfs.append(gr.DataFrame(interactive=False))
+        with gr.TabItem("Synthesized (10%)"):
+            with gr.Tabs() as inner_tabs2:
+                for dataset in ['AMAZON', 'MAG', 'PRIME']:
+                    with gr.TabItem(dataset):
+                        all_dfs.append(gr.DataFrame(interactive=False))
+        with gr.TabItem("Human-Generated"):
+            with gr.Tabs() as inner_tabs3:
+                for dataset in ['AMAZON', 'MAG', 'PRIME']:
+                    with gr.TabItem(dataset):
+                        all_dfs.append(gr.DataFrame(interactive=False))
+    # Submission section
+    gr.Markdown("---")
+    gr.Markdown("## Submit Your Results")
+    gr.Markdown("""
+    Submit your results to be included in the leaderboard. Please ensure your submission meets all requirements.
+    For questions, contact stark-qa@cs.stanford.edu
+    """)
     with gr.Row():
+        with gr.Column():
+            method_name = gr.Textbox(
+                label="Method Name (max 25 chars)*",
+                placeholder="e.g., MyRetrievalModel-v1"
             )
+            team_name = gr.Textbox(
+                label="Team Name (max 25 chars)*",
+                placeholder="e.g., Stanford NLP"
+            )
+            dataset = gr.Dropdown(
+                choices=["amazon", "mag", "prime"],
+                label="Dataset*",
+                value="amazon"
+            )
+            split = gr.Dropdown(
+                choices=["test", "test-0.1", "human_generated_eval"],
+                label="Split*",
+                value="test"
+            )
+            contact_email = gr.Textbox(
+                label="Contact Email(s)*",
+                placeholder="email@example.com; another@example.com"
+            )
+        with gr.Column():
+            code_repo = gr.Textbox(
+                label="Code Repository*",
+                placeholder="https://github.com/username/repository"
+            )
+            csv_file = gr.File(
+                label="Prediction CSV*",
+                file_types=[".csv"]
+            )
+            model_description = gr.Textbox(
+                label="Model Description*",
+                lines=3,
+                placeholder="Briefly describe how your retriever model works..."
+            )
+            hardware = gr.Textbox(
+                label="Hardware Specifications*",
+                placeholder="e.g., 4x NVIDIA A100 80GB"
+            )
+            paper_link = gr.Textbox(
+                label="Paper Link (Optional)",
+                placeholder="https://arxiv.org/abs/..."
+            )
+    submit_btn = gr.Button("Submit", variant="primary")
+    result = gr.Textbox(label="Submission Status", interactive=False)
+    # Set up event handlers
+    model_type_filter.change(
+        update_tables,
+        inputs=[model_type_filter],
+        outputs=all_dfs
+    )
+    submit_btn.click(
+        process_submission,
+        inputs=[
+            method_name, team_name, dataset, split, contact_email,
+            code_repo, csv_file, model_description, hardware, paper_link
+        ],
+        outputs=result
+    )
+    # Initial table update
+    demo.load(
+        update_tables,
+        inputs=[model_type_filter],
+        outputs=all_dfs
+    )
+# Launch the application
+demo.launch()