Spaces:

NPHardEval
/

NPHardEval-leaderboard

Running

@@ -25,21 +25,13 @@ from src.display.utils import (
     WeightType,
     Precision
 )
-from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, TOKEN, QUEUE_REPO, REPO_ID, RESULTS_REPO
 from src.populate import get_evaluation_queue_df, get_leaderboard_df
-from src.submission.submit import add_new_eval
 def restart_space():
     API.restart_space(repo_id=REPO_ID, token=TOKEN)
-try:
-    print(EVAL_REQUESTS_PATH)
-    snapshot_download(
-        repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
-    )
-except Exception:
-    restart_space()
 try:
     print(EVAL_RESULTS_PATH)
     snapshot_download(
@@ -52,12 +44,6 @@ except Exception:
 raw_data, original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
 leaderboard_df = original_df.copy()
-(
-    finished_eval_queue_df,
-    running_eval_queue_df,
-    pending_eval_queue_df,
-) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
 # Searching and filtering
 def update_table(
@@ -240,95 +226,13 @@ with demo:
                     queue=True,
                 )
-        with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
-            gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
-        with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
-            with gr.Column():
-                with gr.Row():
-                    gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
-                with gr.Column():
-                    with gr.Accordion(
-                        f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
-                        open=False,
-                    ):
-                        with gr.Row():
-                            finished_eval_table = gr.components.Dataframe(
-                                value=finished_eval_queue_df,
-                                headers=EVAL_COLS,
-                                datatype=EVAL_TYPES,
-                                row_count=5,
-                            )
-                    with gr.Accordion(
-                        f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
-                        open=False,
-                    ):
-                        with gr.Row():
-                            running_eval_table = gr.components.Dataframe(
-                                value=running_eval_queue_df,
-                                headers=EVAL_COLS,
-                                datatype=EVAL_TYPES,
-                                row_count=5,
-                            )
-                    with gr.Accordion(
-                        f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
-                        open=False,
-                    ):
-                        with gr.Row():
-                            pending_eval_table = gr.components.Dataframe(
-                                value=pending_eval_queue_df,
-                                headers=EVAL_COLS,
-                                datatype=EVAL_TYPES,
-                                row_count=5,
-                            )
-            with gr.Row():
-                gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
             with gr.Row():
                 with gr.Column():
-                    model_name_textbox = gr.Textbox(label="Model name")
-                    revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
-                    model_type = gr.Dropdown(
-                        choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
-                        label="Model type",
-                        multiselect=False,
-                        value=None,
-                        interactive=True,
-                    )
-                with gr.Column():
-                    precision = gr.Dropdown(
-                        choices=[i.value.name for i in Precision if i != Precision.Unknown],
-                        label="Precision",
-                        multiselect=False,
-                        value="float16",
-                        interactive=True,
-                    )
-                    weight_type = gr.Dropdown(
-                        choices=[i.value.name for i in WeightType],
-                        label="Weights type",
-                        multiselect=False,
-                        value="Original",
-                        interactive=True,
-                    )
-                    base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
-            submit_button = gr.Button("Submit Eval")
-            submission_result = gr.Markdown()
-            submit_button.click(
-                add_new_eval,
-                [
-                    model_name_textbox,
-                    base_model_name_textbox,
-                    revision_name_textbox,
-                    precision,
-                    weight_type,
-                    model_type,
-                ],
-                submission_result,
-            )
     with gr.Row():
         with gr.Accordion("📙 Citation", open=False):

     WeightType,
     Precision
 )
+from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, TOKEN, REPO_ID, RESULTS_REPO
 from src.populate import get_evaluation_queue_df, get_leaderboard_df
 def restart_space():
     API.restart_space(repo_id=REPO_ID, token=TOKEN)
 try:
     print(EVAL_RESULTS_PATH)
     snapshot_download(
 raw_data, original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
 leaderboard_df = original_df.copy()
 # Searching and filtering
 def update_table(
                     queue=True,
                 )
+        with gr.TabItem("📈 Metrics", elem_id="llm-benchmark-tab-table", id=2):
             with gr.Row():
                 with gr.Column():
+                    gr.Image("figures/weighted_accuracy_failed.png", min_width=500)
+        with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=3):
+            gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
     with gr.Row():
         with gr.Accordion("📙 Citation", open=False):

figures/weighted_accuracy_failed.png ADDED Viewed

src/display/about.py CHANGED Viewed

@@ -11,57 +11,112 @@ class Task:
 # Init: to update with your specific keys
 class Tasks(Enum):
     # task_key in the json file, metric_key in the json file, name to display in the leaderboard
-    task0 = Task("task_name1", "metric_name", "First task")
-    task1 = Task("task_name2", "metric_name", "Second task")
 # Your leaderboard name
-TITLE = """<h1 align="center" id="space-title">Demo leaderboard</h1>"""
 # What does your leaderboard evaluate?
 INTRODUCTION_TEXT = """
-Intro text
 """
 # Which evaluations are you running? how can people reproduce what you have?
 LLM_BENCHMARKS_TEXT = f"""
-## How it works
-## Reproducibility
-To reproduce our results, here is the commands you can run:
-"""
-EVALUATION_QUEUE_TEXT = """
-## Some good practices before submitting a model
-### 1) Make sure you can load your model and tokenizer using AutoClasses:
-```python
-from transformers import AutoConfig, AutoModel, AutoTokenizer
-config = AutoConfig.from_pretrained("your model name", revision=revision)
-model = AutoModel.from_pretrained("your model name", revision=revision)
-tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
 ```
-If this step fails, follow the error messages to debug your model before submitting it. It's likely your model has been improperly uploaded.
-Note: make sure your model is public!
-Note: if your model needs `use_remote_code=True`, we do not support this option yet but we are working on adding it, stay posted!
-### 2) Convert your model weights to [safetensors](https://huggingface.co/docs/safetensors/index)
-It's a new format for storing weights which is safer and faster to load and use. It will also allow us to add the number of parameters of your model to the `Extended Viewer`!
-### 3) Make sure your model has an open license!
-This is a leaderboard for Open LLMs, and we'd love for as many people as possible to know they can use your model 🤗
-### 4) Fill up your model card
-When we add extra information about models to the leaderboard, it will be automatically taken from the model card
-## In case of model failure
-If your model is displayed in the `FAILED` category, its execution stopped.
-Make sure you have followed the above steps first.
-If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
 """
 CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
 CITATION_BUTTON_TEXT = r"""
 """

 # Init: to update with your specific keys
 class Tasks(Enum):
     # task_key in the json file, metric_key in the json file, name to display in the leaderboard
+    task0 = Task("SAS", "weighted_accuracy", "SAS")
+    task1 = Task("SPP", "weighted_accuracy", "SPP")
+    task2 = Task("EDP", "weighted_accuracy", "EDP")
+    task3 = Task("TSP_D", "weighted_accuracy", "TSP_D")
+    task4 = Task("GCP_D", "weighted_accuracy", "GCP_D")
+    task5 = Task("KSP", "weighted_accuracy", "KSP")
+    task6 = Task("TSP", "weighted_accuracy", "TSP")
+    task7 = Task("GCP", "weighted_accuracy", "GCP")
+    task8 = Task("MSP", "weighted_accuracy", "MSP")
 # Your leaderboard name
+TITLE = """<h1 align="center" id="space-title">NPHardEval leaderboard</h1>"""
 # What does your leaderboard evaluate?
 INTRODUCTION_TEXT = """
+NPHardEval serves as a comprehensive benchmark for assessing the reasoning abilities of large language models (LLMs) through the lens of computational complexity classes.
+[Our repository](https://github.com/casmlab/NPHardEval) contains datasets, data generation scripts, and experimental procedures designed to evaluate LLMs in various reasoning tasks.
+In particular, we use three complexity classes to define the task complexity in the benchmark, including P (polynomial time), NP-complete (nondeterministic polynomial-time complete),
+and NP-hard, which are increasingly complex in both the intrinsic difficulty and the resources needed to solve them. The selected nine problems are:
+  1) P problems: Sorted Array Search (SAS), Edit Distance Problem (EDP), Shortest Path Problem (SPP);
+  2) NP-complete problems: Traveling Salesman Problem Decision Version (TSP-D), Graph Coloring Problem Decision Version (GCP-D), and Knapsack Problem (KSP);
+  3) NP-hard problems:  Traveling Salesman Problem Optimization Version (TSP), Graph Coloring Problem Optimization Version (GCP), and Meeting Scheduling Problem (MSP).
+The following figure shows their relation regarding computational complexity in an Euler diagram.
+<div align="center">
+    <img
+        src="https://raw.githubusercontent.com/casmlab/NPHardEval/main/NP-hard.jpg"
+        style="width: 50%;"
+        alt="Selected problems and the Euler diagram of computational complexity classes"
+    >
+</div>
+Our benchmark offers several advantages compared with current benchmarks:
+- Data construction grounded in the established computational complexity hierarchy
+- Automatic checking mechanisms
+- Automatic generation of datapoints
+- Complete focus on reasoning while exclude numerical computation
 """
 # Which evaluations are you running? how can people reproduce what you have?
 LLM_BENCHMARKS_TEXT = f"""
+The paramount importance of complex reasoning in Large Language Models (LLMs) is well-recognized,
+especially in their application to intricate decision-making tasks. This underscores the necessity
+of thoroughly investigating LLMs' reasoning capabilities. To this end, various benchmarks have been
+developed to evaluate these capabilities. However, existing benchmarks fall short in providing a
+comprehensive assessment of LLMs' potential in reasoning. Additionally, there is a risk of overfitting,
+as these benchmarks are static and publicly accessible, allowing models to tailor responses to specific
+metrics, thus artificially boosting their performance.
+In response, our research introduces 'NPHardEval,' a novel benchmark meticulously designed to
+comprehensively evaluate LLMs' reasoning abilities. It comprises a diverse array of 900 algorithmic
+questions, spanning the spectrum up to NP-Hard complexity. These questions are strategically selected
+to cover a vast range of complexities, ensuring a thorough evaluation of LLMs' reasoning power. This
+benchmark not only offers insights into the current state of reasoning in LLMs but also establishes
+a benchmark for comparing LLMs' performance across various complexity classes.
+Our study marks a significant contribution to understanding LLMs' current reasoning capabilities
+and paves the way for future enhancements. Furthermore, NPHardEval features a dynamic update mechanism,
+refreshing data points monthly. This approach is crucial in reducing the risk of model overfitting,
+leading to a more accurate and dependable evaluation of LLMs' reasoning skills. The benchmark dataset
+and the associated code are accessible at [NPHardEval GitHub Repository]("https://github.com/casmlab/NPHardEval").
+## Quick Start
+### Environment setup
+```bash
+conda create --name llm_reason python=3.10
+conda activate llm_reason
+git clone https://github.com/casmlab/NPHardEval.git
+pip install -r requirements.txt
 ```
+### Set-up API keys
+Please set up your API keys in `secrets.txt`. **Please don't directly upload your keys to any public repository.**
+### Example Commands
+Let's use the GPT 4 Turbo model (GPT-4-1106-preview) and the EDP for example.
+For its zeroshot experiment, you can use:
+```
+cd run
+cd run_close_zeroshot
+python run_hard_GCP.py gpt-4-1106-preview
+```
+For its fewshot experiment,
+```
+cd run
+cd run_close_fewshot
+python run_close_fewshot/run_hard_GCP.py gpt-4-1106-preview self
+```
+"""
+EVALUATION_QUEUE_TEXT = """
+Currently, we don't support the submission of new evaluations.
 """
 CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
 CITATION_BUTTON_TEXT = r"""
+@misc{fan2023nphardeval,
+      title={NPHardEval: Dynamic Benchmark on Reasoning Ability of Large Language Models via Complexity Classes},
+      author={Lizhou Fan and Wenyue Hua and Lingyao Li and Haoyang Ling and Yongfeng Zhang and Libby Hemphill},
+      year={2023},
+      eprint={2312.14890},
+      archivePrefix={arXiv},
+      primaryClass={cs.AI}
+}
 """

src/display/utils.py CHANGED Viewed

@@ -27,7 +27,7 @@ auto_eval_column_dict = []
 auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
 auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
 #Scores
-auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
 for task in Tasks:
     auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
 # Model information

 auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
 auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
 #Scores
+auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Avg ⬆️", "number", True)])
 for task in Tasks:
     auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
 # Model information

src/envs.py CHANGED Viewed

@@ -5,10 +5,9 @@ from huggingface_hub import HfApi
 # clone / pull the lmeh eval data
 TOKEN = os.environ.get("TOKEN", None)
-OWNER = "demo-leaderboard"
-REPO_ID = f"{OWNER}/leaderboard"
-QUEUE_REPO = f"{OWNER}/requests"
-RESULTS_REPO = f"{OWNER}/results"
 CACHE_PATH=os.getenv("HF_HOME", ".")

 # clone / pull the lmeh eval data
 TOKEN = os.environ.get("TOKEN", None)
+OWNER = "hyfrankl"
+REPO_ID = f"{OWNER}/NPHardEval-leaderboard"
+RESULTS_REPO = f"{OWNER}/NPHardEval-results"
 CACHE_PATH=os.getenv("HF_HOME", ".")

src/leaderboard/read_evals.py CHANGED Viewed

@@ -59,10 +59,13 @@ class EvalResult:
             full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
         )
         architecture = "?"
         if model_config is not None:
             architectures = getattr(model_config, "architectures", None)
             if architectures:
                 architecture = ";".join(architectures)
         # Extract results available in this file (some results are split in several files)
         results = {}
@@ -86,7 +89,8 @@ class EvalResult:
             precision=precision,
             revision= config.get("model_sha", ""),
             still_on_hub=still_on_hub,
-            architecture=architecture
         )
     def update_with_request_file(self, requests_path):

             full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
         )
         architecture = "?"
+        model_type = ModelType.from_str(config.get("model_type", "Unknown"))
         if model_config is not None:
             architectures = getattr(model_config, "architectures", None)
             if architectures:
                 architecture = ";".join(architectures)
+            if model_type == ModelType.Unknown:
+                model_type = ModelType.from_str(getattr(model_config, "model_type", "Unknown"))
         # Extract results available in this file (some results are split in several files)
         results = {}
             precision=precision,
             revision= config.get("model_sha", ""),
             still_on_hub=still_on_hub,
+            architecture=architecture,
+            model_type=model_type,
         )
     def update_with_request_file(self, requests_path):

src/submission/submit.py CHANGED Viewed

@@ -3,7 +3,7 @@ import os
 from datetime import datetime, timezone
 from src.display.formatting import styled_error, styled_message, styled_warning
-from src.envs import API, EVAL_REQUESTS_PATH, TOKEN, QUEUE_REPO
 from src.submission.check_validity import (
     already_submitted_models,
     check_model_card,
@@ -22,97 +22,6 @@ def add_new_eval(
     weight_type: str,
     model_type: str,
 ):
-    global REQUESTED_MODELS
-    global USERS_TO_SUBMISSION_DATES
-    if not REQUESTED_MODELS:
-        REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(EVAL_REQUESTS_PATH)
-    user_name = ""
-    model_path = model
-    if "/" in model:
-        user_name = model.split("/")[0]
-        model_path = model.split("/")[1]
-    precision = precision.split(" ")[0]
-    current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
-    if model_type is None or model_type == "":
-        return styled_error("Please select a model type.")
-    # Does the model actually exist?
-    if revision == "":
-        revision = "main"
-    # Is the model on the hub?
-    if weight_type in ["Delta", "Adapter"]:
-        base_model_on_hub, error, _ = is_model_on_hub(model_name=base_model, revision=revision, token=TOKEN, test_tokenizer=True)
-        if not base_model_on_hub:
-            return styled_error(f'Base model "{base_model}" {error}')
-    if not weight_type == "Adapter":
-        model_on_hub, error, _ = is_model_on_hub(model_name=model, revision=revision, test_tokenizer=True)
-        if not model_on_hub:
-            return styled_error(f'Model "{model}" {error}')
-    # Is the model info correctly filled?
-    try:
-        model_info = API.model_info(repo_id=model, revision=revision)
-    except Exception:
-        return styled_error("Could not get your model information. Please fill it up properly.")
-    model_size = get_model_size(model_info=model_info, precision=precision)
-    # Were the model card and license filled?
-    try:
-        license = model_info.cardData["license"]
-    except Exception:
-        return styled_error("Please select a license for your model")
-    modelcard_OK, error_msg = check_model_card(model)
-    if not modelcard_OK:
-        return styled_error(error_msg)
-    # Seems good, creating the eval
-    print("Adding new eval")
-    eval_entry = {
-        "model": model,
-        "base_model": base_model,
-        "revision": revision,
-        "precision": precision,
-        "weight_type": weight_type,
-        "status": "PENDING",
-        "submitted_time": current_time,
-        "model_type": model_type,
-        "likes": model_info.likes,
-        "params": model_size,
-        "license": license,
-    }
-    # Check for duplicate submission
-    if f"{model}_{revision}_{precision}" in REQUESTED_MODELS:
-        return styled_warning("This model has been already submitted.")
-    print("Creating eval file")
-    OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
-    os.makedirs(OUT_DIR, exist_ok=True)
-    out_path = f"{OUT_DIR}/{model_path}_eval_request_False_{precision}_{weight_type}.json"
-    with open(out_path, "w") as f:
-        f.write(json.dumps(eval_entry))
-    print("Uploading eval file")
-    API.upload_file(
-        path_or_fileobj=out_path,
-        path_in_repo=out_path.split("eval-queue/")[1],
-        repo_id=QUEUE_REPO,
-        repo_type="dataset",
-        commit_message=f"Add {model} to eval queue",
-    )
-    # Remove the local file
-    os.remove(out_path)
     return styled_message(
-        "Your request has been submitted to the evaluation queue!\nPlease wait for up to an hour for the model to show in the PENDING list."
     )

 from datetime import datetime, timezone
 from src.display.formatting import styled_error, styled_message, styled_warning
+from src.envs import API, EVAL_REQUESTS_PATH, TOKEN
 from src.submission.check_validity import (
     already_submitted_models,
     check_model_card,
     weight_type: str,
     model_type: str,
 ):
     return styled_message(
+        "Currently, we don't support adding new evaluations. Please contact the admins for more information."
     )