Spaces:

braindao
/

soliditybench-leaderboard

Running

App Files Files Community

brunneis commited on Sep 11, 2024

Commit

68ed9a2

unverified ·

1 Parent(s): 827199d

Clean codebase

Browse files

Files changed (10) hide show

app.py +5 -2
src/about.py +6 -2
src/display/css_html_js.py +21 -10
src/display/formatting.py +4 -0
src/display/utils.py +14 -5
src/envs.py +7 -3
src/leaderboard/read_evals.py +26 -18
src/populate.py +4 -0
src/submission/check_validity.py +11 -5
src/submission/submit.py +11 -11

app.py CHANGED Viewed

@@ -1,5 +1,8 @@
 import gradio as gr
-import pandas as pd
 from apscheduler.schedulers.background import BackgroundScheduler
 from gradio_leaderboard import ColumnFilter, Leaderboard, SelectColumns
 from huggingface_hub import snapshot_download
@@ -109,7 +112,7 @@ with demo:
                     EVALUATION_SCRIPT,
                     elem_classes="markdown-text",
                 )
         with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
             with gr.Column():
                 with gr.Row():

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# flake8: noqa E501
 import gradio as gr
 from apscheduler.schedulers.background import BackgroundScheduler
 from gradio_leaderboard import ColumnFilter, Leaderboard, SelectColumns
 from huggingface_hub import snapshot_download
                     EVALUATION_SCRIPT,
                     elem_classes="markdown-text",
                 )
         with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
             with gr.Column():
                 with gr.Row():

src/about.py CHANGED Viewed

@@ -1,3 +1,7 @@
 from dataclasses import dataclass
 from enum import Enum
@@ -75,13 +79,13 @@ If everything is done, check you can launch the EleutherAIHarness on your model
 EVALUATION_SCRIPT = '''
 To evaluate the model you can access the colab notebook at [this link](https://colab.research.google.com/drive/145KAGvgdAb8BrkObUrxAVWBd9EGDqy8N?usp=sharing).
-## First install the necessary libraries
 ```
 pip install accelerate openai anthropic datasets
 ```
-## Setup your :
 * OPENAI_API_KEY
 * ANTHROPIC_API_KEY
 * HF_TOKEN

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# flake8: noqa E501
 from dataclasses import dataclass
 from enum import Enum
 EVALUATION_SCRIPT = '''
 To evaluate the model you can access the colab notebook at [this link](https://colab.research.google.com/drive/145KAGvgdAb8BrkObUrxAVWBd9EGDqy8N?usp=sharing).
+## First install the necessary libraries
 ```
 pip install accelerate openai anthropic datasets
 ```
+## Setup your :
 * OPENAI_API_KEY
 * ANTHROPIC_API_KEY
 * HF_TOKEN

src/display/css_html_js.py CHANGED Viewed

@@ -1,5 +1,7 @@
-custom_css = """
 .markdown-text {
     font-size: 16px !important;
 }
@@ -33,12 +35,12 @@ custom_css = """
     background: none;
     border: none;
 }
 #search-bar {
     padding: 0px;
 }
-/* Limit the width of the first AutoEvalColumn so that names don't expand too much */
 table td:first-child,
 table th:first-child {
     max-width: 400px;
@@ -62,44 +64,53 @@ table th:first-child {
 #scale-logo .download {
     display: none;
 }
 #filter_type{
     border: 0;
     padding-left: 0;
     padding-top: 0;
 }
 #filter_type label {
     display: flex;
 }
 #filter_type label > span{
     margin-top: var(--spacing-lg);
     margin-right: 0.5em;
 }
 #filter_type label > .wrap{
     width: 103px;
 }
-#filter_type label > .wrap .wrap-inner{
     padding: 2px;
 }
 #filter_type label > .wrap .wrap-inner input{
     width: 1px
 }
 #filter-columns-type{
     border:0;
     padding:0.5;
 }
 #filter-columns-size{
     border:0;
     padding:0.5;
 }
 #box-filter > .form{
     border: 0
 }
 """
 get_window_url_params = """
-    function(url_params) {
-        const params = new URLSearchParams(window.location.search);
-        url_params = Object.fromEntries(params);
-        return url_params;
-    }
-    """

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+custom_css = """
 .markdown-text {
     font-size: 16px !important;
 }
     background: none;
     border: none;
 }
 #search-bar {
     padding: 0px;
 }
+/* Limit the width of the first AutoEvalColumn */
 table td:first-child,
 table th:first-child {
     max-width: 400px;
 #scale-logo .download {
     display: none;
 }
 #filter_type{
     border: 0;
     padding-left: 0;
     padding-top: 0;
 }
 #filter_type label {
     display: flex;
 }
 #filter_type label > span{
     margin-top: var(--spacing-lg);
     margin-right: 0.5em;
 }
 #filter_type label > .wrap{
     width: 103px;
 }
+#filter_type label > .wrap .wrap-inner {
     padding: 2px;
 }
 #filter_type label > .wrap .wrap-inner input{
     width: 1px
 }
 #filter-columns-type{
     border:0;
     padding:0.5;
 }
 #filter-columns-size{
     border:0;
     padding:0.5;
 }
 #box-filter > .form{
     border: 0
 }
 """
 get_window_url_params = """
+function(url_params) {
+    const params = new URLSearchParams(window.location.search);
+    url_params = Object.fromEntries(params);
+    return url_params;
+}
+"""

src/display/formatting.py CHANGED Viewed

@@ -1,3 +1,7 @@
 def model_hyperlink(link, model_name):
     return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# flake8: noqa E501
 def model_hyperlink(link, model_name):
     return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'

src/display/utils.py CHANGED Viewed

@@ -1,3 +1,7 @@
 from dataclasses import dataclass, make_dataclass
 from enum import Enum
@@ -7,7 +11,7 @@ from src.about import Tasks
 def fields(raw_class):
-    return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
 # These classes are for user facing column names,
@@ -23,13 +27,16 @@ class ColumnContent:
 ## Leaderboard columns
 auto_eval_column_dict = []
 # Init
 auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
 auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
-#Scores
 auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
 for task in Tasks:
     auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
 # Model information
 auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
 auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
@@ -44,7 +51,8 @@ auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sh
 # We use make dataclass to dynamically fill the scores from Tasks
 AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
-## For the queue columns in the submission tab
 @dataclass(frozen=True)
 class EvalQueueColumn:  # Queue column
     model = ColumnContent("model", "markdown", True)
@@ -55,7 +63,7 @@ class EvalQueueColumn:  # Queue column
     status = ColumnContent("status", "str", True)
-## All the model information that we might need
 @dataclass
 class ModelDetails:
     name: str
@@ -93,6 +101,7 @@ class WeightType(Enum):
     Original = ModelDetails("Original")
     Delta = ModelDetails("Delta")
 class Precision(Enum):
     float16 = ModelDetails("float16")
     bfloat16 = ModelDetails("bfloat16")
@@ -105,6 +114,7 @@ class Precision(Enum):
             return Precision.bfloat16
         return Precision.Unknown
 # Column selection
 COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
@@ -112,4 +122,3 @@ EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
 EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
 BENCHMARK_COLS = [t.value.col_name for t in Tasks]

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# flake8: noqa E501
 from dataclasses import dataclass, make_dataclass
 from enum import Enum
 def fields(raw_class):
+    return [v for k, v in raw_class.__dict__.items() if not k.startswith("__") and not k.endswith("__")]
 # These classes are for user facing column names,
 ## Leaderboard columns
 auto_eval_column_dict = []
 # Init
 auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
 auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
+# Scores
 auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
 for task in Tasks:
     auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
 # Model information
 auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
 auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
 # We use make dataclass to dynamically fill the scores from Tasks
 AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
+# For the queue columns in the submission tab
 @dataclass(frozen=True)
 class EvalQueueColumn:  # Queue column
     model = ColumnContent("model", "markdown", True)
     status = ColumnContent("status", "str", True)
+# All the model information that we might need
 @dataclass
 class ModelDetails:
     name: str
     Original = ModelDetails("Original")
     Delta = ModelDetails("Delta")
 class Precision(Enum):
     float16 = ModelDetails("float16")
     bfloat16 = ModelDetails("bfloat16")
             return Precision.bfloat16
         return Precision.Unknown
 # Column selection
 COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
 EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
 BENCHMARK_COLS = [t.value.col_name for t in Tasks]

src/envs.py CHANGED Viewed

@@ -1,12 +1,16 @@
 import os
 from huggingface_hub import HfApi
 # Info to change for your repository
 # ----------------------------------
-TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
-OWNER = "braindao" # Change to your org - don't forget to create a results and request dataset, with the correct format!
 # ----------------------------------
 REPO_ID = f"{OWNER}/solidity-leaderboard"
@@ -14,7 +18,7 @@ QUEUE_REPO = f"{OWNER}/requests"
 RESULTS_REPO = f"{OWNER}/results"
 # If you setup a cache later, just change HF_HOME
-CACHE_PATH=os.getenv("HF_HOME", ".")
 # Local caches
 EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# flake8: noqa E501
 import os
 from huggingface_hub import HfApi
 # Info to change for your repository
 # ----------------------------------
+TOKEN = os.environ.get("HF_TOKEN")  # A read/write token for your org
+OWNER = "braindao"  # Change to your org - don't forget to create a results and request dataset, with the correct format!
 # ----------------------------------
 REPO_ID = f"{OWNER}/solidity-leaderboard"
 RESULTS_REPO = f"{OWNER}/results"
 # If you setup a cache later, just change HF_HOME
+CACHE_PATH = os.getenv("HF_HOME", ".")
 # Local caches
 EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")

src/leaderboard/read_evals.py CHANGED Viewed

@@ -1,6 +1,9 @@
 import glob
 import json
-import math
 import os
 from dataclasses import dataclass
@@ -8,7 +11,7 @@ import dateutil
 import numpy as np
 from src.display.formatting import make_clickable_model
-from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType
 from src.submission.check_validity import is_model_on_hub
@@ -16,24 +19,24 @@ from src.submission.check_validity import is_model_on_hub
 class EvalResult:
     """Represents one full evaluation. Built from a combination of the result and request file for a given run.
     """
-    eval_name: str # org_model_precision (uid)
-    full_model: str # org/model (path on hub)
-    org: str
     model: str
-    revision: str # commit hash, "" if main
     results: dict
     precision: Precision = Precision.Unknown
-    model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
-    weight_type: WeightType = WeightType.Original # Original or Adapter
-    architecture: str = "Unknown"
     license: str = "?"
     likes: int = 0
     num_params: int = 0
-    date: str = "" # submission date of request file
     still_on_hub: bool = False
     @classmethod
-    def init_from_json_file(self, json_filepath):
         """Inits the result from the specific model result file"""
         with open(json_filepath) as fp:
             data = json.load(fp)
@@ -58,7 +61,10 @@ class EvalResult:
         full_model = "/".join(org_and_model)
         still_on_hub, _, model_config = is_model_on_hub(
-            full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
         )
         architecture = "?"
         if model_config is not None:
@@ -66,12 +72,14 @@ class EvalResult:
             if architectures:
                 architecture = ";".join(architectures)
-        # Extract results available in this file (some results are split in several files)
         results = {}
         for task in Tasks:
             task = task.value
-            # We average all scores of a given metric (not all metrics are present in all files)
             accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
             if accs.size == 0 or any([acc is None for acc in accs]):
                 continue
@@ -79,14 +87,14 @@ class EvalResult:
             mean_acc = np.mean(accs) * 100.0
             results[task.benchmark] = mean_acc
-        return self(
             eval_name=result_key,
             full_model=full_model,
             org=org,
             model=model,
             results=results,
-            precision=precision,
-            revision= config.get("model_sha", ""),
             still_on_hub=still_on_hub,
             architecture=architecture
         )
@@ -188,7 +196,7 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
     results = []
     for v in eval_results.values():
         try:
-            v.to_dict() # we test if the dict version is complete
             results.append(v)
         except KeyError:  # not all eval values present
             continue

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# flake8: noqa E501
 import glob
 import json
 import os
 from dataclasses import dataclass
 import numpy as np
 from src.display.formatting import make_clickable_model
+from src.display.utils import AutoEvalColumn, ModelType, Precision, Tasks, WeightType
 from src.submission.check_validity import is_model_on_hub
 class EvalResult:
     """Represents one full evaluation. Built from a combination of the result and request file for a given run.
     """
+    eval_name: str  # org_model_precision (uid)
+    full_model: str  # org/model (path on hub)
+    org: str
     model: str
+    revision: str  # commit hash, "" if main
     results: dict
     precision: Precision = Precision.Unknown
+    model_type: ModelType = ModelType.Unknown  # Pretrained, fine tuned, ...
+    weight_type: WeightType = WeightType.Original  # Original or Adapter
+    architecture: str = "Unknown"
     license: str = "?"
     likes: int = 0
     num_params: int = 0
+    date: str = ""  # submission date of request file
     still_on_hub: bool = False
     @classmethod
+    def init_from_json_file(cls, json_filepath):
         """Inits the result from the specific model result file"""
         with open(json_filepath) as fp:
             data = json.load(fp)
         full_model = "/".join(org_and_model)
         still_on_hub, _, model_config = is_model_on_hub(
+            full_model,
+            config.get("model_sha", "main"),
+            trust_remote_code=True,
+            test_tokenizer=False,
         )
         architecture = "?"
         if model_config is not None:
             if architectures:
                 architecture = ";".join(architectures)
+        # Extract results available in this file
+        # (some results are split in several files)
         results = {}
         for task in Tasks:
             task = task.value
+            # We average all scores of a given metric
+            # (not all metrics are present in all files)
             accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
             if accs.size == 0 or any([acc is None for acc in accs]):
                 continue
             mean_acc = np.mean(accs) * 100.0
             results[task.benchmark] = mean_acc
+        return cls(
             eval_name=result_key,
             full_model=full_model,
             org=org,
             model=model,
             results=results,
+            precision=precision,
+            revision=config.get("model_sha", ""),
             still_on_hub=still_on_hub,
             architecture=architecture
         )
     results = []
     for v in eval_results.values():
         try:
+            v.to_dict()  # we test if the dict version is complete
             results.append(v)
         except KeyError:  # not all eval values present
             continue

src/populate.py CHANGED Viewed

@@ -1,3 +1,7 @@
 import json
 import os

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# flake8: noqa E501
 import json
 import os

src/submission/check_validity.py CHANGED Viewed

@@ -1,8 +1,10 @@
 import json
 import os
-import re
 from collections import defaultdict
-from datetime import datetime, timedelta, timezone
 import huggingface_hub
 from huggingface_hub import ModelCard
@@ -10,6 +12,7 @@ from huggingface_hub.hf_api import ModelInfo
 from transformers import AutoConfig
 from transformers.models.auto.tokenization_auto import AutoTokenizer
 def check_model_card(repo_id: str) -> tuple[bool, str]:
     """Checks if the model card and license exist and have been filled"""
     try:
@@ -31,20 +34,21 @@ def check_model_card(repo_id: str) -> tuple[bool, str]:
     return True, ""
 def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_remote_code=False, test_tokenizer=False) -> tuple[bool, str]:
     """Checks if the model model_name is on the hub, and whether it (and its tokenizer) can be loaded with AutoClasses."""
     try:
         config = AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
         if test_tokenizer:
             try:
-                tk = AutoTokenizer.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
             except ValueError as e:
                 return (
                     False,
                     f"uses a tokenizer which is not in a transformers release: {e}",
                     None
                 )
-            except Exception as e:
                 return (False, "'s tokenizer cannot be loaded. Is your tokenizer class in a stable transformers release, and correctly configured?", None)
         return True, None, config
@@ -55,7 +59,7 @@ def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_rem
             None
         )
-    except Exception as e:
         return False, "was not found on hub!", None
@@ -70,10 +74,12 @@ def get_model_size(model_info: ModelInfo, precision: str):
     model_size = size_factor * model_size
     return model_size
 def get_model_arch(model_info: ModelInfo):
     """Gets the model architecture from the configuration"""
     return model_info.config.get("architectures", "Unknown")
 def already_submitted_models(requested_models_dir: str) -> set[str]:
     """Gather a list of already submitted models to avoid duplicates"""
     depth = 1

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# flake8: noqa E501
 import json
 import os
 from collections import defaultdict
 import huggingface_hub
 from huggingface_hub import ModelCard
 from transformers import AutoConfig
 from transformers.models.auto.tokenization_auto import AutoTokenizer
 def check_model_card(repo_id: str) -> tuple[bool, str]:
     """Checks if the model card and license exist and have been filled"""
     try:
     return True, ""
 def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_remote_code=False, test_tokenizer=False) -> tuple[bool, str]:
     """Checks if the model model_name is on the hub, and whether it (and its tokenizer) can be loaded with AutoClasses."""
     try:
         config = AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
         if test_tokenizer:
             try:
+                _ = AutoTokenizer.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
             except ValueError as e:
                 return (
                     False,
                     f"uses a tokenizer which is not in a transformers release: {e}",
                     None
                 )
+            except Exception:
                 return (False, "'s tokenizer cannot be loaded. Is your tokenizer class in a stable transformers release, and correctly configured?", None)
         return True, None, config
             None
         )
+    except Exception:
         return False, "was not found on hub!", None
     model_size = size_factor * model_size
     return model_size
 def get_model_arch(model_info: ModelInfo):
     """Gets the model architecture from the configuration"""
     return model_info.config.get("architectures", "Unknown")
 def already_submitted_models(requested_models_dir: str) -> set[str]:
     """Gather a list of already submitted models to avoid duplicates"""
     depth = 1

src/submission/submit.py CHANGED Viewed

@@ -1,19 +1,19 @@
 import json
 import os
 from datetime import datetime, timezone
 from src.display.formatting import styled_error, styled_message, styled_warning
-from src.envs import API, EVAL_REQUESTS_PATH, TOKEN, QUEUE_REPO
-from src.submission.check_validity import (
-    already_submitted_models,
-    check_model_card,
-    get_model_size,
-    is_model_on_hub,
-)
 REQUESTED_MODELS = None
 USERS_TO_SUBMISSION_DATES = None
 def add_new_eval(
     model: str,
     base_model: str,
@@ -64,12 +64,12 @@ def add_new_eval(
     # Were the model card and license filled?
     try:
-        license = model_info.cardData["license"]
     except Exception:
         return styled_error("Please select a license for your model")
-    modelcard_OK, error_msg = check_model_card(model)
-    if not modelcard_OK:
         return styled_error(error_msg)
     # Seems good, creating the eval
@@ -86,7 +86,7 @@ def add_new_eval(
         "model_type": model_type,
         "likes": model_info.likes,
         "params": model_size,
-        "license": license,
         "private": False,
     }

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# flake8: noqa E501
 import json
 import os
 from datetime import datetime, timezone
 from src.display.formatting import styled_error, styled_message, styled_warning
+from src.envs import API, EVAL_REQUESTS_PATH, QUEUE_REPO, TOKEN
+from src.submission.check_validity import already_submitted_models, check_model_card, get_model_size, is_model_on_hub
 REQUESTED_MODELS = None
 USERS_TO_SUBMISSION_DATES = None
 def add_new_eval(
     model: str,
     base_model: str,
     # Were the model card and license filled?
     try:
+        license_title = model_info.cardData["license"]
     except Exception:
         return styled_error("Please select a license for your model")
+    is_model_card_ok, error_msg = check_model_card(model)
+    if not is_model_card_ok:
         return styled_error(error_msg)
     # Seems good, creating the eval
         "model_type": model_type,
         "likes": model_info.likes,
         "params": model_size,
+        "license": license_title,
         "private": False,
     }