Spaces:

CZLC
/

BenCzechMark

Running

App Files Files Community

Lakoc commited on May 27, 2024

Commit

b66f230

•

1 Parent(s): 49d6897

v0.0.1

Browse files

Files changed (6) hide show

app.py +84 -345
compare_significance.py +112 -75
content.py +15 -43
model_compare.py +17 -45
server.py +144 -0
tasks_metadata.json +204 -0

app.py CHANGED Viewed

@@ -1,390 +1,129 @@
-import glob
 import os
-import logging
-import pandas as pd
 import gradio as gr
 from gradio.themes.utils.sizes import text_md
-from content import (HEADER_MARKDOWN, LEADERBOARD_TAB_TITLE_MARKDOWN, SUBMISSION_TAB_TITLE_MARKDOWN,
-                     )
-import json
-from datetime import datetime
-from pathlib import Path
-from uuid import uuid4
-import time
-import gradio as gr
-from huggingface_hub import HfApi, snapshot_download
-from compare_significance import check_significance, SUPPORTED_METRICS
-from model_compare import ModelCompare
-JSON_DATASET_DIR = Path("../json_dataset")
-JSON_DATASET_DIR.mkdir(parents=True, exist_ok=True)
-JSON_DATASET_PATH = JSON_DATASET_DIR / f"train-{uuid4()}.json"
-api = HfApi()
-ORG= "CZLC"
-REPO = f"{ORG}/LLM_benchmark_data"
-def greet(name: str) -> str:
-    return "Hello " + name + "!"
-DATASET_VERSIONS = ['dev-set-1', 'dev-set-2']
-HF_TOKEN = os.environ.get("HF_TOKEN")
-class LeaderboardServer:
-    def __init__(self, server_address):
-        self.server_address = server_address
-        self.repo_type = "dataset"
-        self.local_leaderboard = snapshot_download(self.server_address, repo_type=self.repo_type, token=HF_TOKEN,local_dir = "./")
-        self.submisssion_id_to_file = {} # Map submission ids to file paths
-    def on_submit(self):
-        self.local_leaderboard = snapshot_download(self.server_address,repo_type=self.repo_type, token=HF_TOKEN,local_dir = "./")
-    def get_leaderboard(self):
-        results = []
-        new_results = []
-        submission_ids = set()
-        # pre-computed ranks
-        with open(os.path.join(self.local_leaderboard, "metadata", "ranks.json")) as ranks_file:
-            ranks = json.load(ranks_file)
-        model_compare = ModelCompare()
-        ranks = model_compare.get_tasks_ranks(ranks)
-        # Models data
-        for submission in glob.glob(os.path.join(self.local_leaderboard, "data") + "/*.json"):
-            data = json.load(open(submission))
-            submission_id = data["metadata"]["model_description"]
-            if submission_id in submission_ids:
-                continue
-            submission_ids.add(submission_id)
-            self.submisssion_id_to_file[submission_id] = submission
-            local_results = {task: list(task_ranks).index(submission_id)+1 for task, task_ranks in ranks.items()}
-            local_results["submission_id"] = submission_id
-            results.append(local_results)
-        dataframe = pd.DataFrame.from_records(results)
-        # Reorder to have the id (model description) first
-        df_order = ["submission_id"] + [col for col in dataframe.columns if col != "submission_id"]
-        dataframe = dataframe[df_order]
-        return dataframe
-    def compute_ranks(self):
-        ''' Compute rankings on every submit '''
-        self.get_leaderboard()
-        ids = list(self.submisssion_id_to_file.keys())
-        rankings = {id: {} for id in ids}
-        for a_idx in range(len(ids)):
-            for b_idx in range(a_idx+1, len(ids)):
-                modelA_id = ids[a_idx]
-                modelB_id = ids[b_idx]
-                res = self.compare_models(modelA_id, modelB_id)
-                rankings[modelA_id][modelB_id] =  {
-                    task: data["significant"] for task,data in res.items()
-                }
-                rankings[modelB_id][modelA_id] =  {
-                    task: not data["significant"] for task,data in res.items()
-                }
-        return rankings
-    def compare_models(self, modelA, modelB):
-        modelA_path = self.submisssion_id_to_file.get(modelA)
-        modelB_path = self.submisssion_id_to_file.get(modelB)
-        return check_significance(modelA_path, modelB_path)
-    def get_rankings(self):
-        # TODO retrieve saved rankings for models on tasks
-        pass
-    def save_json(self,file, submission_name) -> None:
-        filename = os.path.basename(file)
-        api.upload_file(
-            path_or_fileobj=file,
-            path_in_repo=f"data/{submission_name}_{filename}",
-            repo_id=self.server_address,
-            repo_type=self.repo_type,
-            token=HF_TOKEN,
-        )
-leaderboard_server =  LeaderboardServer(REPO)
-logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
-LEADERBOARD_TYPES = ['LLM',]
-MAX_SUBMISSIONS_PER_24H = 2
-# DATASET_VERSIONS = ['dev-set-1', 'dev-set-2']
-# CHALLENGE_NAME = 'NOTSOFAR1'
-# if __name__ == '__main__':
 with (gr.Blocks(theme=gr.themes.Soft(text_size=text_md), css="footer {visibility: hidden}") as main):
     app_state = gr.State({})
-    # with gr.Row():
-    #     greet_name = gr.Textbox(label="Name")
-    #     greet_output = gr.Textbox(label="Greetings")
-    # greet_btn = gr.Button("Greet")
-    # greet_btn.click(fn=greet, inputs=greet_name, outputs=greet_output).success(
-    #     fn=save_json,
-    #     inputs=[greet_name, greet_output],
-    #     outputs=None,
-    # )
     with gr.Row():
         with gr.Row():
             gr.Markdown(HEADER_MARKDOWN)
     with gr.Row():
-        # Leaderboards Tab #
-        ####################
-        def populate_leaderboard(leaderboard_type, dataset_version):
-            gr.Info('Loading leaderboard...')
-            time.sleep(1)
-            leaderboard_df = leaderboard_server.get_leaderboard()
-            # leaderboard_df = lb_server.get_leaderboard(
-            #     submission_type=leaderboard_type, dataset_version=dataset_version)
-            # if leaderboard_df.empty:
-            return leaderboard_df
-            # return leaderboard_df
-        def create_leaderboard_tab(tab_name: str, idx: int, dataset_version_dropdown: gr.Dropdown):
-            # dataset_version = dataset_version_dropdown.value
-            print(f'Creating tab for {tab_name}, idx={idx}, dataset_version={dataset_version_dropdown}')
-            with gr.Tab(id=tab_name, label=tab_name) as leaderboard_tab:
-                leaderboard_table = gr.DataFrame(populate_leaderboard(tab_name, None)) if idx == 0 \
-                    else gr.DataFrame(pd.DataFrame(columns=['No submissions yet']))
-                leaderboard_tab.select(fn=populate_leaderboard,
-                                       inputs=[gr.Text(tab_name, visible=False)],
-                                       outputs=[leaderboard_table])
-                return leaderboard_table
-        def on_dropdown_change():
-            first_tab_name = LEADERBOARD_TYPES[0]
-            leaderboard_server.on_submit()
-            return gr.Tabs(selected=first_tab_name), populate_leaderboard(first_tab_name, None)
         with gr.Tab('Leaderboard') as leaderboards_tab:
-            # with gr.Row():
-            #     gr.Markdown(LEADERBOARD_TAB_TITLE_MARKDOWN)
-            # with gr.Row():
-            #     with gr.Column():
-            #         dataset_version_drop = gr.Dropdown(choices=DATASET_VERSIONS, multiselect=False,
-            #                                            value=DATASET_VERSIONS[-1], label="Dataset",
-            #                                            interactive=True)
-            #     with gr.Column():
-            #         gr.Markdown('')  # Empty column for spacing
-            #     with gr.Column():
-            #         gr.Markdown('')  # Empty column for spacing
-            #     with gr.Column():
-            #         gr.Markdown('')  # Empty column for spacing
-            # with gr.Row():
-            #     with gr.Tabs() as leaderboards_tabs:
-            #         leaderboard_tables_list = []
-            #         for leaderboard_idx, leaderboard_type in enumerate(LEADERBOARD_TYPES):
-            #             l_tab = create_leaderboard_tab(leaderboard_type, leaderboard_idx, None)
-            #             leaderboard_tables_list.append(l_tab)
-            # change the table based on the selected model
-            def on_dropdown_change(model_detail):
-                leaderboard = leaderboard_server.get_leaderboard()
-                return leaderboard[leaderboard["submission_id"] == model_detail]
-            results_table = gr.DataFrame(leaderboard_server.get_leaderboard(), interactive=False, label=None, visible=True)
-            model_detail = gr.Dropdown(choices=list(leaderboard_server.get_leaderboard()["submission_id"]), label="Select model", interactive=True)
-            model_detail_button = gr.Button("Show model detail", interactive=True)
-            model_detail_button.click(
-                fn=on_dropdown_change,
-                inputs=[model_detail],
-                outputs=[results_table]
-            )
-            # results_table.select(fn=on_dropdown_change, inputs=[model_detail], outputs=[results_table])
-            # dataset_version_drop.select(fn=on_dropdown_change, inputs=[dataset_version_drop],
-            #                             outputs=[leaderboards_tabs, leaderboard_tables_list[0]])
-        ##################
-        # Submission Tab #
-        ##################
         with gr.Tab('Submission'):
             with gr.Column():
-                def on_submit_pressed():
-                    return gr.update(value='Processing submission...', interactive=False)
-                def validate_submission_inputs(team_name, submission_zip, submission_type, token):
-                    if not team_name or not submission_zip or not submission_type:
-                        raise ValueError('Please fill in all fields')
-                    if not os.path.exists(submission_zip):
-                        raise ValueError('File does not exist')
-                    # if not submission_zip.endswith('.zip'):
-                    #     raise ValueError('File must be a zip')
-                    # if not token:
-                    #     raise ValueError('Please insert a valid Hugging Face token')
-                def process_submission(team_name, submission, submission_type, description,
-                                       app_state, request: gr.Request):
-                    logging.info(f'{team_name}: new submission for track: {submission_type}')
-                    try:
-                        token = app_state.get('hf_token')
-                        validate_submission_inputs(team_name, submission, submission_type, token)
-                    except ValueError as err:
-                        gr.Warning(str(err))
-                        return
-                    # metadata = {'challenge_name': CHALLENGE_NAME,
-                    #             "dataset_version": DATASET_VERSIONS[-1],
-                    #             'team_name': team_name,
-                    #             'submission_type': submission_type,
-                    #             'description': description,
-                    #             'token': token,
-                    #             'file_name': os.path.basename(submission_zip),
-                    #             'file_size_mb': os.path.getsize(submission_zip) / 1024 / 1024,
-                    #             'ip': request.client.host}
-                    leaderboard_server.save_json(submission,team_name)
-                    try:
-                        gr.Info('Processing submission...')
-                        # response = lb_server.add_submission(token=token, file_path=submission_zip, metadata=metadata)
-                        # if 'error' in response:
-                        #     gr.Warning(f'Failed to process submission - {response["error"]}')
-                        # else:
-                        gr.Info('Done processing submission')
-                    except Exception as e:
-                        gr.Warning(f'Submission failed to upload - {e}')
-                def on_submit_done():
-                    on_dropdown_change()
-                    leaderboard_server.on_submit()
-                    # leaderboard_tab.children[0] = gr.DataFrame(populate_leaderboard(None, None))
-                    # leaderboard_tab.render()
-                    return gr.update(value='Submit', interactive=True)
-                def show_leaderboard():
-                    gr.Info("Loding leaderboard...")
-                    return leaderboard_server.get_leaderboard()
-                gr.Markdown(
-                    """
-                    # Model submission
-                    Model can be compared with other models and submitted\n
-                    Click **Compare results** to compare your model with other models in the leaderboard\n
-                    Click **Submit results** to submit your model to the leaderboard
-                    (Comparison by itself is not a submission)
-                    """
-                )
-                submission_team_name_tb = gr.Textbox(label='Team Name')
-                # submission_type_radio = gr.Radio(label='Submission Track', choices=LEADERBOARD_TYPES)
                 with gr.Row():
                     description_tb = gr.Textbox(label='Description', type='text')
                     link_to_model_tb = gr.Textbox(label='Link to model', type='text')
-                with gr.Row():
-                    hf_token_tb = gr.Textbox(label='Token', type='password')
-                    submissions_24h_txt = gr.Textbox(label='Submissions 24h', value='')
                 submission_file_path = gr.File(label='Upload your results', type='filepath')
-                compare_results_button = gr.DataFrame(show_leaderboard(), interactive=False, label=None, visible=True)
-                # Button that triggers shows the current leaderboard
-                show_results_button = gr.Button("Compare results", interactive=True)
-                show_results_button.click(
-                    fn=show_leaderboard,
-                    outputs=[compare_results_button]
-                )
-                submission_btn = gr.Button(value='Submit results', interactive=True)
-                submission_btn.click(
-                    fn=on_submit_pressed,
-                    outputs=[submission_btn]
-                ).then(
-                    fn=process_submission,
-                    inputs=[submission_team_name_tb, submission_file_path, description_tb, app_state]
-                ).then(
-                    fn=on_submit_done,
-                    outputs=[submission_btn]
-                )
-                # .then(
-                #     fn=on_dropdown_change,
-                #                     outputs=[leaderboards_tabs, leaderboard_tables_list[0]]
-                # )
-        # # My Submissions Tab #
-        # ######################
-        # with gr.Tab('My Submissions') as my_submissions_tab:
-        #     def on_my_submissions_tab_select(app_state):
-        #         hf_token = app_state.get('hf_token')
-        #         if not hf_token:
-        #             return pd.DataFrame(columns=['Please insert your Hugging Face token'])
-        #         # submissions = lb_server.get_submissions_by_hf_token(hf_token=hf_token)
-        #         # if submissions.empty:
-        #         #     submissions = pd.DataFrame(columns=['No submissions yet'])
-        #         # return submissions
-        #
-        #     gr.Markdown(MY_SUBMISSIONS_TAB_TITLE_MARKDOWN)
-        #     my_submissions_table = gr.DataFrame()
-        #
-        #     my_submissions_tab.select(fn=on_my_submissions_tab_select, inputs=[app_state],
-        #                               outputs=[my_submissions_table])
-        #     my_submissions_token_tb = gr.Textbox(label='Token', type='password')
-    def on_token_insert(hf_token, app_state):
-        gr.Info(f'Verifying token...')
-        submission_count = None
-        # if hf_token:
-            # submission_count = lb_server.get_submission_count_last_24_hours(hf_token=hf_token)
-        if submission_count is None:
-            # Invalid token
-            app_state['hf_token'] = None
-            submissions_24h_str = ''
-            team_submissions_df = pd.DataFrame(columns=['Invalid Token'])
-            gr.Warning('Invalid token')
-        # else:
-        #     app_state['hf_token'] = hf_token
-        #     submissions_24h_str = f'{submission_count}/{MAX_SUBMISSIONS_PER_24H}'
-        #     team_submissions_df = lb_server.get_submissions_by_hf_token(hf_token=hf_token)
-        #     if team_submissions_df.empty:
-        #         team_submissions_df = pd.DataFrame(columns=['No submissions yet'])
-        #     gr.Info('Token verified!')
-        return app_state, team_submissions_df, submissions_24h_str
-    hf_token_tb.change(fn=on_token_insert, inputs=[hf_token_tb, app_state],
-                       outputs=[app_state, submissions_24h_txt])
-    # my_submissions_token_tb.change(fn=on_token_insert, inputs=[my_submissions_token_tb, app_state],
-    #                                outputs=[app_state, my_submissions_table, submissions_24h_txt])
     main.launch()

 import os
 import gradio as gr
+import pandas as pd
 from gradio.themes.utils.sizes import text_md
+from content import (HEADER_MARKDOWN, LEADERBOARD_TAB_TITLE_MARKDOWN, SUBMISSION_TAB_TITLE_MARKDOWN)
+from server import LeaderboardServer
+leaderboard_server = LeaderboardServer()
+def on_submit_pressed():
+    return gr.update(value='Processing submission...', interactive=False)
+def validate_submission_inputs(team_name, submission_id, link_to_model, submission_file):
+    if not team_name or not submission_id or not link_to_model or not submission_file:
+        raise ValueError('Please fill in all fields')
+    if not os.path.exists(submission_file):
+        raise ValueError('File does not exist')
+def process_submission(team_name, submission_id, description, link_to_model, submission_file):
+    try:
+        validate_submission_inputs(team_name, submission_id, link_to_model, submission_file)
+        metadata = {
+            "team_name": team_name,
+            "submission_id": submission_id,
+            "description": description,
+            "link_to_model": link_to_model,
+        }
+        gr.Info('Submission valid, running local tournament...')
+        leaderboard_server.prepare_model_for_submission(submission_file, metadata)
+    except ValueError as err:
+        gr.Warning(str(err))
+    return gr.update(visible=False), gr.update(visible=True), gr.update(interactive=True,
+                                                                        visible=True), gr.update(
+        interactive=True, visible=True), gr.update(visible=True), gr.update(
+        value=leaderboard_server.get_leaderboard(leaderboard_server.pre_submit[0]), visible=True)
+def submit_results():
+    leaderboard_server.save_pre_submit()
+    leaderboard_server.update_leaderboard()
+    gr.Info('Submission successful!')
+    return gr.update(value='Pre-submit model', visible=True, interactive=True), gr.update(
+        visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(
+        visible=False), gr.update(visible=False), gr.DataFrame(
+        value=leaderboard_server.get_leaderboard(), visible=True)
+def erase_presubmit():
+    leaderboard_server.pre_submit = None
+    return gr.update(value='Pre-submit model', visible=True, interactive=True), gr.update(
+        visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(
+        visible=False), gr.update(visible=False)
 with (gr.Blocks(theme=gr.themes.Soft(text_size=text_md), css="footer {visibility: hidden}") as main):
     app_state = gr.State({})
     with gr.Row():
         with gr.Row():
             gr.Markdown(HEADER_MARKDOWN)
     with gr.Row():
         with gr.Tab('Leaderboard') as leaderboards_tab:
+            gr.Markdown(LEADERBOARD_TAB_TITLE_MARKDOWN)
+            results_table = gr.DataFrame(leaderboard_server.get_leaderboard(), interactive=False, label=None,
+                                         visible=True)
         with gr.Tab('Submission'):
             with gr.Column():
+                gr.Markdown(SUBMISSION_TAB_TITLE_MARKDOWN)
+                with gr.Row():
+                    submission_team_name_tb = gr.Textbox(label='Team Name')
+                    submission_id_tb = gr.Textbox(label='Submission ID')
                 with gr.Row():
                     description_tb = gr.Textbox(label='Description', type='text')
                     link_to_model_tb = gr.Textbox(label='Link to model', type='text')
                 submission_file_path = gr.File(label='Upload your results', type='filepath')
+                pre_submission_btn = gr.Button(value='Pre-submit model', interactive=True)
+                submit_prompt = gr.Markdown(
+                    """
+                    Do you really want to submit a model? This action is irreversible.
+                    """,
+                    visible=False
+                )
+                pre_submit_info = gr.Markdown(
+                    """
+                    This is how will ranking look like after your submission:
+                    """,
+                    visible=False
+                )
+                pre_submit_table = gr.DataFrame(pd.DataFrame(), interactive=False, label=None, visible=False)
+                submission_btn_yes = gr.Button(value='Submit model', interactive=False, visible=False)
+                submission_btn_no = gr.Button(value='Reverse process', interactive=False, visible=False)
+                pre_submission_btn.click(
+                    fn=on_submit_pressed,
+                    outputs=[pre_submission_btn]
+                ).then(
+                    fn=process_submission,
+                    inputs=[submission_team_name_tb, submission_id_tb, description_tb, link_to_model_tb,
+                            submission_file_path],
+                    outputs=[pre_submission_btn, submit_prompt, submission_btn_yes, submission_btn_no, pre_submit_info,
+                             pre_submit_table]
+                )
+                submission_btn_yes.click(
+                    fn=submit_results,
+                    outputs=[pre_submission_btn, submission_btn_yes, submission_btn_no, submit_prompt, pre_submit_info,
+                             pre_submit_table, results_table]
+                )
+                submission_btn_no.click(
+                    fn=erase_presubmit,
+                    outputs=[pre_submission_btn, submission_btn_yes, submission_btn_no, submit_prompt, pre_submit_info,
+                             pre_submit_table]
+                )
     main.launch()

compare_significance.py CHANGED Viewed

@@ -3,20 +3,18 @@ import json
 from collections import defaultdict
 from typing import Sequence
-import numpy
 import numpy as np
-from scipy.stats import ttest_ind, ttest_rel
 from sklearn.metrics import roc_curve, auc
 from tqdm import tqdm
-# from leaderboard import SUPPORTED_METRICS
 SUPPORTED_METRICS = [
     "avg_mcauroc",  # for classification tasks
-    "em",  # for QA tasks
     "acc",  # for multichoice tasks
-    "rouge",  # for summarization tasks
-    "ppl",  # for language modeling tasks
 ]
@@ -44,43 +42,70 @@ def _get_CMs(i, probabilities, references, thresholds):
     return confusion_matrices
-def compute_significance_accuracy(predsA, referencesA, predsB, referencesB):
-    # following https://github.com/rtmdrr/testSignificanceNLP/blob/c7302d015538944364b622eb860dd9fbee6d50ec/testSignificance.py#L164C32-L165C24
-    # Calculate the T-test on TWO RELATED samples of scores, a and b. for one sided test we multiply p-value by half
-    scores_A  = [1 if pred == ref else 0 for pred, ref in zip(predsA, referencesA)]
-    scores_B  = [1 if pred == ref else 0 for pred, ref in zip(predsB, referencesB)]
     t, p = ttest_rel(scores_A, scores_B)
     # correct for one-tailed test
     p_value = p / 2
-    delta = np.mean(scores_A) - np.mean(scores_B)
     return p_value, delta
-def compute_significance_em(predsA, referencesA, predsB, referencesB):
-    pass
-def compute_significance_rouge(predsA, referencesA, predsB, referencesB):
-    # TODO: MDocekal
-    # Use bootstrapping
-    # https://github.com/rtmdrr/testSignificanceNLP/blob/c7302d015538944364b622eb860dd9fbee6d50ec/testSignificance.py#L89
-    pass
-def compute_significance_ppl(predsA, referencesA, predsB, referencesB):
-    # TODO: MDocekal
-    # Use bootstrapping
-    # https://github.com/rtmdrr/testSignificanceNLP/blob/c7302d015538944364b622eb860dd9fbee6d50ec/testSignificance.py#L89
-    pass
 def compute_significance_avg_mcauroc(probsA: Sequence[Sequence[float]], referencesA: Sequence[int],
                                      probsB: Sequence[Sequence[float]], referencesB: Sequence[int]):
     # compute MC-AUC for model A
-    model_A_scores = get_mc_auc_samples(probsA, referencesA, Nsamples=1_000)
-    model_B_scores = get_mc_auc_samples(probsB, referencesB, Nsamples=1_000)
     # one-tailed test
     p_value = ((model_A_scores[:, np.newaxis] <= model_B_scores[np.newaxis, :]).sum()
                / (len(model_A_scores) * len(model_B_scores)))
-    delta = np.mean(model_A_scores) - np.mean(model_B_scores)
     return p_value, delta
 def get_mc_auc_samples(probs, references, Nsamples=1_000_000):
     n_classes = list(range(len(probs[0])))
     fpr = dict()
@@ -93,23 +118,24 @@ def get_mc_auc_samples(probs, references, Nsamples=1_000_000):
                                              y_score=[prob[i] for prob in probs])
         confusion_matrices = _get_CMs(i, probs, references, thresholds)
         λ = 1.0  # <- Flat prior
         # λ = 0.5  # <- Jeffrey's prior
         # sample variates for every threshold
-        tpr_variates_for_each_fpr = []
-        for k in range(len(thresholds[i])):
-            tpr_variates_for_each_fpr.append(
-                numpy.random.beta(confusion_matrices[k]["TP"] + λ, confusion_matrices[k]["FN"] + λ, Nsamples))
         # fprs x tpr_variates
-        tpr_variates_for_each_fpr = np.array(tpr_variates_for_each_fpr)
         # now pick 1 variate for each fpr, and compute AUC
         auc_scores = []
-        for tpr_variates in tqdm(tpr_variates_for_each_fpr.T,
-                                 desc=f"Computing AUCs for class {i + 1}/{len(n_classes)}"):
             auc_score = auc(fpr[i], tpr_variates)
             # if numpy.isnan(auc_score):
             #     auc_score = 0
@@ -141,18 +167,27 @@ def read_json(file_path):
             golds = unzipped_list[0]
             probs = unzipped_list[1]
             data[task] = (golds, probs), metric
-    return data, fc["metadata"]
-def check_significance_task(fileA, fileB, task, significance_level=0.05):
-    dataA, metadataA = read_json(fileA)
-    dataB, metadataB = read_json(fileB)
-    print("DEBUG",fileA, task,  dataA[task])
-    decisions = dict()
     metricA = dataA[task][1]
     metricB = dataB[task][1]
     assert metricA == metricB
@@ -160,32 +195,33 @@ def check_significance_task(fileA, fileB, task, significance_level=0.05):
     if metricA == "avg_mcauroc":
         p_value, delta = compute_significance_avg_mcauroc(probsA=dataA[task][0][1], referencesA=dataA[task][0][0],
-                                                            probsB=dataB[task][0][1], referencesB=dataB[task][0][0])
-    elif metricA == "acc":
-        p_value, delta = compute_significance_accuracy(predsA=dataA[task][0][1], referencesA=dataA[task][0][0],
-                                                        predsB=dataB[task][0][1], referencesB=dataB[task][0][0])
-    elif metricA == "em":
-        raise NotImplementedError("Exact match is not supported yet.")
-    elif metricA == "rouge":
-        raise NotImplementedError("Rouge is not supported yet.")
-    elif metricA == "ppl":
-        raise NotImplementedError("Perplexity is not supported yet.")
     else:
         raise ValueError(f"Unsupported metric {metricA}")
-    decisions[task] = {
         "significant": not (p_value > significance_level),
         "p_value": p_value,
         "delta": delta,
     }
-    return decisions
 def check_significance(fileA, fileB, significance_level=0.05):
-    dataA, metadataA = read_json(fileA)
-    dataB, metadataB = read_json(fileB)
     decisions = dict()
-    for task in dataA.keys():
         metricA = dataA[task][1]
         metricB = dataB[task][1]
         assert metricA == metricB
@@ -195,37 +231,38 @@ def check_significance(fileA, fileB, significance_level=0.05):
             p_value, delta = compute_significance_avg_mcauroc(probsA=dataA[task][0][1], referencesA=dataA[task][0][0],
                                                               probsB=dataB[task][0][1], referencesB=dataB[task][0][0])
-        elif metricA == "acc":
-            p_value, delta = compute_significance_accuracy(predsA=dataA[task][0][1], referencesA=dataA[task][0][0],
-                                                           predsB=dataB[task][0][1], referencesB=dataB[task][0][0])
-        elif metricA == "em":
-            raise NotImplementedError("Exact match is not supported yet.")
-        elif metricA == "rouge":
-            raise NotImplementedError("Rouge is not supported yet.")
-        elif metricA == "ppl":
-            raise NotImplementedError("Perplexity is not supported yet.")
         else:
             raise ValueError(f"Unsupported metric {metricA}")
         decisions[task] = {
             "significant": not (p_value > significance_level),
             "p_value": p_value,
             "delta": delta,
         }
     return decisions
 def main():
     parser = argparse.ArgumentParser(description="One-tailed test if model A improves over model B.")
-    parser.add_argument("--modelA", help="ModelA JSONL file from lm harness.")
-    parser.add_argument("--modelB", help="ModelB JSONL file from lm harness.")
     parser.add_argument("--significance_level", type=float, default=0.05, help="Significance level (e.g., 0.05)")
     args = parser.parse_args()
     result = check_significance(args.modelA, args.modelB, args.significance_level)
     print(json.dumps(result, indent=2))
 # harness already returns stderr estimate for sampling distribution
 # see https://github.com/EleutherAI/lm-evaluation-harness/blob/6433bd3fe3033d302b22cdcd53af237e9039ef29/lm_eval/api/metrics.py#L213
 if __name__ == "__main__":
     main()

 from collections import defaultdict
 from typing import Sequence
 import numpy as np
+from numba import njit, prange
+from scipy.stats import ttest_rel
 from sklearn.metrics import roc_curve, auc
 from tqdm import tqdm
 SUPPORTED_METRICS = [
     "avg_mcauroc",  # for classification tasks
+    "exact_match",  # for QA tasks
     "acc",  # for multichoice tasks
+    "rouge_raw_r2_mid_f",  # for summarization tasks
+    "word_perplexity",  # for language modeling tasks
 ]
     return confusion_matrices
+def compute_significance_ttest(scores_A, scores_B):
+    delta = np.mean(scores_A) - np.mean(scores_B)
+    if delta <= 0:
+        return 1.0, delta
     t, p = ttest_rel(scores_A, scores_B)
     # correct for one-tailed test
     p_value = p / 2
     return p_value, delta
+@njit(parallel=True)
+def compute_significance_bootstrap(scores_A, scores_B):
+    n = len(scores_A)
+    R = 1_000
+    delta_orig = np.mean(scores_A) - np.mean(scores_B)
+    if delta_orig <= 0:
+        return 1.0, delta_orig
+    r = 0
+    for _ in prange(R):
+        samples = np.random.choice(n, n, replace=True)
+        temp_A = scores_A[samples]
+        temp_B = scores_B[samples]
+        delta = np.mean(temp_A) - np.mean(temp_B)
+        if delta > 2 * delta_orig:
+            r += 1
+    pval = r / R
+    return pval, delta_orig
 def compute_significance_avg_mcauroc(probsA: Sequence[Sequence[float]], referencesA: Sequence[int],
                                      probsB: Sequence[Sequence[float]], referencesB: Sequence[int]):
     # compute MC-AUC for model A
+    model_A_scores = get_mc_auc_samples(probsA, referencesA, Nsamples=100)
+    model_B_scores = get_mc_auc_samples(probsB, referencesB, Nsamples=100)
+    delta = np.mean(model_A_scores) - np.mean(model_B_scores)
     # one-tailed test
     p_value = ((model_A_scores[:, np.newaxis] <= model_B_scores[np.newaxis, :]).sum()
                / (len(model_A_scores) * len(model_B_scores)))
     return p_value, delta
+# Helper function to convert confusion matrices to numba-compatible arrays
+def convert_confusion_matrices(confusion_matrices):
+    num_thresholds = len(confusion_matrices)
+    tp = np.empty(num_thresholds)
+    fn = np.empty(num_thresholds)
+    for k in range(num_thresholds):
+        tp[k] = confusion_matrices[k]["TP"]
+        fn[k] = confusion_matrices[k]["FN"]
+    return tp, fn
+@njit(parallel=True)
+def compute_tpr_variates(tp, fn, λ, Nsamples, num_thresholds):
+    tpr_variates_for_each_fpr = np.empty((num_thresholds, Nsamples))
+    for k in prange(num_thresholds):
+        tpr_variates_for_each_fpr[k, :] = np.random.beta(tp[k] + λ, fn[k] + λ, Nsamples)
+    return tpr_variates_for_each_fpr
 def get_mc_auc_samples(probs, references, Nsamples=1_000_000):
     n_classes = list(range(len(probs[0])))
     fpr = dict()
                                              y_score=[prob[i] for prob in probs])
         confusion_matrices = _get_CMs(i, probs, references, thresholds)
+        tp, fn = convert_confusion_matrices(confusion_matrices)
         λ = 1.0  # <- Flat prior
         # λ = 0.5  # <- Jeffrey's prior
         # sample variates for every threshold
+        # tpr_variates_for_each_fpr = []
+        # for k in range(len(thresholds[i])):
+        #     tpr_variates_for_each_fpr.append(
+        #         numpy.random.beta(confusion_matrices[k]["TP"] + λ, confusion_matrices[k]["FN"] + λ, Nsamples))
+        tpr_variates_for_each_fpr = compute_tpr_variates(tp, fn, λ, Nsamples, len(thresholds[i]))
         # fprs x tpr_variates
+        # tpr_variates_for_each_fpr = np.array(tpr_variates_for_each_fpr)
         # now pick 1 variate for each fpr, and compute AUC
         auc_scores = []
+        for tpr_variates in tpr_variates_for_each_fpr.T:
             auc_score = auc(fpr[i], tpr_variates)
             # if numpy.isnan(auc_score):
             #     auc_score = 0
             golds = unzipped_list[0]
             probs = unzipped_list[1]
             data[task] = (golds, probs), metric
+        else:
+            scores = [line[metric] for line in fc["predictions"][task]]
+            data[task] = scores, metric
+    # make sure all tasks are submitted
+    METADATA_FILE = "tasks_metadata.json"
+    with open(METADATA_FILE, "r") as f:
+        metadata = json.load(f)
+    all_tasks = list(metadata["tasks"].keys())
+    all_missing_tasks = []
+    for task in all_tasks:
+        if task not in data:
+            all_missing_tasks.append(task)
+    if len(all_missing_tasks) > 0:
+        EOLN = "\n"
+        raise ValueError(f"Missing tasks in {file_path}: {EOLN.join(all_missing_tasks)}")
+    return data
+def process_task(task, dataA, dataB, significance_level):
     metricA = dataA[task][1]
     metricB = dataB[task][1]
     assert metricA == metricB
     if metricA == "avg_mcauroc":
         p_value, delta = compute_significance_avg_mcauroc(probsA=dataA[task][0][1], referencesA=dataA[task][0][0],
+                                                          probsB=dataB[task][0][1], referencesB=dataB[task][0][0])
+    elif metricA in ["acc", "exact_match"]:
+        p_value, delta = compute_significance_ttest(scores_A=dataA[task][0], scores_B=dataB[task][0])
+    elif metricA in ["rouge_raw_r2_mid_f", "word_perplexity"]:
+        p_value, delta = compute_significance_bootstrap(scores_A=np.array(dataA[task][0]),
+                                                        scores_B=np.array(dataB[task][0]))
     else:
         raise ValueError(f"Unsupported metric {metricA}")
+    if delta <= 0:
+        p_value = 1.0
+    return task, {
         "significant": not (p_value > significance_level),
         "p_value": p_value,
         "delta": delta,
     }
 def check_significance(fileA, fileB, significance_level=0.05):
+    dataA = read_json(fileA)
+    dataB = read_json(fileB)
     decisions = dict()
+    _iter = tqdm(list(dataA.keys()))
+    for task in _iter:
+        _iter.set_description(f"Processing task: {task}")
         metricA = dataA[task][1]
         metricB = dataB[task][1]
         assert metricA == metricB
             p_value, delta = compute_significance_avg_mcauroc(probsA=dataA[task][0][1], referencesA=dataA[task][0][0],
                                                               probsB=dataB[task][0][1], referencesB=dataB[task][0][0])
+        elif metricA in ["acc", "exact_match"]:
+            p_value, delta = compute_significance_ttest(scores_A=dataA[task][0], scores_B=dataB[task][0])
+        elif metricA in ["rouge_raw_r2_mid_f", "word_perplexity"]:
+            p_value, delta = compute_significance_bootstrap(scores_A=np.array(dataA[task][0]),
+                                                            scores_B=np.array(dataB[task][0]))
         else:
             raise ValueError(f"Unsupported metric {metricA}")
+        if delta <= 0:
+            p_value = 1.0
         decisions[task] = {
             "significant": not (p_value > significance_level),
             "p_value": p_value,
             "delta": delta,
         }
     return decisions
 def main():
     parser = argparse.ArgumentParser(description="One-tailed test if model A improves over model B.")
+    parser.add_argument("--modelA", help="ModelA JSON file from lm harness.")
+    parser.add_argument("--modelB", help="ModelB JSON file from lm harness.")
     parser.add_argument("--significance_level", type=float, default=0.05, help="Significance level (e.g., 0.05)")
     args = parser.parse_args()
     result = check_significance(args.modelA, args.modelB, args.significance_level)
     print(json.dumps(result, indent=2))
 # harness already returns stderr estimate for sampling distribution
 # see https://github.com/EleutherAI/lm-evaluation-harness/blob/6433bd3fe3033d302b22cdcd53af237e9039ef29/lm_eval/api/metrics.py#L213
 if __name__ == "__main__":
+    check_significance("../csmpt.json", "../llama3_instruct.json", 0.05)
     main()

content.py CHANGED Viewed

@@ -2,55 +2,27 @@
 This file contains the text content for the leaderboard client.
 """
-# HEADER_MARKDOWN = """
-#     # CHiME-8 Leaderboard
-#     In collaboration with the CHiME-8 Challenge, the NOTSOFAR team is proud to host the official leaderboard for the three tasks this year.\n
-#     For details, visit:
-#     1. [DASR](https://www.chimechallenge.org/current/task1/index)
-#     2. [NOTSOFAR](https://www.chimechallenge.org/current/task2/index)
-#     3. [MMCSG](https://www.chimechallenge.org/current/task3/index)
-#
-#
-#     ### DASR and NOTSOFAR - the scientific story
-#     Both tasks focus on distant automatic speech recognition and speaker diarization, offering a fundamental comparison
-#     among different system designs:
-#     - Single-channel (SC), 1 device (NOTSOFAR-SC)
-#     - Multi-channel (MC), known-geometry, 1 device (NOTSOFAR-MC)
-#     - Multi-channel (MC), geometry-agnostic, multiple devices (DASR-Constrained-LM and DASR-Unconstrained-LM)
-#
-#     Featured in both tasks, the NOTSOFAR recorded meeting dataset is leveraged as a common benchmark:
-#     each geometry-agnostic MC system submitted to DASR tracks (constrained or not) will also be **automatically submitted**
-#     to the known-geometry single-device NOTSOFAR-MC track. These entries will be marked with "DASR" to denote their origin.
-# """
-HEADER_MARKDOWN = """   """
 LEADERBOARD_TAB_TITLE_MARKDOWN = """
-    ## Leaderboard
-"""
 SUBMISSION_TAB_TITLE_MARKDOWN = """
     ## Submission
-    To submit your results, please fill in the form below.
-    - *Team Name:* The name of your team, as it will appear on the leaderboard'
-    - *Results:* Results zip file to submit
-    - *Submission track:* The track to submit results to
-    - *Token:* Your Hugging Face token
     - *Description:* Short description of your submission (optional)
-    **Hugging Face tokens:** To create a token, go to your profile settings > Access Tokens > New Token.
-    Name the token and give it a write role, then copy the token and paste it in the field below.
-    **Team creation:** Upon the first submission, your team name is associated with your Hugging Face user account.
-    Any token generated by your account can be used. All team members should use this specific user's token for
-    future submissions.
-    **Submission limit:** 5 submissions per team every 24 hours. Each participant should only belong to one team.
-    Changing team names is allowed, but it is not intended to bypass the daily submission limit.
-"""
-SUBMISSION_TAB_TITLE_MARKDOWN = """
 """

 This file contains the text content for the leaderboard client.
 """
+HEADER_MARKDOWN = """
+    # BenCzechMark
+    Welcome to the leaderboard! Here you can submit your model and compare it with the existing models.
+    """
 LEADERBOARD_TAB_TITLE_MARKDOWN = """
+   ## Leaderboard
+   The leaderboard below shows the current ranking of the models...
+   """
 SUBMISSION_TAB_TITLE_MARKDOWN = """
     ## Submission
+    To submit your model, please fill in the form below.
+    - *Team name:* The name of your team, as it will appear on the leaderboard'
+    - *Submission ID:* Results json file to submit
     - *Description:* Short description of your submission (optional)
+    - *Link to model:* Link to the model's repository or documentation
+    After filling in the form, click the **Pre-submit model** button.
+    This will run a comparison of your model with the existing leaderboard models.
+    After the tournament is complete, you will be able to submit your model to the leaderboard.
 """

model_compare.py CHANGED Viewed

@@ -1,62 +1,34 @@
 from functools import cmp_to_key
-from compare_significance import check_significance
-class ModelCompare():
-    TASKS =  ["propaganda_demonizace",
-                "propaganda_vina",
-                "propaganda_relativizace",
-                "propaganda_argumentace",
-                "propaganda_lokace",
-                "propaganda_nazor",
-                "propaganda_emoce",
-                "propaganda_fabulace",
-                "propaganda_nalepkovani",
-                "propaganda_zamereni",
-                "propaganda_zanr",
-                "propaganda_rusko",
-                "propaganda_strach",
-                "benczechmark_sentiment"]
-    def __init__(self, ranks:dict=None):
         self.ranks = ranks
-    def compare_models(self, modelA_id, modelB_id):
         if not self.ranks:
             raise Exception("Missing model rankings")
-        res = self.ranks[modelA_id][modelB_id][self.current_task]
-        if res == True:
             return 1
-        elif res == False:
             return -1
         else:
             return -1
-    def get_tasks_ranks(self, ranks:dict) -> dict:
-        '''Order models based on the significance improvement'''
         self.ranks = ranks
         tasks_ranks = {}
         models = ranks.keys()
-        for task in self.TASKS:
             self.current_task = task
             tasks_ranks[task] = sorted(models, key=cmp_to_key(self.compare_models))
         return tasks_ranks
-# models = {
-# model1 : {
-#         task1 : order_idx
-#         task2 : order_idx
-#         task3 : order_idx
-#     }
-# }

 from functools import cmp_to_key
+class ModelCompare:
+    def __init__(self, tasks, ranks: dict = None):
+        self.current_task = None
         self.ranks = ranks
+        self.tasks = tasks
+    def compare_models(self, model_a, model_b):
         if not self.ranks:
             raise Exception("Missing model rankings")
+        res = self.ranks[model_a][model_b][self.current_task]
+        if res:
             return 1
+        elif not res:
             return -1
         else:
             return -1
+    def get_tasks_ranks(self, ranks: dict) -> dict:
+        """Order models based on the significance improvement"""
         self.ranks = ranks
         tasks_ranks = {}
         models = ranks.keys()
+        for task in self.tasks:
             self.current_task = task
             tasks_ranks[task] = sorted(models, key=cmp_to_key(self.compare_models))
         return tasks_ranks

server.py ADDED Viewed

	@@ -0,0 +1,144 @@

+import copy
+import glob
+import json
+import os
+import gradio as gr
+import pandas as pd
+from huggingface_hub import HfApi, snapshot_download
+from compare_significance import check_significance
+from model_compare import ModelCompare
+api = HfApi()
+ORG = "CZLC"
+REPO = f"{ORG}/LLM_benchmark_data"
+HF_TOKEN = os.environ.get("HF_TOKEN")
+TASKS_METADATA_PATH = "./tasks_metadata.json"
+class LeaderboardServer:
+    def __init__(self):
+        self.server_address = REPO
+        self.repo_type = "dataset"
+        self.local_leaderboard = snapshot_download(self.server_address, repo_type=self.repo_type, token=HF_TOKEN,
+                                                   local_dir="./")
+        self.submisssion_id_to_file = {}  # Map submission ids to file paths
+        self.tasks_metadata = json.load(open(TASKS_METADATA_PATH))['tasks']
+        self.submission_ids = set()
+        self.comparer = ModelCompare(self.tasks_metadata.keys())
+        self.fetch_existing_models()
+        self.tournament_results = self.load_tournament_results()
+        self.pre_submit = None
+    def update_leaderboard(self):
+        self.local_leaderboard = snapshot_download(self.server_address, repo_type=self.repo_type, token=HF_TOKEN,
+                                                   local_dir="./")
+        self.fetch_existing_models()
+        self.tournament_results = self.load_tournament_results()
+    def load_tournament_results(self):
+        metadata_rank_paths = os.path.join(self.local_leaderboard, "tournament.json")
+        if not os.path.exists(metadata_rank_paths):
+            return {}
+        with open(metadata_rank_paths) as ranks_file:
+            results = json.load(ranks_file)
+        return results
+    def fetch_existing_models(self):
+        # Models data
+        for submission in glob.glob(os.path.join(self.local_leaderboard, "data") + "/*.json"):
+            data = json.load(open(submission))
+            metadata = data.get('metadata')
+            if metadata is None:
+                continue
+            submission_id = metadata["team_name"] + "_" + metadata["submission_id"]
+            self.submission_ids.add(submission_id)
+            self.submisssion_id_to_file[submission_id] = submission
+    def get_leaderboard(self, tournament_results=None):
+        rank_based_on = tournament_results if tournament_results else self.tournament_results
+        if len(rank_based_on) == 0:
+            return pd.DataFrame(columns=['No submissions yet'])
+        else:
+            ranks = self.comparer.get_tasks_ranks(rank_based_on)
+            results = []
+            for submission in rank_based_on.keys():
+                path = self.submisssion_id_to_file.get(submission)
+                if path is None:
+                    if self.pre_submit and submission == self.pre_submit[1]:
+                        data = json.load(open(self.pre_submit[2]))
+                    else:
+                        raise gr.Error(f"Internal error: Submission [{submission}] not found")
+                elif path:
+                    data = json.load(open(path))
+                else:
+                    raise gr.Error(f"Submission [{submission}] not found")
+                submission_id = data["metadata"]["team_name"] + "_" + data["metadata"]["submission_id"]
+                local_results = {task: list(task_ranks).index(submission_id) + 1 for task, task_ranks in ranks.items()}
+                local_results["submission_id"] = submission_id
+                if self.pre_submit and submission == self.pre_submit[1]:
+                    results.insert(0, local_results)
+                else:
+                    results.append(local_results)
+            dataframe = pd.DataFrame.from_records(results)
+            df_order = ["submission_id"] + [col for col in dataframe.columns if col != "submission_id"]
+            dataframe = dataframe[df_order]
+            dataframe = dataframe.rename(columns={key: value["name"] for key, value in self.tasks_metadata.items()})
+            return dataframe
+    def start_tournament(self, new_model_id, new_model_file):
+        new_tournament = copy.deepcopy(self.tournament_results)
+        new_tournament[new_model_id] = {}
+        new_tournament[new_model_id][new_model_id] = {task: False for task in self.tasks_metadata.keys()}
+        for model in self.submission_ids:
+            res = check_significance(new_model_file, self.submisssion_id_to_file[model])
+            res_inverse = check_significance(self.submisssion_id_to_file[model], new_model_file)
+            new_tournament[new_model_id][model] = {
+                task: data["significant"] for task, data in res.items()
+            }
+            new_tournament[model][new_model_id] = {
+                task: data["significant"] for task, data in res_inverse.items()
+            }
+        return new_tournament
+    def prepare_model_for_submission(self, file, metadata) -> None:
+        with open(file, "r") as f:
+            data = json.load(f)
+        data["metadata"] = metadata
+        with open(file, "w") as f:
+            json.dump(data, f)
+        model_id = metadata["team_name"] + "_" + metadata["submission_id"]
+        tournament_results = self.start_tournament(model_id, file)
+        self.pre_submit = tournament_results, model_id, file
+    def save_pre_submit(self):
+        if self.pre_submit:
+            tournament_results, model_id, file = self.pre_submit
+            filename = os.path.basename(file)
+            api.upload_file(
+                path_or_fileobj=file,
+                path_in_repo=f"data/{model_id}_{filename}",
+                repo_id=self.server_address,
+                repo_type=self.repo_type,
+                token=HF_TOKEN,
+            )
+            # Temporary save tournament results
+            tournament_results_path = os.path.join(self.local_leaderboard, "tournament.json")
+            with open(tournament_results_path, "w") as f:
+                json.dump(tournament_results, f)
+            api.upload_file(
+                path_or_fileobj=tournament_results_path,
+                path_in_repo="tournament.json",
+                repo_id=self.server_address,
+                repo_type=self.repo_type,
+                token=HF_TOKEN,
+            )

tasks_metadata.json ADDED Viewed

	@@ -0,0 +1,204 @@

+{
+  "tasks": {
+    "benczechmark_propaganda_argumentace": {
+      "task": "benczechmark_propaganda_argumentace",
+      "name": "P-Argumentace",
+      "source_url": "https://huggingface.co/datasets/CZLC/propaganda_argumentace"
+    },
+    "benczechmark_propaganda_fabulace": {
+      "task": "benczechmark_propaganda_fabulace",
+      "name": "P-Fabulace",
+      "source_url": "https://huggingface.co/datasets/CZLC/propaganda_fabulace"
+    },
+    "benczechmark_propaganda_nazor": {
+      "task": "benczechmark_propaganda_nazor",
+      "name": "P-Názor",
+      "source_url": "https://huggingface.co/datasets/CZLC/propaganda_nazor"
+    },
+    "benczechmark_propaganda_strach": {
+      "task": "benczechmark_propaganda_strach",
+      "name": "P-Strach",
+      "source_url": "https://huggingface.co/datasets/CZLC/propaganda_strach"
+    },
+    "benczechmark_propaganda_zamereni": {
+      "task": "benczechmark_propaganda_zamereni",
+      "name": "P-Zaměření",
+      "source_url": "https://huggingface.co/datasets/CZLC/propaganda_zamereni"
+    },
+    "benczechmark_propaganda_demonizace": {
+      "task": "benczechmark_propaganda_demonizace",
+      "name": "P-Demonizace",
+      "source_url": "https://huggingface.co/datasets/CZLC/propaganda_demonizace"
+    },
+    "benczechmark_propaganda_lokace": {
+      "task": "benczechmark_propaganda_lokace",
+      "name": "P-Lokace",
+      "source_url": "https://huggingface.co/datasets/CZLC/propaganda_lokace"
+    },
+    "benczechmark_propaganda_relativizace": {
+      "task": "benczechmark_propaganda_relativizace",
+      "name": "P-Relativizace",
+      "source_url": "https://huggingface.co/datasets/CZLC/propaganda_relativizace"
+    },
+    "benczechmark_propaganda_vina": {
+      "task": "benczechmark_propaganda_vina",
+      "name": "P-Vina",
+      "source_url": "https://huggingface.co/datasets/CZLC/propaganda_vina"
+    },
+    "benczechmark_propaganda_zanr": {
+      "task": "benczechmark_propaganda_zanr",
+      "name": "P-Žánr",
+      "source_url": "https://huggingface.co/datasets/CZLC/propaganda_zanr"
+    },
+    "benczechmark_propaganda_emoce": {
+      "task": "benczechmark_propaganda_emoce",
+      "name": "P-Emoce",
+      "source_url": "https://huggingface.co/datasets/CZLC/propaganda_emoce"
+    },
+    "benczechmark_propaganda_nalepkovani": {
+      "task": "benczechmark_propaganda_nalepkovani",
+      "name": "P-Nalepkování",
+      "source_url": "https://huggingface.co/datasets/CZLC/propaganda_nalepkovani"
+    },
+    "benczechmark_propaganda_rusko": {
+      "task": "benczechmark_propaganda_rusko",
+      "name": "P-Rusko",
+      "source_url": "https://huggingface.co/datasets/CZLC/propaganda_rusko"
+    },
+    "benczechmark_sentiment_mall": {
+      "task": "benczechmark_sentiment_mall",
+      "name": "S-Mall",
+      "source_url": "https://huggingface.co/datasets/CZLC/mall_sentiment_balanced"
+    },
+    "benczechmark_sentiment_fb": {
+      "task": "benczechmark_sentiment_fb",
+      "name": "S-FB",
+      "source_url": "https://huggingface.co/datasets/CZLC/fb_sentiment_balanced"
+    },
+    "benczechmark_sentiment_csfd": {
+      "task": "benczechmark_sentiment_csfd",
+      "name": "S-CSFD",
+      "source_url": "https://huggingface.co/datasets/CZLC/csfd_sentiment_balanced"
+    },
+    "benczechmark_summarization": {
+      "task": "benczechmark_summarization",
+      "name": "Summarization",
+      "source_url": "https://huggingface.co/datasets/CZLC/sumeczech_downsampled"
+    },
+    "benczechmark_grammarerrorcorrection": {
+      "task": "benczechmark_grammarerrorcorrection",
+      "name": "Grammar Error Correction",
+      "source_url": "https://huggingface.co/datasets/CZLC/cs_gec"
+    },
+    "benczechmark_cs_naturalquestions": {
+      "task": "benczechmark_cs_naturalquestions",
+      "name": "CS Natural Questions",
+      "source_url": "https://huggingface.co/datasets/CZLC/cs_naturalquestions"
+    },
+    "benczechmark_cs_sqad32": {
+      "task": "benczechmark_cs_sqad32",
+      "name": "CS SQAD 3.2",
+      "source_url": "https://huggingface.co/datasets/CZLC/SQAD_3.2"
+    },
+    "benczechmark_cs_triviaQA": {
+      "task": "benczechmark_cs_triviaQA",
+      "name": "CS TriviaQA",
+      "source_url": "https://huggingface.co/datasets/CZLC/cs_triviaqa"
+    },
+    "benczechmark_csfever_nli": {
+      "task": "benczechmark_csfever_nli",
+      "name": "CSFever NLI",
+      "source_url": "https://huggingface.co/datasets/CZLC/ctu-aic/csfever_nli"
+    },
+    "benczechmark_ctkfacts_nli": {
+      "task": "benczechmark_ctkfacts_nli",
+      "name": "CTKFacts NLI",
+      "source_url": "https://huggingface.co/datasets/CZLC/ctu-aic/ctkfacts_nli"
+    },
+    "benczechmark_cs_ner": {
+      "task": "benczechmark_cs_ner",
+      "name": "CS NER",
+      "source_url": "https://huggingface.co/datasets/CZLC/fewshot-goes-multilingual/cs_czech-named-entity-corpus_2.0"
+    },
+    "benczechmark_hellaswag": {
+      "task": "benczechmark_hellaswag",
+      "name": "HellaSwag",
+      "source_url": "https://huggingface.co/datasets/CZLC/cs_hellaswag"
+    },
+    "benczechmark_histcorpus": {
+      "task": "benczechmark_histcorpus",
+      "name": "HistCorpus",
+      "source_url": "https://huggingface.co/datasets/CZLC/benczechmark_histcorpus"
+    },
+    "benczechmark_klokan_qa": {
+      "task": "benczechmark_klokan_qa",
+      "name": "Klokan QA",
+      "source_url": "https://huggingface.co/datasets/hynky/klokan-qa"
+    },
+    "benczechmark_cs_court_decisions_ner": {
+      "task": "benczechmark_cs_court_decisions_ner",
+      "name": "CS Court Decisions NER",
+      "source_url": "https://huggingface.co/datasets/CZLC/fewshot-goes-multilingual/cs_czech-court-decisions-ner"
+    },
+    "benczechmark_umimeto_biology": {
+      "task": "benczechmark_umimeto_biology",
+      "name": "Umimeto.cz - Biology",
+      "source_url": "https://huggingface.co/datasets/CZLC/umimeto-biology"
+    },
+    "benczechmark_umimeto_chemistry": {
+      "task": "benczechmark_umimeto_chemistry",
+      "name": "Umimeto.cz - Chemistry",
+      "source_url": "https://huggingface.co/datasets/CZLC/umimeto-chemistry"
+    },
+    "benczechmark_umimeto_czech": {
+      "task": "benczechmark_umimeto_czech",
+      "name": "Umimeto.cz - Czech",
+      "source_url": "https://huggingface.co/datasets/CZLC/umimeto-czech"
+    },
+    "benczechmark_umimeto_history": {
+      "task": "benczechmark_umimeto_history",
+      "name": "Umimeto.cz - History",
+      "source_url": "https://huggingface.co/datasets/CZLC/umimeto-history"
+    },
+    "benczechmark_umimeto_informatics": {
+      "task": "benczechmark_umimeto_informatics",
+      "name": "Umimeto.cz - Informatics",
+      "source_url": "https://huggingface.co/datasets/CZLC/umimeto-informatics"
+    },
+    "benczechmark_umimeto_math": {
+      "task": "benczechmark_umimeto_math",
+      "name": "Umimeto.cz - Math",
+      "source_url": "https://huggingface.co/datasets/CZLC/umimeto-math"
+    },
+    "benczechmark_umimeto_physics": {
+      "task": "benczechmark_umimeto_physics",
+      "name": "Umimeto.cz - Physics",
+      "source_url": "https://huggingface.co/datasets/CZLC/umimeto-physics"
+    },
+    "benczechmark_cermat_czmath_mc": {
+      "task": "benczechmark_cermat_czmath_mc",
+      "name": "Cermat Czech Math MC",
+      "source_url": "https://huggingface.co/datasets/CZLC/cermat_math_mc"
+    },
+    "benczechmark_cermat_czmath_open": {
+      "task": "benczechmark_cermat_czmath_open",
+      "name": "Cermat Czech Math Open",
+      "source_url": "https://huggingface.co/datasets/CZLC/cermat_math_open"
+    },
+    "benczechmark_cermat_czech_tf": {
+      "task": "benczechmark_cermat_czech_tf",
+      "name": "Cermat Czech Language TF",
+      "source_url": "https://huggingface.co/datasets/CZLC/cermat_czech_tf"
+    },
+    "benczechmark_cermat_czech_mc": {
+      "task": "benczechmark_cermat_czech_mc",
+      "name": "Cermat Czech Language MC",
+      "source_url": "https://huggingface.co/datasets/CZLC/cermat_czech_mc"
+    },
+    "benczechmark_cermat_czech_open": {
+      "task": "benczechmark_cermat_czech_open",
+      "name": "Cermat Czech Language Open",
+      "source_url": "https://huggingface.co/datasets/CZLC/cermat_czech_open"
+    }
+  }
+}