long-code-arena / src /leaderboard_formatting.py
galtimur's picture
Update src/leaderboard_formatting.py
c3a2163 verified
raw
history blame
No virus
3.56 kB
from typing import List
# Common dictionary to map the columns names
COLUMNS_PRETTY = {
"bleu": "BLEU",
"chrf": "ChrF",
"rouge1": "ROUGE-1",
"rouge2": "ROUGE-2",
"rougeL": "ROUGE-L",
"bertscore": "BERTScore",
"bertscore_normalized": "BERTScore (Normalized)",
"model_name": "Model Name",
"model_availability": "Availability",
"urls": "Resources",
"context_size": "Context Size",
"submitted_by": "Submitted By",
"EM infile": "EM infile",
"EM inproject": "EM inproject",
"EM common": "EM common",
"EM commited": "EM committed",
"EM non_informative": "EM non-informative",
"EM random": "EM random",
"EM all": "EM all",
"dataset": "Dataset",
"CompScore": "CompScore",
"context": "Context",
"task_type": "Task type",
}
# Add your metrics
METRICS_PER_TASK = {
"commit_message_generation": [
"BLEU",
"ChrF",
"ROUGE-1",
"ROUGE-2",
"ROUGE-L",
"BERTScore",
"BERTScore (Normalized)",
],
"project_code_completion": [
"EM infile",
"EM inproject",
"EM common",
"EM committed",
"EM non-informative",
"EM random",
"EM all",
],
"bug_localization": [
"k",
"P@k",
"R@k",
"f1-score",
],
"module_summarization": [
"CompScore",
],
"library_based_code_generation": [
"ChrF",
"API Recall",
],
"ci_builds_repair": [
"Pass@1",
],
}
SORT_COLUMN_PER_TASK = {
"commit_message_generation": "ROUGE-1",
"project_code_completion": "EM inproject",
"bug_localization": "Model Name",
"module_summarization": "CompScore",
"library_based_code_generation": "API Recall",
"ci_builds_repair": "Pass@1",
}
def get_columns_per_task(task_id: str) -> List[str]:
metrics_per_task = METRICS_PER_TASK[task_id]
if task_id == 'project_code_completion':
return ["Model Name", "Context Size", "Dataset Name", "Dataset"] + metrics_per_task + ["Availability", "Submitted By", "Resources"]
if task_id == 'bug_localization':
return ["Model Name", "Availability", "Context Size", "Dataset"] + metrics_per_task + ["Submitted By", "Resources"]
if task_id == 'module_summarization':
return ["Model Name", "Context Size"] + metrics_per_task + ["Submitted By", "Resources"]
if task_id == 'library_based_code_generation':
return ["Model Name", "Context"] + metrics_per_task + ["Availability", "Submitted By", "Resources"]
if task_id == 'ci_builds_repair':
return ["Model Name", "Context Size", "Task type"] + metrics_per_task + ["Availability", "Submitted By", "Resources"]
return ["Model Name", "Context Size", "Availability"] + metrics_per_task + ["Submitted By", "Resources"]
def get_types_per_task(task_id: str) -> List[str]:
metrics_per_task = METRICS_PER_TASK.get(task_id, (0, 0, 0, 0, 0))
if task_id == 'project_code_completion':
return ["html", "markdown", "markdown", "html"] + ["number" for _ in metrics_per_task] + ["markdown", "markdown", "html"]
if task_id == 'bug_localization':
return ["html", "markdown", "markdown", "html"] + ["number" for _ in metrics_per_task] + ["markdown", "html"]
if task_id == 'ci_builds_repair':
return ["html", "markdown", "markdown"] + ["number" for _ in metrics_per_task] + ["markdown", "markdown", "html"]
return ["html", "markdown", "markdown"] + ["number" for _ in metrics_per_task] + ["markdown", "html"]