Add metrics computation for CMG task
Browse files- app.py +1 -1
- requirements.txt +5 -1
- src/__init__.py +0 -0
- src/evaluation/__init__.py +3 -0
- src/evaluation/base_task_metrics.py +17 -0
- src/evaluation/commit_message_generation/__init__.py +3 -0
- src/evaluation/commit_message_generation/cmg_metrics.py +53 -0
- src/evaluation/metrics.py +13 -0
- src/formatting.py +12 -0
- src/get_results_for_task.py +1 -3
- src/submission_uploader.py +171 -33
app.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
import os
|
2 |
|
3 |
-
import gradio as gr
|
4 |
|
5 |
from src.content import (INTRODUCTION_TEXT, INTRODUCTION_TITLE,
|
6 |
LEADERBOARD_TEXT, LEADERBOARD_TITLE,
|
|
|
1 |
import os
|
2 |
|
3 |
+
import gradio as gr # type: ignore[import]
|
4 |
|
5 |
from src.content import (INTRODUCTION_TEXT, INTRODUCTION_TITLE,
|
6 |
LEADERBOARD_TEXT, LEADERBOARD_TITLE,
|
requirements.txt
CHANGED
@@ -1 +1,5 @@
|
|
1 |
-
huggingface_hub
|
|
|
|
|
|
|
|
|
|
1 |
+
huggingface_hub
|
2 |
+
jsonlines
|
3 |
+
pandas
|
4 |
+
tqdm
|
5 |
+
evaluate
|
src/__init__.py
ADDED
File without changes
|
src/evaluation/__init__.py
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
from .metrics import METRICS
|
2 |
+
|
3 |
+
__all__ = ["METRICS"]
|
src/evaluation/base_task_metrics.py
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from abc import ABC, abstractmethod
|
2 |
+
from typing import Dict, List
|
3 |
+
|
4 |
+
|
5 |
+
class BaseTaskMetrics(ABC):
|
6 |
+
def reset(self):
|
7 |
+
pass
|
8 |
+
|
9 |
+
@abstractmethod
|
10 |
+
def add_batch(
|
11 |
+
self, predictions: List[str], references: List[str], *args, **kwargs
|
12 |
+
) -> None:
|
13 |
+
pass
|
14 |
+
|
15 |
+
@abstractmethod
|
16 |
+
def compute(self, *args, **kwargs) -> Dict[str, float]:
|
17 |
+
pass
|
src/evaluation/commit_message_generation/__init__.py
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
from .cmg_metrics import CMGMetrics
|
2 |
+
|
3 |
+
__all__ = ["CMGMetrics"]
|
src/evaluation/commit_message_generation/cmg_metrics.py
ADDED
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Dict, List
|
2 |
+
|
3 |
+
import evaluate # type: ignore[import]
|
4 |
+
|
5 |
+
from ..base_task_metrics import BaseTaskMetrics
|
6 |
+
from .b_norm import BNorm
|
7 |
+
|
8 |
+
|
9 |
+
class CMGMetrics(BaseTaskMetrics):
|
10 |
+
def __init__(self):
|
11 |
+
self.bnorm = BNorm()
|
12 |
+
self.bleu = evaluate.load("sacrebleu")
|
13 |
+
self.chrf = evaluate.load("chrf")
|
14 |
+
self.rouge = evaluate.load("rouge")
|
15 |
+
self.bertscore = evaluate.load("bertscore")
|
16 |
+
self.bertscore_normalized = evaluate.load("bertscore")
|
17 |
+
|
18 |
+
def reset(self):
|
19 |
+
self.bnorm.reset()
|
20 |
+
|
21 |
+
def update(
|
22 |
+
self, predictions: List[str], references: List[str], *args, **kwargs
|
23 |
+
) -> None:
|
24 |
+
self.bnorm.update(predictions=predictions, references=references)
|
25 |
+
self.bleu.add_batch(
|
26 |
+
predictions=predictions, references=[[ref] for ref in references]
|
27 |
+
)
|
28 |
+
self.chrf.add_batch(
|
29 |
+
predictions=predictions, references=[[ref] for ref in references]
|
30 |
+
)
|
31 |
+
self.rouge.add_batch(predictions=predictions, references=references)
|
32 |
+
self.bertscore.add_batch(predictions=predictions, references=references)
|
33 |
+
self.bertscore_normalized.add_batch(
|
34 |
+
predictions=predictions, references=references
|
35 |
+
)
|
36 |
+
|
37 |
+
def compute(self, *args, **kwargs) -> Dict[str, float]:
|
38 |
+
rouge = self.rouge.compute()
|
39 |
+
bertscore = self.bertscore.compute(lang="en")
|
40 |
+
bertscore_normalized = self.bertscore_normalized.compute(
|
41 |
+
lang="en", rescale_with_baseline=True
|
42 |
+
)
|
43 |
+
return {
|
44 |
+
"bnorm": self.bnorm.compute(),
|
45 |
+
"bleu": self.bleu.compute(tokenize="13a")["score"],
|
46 |
+
"chrf": self.chrf.compute()["score"],
|
47 |
+
"rouge1": rouge["rouge1"] * 100,
|
48 |
+
"rouge2": rouge["rouge2"] * 100,
|
49 |
+
"rougeL": rouge["rougeL"] * 100,
|
50 |
+
"bertscore": sum(bertscore["f1"]) / len(bertscore["f1"]),
|
51 |
+
"bertscore_normalized": sum(bertscore_normalized["f1"])
|
52 |
+
/ len(bertscore_normalized["f1"]),
|
53 |
+
}
|
src/evaluation/metrics.py
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Dict, Optional
|
2 |
+
|
3 |
+
from .base_task_metrics import BaseTaskMetrics
|
4 |
+
from .commit_message_generation import CMGMetrics
|
5 |
+
|
6 |
+
METRICS: Dict[str, Optional[BaseTaskMetrics]] = {
|
7 |
+
"commit_message_generation": CMGMetrics(),
|
8 |
+
"bug_localization": None,
|
9 |
+
"module_to_text": None,
|
10 |
+
"library_usage": None,
|
11 |
+
"project_code_completion": None,
|
12 |
+
"bug_localization_build_logs": None,
|
13 |
+
}
|
src/formatting.py
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
def styled_error(error):
|
2 |
+
return f"<p style='color: red; font-size: 20px; text-align: center;'>{error}</p>"
|
3 |
+
|
4 |
+
|
5 |
+
def styled_warning(warn):
|
6 |
+
return f"<p style='color: orange; font-size: 20px; text-align: center;'>{warn}</p>"
|
7 |
+
|
8 |
+
|
9 |
+
def styled_message(message):
|
10 |
+
return (
|
11 |
+
f"<p style='color: green; font-size: 20px; text-align: center;'>{message}</p>"
|
12 |
+
)
|
src/get_results_for_task.py
CHANGED
@@ -1,6 +1,4 @@
|
|
1 |
-
import pandas as pd
|
2 |
-
|
3 |
-
RESULTS_DATASET = "JetBrains-Research/lca-results"
|
4 |
|
5 |
|
6 |
def get_results_for_task_stub(task: str) -> pd.DataFrame:
|
|
|
1 |
+
import pandas as pd # type: ignore[import]
|
|
|
|
|
2 |
|
3 |
|
4 |
def get_results_for_task_stub(task: str) -> pd.DataFrame:
|
src/submission_uploader.py
CHANGED
@@ -1,9 +1,16 @@
|
|
1 |
import json
|
|
|
2 |
import os
|
3 |
-
from
|
|
|
4 |
|
5 |
-
|
|
|
|
|
|
|
6 |
|
|
|
|
|
7 |
from .tasks import TASKS_PRETTY_REVERSE
|
8 |
|
9 |
|
@@ -39,19 +46,30 @@ class SubmissionUploader:
|
|
39 |
and discussion.title == pr_title
|
40 |
):
|
41 |
return discussion
|
|
|
42 |
|
43 |
-
def
|
44 |
self,
|
45 |
-
task_id: str,
|
46 |
-
model_folder: str,
|
47 |
model_name_pretty: str,
|
48 |
model_availability: str,
|
49 |
urls: str,
|
50 |
context_size: str,
|
51 |
submitted_by: str,
|
52 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
53 |
) -> List[CommitOperationAdd]:
|
54 |
-
# add predictions files
|
55 |
commit_operations = [
|
56 |
CommitOperationAdd(
|
57 |
path_in_repo=f"{task_id}/predictions/{model_folder}/{os.path.basename(filename)}",
|
@@ -59,25 +77,114 @@ class SubmissionUploader:
|
|
59 |
)
|
60 |
for filename in filenames
|
61 |
]
|
|
|
62 |
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
70 |
}
|
71 |
-
with open("
|
72 |
-
json.dump(
|
73 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
74 |
CommitOperationAdd(
|
75 |
-
path_in_repo=f"{task_id}/
|
76 |
-
path_or_fileobj="
|
77 |
)
|
78 |
-
|
79 |
|
80 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
81 |
|
82 |
def upload_files(
|
83 |
self,
|
@@ -92,10 +199,21 @@ class SubmissionUploader:
|
|
92 |
force: bool = False,
|
93 |
) -> str:
|
94 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
95 |
pr_title = f"π New submission to {task_pretty} task: {model_name_pretty} with {context_size} context size from {submitted_by}"
|
96 |
|
97 |
task_id = TASKS_PRETTY_REVERSE[task_pretty]
|
98 |
|
|
|
99 |
if not force:
|
100 |
if model_name_pretty in self._fs.ls(
|
101 |
f"datasets/{self._dataset_id}/{task_id}/predictions"
|
@@ -106,29 +224,46 @@ class SubmissionUploader:
|
|
106 |
)
|
107 |
for filename in filenames + ["metadata.json"]
|
108 |
):
|
109 |
-
return (
|
110 |
f"{model_name_pretty} is already present in {self._dataset_id}."
|
111 |
)
|
112 |
|
113 |
prev_pr = self._get_previous_pr(pr_title)
|
114 |
if prev_pr is not None:
|
115 |
url = f"https://huggingface.co/datasets/{self._dataset_id}/discussions/{prev_pr.num}"
|
116 |
-
return
|
|
|
|
|
117 |
|
118 |
-
|
|
|
119 |
task_id=task_id,
|
120 |
model_folder=model_folder,
|
121 |
-
model_name_pretty=model_name_pretty,
|
122 |
-
model_availability=model_availability,
|
123 |
-
urls=urls,
|
124 |
-
context_size=context_size,
|
125 |
-
submitted_by=submitted_by,
|
126 |
filenames=filenames,
|
127 |
)
|
128 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
129 |
new_pr = self._api.create_commit(
|
130 |
repo_id=self._dataset_id,
|
131 |
-
operations=
|
132 |
commit_message=pr_title,
|
133 |
commit_description=f"""New submission to {task_pretty} task in ποΈ Long Code Arena benchmark!
|
134 |
|
@@ -141,7 +276,10 @@ class SubmissionUploader:
|
|
141 |
create_pr=True,
|
142 |
repo_type="dataset",
|
143 |
)
|
144 |
-
return f"π PR created at {new_pr.pr_url}."
|
145 |
|
146 |
-
except Exception:
|
147 |
-
|
|
|
|
|
|
|
|
1 |
import json
|
2 |
+
import logging
|
3 |
import os
|
4 |
+
from tempfile import TemporaryDirectory
|
5 |
+
from typing import Dict, List, Optional
|
6 |
|
7 |
+
import jsonlines
|
8 |
+
from huggingface_hub import CommitOperationAdd # type: ignore[import]
|
9 |
+
from huggingface_hub import Discussion, HfApi, HfFileSystem
|
10 |
+
from tqdm import tqdm
|
11 |
|
12 |
+
from .evaluation import METRICS
|
13 |
+
from .formatting import styled_error, styled_message, styled_warning
|
14 |
from .tasks import TASKS_PRETTY_REVERSE
|
15 |
|
16 |
|
|
|
46 |
and discussion.title == pr_title
|
47 |
):
|
48 |
return discussion
|
49 |
+
return None
|
50 |
|
51 |
+
def _get_metadata(
|
52 |
self,
|
|
|
|
|
53 |
model_name_pretty: str,
|
54 |
model_availability: str,
|
55 |
urls: str,
|
56 |
context_size: str,
|
57 |
submitted_by: str,
|
58 |
+
) -> Dict[str, str]:
|
59 |
+
return {
|
60 |
+
"model_name": model_name_pretty,
|
61 |
+
"model_availability": model_availability,
|
62 |
+
"urls": urls,
|
63 |
+
"context_size": context_size,
|
64 |
+
"submitted_by": submitted_by,
|
65 |
+
}
|
66 |
+
|
67 |
+
def _upload_predictions(
|
68 |
+
self,
|
69 |
+
task_id: str,
|
70 |
+
model_folder: str,
|
71 |
+
filenames: List[str],
|
72 |
) -> List[CommitOperationAdd]:
|
|
|
73 |
commit_operations = [
|
74 |
CommitOperationAdd(
|
75 |
path_in_repo=f"{task_id}/predictions/{model_folder}/{os.path.basename(filename)}",
|
|
|
77 |
)
|
78 |
for filename in filenames
|
79 |
]
|
80 |
+
return commit_operations
|
81 |
|
82 |
+
def _compute_metrics_for_predictions(
|
83 |
+
self, task_id: str, filenames: Optional[List[str]], temp_directory: str
|
84 |
+
) -> None:
|
85 |
+
metrics_module = METRICS[task_id]
|
86 |
+
assert (
|
87 |
+
metrics_module is not None
|
88 |
+
), f"Computing metrics for {task_id} is not supported."
|
89 |
+
metrics_module.reset()
|
90 |
+
open(os.path.join(temp_directory, "metrics.jsonl"), "w").close()
|
91 |
+
|
92 |
+
# compute the metrics for each submitted file
|
93 |
+
for filename in filenames:
|
94 |
+
with jsonlines.open(filename, "r") as reader:
|
95 |
+
for example in tqdm(
|
96 |
+
reader, desc=f"Computing metrics for {os.path.basename(filename)}"
|
97 |
+
):
|
98 |
+
metrics_module.add_batch(
|
99 |
+
predictions=[example["prediction"]],
|
100 |
+
references=[example["reference"]],
|
101 |
+
)
|
102 |
+
computed_metrics = metrics_module.compute()
|
103 |
+
metrics_module.reset()
|
104 |
+
with jsonlines.open(
|
105 |
+
os.path.join(temp_directory, "metrics.jsonl"), "a"
|
106 |
+
) as writer:
|
107 |
+
writer.write(computed_metrics)
|
108 |
+
|
109 |
+
# aggregate the metrics over submitted files
|
110 |
+
with jsonlines.open(
|
111 |
+
os.path.join(temp_directory, "metrics.jsonl"), "r"
|
112 |
+
) as reader:
|
113 |
+
metrics_results = [line for line in reader]
|
114 |
+
final_metrics_results = {
|
115 |
+
key: sum(entry[key] for entry in metrics_results) / len(metrics_results)
|
116 |
+
for key in metrics_results[0]
|
117 |
}
|
118 |
+
with open(os.path.join(temp_directory, "final_metrics.json"), "w") as f:
|
119 |
+
json.dump(final_metrics_results, f)
|
120 |
+
|
121 |
+
def _upload_results(
|
122 |
+
self,
|
123 |
+
task_id: str,
|
124 |
+
model_folder: str,
|
125 |
+
model_name_pretty: str,
|
126 |
+
model_availability: str,
|
127 |
+
urls: str,
|
128 |
+
context_size: str,
|
129 |
+
submitted_by: str,
|
130 |
+
temp_directory: str,
|
131 |
+
) -> List[CommitOperationAdd]:
|
132 |
+
final_results = {}
|
133 |
+
with open(os.path.join(temp_directory, "final_metrics.json"), "r") as f:
|
134 |
+
metrics = json.load(f)
|
135 |
+
final_results.update(metrics)
|
136 |
+
metadata_dict = self._get_metadata(
|
137 |
+
model_name_pretty=model_name_pretty,
|
138 |
+
model_availability=model_availability,
|
139 |
+
urls=urls,
|
140 |
+
context_size=context_size,
|
141 |
+
submitted_by=submitted_by,
|
142 |
+
)
|
143 |
+
final_results.update(metadata_dict)
|
144 |
+
|
145 |
+
with jsonlines.open(
|
146 |
+
os.path.join(temp_directory, "final_results.jsonl"), "w"
|
147 |
+
) as writer:
|
148 |
+
writer.write(final_results)
|
149 |
+
|
150 |
+
return [
|
151 |
CommitOperationAdd(
|
152 |
+
path_in_repo=f"{task_id}/results/{model_folder}.jsonl",
|
153 |
+
path_or_fileobj=os.path.join(temp_directory, "final_results.jsonl"),
|
154 |
)
|
155 |
+
]
|
156 |
|
157 |
+
def _verify_arguments(
|
158 |
+
self,
|
159 |
+
model_folder: str,
|
160 |
+
model_name_pretty: str,
|
161 |
+
model_availability: str,
|
162 |
+
urls: str,
|
163 |
+
context_size: str,
|
164 |
+
submitted_by: str,
|
165 |
+
filenames: Optional[List[str]],
|
166 |
+
):
|
167 |
+
assert (
|
168 |
+
model_folder
|
169 |
+
), "Please, specify non-empty name for a directory with a model's results."
|
170 |
+
assert model_name_pretty, "Please, specify non-empty name for a model."
|
171 |
+
assert (
|
172 |
+
model_availability
|
173 |
+
), "Please, specify non-empty information about a model's availability."
|
174 |
+
assert (
|
175 |
+
context_size
|
176 |
+
), "Please, specify non-empty information about a model's context size."
|
177 |
+
try:
|
178 |
+
_ = int(context_size)
|
179 |
+
except:
|
180 |
+
raise ValueError(
|
181 |
+
"Please, specify a model's context size as an integer (e.g., 16000)."
|
182 |
+
)
|
183 |
+
|
184 |
+
assert (
|
185 |
+
submitted_by
|
186 |
+
), "Please, specify non-empty information about a submission's author(s)."
|
187 |
+
assert filenames, "Please, attach at least one file with predictions."
|
188 |
|
189 |
def upload_files(
|
190 |
self,
|
|
|
199 |
force: bool = False,
|
200 |
) -> str:
|
201 |
try:
|
202 |
+
self._verify_arguments(
|
203 |
+
model_folder=model_folder,
|
204 |
+
model_name_pretty=model_name_pretty,
|
205 |
+
model_availability=model_availability,
|
206 |
+
urls=urls,
|
207 |
+
context_size=context_size,
|
208 |
+
submitted_by=submitted_by,
|
209 |
+
filenames=filenames,
|
210 |
+
)
|
211 |
+
|
212 |
pr_title = f"π New submission to {task_pretty} task: {model_name_pretty} with {context_size} context size from {submitted_by}"
|
213 |
|
214 |
task_id = TASKS_PRETTY_REVERSE[task_pretty]
|
215 |
|
216 |
+
logging.info("Checking if this request is already submitted...")
|
217 |
if not force:
|
218 |
if model_name_pretty in self._fs.ls(
|
219 |
f"datasets/{self._dataset_id}/{task_id}/predictions"
|
|
|
224 |
)
|
225 |
for filename in filenames + ["metadata.json"]
|
226 |
):
|
227 |
+
return styled_warning(
|
228 |
f"{model_name_pretty} is already present in {self._dataset_id}."
|
229 |
)
|
230 |
|
231 |
prev_pr = self._get_previous_pr(pr_title)
|
232 |
if prev_pr is not None:
|
233 |
url = f"https://huggingface.co/datasets/{self._dataset_id}/discussions/{prev_pr.num}"
|
234 |
+
return styled_warning(
|
235 |
+
f"{self._dataset_id} already has an open PR for this submission: {url}."
|
236 |
+
)
|
237 |
|
238 |
+
logging.info("Processing predictions...")
|
239 |
+
predictions_commit_operations = self._upload_predictions(
|
240 |
task_id=task_id,
|
241 |
model_folder=model_folder,
|
|
|
|
|
|
|
|
|
|
|
242 |
filenames=filenames,
|
243 |
)
|
244 |
|
245 |
+
with TemporaryDirectory() as d:
|
246 |
+
logging.info("Computing metrics...")
|
247 |
+
self._compute_metrics_for_predictions(
|
248 |
+
task_id=task_id, filenames=filenames, temp_directory=str(d)
|
249 |
+
)
|
250 |
+
|
251 |
+
logging.info("Processing results...")
|
252 |
+
results_commit_operations = self._upload_results(
|
253 |
+
task_id=task_id,
|
254 |
+
model_folder=model_folder,
|
255 |
+
model_name_pretty=model_name_pretty,
|
256 |
+
model_availability=model_availability,
|
257 |
+
urls=urls,
|
258 |
+
context_size=context_size,
|
259 |
+
submitted_by=submitted_by,
|
260 |
+
temp_directory=str(d),
|
261 |
+
)
|
262 |
+
|
263 |
+
logging.info("Creating commit...")
|
264 |
new_pr = self._api.create_commit(
|
265 |
repo_id=self._dataset_id,
|
266 |
+
operations=predictions_commit_operations + results_commit_operations,
|
267 |
commit_message=pr_title,
|
268 |
commit_description=f"""New submission to {task_pretty} task in ποΈ Long Code Arena benchmark!
|
269 |
|
|
|
276 |
create_pr=True,
|
277 |
repo_type="dataset",
|
278 |
)
|
279 |
+
return styled_message(f"π PR created at {new_pr.pr_url}.")
|
280 |
|
281 |
+
except Exception as e:
|
282 |
+
logging.exception(e)
|
283 |
+
if str(e):
|
284 |
+
return styled_error(str(e))
|
285 |
+
return styled_error("An exception occured.")
|