Spaces:
Runtime error
Runtime error
Petr Tsvetkov
commited on
Commit
Β·
073db2c
1
Parent(s):
f5faae7
Add noref gpt-eval to the pipeline
Browse files
custom_metrics/gpt_eval.py
CHANGED
|
@@ -20,6 +20,24 @@ lowest quality and 10 is the highest quality. Do not include any other text or e
|
|
| 20 |
"""
|
| 21 |
|
| 22 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
N_RETRIES = 3
|
| 24 |
|
| 25 |
|
|
@@ -51,3 +69,13 @@ def compute_ref(prediction, reference, n_requests):
|
|
| 51 |
]
|
| 52 |
|
| 53 |
return sum(results) / len(results)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
"""
|
| 21 |
|
| 22 |
|
| 23 |
+
def build_prompt_noref(prediction, diff):
|
| 24 |
+
return f"""Evaluate the following commit message based on clarity, specificity, context, and conciseness without
|
| 25 |
+
providing any additional feedback or commentary:
|
| 26 |
+
|
| 27 |
+
START OF THE COMMIT MESSAGE YOU HAVE TO EVALUATE
|
| 28 |
+
{prediction}
|
| 29 |
+
END OF THE COMMIT MESSAGE YOU HAVE TO EVALUATE
|
| 30 |
+
|
| 31 |
+
These are the code changes included in the commit:
|
| 32 |
+
START OF THE CODE CHANGES
|
| 33 |
+
{diff}
|
| 34 |
+
END OF THE CODE CHANGES
|
| 35 |
+
|
| 36 |
+
YOUR TASK: Provide a single number as a response, representing the rating on a scale from 1 to 10, where 1 is the
|
| 37 |
+
lowest quality and 10 is the highest quality. Do not include any other text or explanation in your response.
|
| 38 |
+
"""
|
| 39 |
+
|
| 40 |
+
|
| 41 |
N_RETRIES = 3
|
| 42 |
|
| 43 |
|
|
|
|
| 69 |
]
|
| 70 |
|
| 71 |
return sum(results) / len(results)
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
def compute_noref(prediction, diff, n_requests):
|
| 75 |
+
prompt = build_prompt_noref(prediction, diff)
|
| 76 |
+
results = [
|
| 77 |
+
get_number_for_prompt(prompt)
|
| 78 |
+
for _ in range(n_requests)
|
| 79 |
+
]
|
| 80 |
+
|
| 81 |
+
return sum(results) / len(results)
|
generation_steps/metrics_analysis.py
CHANGED
|
@@ -80,10 +80,25 @@ def gptscore_ref_5_fn(pred, ref, **kwargs):
|
|
| 80 |
return gpt_eval.compute_ref(prediction=pred, reference=ref, n_requests=5)
|
| 81 |
|
| 82 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 83 |
IND_METRICS = {
|
| 84 |
"gptscore-ref-1-req": gptscore_ref_1_fn,
|
| 85 |
-
"gptscore-ref-3-req": gptscore_ref_3_fn,
|
| 86 |
# "gptscore-ref-5-req": gptscore_ref_5_fn,
|
|
|
|
|
|
|
|
|
|
| 87 |
"editdist": edit_distance_fn,
|
| 88 |
"bleu": bleu_fn,
|
| 89 |
"meteor": meteor_fn,
|
|
@@ -111,7 +126,7 @@ def compute_metrics(df):
|
|
| 111 |
tqdm.pandas()
|
| 112 |
|
| 113 |
def apply_metric_fn_to_row(row, fn, col_pred, col_ref):
|
| 114 |
-
return fn(row[col_pred], row[col_ref], edittime=row['edit_time'])
|
| 115 |
|
| 116 |
for metric in REL_METRICS:
|
| 117 |
print(f"Computing {metric} for the related pairs")
|
|
|
|
| 80 |
return gpt_eval.compute_ref(prediction=pred, reference=ref, n_requests=5)
|
| 81 |
|
| 82 |
|
| 83 |
+
def gptscore_noref_1_fn(pred, ref, **kwargs):
|
| 84 |
+
return gpt_eval.compute_noref(prediction=pred, diff=kwargs['diff'], n_requests=1)
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
def gptscore_noref_3_fn(pred, ref, **kwargs):
|
| 88 |
+
return gpt_eval.compute_noref(prediction=pred, diff=kwargs['diff'], n_requests=3)
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
def gptscore_noref_5_fn(pred, ref, **kwargs):
|
| 92 |
+
return gpt_eval.compute_noref(prediction=pred, diff=kwargs['diff'], n_requests=5)
|
| 93 |
+
|
| 94 |
+
|
| 95 |
IND_METRICS = {
|
| 96 |
"gptscore-ref-1-req": gptscore_ref_1_fn,
|
| 97 |
+
# "gptscore-ref-3-req": gptscore_ref_3_fn,
|
| 98 |
# "gptscore-ref-5-req": gptscore_ref_5_fn,
|
| 99 |
+
"gptscore-noref-1-req": gptscore_noref_1_fn,
|
| 100 |
+
# "gptscore-noref-3-req": gptscore_noref_3_fn,
|
| 101 |
+
# "gptscore-noref-5-req": gptscore_noref_5_fn,
|
| 102 |
"editdist": edit_distance_fn,
|
| 103 |
"bleu": bleu_fn,
|
| 104 |
"meteor": meteor_fn,
|
|
|
|
| 126 |
tqdm.pandas()
|
| 127 |
|
| 128 |
def apply_metric_fn_to_row(row, fn, col_pred, col_ref):
|
| 129 |
+
return fn(row[col_pred], row[col_ref], edittime=row['edit_time'], diff=str(row['mods']))
|
| 130 |
|
| 131 |
for metric in REL_METRICS:
|
| 132 |
print(f"Computing {metric} for the related pairs")
|