Spaces:
Runtime error
Runtime error
Petr Tsvetkov
commited on
Commit
Β·
aef1dbe
1
Parent(s):
9e1ff19
Visualizer bugs fixed; added normalized editdist
Browse files- analysis_util.py +10 -10
- change_visualizer.py +2 -2
- generate_annotated_diffs.py +7 -2
- generation_steps/metrics_analysis.py +42 -4
analysis_util.py
CHANGED
|
@@ -55,21 +55,21 @@ def get_correlations_df(df, right_side):
|
|
| 55 |
|
| 56 |
|
| 57 |
def get_correlations_for_groups(df, right_side):
|
| 58 |
-
|
| 59 |
|
| 60 |
for e2s in (False, True):
|
| 61 |
for s2e in (False, True):
|
| 62 |
-
|
| 63 |
if e2s:
|
| 64 |
-
|
| 65 |
if s2e:
|
| 66 |
-
|
| 67 |
-
if
|
| 68 |
-
|
| 69 |
|
| 70 |
subdf = df[(df["end_to_start"] == e2s) & (df["start_to_end"] == s2e)]
|
| 71 |
-
|
| 72 |
-
|
| 73 |
|
| 74 |
-
|
| 75 |
-
return
|
|
|
|
| 55 |
|
| 56 |
|
| 57 |
def get_correlations_for_groups(df, right_side):
|
| 58 |
+
correlations = {"all": get_correlations_df(df, right_side=right_side)}
|
| 59 |
|
| 60 |
for e2s in (False, True):
|
| 61 |
for s2e in (False, True):
|
| 62 |
+
group = ""
|
| 63 |
if e2s:
|
| 64 |
+
group += "+e2s"
|
| 65 |
if s2e:
|
| 66 |
+
group += "+s2e"
|
| 67 |
+
if group == "":
|
| 68 |
+
group = "golden"
|
| 69 |
|
| 70 |
subdf = df[(df["end_to_start"] == e2s) & (df["start_to_end"] == s2e)]
|
| 71 |
+
subdf_corr = get_correlations_df(subdf, right_side=right_side)
|
| 72 |
+
correlations[group] = subdf_corr
|
| 73 |
|
| 74 |
+
correlations = pd.concat(correlations, axis=1)
|
| 75 |
+
return correlations
|
change_visualizer.py
CHANGED
|
@@ -110,8 +110,8 @@ if __name__ == '__main__':
|
|
| 110 |
gr.Markdown(f"### Reference-only correlations")
|
| 111 |
gr.Markdown(value=analysis_util.get_correlations_for_groups(df_synthetic, right_side="ind").to_markdown())
|
| 112 |
|
| 113 |
-
gr.Markdown(f"### Aggregated correlations")
|
| 114 |
-
gr.Markdown(value=analysis_util.get_correlations_for_groups(df_synthetic, right_side="aggr").to_markdown())
|
| 115 |
|
| 116 |
application.load(update_dataset_view_manual, inputs=slider_manual,
|
| 117 |
outputs=view_manual)
|
|
|
|
| 110 |
gr.Markdown(f"### Reference-only correlations")
|
| 111 |
gr.Markdown(value=analysis_util.get_correlations_for_groups(df_synthetic, right_side="ind").to_markdown())
|
| 112 |
|
| 113 |
+
# gr.Markdown(f"### Aggregated correlations")
|
| 114 |
+
# gr.Markdown(value=analysis_util.get_correlations_for_groups(df_synthetic, right_side="aggr").to_markdown())
|
| 115 |
|
| 116 |
application.load(update_dataset_view_manual, inputs=slider_manual,
|
| 117 |
outputs=view_manual)
|
generate_annotated_diffs.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
| 1 |
import diff_match_patch as dmp_module
|
|
|
|
| 2 |
|
| 3 |
from api_wrappers import hf_data_loader
|
| 4 |
|
|
@@ -26,14 +27,18 @@ def annotated_diff_for_row(row):
|
|
| 26 |
|
| 27 |
|
| 28 |
def manual_data_with_annotated_diffs():
|
|
|
|
|
|
|
| 29 |
df = hf_data_loader.load_raw_rewriting_as_pandas()
|
| 30 |
-
annotated = df.
|
| 31 |
df['annotated_diff'] = annotated
|
| 32 |
return df
|
| 33 |
|
| 34 |
|
| 35 |
def synthetic_data_with_annotated_diffs():
|
|
|
|
|
|
|
| 36 |
df = hf_data_loader.load_synthetic_as_pandas()
|
| 37 |
-
annotated = df.
|
| 38 |
df['annotated_diff'] = annotated
|
| 39 |
return df
|
|
|
|
| 1 |
import diff_match_patch as dmp_module
|
| 2 |
+
from tqdm import tqdm
|
| 3 |
|
| 4 |
from api_wrappers import hf_data_loader
|
| 5 |
|
|
|
|
| 27 |
|
| 28 |
|
| 29 |
def manual_data_with_annotated_diffs():
|
| 30 |
+
tqdm.pandas()
|
| 31 |
+
|
| 32 |
df = hf_data_loader.load_raw_rewriting_as_pandas()
|
| 33 |
+
annotated = df.progress_apply(annotated_diff_for_row, axis=1)
|
| 34 |
df['annotated_diff'] = annotated
|
| 35 |
return df
|
| 36 |
|
| 37 |
|
| 38 |
def synthetic_data_with_annotated_diffs():
|
| 39 |
+
tqdm.pandas()
|
| 40 |
+
|
| 41 |
df = hf_data_loader.load_synthetic_as_pandas()
|
| 42 |
+
annotated = df.progress_apply(annotated_diff_for_row, axis=1)
|
| 43 |
df['annotated_diff'] = annotated
|
| 44 |
return df
|
generation_steps/metrics_analysis.py
CHANGED
|
@@ -15,6 +15,8 @@ BLEU = evaluate.load('bleu', cache_dir=config.CACHE_DIR)
|
|
| 15 |
|
| 16 |
|
| 17 |
def bleu_fn(pred, ref, **kwargs):
|
|
|
|
|
|
|
| 18 |
return BLEU.compute(predictions=[pred], references=[ref])["bleu"]
|
| 19 |
|
| 20 |
|
|
@@ -22,6 +24,8 @@ METEOR = evaluate.load('meteor', cache_dir=config.CACHE_DIR)
|
|
| 22 |
|
| 23 |
|
| 24 |
def meteor_fn(pred, ref, **kwargs):
|
|
|
|
|
|
|
| 25 |
return METEOR.compute(predictions=[pred], references=[ref])["meteor"]
|
| 26 |
|
| 27 |
|
|
@@ -29,14 +33,20 @@ ROUGE = evaluate.load('rouge', cache_dir=config.CACHE_DIR)
|
|
| 29 |
|
| 30 |
|
| 31 |
def rouge1_fn(pred, ref, **kwargs):
|
|
|
|
|
|
|
| 32 |
return ROUGE.compute(predictions=[pred], references=[ref])["rouge1"]
|
| 33 |
|
| 34 |
|
| 35 |
def rouge2_fn(pred, ref, **kwargs):
|
|
|
|
|
|
|
| 36 |
return ROUGE.compute(predictions=[pred], references=[ref])["rouge2"]
|
| 37 |
|
| 38 |
|
| 39 |
def rougeL_fn(pred, ref, **kwargs):
|
|
|
|
|
|
|
| 40 |
return ROUGE.compute(predictions=[pred], references=[ref])["rougeL"]
|
| 41 |
|
| 42 |
|
|
@@ -44,6 +54,10 @@ BERTSCORE = evaluate.load('bertscore', cache_dir=config.CACHE_DIR)
|
|
| 44 |
|
| 45 |
|
| 46 |
def bertscore_fn(pred, ref, **kwargs):
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
return BERTSCORE.compute(predictions=[pred], references=[ref], model_type="distilbert-base-uncased")["f1"][0]
|
| 48 |
|
| 49 |
|
|
@@ -51,6 +65,8 @@ CHRF = evaluate.load("chrf")
|
|
| 51 |
|
| 52 |
|
| 53 |
def chrf_fn(pred, ref, **kwargs):
|
|
|
|
|
|
|
| 54 |
return CHRF.compute(predictions=[pred], references=[[ref]])["score"]
|
| 55 |
|
| 56 |
|
|
@@ -58,26 +74,46 @@ TER = evaluate.load("ter")
|
|
| 58 |
|
| 59 |
|
| 60 |
def ter_fn(pred, ref, **kwargs):
|
|
|
|
|
|
|
| 61 |
return TER.compute(predictions=[pred], references=[[ref]])["score"]
|
| 62 |
|
| 63 |
|
| 64 |
def edit_distance_fn(pred, ref, **kwargs):
|
|
|
|
|
|
|
|
|
|
| 65 |
return Levenshtein.distance(pred, ref)
|
| 66 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 67 |
|
| 68 |
def edit_time_fn(pred, ref, **kwargs):
|
| 69 |
return kwargs["edittime"]
|
| 70 |
|
| 71 |
|
| 72 |
def gptscore_ref_1_fn(pred, ref, **kwargs):
|
|
|
|
|
|
|
|
|
|
| 73 |
return gpt_eval.compute_ref(prediction=pred, reference=ref, n_requests=1)
|
| 74 |
|
| 75 |
|
| 76 |
def gptscore_ref_3_fn(pred, ref, **kwargs):
|
|
|
|
|
|
|
|
|
|
| 77 |
return gpt_eval.compute_ref(prediction=pred, reference=ref, n_requests=3)
|
| 78 |
|
| 79 |
|
| 80 |
def gptscore_ref_5_fn(pred, ref, **kwargs):
|
|
|
|
|
|
|
|
|
|
| 81 |
return gpt_eval.compute_ref(prediction=pred, reference=ref, n_requests=5)
|
| 82 |
|
| 83 |
|
|
@@ -94,13 +130,14 @@ def gptscore_noref_5_fn(pred, ref, **kwargs):
|
|
| 94 |
|
| 95 |
|
| 96 |
IND_METRICS = {
|
|
|
|
|
|
|
| 97 |
"gptscore-ref-1-req": gptscore_ref_1_fn,
|
| 98 |
# "gptscore-ref-3-req": gptscore_ref_3_fn,
|
| 99 |
# "gptscore-ref-5-req": gptscore_ref_5_fn,
|
| 100 |
"gptscore-noref-1-req": gptscore_noref_1_fn,
|
| 101 |
# "gptscore-noref-3-req": gptscore_noref_3_fn,
|
| 102 |
# "gptscore-noref-5-req": gptscore_noref_5_fn,
|
| 103 |
-
"editdist": edit_distance_fn,
|
| 104 |
"bleu": bleu_fn,
|
| 105 |
"meteor": meteor_fn,
|
| 106 |
"rouge1": rouge1_fn,
|
|
@@ -112,11 +149,12 @@ IND_METRICS = {
|
|
| 112 |
}
|
| 113 |
|
| 114 |
AGGR_METRICS = IND_METRICS.copy()
|
| 115 |
-
del AGGR_METRICS["gptscore-ref-1-req"]
|
| 116 |
-
del AGGR_METRICS["gptscore-noref-1-req"]
|
| 117 |
|
| 118 |
REL_METRICS = {
|
| 119 |
"editdist": edit_distance_fn,
|
|
|
|
| 120 |
"edittime": edit_time_fn,
|
| 121 |
}
|
| 122 |
|
|
@@ -183,7 +221,7 @@ def compute_metrics(df):
|
|
| 183 |
df[f"rel_{rel_metric}_aggr_{aggr_metric}_pearson"] = (
|
| 184 |
df[f"{rel_metric}_related"].corr(df[f"{aggr_metric}_aggr"], method="pearson"))
|
| 185 |
|
| 186 |
-
df[f"rel_{rel_metric}
|
| 187 |
df[f"{rel_metric}_related"].corr(df[f"{aggr_metric}_aggr"], method="spearman"))
|
| 188 |
|
| 189 |
return df
|
|
|
|
| 15 |
|
| 16 |
|
| 17 |
def bleu_fn(pred, ref, **kwargs):
|
| 18 |
+
if "refs" in kwargs:
|
| 19 |
+
return BLEU.compute(predictions=[pred] * len(kwargs["refs"]), references=kwargs["refs"])["bleu"]
|
| 20 |
return BLEU.compute(predictions=[pred], references=[ref])["bleu"]
|
| 21 |
|
| 22 |
|
|
|
|
| 24 |
|
| 25 |
|
| 26 |
def meteor_fn(pred, ref, **kwargs):
|
| 27 |
+
if "refs" in kwargs:
|
| 28 |
+
return METEOR.compute(predictions=[pred] * len(kwargs["refs"]), references=kwargs["refs"])["meteor"]
|
| 29 |
return METEOR.compute(predictions=[pred], references=[ref])["meteor"]
|
| 30 |
|
| 31 |
|
|
|
|
| 33 |
|
| 34 |
|
| 35 |
def rouge1_fn(pred, ref, **kwargs):
|
| 36 |
+
if "refs" in kwargs:
|
| 37 |
+
return ROUGE.compute(predictions=[pred] * len(kwargs["refs"]), references=kwargs["refs"])["rouge1"]
|
| 38 |
return ROUGE.compute(predictions=[pred], references=[ref])["rouge1"]
|
| 39 |
|
| 40 |
|
| 41 |
def rouge2_fn(pred, ref, **kwargs):
|
| 42 |
+
if "refs" in kwargs:
|
| 43 |
+
return ROUGE.compute(predictions=[pred] * len(kwargs["refs"]), references=kwargs["refs"])["rouge2"]
|
| 44 |
return ROUGE.compute(predictions=[pred], references=[ref])["rouge2"]
|
| 45 |
|
| 46 |
|
| 47 |
def rougeL_fn(pred, ref, **kwargs):
|
| 48 |
+
if "refs" in kwargs:
|
| 49 |
+
return ROUGE.compute(predictions=[pred] * len(kwargs["refs"]), references=kwargs["refs"])["rougeL"]
|
| 50 |
return ROUGE.compute(predictions=[pred], references=[ref])["rougeL"]
|
| 51 |
|
| 52 |
|
|
|
|
| 54 |
|
| 55 |
|
| 56 |
def bertscore_fn(pred, ref, **kwargs):
|
| 57 |
+
if "refs" in kwargs:
|
| 58 |
+
return \
|
| 59 |
+
BERTSCORE.compute(predictions=[pred], references=[kwargs["refs"]], model_type="distilbert-base-uncased")[
|
| 60 |
+
"f1"][0]
|
| 61 |
return BERTSCORE.compute(predictions=[pred], references=[ref], model_type="distilbert-base-uncased")["f1"][0]
|
| 62 |
|
| 63 |
|
|
|
|
| 65 |
|
| 66 |
|
| 67 |
def chrf_fn(pred, ref, **kwargs):
|
| 68 |
+
if "refs" in kwargs:
|
| 69 |
+
return CHRF.compute(predictions=[pred], references=[kwargs["refs"]])["score"]
|
| 70 |
return CHRF.compute(predictions=[pred], references=[[ref]])["score"]
|
| 71 |
|
| 72 |
|
|
|
|
| 74 |
|
| 75 |
|
| 76 |
def ter_fn(pred, ref, **kwargs):
|
| 77 |
+
if "refs" in kwargs:
|
| 78 |
+
return TER.compute(predictions=[pred], references=[kwargs["refs"]])["score"]
|
| 79 |
return TER.compute(predictions=[pred], references=[[ref]])["score"]
|
| 80 |
|
| 81 |
|
| 82 |
def edit_distance_fn(pred, ref, **kwargs):
|
| 83 |
+
if "refs" in kwargs:
|
| 84 |
+
scores = [Levenshtein.distance(pred, ref) for ref in kwargs["refs"]]
|
| 85 |
+
return sum(scores) / len(scores)
|
| 86 |
return Levenshtein.distance(pred, ref)
|
| 87 |
|
| 88 |
+
def edit_distance_norm_fn(pred, ref, **kwargs):
|
| 89 |
+
if "refs" in kwargs:
|
| 90 |
+
scores = [Levenshtein.distance(pred, ref) / len(pred) for ref in kwargs["refs"]]
|
| 91 |
+
return sum(scores) / len(scores)
|
| 92 |
+
return Levenshtein.distance(pred, ref) / len(pred)
|
| 93 |
+
|
| 94 |
|
| 95 |
def edit_time_fn(pred, ref, **kwargs):
|
| 96 |
return kwargs["edittime"]
|
| 97 |
|
| 98 |
|
| 99 |
def gptscore_ref_1_fn(pred, ref, **kwargs):
|
| 100 |
+
if "refs" in kwargs:
|
| 101 |
+
scores = [gpt_eval.compute_ref(prediction=pred, reference=ref, n_requests=1) for ref in kwargs["refs"]]
|
| 102 |
+
return sum(scores) / len(scores)
|
| 103 |
return gpt_eval.compute_ref(prediction=pred, reference=ref, n_requests=1)
|
| 104 |
|
| 105 |
|
| 106 |
def gptscore_ref_3_fn(pred, ref, **kwargs):
|
| 107 |
+
if "refs" in kwargs:
|
| 108 |
+
scores = [gpt_eval.compute_ref(prediction=pred, reference=ref, n_requests=3) for ref in kwargs["refs"]]
|
| 109 |
+
return sum(scores) / len(scores)
|
| 110 |
return gpt_eval.compute_ref(prediction=pred, reference=ref, n_requests=3)
|
| 111 |
|
| 112 |
|
| 113 |
def gptscore_ref_5_fn(pred, ref, **kwargs):
|
| 114 |
+
if "refs" in kwargs:
|
| 115 |
+
scores = [gpt_eval.compute_ref(prediction=pred, reference=ref, n_requests=5) for ref in kwargs["refs"]]
|
| 116 |
+
return sum(scores) / len(scores)
|
| 117 |
return gpt_eval.compute_ref(prediction=pred, reference=ref, n_requests=5)
|
| 118 |
|
| 119 |
|
|
|
|
| 130 |
|
| 131 |
|
| 132 |
IND_METRICS = {
|
| 133 |
+
"editdist": edit_distance_fn,
|
| 134 |
+
"editdist-norm": edit_distance_norm_fn,
|
| 135 |
"gptscore-ref-1-req": gptscore_ref_1_fn,
|
| 136 |
# "gptscore-ref-3-req": gptscore_ref_3_fn,
|
| 137 |
# "gptscore-ref-5-req": gptscore_ref_5_fn,
|
| 138 |
"gptscore-noref-1-req": gptscore_noref_1_fn,
|
| 139 |
# "gptscore-noref-3-req": gptscore_noref_3_fn,
|
| 140 |
# "gptscore-noref-5-req": gptscore_noref_5_fn,
|
|
|
|
| 141 |
"bleu": bleu_fn,
|
| 142 |
"meteor": meteor_fn,
|
| 143 |
"rouge1": rouge1_fn,
|
|
|
|
| 149 |
}
|
| 150 |
|
| 151 |
AGGR_METRICS = IND_METRICS.copy()
|
| 152 |
+
# del AGGR_METRICS["gptscore-ref-1-req"]
|
| 153 |
+
# del AGGR_METRICS["gptscore-noref-1-req"]
|
| 154 |
|
| 155 |
REL_METRICS = {
|
| 156 |
"editdist": edit_distance_fn,
|
| 157 |
+
"editdist-norm": edit_distance_norm_fn,
|
| 158 |
"edittime": edit_time_fn,
|
| 159 |
}
|
| 160 |
|
|
|
|
| 221 |
df[f"rel_{rel_metric}_aggr_{aggr_metric}_pearson"] = (
|
| 222 |
df[f"{rel_metric}_related"].corr(df[f"{aggr_metric}_aggr"], method="pearson"))
|
| 223 |
|
| 224 |
+
df[f"rel_{rel_metric}_aggr_{aggr_metric}_spearman"] = (
|
| 225 |
df[f"{rel_metric}_related"].corr(df[f"{aggr_metric}_aggr"], method="spearman"))
|
| 226 |
|
| 227 |
return df
|