Petr Tsvetkov commited on
Commit
aef1dbe
β€’
1 Parent(s): 9e1ff19

Visualizer bugs fixed; added normalized editdist

Browse files
analysis_util.py CHANGED
@@ -55,21 +55,21 @@ def get_correlations_df(df, right_side):
55
 
56
 
57
  def get_correlations_for_groups(df, right_side):
58
- noref_correlations = {"all": get_correlations_df(df, right_side=right_side)}
59
 
60
  for e2s in (False, True):
61
  for s2e in (False, True):
62
- suffix = ""
63
  if e2s:
64
- suffix += "+e2s"
65
  if s2e:
66
- suffix += "+s2e"
67
- if suffix == "":
68
- suffix = "golden"
69
 
70
  subdf = df[(df["end_to_start"] == e2s) & (df["start_to_end"] == s2e)]
71
- subdf_noref_corr = get_correlations_for_groups(subdf, right_side=right_side)
72
- noref_correlations[suffix] = subdf_noref_corr
73
 
74
- noref_correlations = pd.concat(noref_correlations, axis=1)
75
- return noref_correlations
 
55
 
56
 
57
  def get_correlations_for_groups(df, right_side):
58
+ correlations = {"all": get_correlations_df(df, right_side=right_side)}
59
 
60
  for e2s in (False, True):
61
  for s2e in (False, True):
62
+ group = ""
63
  if e2s:
64
+ group += "+e2s"
65
  if s2e:
66
+ group += "+s2e"
67
+ if group == "":
68
+ group = "golden"
69
 
70
  subdf = df[(df["end_to_start"] == e2s) & (df["start_to_end"] == s2e)]
71
+ subdf_corr = get_correlations_df(subdf, right_side=right_side)
72
+ correlations[group] = subdf_corr
73
 
74
+ correlations = pd.concat(correlations, axis=1)
75
+ return correlations
change_visualizer.py CHANGED
@@ -110,8 +110,8 @@ if __name__ == '__main__':
110
  gr.Markdown(f"### Reference-only correlations")
111
  gr.Markdown(value=analysis_util.get_correlations_for_groups(df_synthetic, right_side="ind").to_markdown())
112
 
113
- gr.Markdown(f"### Aggregated correlations")
114
- gr.Markdown(value=analysis_util.get_correlations_for_groups(df_synthetic, right_side="aggr").to_markdown())
115
 
116
  application.load(update_dataset_view_manual, inputs=slider_manual,
117
  outputs=view_manual)
 
110
  gr.Markdown(f"### Reference-only correlations")
111
  gr.Markdown(value=analysis_util.get_correlations_for_groups(df_synthetic, right_side="ind").to_markdown())
112
 
113
+ # gr.Markdown(f"### Aggregated correlations")
114
+ # gr.Markdown(value=analysis_util.get_correlations_for_groups(df_synthetic, right_side="aggr").to_markdown())
115
 
116
  application.load(update_dataset_view_manual, inputs=slider_manual,
117
  outputs=view_manual)
generate_annotated_diffs.py CHANGED
@@ -1,4 +1,5 @@
1
  import diff_match_patch as dmp_module
 
2
 
3
  from api_wrappers import hf_data_loader
4
 
@@ -26,14 +27,18 @@ def annotated_diff_for_row(row):
26
 
27
 
28
  def manual_data_with_annotated_diffs():
 
 
29
  df = hf_data_loader.load_raw_rewriting_as_pandas()
30
- annotated = df.apply(annotated_diff_for_row, axis=1)
31
  df['annotated_diff'] = annotated
32
  return df
33
 
34
 
35
  def synthetic_data_with_annotated_diffs():
 
 
36
  df = hf_data_loader.load_synthetic_as_pandas()
37
- annotated = df.apply(annotated_diff_for_row, axis=1)
38
  df['annotated_diff'] = annotated
39
  return df
 
1
  import diff_match_patch as dmp_module
2
+ from tqdm import tqdm
3
 
4
  from api_wrappers import hf_data_loader
5
 
 
27
 
28
 
29
  def manual_data_with_annotated_diffs():
30
+ tqdm.pandas()
31
+
32
  df = hf_data_loader.load_raw_rewriting_as_pandas()
33
+ annotated = df.progress_apply(annotated_diff_for_row, axis=1)
34
  df['annotated_diff'] = annotated
35
  return df
36
 
37
 
38
  def synthetic_data_with_annotated_diffs():
39
+ tqdm.pandas()
40
+
41
  df = hf_data_loader.load_synthetic_as_pandas()
42
+ annotated = df.progress_apply(annotated_diff_for_row, axis=1)
43
  df['annotated_diff'] = annotated
44
  return df
generation_steps/metrics_analysis.py CHANGED
@@ -15,6 +15,8 @@ BLEU = evaluate.load('bleu', cache_dir=config.CACHE_DIR)
15
 
16
 
17
  def bleu_fn(pred, ref, **kwargs):
 
 
18
  return BLEU.compute(predictions=[pred], references=[ref])["bleu"]
19
 
20
 
@@ -22,6 +24,8 @@ METEOR = evaluate.load('meteor', cache_dir=config.CACHE_DIR)
22
 
23
 
24
  def meteor_fn(pred, ref, **kwargs):
 
 
25
  return METEOR.compute(predictions=[pred], references=[ref])["meteor"]
26
 
27
 
@@ -29,14 +33,20 @@ ROUGE = evaluate.load('rouge', cache_dir=config.CACHE_DIR)
29
 
30
 
31
  def rouge1_fn(pred, ref, **kwargs):
 
 
32
  return ROUGE.compute(predictions=[pred], references=[ref])["rouge1"]
33
 
34
 
35
  def rouge2_fn(pred, ref, **kwargs):
 
 
36
  return ROUGE.compute(predictions=[pred], references=[ref])["rouge2"]
37
 
38
 
39
  def rougeL_fn(pred, ref, **kwargs):
 
 
40
  return ROUGE.compute(predictions=[pred], references=[ref])["rougeL"]
41
 
42
 
@@ -44,6 +54,10 @@ BERTSCORE = evaluate.load('bertscore', cache_dir=config.CACHE_DIR)
44
 
45
 
46
  def bertscore_fn(pred, ref, **kwargs):
 
 
 
 
47
  return BERTSCORE.compute(predictions=[pred], references=[ref], model_type="distilbert-base-uncased")["f1"][0]
48
 
49
 
@@ -51,6 +65,8 @@ CHRF = evaluate.load("chrf")
51
 
52
 
53
  def chrf_fn(pred, ref, **kwargs):
 
 
54
  return CHRF.compute(predictions=[pred], references=[[ref]])["score"]
55
 
56
 
@@ -58,26 +74,46 @@ TER = evaluate.load("ter")
58
 
59
 
60
  def ter_fn(pred, ref, **kwargs):
 
 
61
  return TER.compute(predictions=[pred], references=[[ref]])["score"]
62
 
63
 
64
  def edit_distance_fn(pred, ref, **kwargs):
 
 
 
65
  return Levenshtein.distance(pred, ref)
66
 
 
 
 
 
 
 
67
 
68
  def edit_time_fn(pred, ref, **kwargs):
69
  return kwargs["edittime"]
70
 
71
 
72
  def gptscore_ref_1_fn(pred, ref, **kwargs):
 
 
 
73
  return gpt_eval.compute_ref(prediction=pred, reference=ref, n_requests=1)
74
 
75
 
76
  def gptscore_ref_3_fn(pred, ref, **kwargs):
 
 
 
77
  return gpt_eval.compute_ref(prediction=pred, reference=ref, n_requests=3)
78
 
79
 
80
  def gptscore_ref_5_fn(pred, ref, **kwargs):
 
 
 
81
  return gpt_eval.compute_ref(prediction=pred, reference=ref, n_requests=5)
82
 
83
 
@@ -94,13 +130,14 @@ def gptscore_noref_5_fn(pred, ref, **kwargs):
94
 
95
 
96
  IND_METRICS = {
 
 
97
  "gptscore-ref-1-req": gptscore_ref_1_fn,
98
  # "gptscore-ref-3-req": gptscore_ref_3_fn,
99
  # "gptscore-ref-5-req": gptscore_ref_5_fn,
100
  "gptscore-noref-1-req": gptscore_noref_1_fn,
101
  # "gptscore-noref-3-req": gptscore_noref_3_fn,
102
  # "gptscore-noref-5-req": gptscore_noref_5_fn,
103
- "editdist": edit_distance_fn,
104
  "bleu": bleu_fn,
105
  "meteor": meteor_fn,
106
  "rouge1": rouge1_fn,
@@ -112,11 +149,12 @@ IND_METRICS = {
112
  }
113
 
114
  AGGR_METRICS = IND_METRICS.copy()
115
- del AGGR_METRICS["gptscore-ref-1-req"]
116
- del AGGR_METRICS["gptscore-noref-1-req"]
117
 
118
  REL_METRICS = {
119
  "editdist": edit_distance_fn,
 
120
  "edittime": edit_time_fn,
121
  }
122
 
@@ -183,7 +221,7 @@ def compute_metrics(df):
183
  df[f"rel_{rel_metric}_aggr_{aggr_metric}_pearson"] = (
184
  df[f"{rel_metric}_related"].corr(df[f"{aggr_metric}_aggr"], method="pearson"))
185
 
186
- df[f"rel_{rel_metric}_ind_{aggr_metric}_spearman"] = (
187
  df[f"{rel_metric}_related"].corr(df[f"{aggr_metric}_aggr"], method="spearman"))
188
 
189
  return df
 
15
 
16
 
17
  def bleu_fn(pred, ref, **kwargs):
18
+ if "refs" in kwargs:
19
+ return BLEU.compute(predictions=[pred] * len(kwargs["refs"]), references=kwargs["refs"])["bleu"]
20
  return BLEU.compute(predictions=[pred], references=[ref])["bleu"]
21
 
22
 
 
24
 
25
 
26
  def meteor_fn(pred, ref, **kwargs):
27
+ if "refs" in kwargs:
28
+ return METEOR.compute(predictions=[pred] * len(kwargs["refs"]), references=kwargs["refs"])["meteor"]
29
  return METEOR.compute(predictions=[pred], references=[ref])["meteor"]
30
 
31
 
 
33
 
34
 
35
  def rouge1_fn(pred, ref, **kwargs):
36
+ if "refs" in kwargs:
37
+ return ROUGE.compute(predictions=[pred] * len(kwargs["refs"]), references=kwargs["refs"])["rouge1"]
38
  return ROUGE.compute(predictions=[pred], references=[ref])["rouge1"]
39
 
40
 
41
  def rouge2_fn(pred, ref, **kwargs):
42
+ if "refs" in kwargs:
43
+ return ROUGE.compute(predictions=[pred] * len(kwargs["refs"]), references=kwargs["refs"])["rouge2"]
44
  return ROUGE.compute(predictions=[pred], references=[ref])["rouge2"]
45
 
46
 
47
  def rougeL_fn(pred, ref, **kwargs):
48
+ if "refs" in kwargs:
49
+ return ROUGE.compute(predictions=[pred] * len(kwargs["refs"]), references=kwargs["refs"])["rougeL"]
50
  return ROUGE.compute(predictions=[pred], references=[ref])["rougeL"]
51
 
52
 
 
54
 
55
 
56
  def bertscore_fn(pred, ref, **kwargs):
57
+ if "refs" in kwargs:
58
+ return \
59
+ BERTSCORE.compute(predictions=[pred], references=[kwargs["refs"]], model_type="distilbert-base-uncased")[
60
+ "f1"][0]
61
  return BERTSCORE.compute(predictions=[pred], references=[ref], model_type="distilbert-base-uncased")["f1"][0]
62
 
63
 
 
65
 
66
 
67
  def chrf_fn(pred, ref, **kwargs):
68
+ if "refs" in kwargs:
69
+ return CHRF.compute(predictions=[pred], references=[kwargs["refs"]])["score"]
70
  return CHRF.compute(predictions=[pred], references=[[ref]])["score"]
71
 
72
 
 
74
 
75
 
76
  def ter_fn(pred, ref, **kwargs):
77
+ if "refs" in kwargs:
78
+ return TER.compute(predictions=[pred], references=[kwargs["refs"]])["score"]
79
  return TER.compute(predictions=[pred], references=[[ref]])["score"]
80
 
81
 
82
  def edit_distance_fn(pred, ref, **kwargs):
83
+ if "refs" in kwargs:
84
+ scores = [Levenshtein.distance(pred, ref) for ref in kwargs["refs"]]
85
+ return sum(scores) / len(scores)
86
  return Levenshtein.distance(pred, ref)
87
 
88
+ def edit_distance_norm_fn(pred, ref, **kwargs):
89
+ if "refs" in kwargs:
90
+ scores = [Levenshtein.distance(pred, ref) / len(pred) for ref in kwargs["refs"]]
91
+ return sum(scores) / len(scores)
92
+ return Levenshtein.distance(pred, ref) / len(pred)
93
+
94
 
95
  def edit_time_fn(pred, ref, **kwargs):
96
  return kwargs["edittime"]
97
 
98
 
99
  def gptscore_ref_1_fn(pred, ref, **kwargs):
100
+ if "refs" in kwargs:
101
+ scores = [gpt_eval.compute_ref(prediction=pred, reference=ref, n_requests=1) for ref in kwargs["refs"]]
102
+ return sum(scores) / len(scores)
103
  return gpt_eval.compute_ref(prediction=pred, reference=ref, n_requests=1)
104
 
105
 
106
  def gptscore_ref_3_fn(pred, ref, **kwargs):
107
+ if "refs" in kwargs:
108
+ scores = [gpt_eval.compute_ref(prediction=pred, reference=ref, n_requests=3) for ref in kwargs["refs"]]
109
+ return sum(scores) / len(scores)
110
  return gpt_eval.compute_ref(prediction=pred, reference=ref, n_requests=3)
111
 
112
 
113
  def gptscore_ref_5_fn(pred, ref, **kwargs):
114
+ if "refs" in kwargs:
115
+ scores = [gpt_eval.compute_ref(prediction=pred, reference=ref, n_requests=5) for ref in kwargs["refs"]]
116
+ return sum(scores) / len(scores)
117
  return gpt_eval.compute_ref(prediction=pred, reference=ref, n_requests=5)
118
 
119
 
 
130
 
131
 
132
  IND_METRICS = {
133
+ "editdist": edit_distance_fn,
134
+ "editdist-norm": edit_distance_norm_fn,
135
  "gptscore-ref-1-req": gptscore_ref_1_fn,
136
  # "gptscore-ref-3-req": gptscore_ref_3_fn,
137
  # "gptscore-ref-5-req": gptscore_ref_5_fn,
138
  "gptscore-noref-1-req": gptscore_noref_1_fn,
139
  # "gptscore-noref-3-req": gptscore_noref_3_fn,
140
  # "gptscore-noref-5-req": gptscore_noref_5_fn,
 
141
  "bleu": bleu_fn,
142
  "meteor": meteor_fn,
143
  "rouge1": rouge1_fn,
 
149
  }
150
 
151
  AGGR_METRICS = IND_METRICS.copy()
152
+ # del AGGR_METRICS["gptscore-ref-1-req"]
153
+ # del AGGR_METRICS["gptscore-noref-1-req"]
154
 
155
  REL_METRICS = {
156
  "editdist": edit_distance_fn,
157
+ "editdist-norm": edit_distance_norm_fn,
158
  "edittime": edit_time_fn,
159
  }
160
 
 
221
  df[f"rel_{rel_metric}_aggr_{aggr_metric}_pearson"] = (
222
  df[f"{rel_metric}_related"].corr(df[f"{aggr_metric}_aggr"], method="pearson"))
223
 
224
+ df[f"rel_{rel_metric}_aggr_{aggr_metric}_spearman"] = (
225
  df[f"{rel_metric}_related"].corr(df[f"{aggr_metric}_aggr"], method="spearman"))
226
 
227
  return df