Petr Tsvetkov commited on
Commit
f26a894
β€’
1 Parent(s): 642fae1

Add datasets comparison

Browse files
Files changed (2) hide show
  1. change_visualizer.py +22 -0
  2. statistics.py +38 -0
change_visualizer.py CHANGED
@@ -1,6 +1,7 @@
1
  import gradio as gr
2
 
3
  import generate_annotated_diffs
 
4
 
5
  df_manual = generate_annotated_diffs.manual_data_with_annotated_diffs()
6
  n_diffs_manual = len(df_manual)
@@ -8,6 +9,9 @@ n_diffs_manual = len(df_manual)
8
  df_synthetic = generate_annotated_diffs.synthetic_data_with_annotated_diffs()
9
  n_diffs_synthetic = len(df_synthetic)
10
 
 
 
 
11
 
12
  def update_manual_view(diff_idx):
13
  diff_idx -= 1
@@ -74,6 +78,24 @@ if __name__ == '__main__':
74
 
75
  slider_synthetic.change(update_synthetic_view, inputs=slider_synthetic,
76
  outputs=view_synthetic)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
 
78
  application.load(update_manual_view, inputs=slider_manual,
79
  outputs=view_manual)
 
1
  import gradio as gr
2
 
3
  import generate_annotated_diffs
4
+ import statistics
5
 
6
  df_manual = generate_annotated_diffs.manual_data_with_annotated_diffs()
7
  n_diffs_manual = len(df_manual)
 
9
  df_synthetic = generate_annotated_diffs.synthetic_data_with_annotated_diffs()
10
  n_diffs_synthetic = len(df_synthetic)
11
 
12
+ STATISTICS = {"manual": statistics.get_statistics_for_manual_df(df_manual),
13
+ "synthetic": statistics.get_statistics_for_synthetic_df(df_synthetic)}
14
+
15
 
16
  def update_manual_view(diff_idx):
17
  diff_idx -= 1
 
78
 
79
  slider_synthetic.change(update_synthetic_view, inputs=slider_synthetic,
80
  outputs=view_synthetic)
81
+ with gr.Tab("Compare"):
82
+ def layout_for_statistics(statistics_group_name):
83
+ gr.Markdown(f"### {statistics_group_name}")
84
+ stats = STATISTICS[statistics_group_name]
85
+ gr.Number(label="Average deletions number (rel to the result length)", interactive=False,
86
+ value=stats['deletions'].mean().item(), precision=3)
87
+ gr.Number(label="Average insertions number (rel to the result length)", interactive=False,
88
+ value=stats['insertions'].mean().item(), precision=3)
89
+ gr.Number(label="Average changes number (rel to the result length)", interactive=False,
90
+ value=stats['changes'].mean().item(), precision=3)
91
+
92
+
93
+ with gr.Row():
94
+ with gr.Column(scale=1):
95
+ layout_for_statistics("manual")
96
+
97
+ with gr.Column(scale=1):
98
+ layout_for_statistics("synthetic")
99
 
100
  application.load(update_manual_view, inputs=slider_manual,
101
  outputs=view_manual)
statistics.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import pandas as pd
3
+
4
+
5
+ def get_statistics_for_df(df: pd.DataFrame, end_col, annotated_col):
6
+ relative_deletions = []
7
+ relative_insertions = []
8
+ relative_changes = []
9
+
10
+ for _, row in df.iterrows():
11
+ sum_deletions = 0
12
+ sum_insertions = 0
13
+ for text, change_type in row[annotated_col]:
14
+ if change_type == '-':
15
+ sum_deletions += len(text)
16
+ elif change_type == '+':
17
+ sum_insertions += len(text)
18
+
19
+ sum_changes = sum_deletions + sum_insertions
20
+ end_length = len(row[end_col])
21
+
22
+ relative_deletions.append(sum_deletions / end_length)
23
+ relative_insertions.append(sum_insertions / end_length)
24
+ relative_changes.append(sum_changes / end_length)
25
+
26
+ return {
27
+ "deletions": np.asarray(relative_deletions),
28
+ "insertions": np.asarray(relative_insertions),
29
+ "changes": np.asarray(relative_changes)
30
+ }
31
+
32
+
33
+ def get_statistics_for_manual_df(df):
34
+ return get_statistics_for_df(df, end_col='commit_msg_end', annotated_col='annotated_diff')
35
+
36
+
37
+ def get_statistics_for_synthetic_df(df):
38
+ return get_statistics_for_df(df, end_col='reference', annotated_col='annotated_diff')