Petr Tsvetkov commited on
Commit
5434c4b
β€’
1 Parent(s): 6503e4e

Create visualize app

Browse files
Files changed (2) hide show
  1. change_visualizer.py +29 -0
  2. generate_annotated_diffs.py +43 -42
change_visualizer.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+
3
+ import generate_annotated_diffs
4
+
5
+ df = generate_annotated_diffs.data_with_annotated_diffs()
6
+ n_diffs = len(df)
7
+
8
+
9
+ def update_view(diff_idx):
10
+ diff_idx -= 1
11
+ return df.iloc[diff_idx]['annotated_diff'], df.iloc[diff_idx]['commit_msg_start'], df.iloc[diff_idx][
12
+ 'commit_msg_end'], df.iloc[diff_idx][
13
+ 'session'], f"https://github.com/{df.iloc[diff_idx]['repo']}/commit/{df.iloc[diff_idx]['hash']}"
14
+
15
+
16
+ if __name__ == '__main__':
17
+ with gr.Blocks(theme=gr.themes.Soft()) as application:
18
+ slider = gr.Slider(minimum=1, maximum=n_diffs, step=1, value=1, label=f"Sample number (total: {n_diffs})")
19
+
20
+ diff_view = gr.Highlightedtext(combine_adjacent=True, color_map={'+': "green", '-': "red"})
21
+ start_view = gr.Textbox(interactive=False, label="Start message", container=True)
22
+ end_view = gr.Textbox(interactive=False, label="End message", container=True)
23
+ session_view = gr.Textbox(interactive=False, label="Session", container=True)
24
+ link_view = gr.Markdown()
25
+
26
+ slider.change(update_view, inputs=slider, outputs=[diff_view, start_view, end_view, session_view, link_view])
27
+
28
+ application.load(update_view, inputs=slider, outputs=[diff_view, start_view, end_view, session_view, link_view])
29
+ application.launch()
generate_annotated_diffs.py CHANGED
@@ -1,7 +1,6 @@
1
- import json
2
  from datetime import datetime
3
 
4
- import gradio as gr
5
 
6
  import hf_data_loader
7
 
@@ -22,51 +21,53 @@ def group_changes(changes):
22
  return grouped_changes
23
 
24
 
25
- def get_annotated_diff(initial_text, changes):
26
- grouped_changes = group_changes(changes)
27
- text = [(c, None) for c in initial_text]
28
- for change_group in grouped_changes:
29
- next_text = []
30
- text_pointer = 0
31
- real_text_ind = 0
32
- change_pointer = 0
33
- while text_pointer < len(text):
34
- if change_pointer >= len(change_group) or real_text_ind < change_group[change_pointer]['p']:
35
- next_text.append(text[text_pointer])
36
- real_text_ind += 1
37
- text_pointer += 1
38
- elif change_group[change_pointer]['t'] == '+':
39
- if not (text[text_pointer][1] == '-' and text[text_pointer][0] == change_group[change_pointer]['c']):
40
- next_text.append((change_group[change_pointer]['c'], '+'))
41
- else:
42
- text_pointer += 1
43
-
44
- real_text_ind += 1
45
- change_pointer += 1
46
- elif change_group[change_pointer]['t'] == '-':
47
- if not (text[text_pointer][1] == '+' and text[text_pointer][0] == change_group[change_pointer]['c']):
48
- next_text.append((text[text_pointer][0], '-'))
49
- text_pointer += 1
50
-
51
- real_text_ind += 1
52
- change_pointer += 1
53
- else:
54
- raise RuntimeError("Unexpected branch")
55
- text = next_text
56
- return text
 
 
 
 
 
57
 
58
 
59
  def annotated_diff_for_row(row):
60
  start = row['commit_msg_start']
61
- changes = json.loads(row['commit_msg_history'])
62
- return get_annotated_diff(start, changes)
63
 
64
 
65
- if __name__ == '__main__':
66
  df = hf_data_loader.load_raw_dataset_as_pandas()
67
  annotated = df.apply(annotated_diff_for_row, axis=1)
68
- with gr.Blocks(theme=gr.themes.Soft()) as application:
69
- gr.Highlightedtext(value=annotated[0], combine_adjacent=True, color_map={'+': "green", '-': "red"})
70
- gr.Markdown(value=df.iloc[0]['commit_msg_start'])
71
- gr.Markdown(value=df.iloc[0]['commit_msg_end'])
72
- application.launch()
 
 
1
  from datetime import datetime
2
 
3
+ import diff_match_patch as dmp_module
4
 
5
  import hf_data_loader
6
 
 
21
  return grouped_changes
22
 
23
 
24
+ def fill_in_annotation_gaps(annotated_text):
25
+ seg_start = None
26
+ seg_type = None
27
+
28
+ for i, e in enumerate(annotated_text):
29
+ if e[1] is None:
30
+ continue
31
+
32
+ if seg_type is None:
33
+ seg_start = i
34
+ elif seg_type != e[1]:
35
+ for j in range(seg_start, i):
36
+ annotated_text[j][1] = seg_type
37
+ seg_start = i
38
+ seg_type = e[1]
39
+
40
+ if seg_start is not None:
41
+ for j in range(seg_start, len(annotated_text)):
42
+ annotated_text[j][1] = seg_type
43
+
44
+ return annotated_text
45
+
46
+
47
+ def get_annotated_diff(start_text, end_text):
48
+ dmp = dmp_module.diff_match_patch()
49
+ dmp_mapping = {
50
+ -1: '-',
51
+ 0: None,
52
+ 1: '+'
53
+ }
54
+
55
+ diff = dmp.diff_main(start_text, end_text)
56
+ dmp.diff_cleanupSemantic(diff)
57
+
58
+ result = [[w, dmp_mapping[t]] for t, w in diff]
59
+
60
+ return result
61
 
62
 
63
  def annotated_diff_for_row(row):
64
  start = row['commit_msg_start']
65
+ end = row['commit_msg_end']
66
+ return get_annotated_diff(start, end)
67
 
68
 
69
+ def data_with_annotated_diffs():
70
  df = hf_data_loader.load_raw_dataset_as_pandas()
71
  annotated = df.apply(annotated_diff_for_row, axis=1)
72
+ df['annotated_diff'] = annotated
73
+ return df