Spaces:
Runtime error
Runtime error
Petr Tsvetkov
commited on
Commit
Β·
5f3a4af
1
Parent(s):
0c136d8
Synthetic dataset visualization
Browse files- change_visualizer.py +32 -0
- config.py +8 -0
- generate_annotated_diffs.py +7 -0
- generate_synthetic_dataset.py +2 -1
- hf_data_loader.py +10 -3
change_visualizer.py
CHANGED
|
@@ -5,6 +5,9 @@ import generate_annotated_diffs
|
|
| 5 |
df_manual = generate_annotated_diffs.manual_data_with_annotated_diffs()
|
| 6 |
n_diffs_manual = len(df_manual)
|
| 7 |
|
|
|
|
|
|
|
|
|
|
| 8 |
|
| 9 |
def update_manual_view(diff_idx):
|
| 10 |
diff_idx -= 1
|
|
@@ -14,6 +17,14 @@ def update_manual_view(diff_idx):
|
|
| 14 |
'session'], f"https://github.com/{df_manual.iloc[diff_idx]['repo']}/commit/{df_manual.iloc[diff_idx]['hash']}"
|
| 15 |
|
| 16 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
if __name__ == '__main__':
|
| 18 |
with gr.Blocks(theme=gr.themes.Soft()) as application:
|
| 19 |
with gr.Tab("Manual"):
|
|
@@ -36,7 +47,28 @@ if __name__ == '__main__':
|
|
| 36 |
slider_manual.change(update_manual_view, inputs=slider_manual,
|
| 37 |
outputs=view_manual)
|
| 38 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
application.load(update_manual_view, inputs=slider_manual,
|
| 40 |
outputs=view_manual)
|
| 41 |
|
|
|
|
|
|
|
|
|
|
| 42 |
application.launch()
|
|
|
|
| 5 |
df_manual = generate_annotated_diffs.manual_data_with_annotated_diffs()
|
| 6 |
n_diffs_manual = len(df_manual)
|
| 7 |
|
| 8 |
+
df_synthetic = generate_annotated_diffs.synthetic_data_with_annotated_diffs()
|
| 9 |
+
n_diffs_synthetic = len(df_synthetic)
|
| 10 |
+
|
| 11 |
|
| 12 |
def update_manual_view(diff_idx):
|
| 13 |
diff_idx -= 1
|
|
|
|
| 17 |
'session'], f"https://github.com/{df_manual.iloc[diff_idx]['repo']}/commit/{df_manual.iloc[diff_idx]['hash']}"
|
| 18 |
|
| 19 |
|
| 20 |
+
def update_synthetic_view(diff_idx):
|
| 21 |
+
diff_idx -= 1
|
| 22 |
+
return (df_synthetic.iloc[diff_idx]['annotated_diff'], df_synthetic.iloc[diff_idx]['initial_msg_pred'],
|
| 23 |
+
df_synthetic.iloc[diff_idx][
|
| 24 |
+
'get_annotated_diff'],
|
| 25 |
+
f"https://github.com/{df_synthetic.iloc[diff_idx]['repo']}/commit/{df_synthetic.iloc[diff_idx]['hash']}")
|
| 26 |
+
|
| 27 |
+
|
| 28 |
if __name__ == '__main__':
|
| 29 |
with gr.Blocks(theme=gr.themes.Soft()) as application:
|
| 30 |
with gr.Tab("Manual"):
|
|
|
|
| 47 |
slider_manual.change(update_manual_view, inputs=slider_manual,
|
| 48 |
outputs=view_manual)
|
| 49 |
|
| 50 |
+
with gr.Tab("Synthetic"):
|
| 51 |
+
slider_synthetic = gr.Slider(minimum=1, maximum=n_diffs_synthetic, step=1, value=1,
|
| 52 |
+
label=f"Sample number (total: {n_diffs_synthetic})")
|
| 53 |
+
|
| 54 |
+
diff_view_synthetic = gr.Highlightedtext(combine_adjacent=True, color_map={'+': "green", '-': "red"})
|
| 55 |
+
start_view_synthetic = gr.Textbox(interactive=False, label="Start message", container=True)
|
| 56 |
+
end_view_synthetic = gr.Textbox(interactive=False, label="End message", container=True)
|
| 57 |
+
link_view_synthetic = gr.Markdown()
|
| 58 |
+
view_synthetic = [
|
| 59 |
+
diff_view_synthetic,
|
| 60 |
+
start_view_synthetic,
|
| 61 |
+
end_view_synthetic,
|
| 62 |
+
link_view_synthetic
|
| 63 |
+
]
|
| 64 |
+
|
| 65 |
+
slider_synthetic.change(update_synthetic_view, inputs=slider_synthetic,
|
| 66 |
+
outputs=view_synthetic)
|
| 67 |
+
|
| 68 |
application.load(update_manual_view, inputs=slider_manual,
|
| 69 |
outputs=view_manual)
|
| 70 |
|
| 71 |
+
application.load(update_synthetic_view, inputs=slider_synthetic,
|
| 72 |
+
outputs=view_synthetic)
|
| 73 |
+
|
| 74 |
application.launch()
|
config.py
CHANGED
|
@@ -4,9 +4,17 @@ from pathlib import Path
|
|
| 4 |
GRAZIE_API_JWT_TOKEN = os.environ.get("GRAZIE_API_JWT_TOKEN")
|
| 5 |
|
| 6 |
HF_TOKEN = os.environ.get('HF_TOKEN')
|
|
|
|
| 7 |
HF_RAW_DATASET_NAME = "petrtsv-jb/commit-msg-rewriting"
|
| 8 |
HF_RAW_DATASET_SPLIT = 'train'
|
| 9 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
CACHE_DIR = Path("cache")
|
| 11 |
CACHE_DIR.mkdir(exist_ok=True)
|
| 12 |
|
|
|
|
| 4 |
GRAZIE_API_JWT_TOKEN = os.environ.get("GRAZIE_API_JWT_TOKEN")
|
| 5 |
|
| 6 |
HF_TOKEN = os.environ.get('HF_TOKEN')
|
| 7 |
+
|
| 8 |
HF_RAW_DATASET_NAME = "petrtsv-jb/commit-msg-rewriting"
|
| 9 |
HF_RAW_DATASET_SPLIT = 'train'
|
| 10 |
|
| 11 |
+
HF_FULL_COMMITS_DATASET_NAME = "JetBrains-Research/lca-commit-message-generation"
|
| 12 |
+
HF_FULL_COMMITS_DATASET_SUBNAME = "commitchronicle-py-long"
|
| 13 |
+
HF_FULL_COMMITS_DATASET_SPLIT = "test"
|
| 14 |
+
|
| 15 |
+
HF_SYNTHETIC_DATASET_NAME = "petrtsv-jb/synthetic-commit-msg-rewriting"
|
| 16 |
+
HF_SYNTHETIC_DATASET_SPLIT = 'train'
|
| 17 |
+
|
| 18 |
CACHE_DIR = Path("cache")
|
| 19 |
CACHE_DIR.mkdir(exist_ok=True)
|
| 20 |
|
generate_annotated_diffs.py
CHANGED
|
@@ -36,3 +36,10 @@ def manual_data_with_annotated_diffs():
|
|
| 36 |
annotated = df.apply(annotated_diff_for_row_manual_df, axis=1)
|
| 37 |
df['annotated_diff'] = annotated
|
| 38 |
return df
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
annotated = df.apply(annotated_diff_for_row_manual_df, axis=1)
|
| 37 |
df['annotated_diff'] = annotated
|
| 38 |
return df
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def synthetic_data_with_annotated_diffs():
|
| 42 |
+
df = hf_data_loader.load_synthetic_dataset_as_pandas()
|
| 43 |
+
annotated = df.apply(annotated_diff_for_row_synthetic_df, axis=1)
|
| 44 |
+
df['annotated_diff'] = annotated
|
| 45 |
+
return df
|
generate_synthetic_dataset.py
CHANGED
|
@@ -58,7 +58,8 @@ def generate_synthetic_dataset():
|
|
| 58 |
initial_messages_pred = []
|
| 59 |
|
| 60 |
for prompt in tqdm(df['initial_msg_prompt']):
|
| 61 |
-
|
|
|
|
| 62 |
|
| 63 |
df['initial_msg_pred'] = initial_messages_pred
|
| 64 |
|
|
|
|
| 58 |
initial_messages_pred = []
|
| 59 |
|
| 60 |
for prompt in tqdm(df['initial_msg_prompt']):
|
| 61 |
+
output = generate_initial_msg(prompt)
|
| 62 |
+
initial_messages_pred.append(output)
|
| 63 |
|
| 64 |
df['initial_msg_pred'] = initial_messages_pred
|
| 65 |
|
hf_data_loader.py
CHANGED
|
@@ -11,8 +11,15 @@ def load_raw_rewriting_dataset_as_pandas():
|
|
| 11 |
|
| 12 |
|
| 13 |
def load_full_commit_dataset_as_pandas():
|
| 14 |
-
return load_dataset(
|
| 15 |
-
|
| 16 |
-
split=
|
| 17 |
cache_dir=config.CACHE_DIR).to_pandas().rename(
|
| 18 |
columns={'message': 'reference'})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
|
| 12 |
|
| 13 |
def load_full_commit_dataset_as_pandas():
|
| 14 |
+
return load_dataset(path=config.HF_FULL_COMMITS_DATASET_NAME,
|
| 15 |
+
name=config.HF_FULL_COMMITS_DATASET_SUBNAME,
|
| 16 |
+
split=config.HF_FULL_COMMITS_DATASET_SPLIT,
|
| 17 |
cache_dir=config.CACHE_DIR).to_pandas().rename(
|
| 18 |
columns={'message': 'reference'})
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def load_synthetic_dataset_as_pandas():
|
| 22 |
+
load_dataset(config.HF_SYNTHETIC_DATASET_NAME,
|
| 23 |
+
split=config.HF_SYNTHETIC_DATASET_SPLIT,
|
| 24 |
+
token=config.HF_TOKEN,
|
| 25 |
+
cache_dir=config.CACHE_DIR).to_pandas()
|