Petr Tsvetkov
Remove the histograms
827777f
raw
history blame
No virus
1.61 kB
import Levenshtein
import numpy as np
import pandas as pd
import plotly.figure_factory as ff
def get_statistics(start_msg, end_msg, annotated_msg):
edit_ops = Levenshtein.editops(start_msg, end_msg)
n_deletes = sum([1 if op == 'delete' else 0 for op, _, _ in edit_ops])
n_inserts = sum([1 if op == 'insert' else 0 for op, _, _ in edit_ops])
n_replaces = sum([1 if op == 'replace' else 0 for op, _, _ in edit_ops])
n_changes = n_deletes + n_inserts + n_replaces
n_deletes += n_replaces
n_inserts += n_replaces
return {
"deletions": n_deletes,
"insertions": n_inserts,
"changes": n_changes,
"deletions_norm": n_deletes / len(start_msg),
"insertions_norm": n_inserts / len(end_msg),
"changes_norm": n_changes / len(end_msg),
}
def get_statistics_for_df(df: pd.DataFrame):
stats = [get_statistics(row["commit_msg_start"], row["commit_msg_end"], row["annotated_diff"]) for _, row in
df.iterrows()]
assert len(stats) > 0
return {stat_name: np.asarray([e[stat_name] for e in stats]) for stat_name in stats[0]}
def build_plotly_chart(stat_golden, stat_e2s, stat_s2e, stat_e2s_s2e, stat_name):
hist_data = [stat_golden, stat_e2s, stat_s2e, stat_e2s_s2e,
np.concatenate((stat_e2s, stat_s2e, stat_e2s_s2e), axis=0)]
group_labels = ['Golden', 'e2s', 's2e', 'e2s+s 2e', 'Synthetic']
fig = ff.create_distplot(hist_data, group_labels,
bin_size=.1, show_rug=False, show_hist=False)
fig.update_layout(title_text=stat_name)
return fig