commit-message-editing-visualization / dataset_statistics.py
Petr Tsvetkov
Fix the synthetic data generation pipeline
347f566
raw
history blame contribute delete
No virus
2.48 kB
import pickle
import Levenshtein
import numpy as np
import pandas as pd
import plotly.figure_factory as ff
from scipy.stats import stats
import config
def get_statistics_for_sample(start_msg, end_msg, row=None):
edit_ops = Levenshtein.editops(start_msg, end_msg)
n_deletes = sum([1 if op == 'delete' else 0 for op, _, _ in edit_ops])
n_inserts = sum([1 if op == 'insert' else 0 for op, _, _ in edit_ops])
n_replaces = sum([1 if op == 'replace' else 0 for op, _, _ in edit_ops])
n_changes = n_deletes + n_inserts + n_replaces
n_deletes += n_replaces
n_inserts += n_replaces
return {
"deletions": n_deletes,
"insertions": n_inserts,
"changes": n_changes,
"deletions_norm": n_deletes / len(start_msg),
"insertions_norm": n_inserts / len(end_msg),
"changes_norm": n_changes / len(end_msg),
"lendiff": abs(len(start_msg) - len(end_msg)),
"editdist": row["editdist_related"] if row is not None else Levenshtein.distance(start_msg, end_msg),
}
def get_statistics_for_row(row):
start_msg = row["commit_msg_start"]
end_msg = row["commit_msg_end"]
return get_statistics_for_sample(start_msg, end_msg, row=row)
def get_statistics_for_df(df: pd.DataFrame):
stats = [get_statistics_for_row(row) for _, row in
df.iterrows()]
assert len(stats) > 0
return {stat_name: np.asarray([e[stat_name] for e in stats]) for stat_name in stats[0]}
def build_plotly_chart(stat_golden, stat_e2s, stat_s2e, stat_e2s_s2e, stat_name):
hist_data = [stat_golden, stat_e2s, stat_s2e, stat_e2s_s2e,
np.concatenate((stat_e2s, stat_s2e, stat_e2s_s2e), axis=0)]
group_labels = ['Golden', 'e2s', 's2e', 'e2s+s2e', 'Synthetic']
fig = ff.create_distplot(hist_data, group_labels,
bin_size=.05, show_rug=False, show_hist=False)
fig.update_layout(title_text=stat_name)
with open(config.OUTPUT_CHARTS_DIR / f"{stat_name}_data.pkl", "wb") as f:
pickle.dump(hist_data, f)
return fig
def t_test(group_stats, main_group="manual"):
results = {}
for group in group_stats:
results[group] = {}
for stat in group_stats[group]:
a = group_stats[main_group][stat]
b = group_stats[group][stat]
p = stats.ttest_ind(a, b, equal_var=False, random_state=config.RANDOM_STATE).pvalue
results[group][stat] = p
return results