Spaces:
Runtime error
Runtime error
| import pickle | |
| import Levenshtein | |
| import numpy as np | |
| import pandas as pd | |
| import plotly.figure_factory as ff | |
| from scipy.stats import stats | |
| import config | |
| def get_statistics_for_sample(start_msg, end_msg, row=None): | |
| edit_ops = Levenshtein.editops(start_msg, end_msg) | |
| n_deletes = sum([1 if op == 'delete' else 0 for op, _, _ in edit_ops]) | |
| n_inserts = sum([1 if op == 'insert' else 0 for op, _, _ in edit_ops]) | |
| n_replaces = sum([1 if op == 'replace' else 0 for op, _, _ in edit_ops]) | |
| n_changes = n_deletes + n_inserts + n_replaces | |
| n_deletes += n_replaces | |
| n_inserts += n_replaces | |
| return { | |
| "deletions": n_deletes, | |
| "insertions": n_inserts, | |
| "changes": n_changes, | |
| "deletions_norm": n_deletes / len(start_msg), | |
| "insertions_norm": n_inserts / len(end_msg), | |
| "changes_norm": n_changes / len(end_msg), | |
| "lendiff": abs(len(start_msg) - len(end_msg)), | |
| "editdist": row["editdist_related"] if row is not None else Levenshtein.distance(start_msg, end_msg), | |
| } | |
| def get_statistics_for_row(row): | |
| start_msg = row["commit_msg_start"] | |
| end_msg = row["commit_msg_end"] | |
| return get_statistics_for_sample(start_msg, end_msg, row=row) | |
| def get_statistics_for_df(df: pd.DataFrame): | |
| stats = [get_statistics_for_row(row) for _, row in | |
| df.iterrows()] | |
| assert len(stats) > 0 | |
| return {stat_name: np.asarray([e[stat_name] for e in stats]) for stat_name in stats[0]} | |
| def build_plotly_chart(stat_golden, stat_e2s, stat_s2e, stat_e2s_s2e, stat_name): | |
| hist_data = [stat_golden, stat_e2s, stat_s2e, stat_e2s_s2e, | |
| np.concatenate((stat_e2s, stat_s2e, stat_e2s_s2e), axis=0)] | |
| group_labels = ['Golden', 'e2s', 's2e', 'e2s+s2e', 'Synthetic'] | |
| fig = ff.create_distplot(hist_data, group_labels, | |
| bin_size=.05, show_rug=False, show_hist=False) | |
| fig.update_layout(title_text=stat_name) | |
| with open(config.OUTPUT_CHARTS_DIR / f"{stat_name}_data.pkl", "wb") as f: | |
| pickle.dump(hist_data, f) | |
| return fig | |
| def t_test(group_stats, main_group="manual"): | |
| results = {} | |
| for group in group_stats: | |
| results[group] = {} | |
| for stat in group_stats[group]: | |
| a = group_stats[main_group][stat] | |
| b = group_stats[group][stat] | |
| p = stats.ttest_ind(a, b, equal_var=False, random_state=config.RANDOM_STATE).pvalue | |
| results[group][stat] = p | |
| return results | |