import numpy as np import pandas as pd def get_statistics_for_df(df: pd.DataFrame, start_col, end_col, annotated_col): relative_deletions = [] relative_insertions = [] relative_changes = [] for _, row in df.iterrows(): sum_deletions = 0 sum_insertions = 0 for text, change_type in row[annotated_col]: if change_type == '-': sum_deletions += len(text) elif change_type == '+': sum_insertions += len(text) sum_changes = sum_deletions + sum_insertions end_length = len(row[end_col]) start_length = len(row[start_col]) relative_deletions.append(sum_deletions / start_length) relative_insertions.append(sum_insertions / end_length) relative_changes.append(sum_changes / end_length) return { "deletions": np.asarray(relative_deletions), "insertions": np.asarray(relative_insertions), "changes": np.asarray(relative_changes) } def get_statistics_for_manual_df(df): return get_statistics_for_df(df, start_col="commit_msg_start", end_col='commit_msg_end', annotated_col='annotated_diff') def get_statistics_for_synthetic_df(df): return get_statistics_for_df(df, start_col="commit_msg_start", end_col='reference', annotated_col='annotated_diff')