Petr Tsvetkov
# of deletions rel to initial message length
4017643
raw
history blame
1.36 kB
import numpy as np
import pandas as pd
def get_statistics_for_df(df: pd.DataFrame, start_col, end_col, annotated_col):
relative_deletions = []
relative_insertions = []
relative_changes = []
for _, row in df.iterrows():
sum_deletions = 0
sum_insertions = 0
for text, change_type in row[annotated_col]:
if change_type == '-':
sum_deletions += len(text)
elif change_type == '+':
sum_insertions += len(text)
sum_changes = sum_deletions + sum_insertions
end_length = len(row[end_col])
start_length = len(row[start_col])
relative_deletions.append(sum_deletions / start_length)
relative_insertions.append(sum_insertions / end_length)
relative_changes.append(sum_changes / end_length)
return {
"deletions": np.asarray(relative_deletions),
"insertions": np.asarray(relative_insertions),
"changes": np.asarray(relative_changes)
}
def get_statistics_for_manual_df(df):
return get_statistics_for_df(df, start_col="commit_msg_start", end_col='commit_msg_end',
annotated_col='annotated_diff')
def get_statistics_for_synthetic_df(df):
return get_statistics_for_df(df, start_col="commit_msg_start", end_col='reference', annotated_col='annotated_diff')