|
import numpy as np |
|
import pandas as pd |
|
|
|
|
|
def get_statistics_for_df(df: pd.DataFrame, start_col, end_col, annotated_col): |
|
relative_deletions = [] |
|
relative_insertions = [] |
|
relative_changes = [] |
|
|
|
for _, row in df.iterrows(): |
|
sum_deletions = 0 |
|
sum_insertions = 0 |
|
for text, change_type in row[annotated_col]: |
|
if change_type == '-': |
|
sum_deletions += len(text) |
|
elif change_type == '+': |
|
sum_insertions += len(text) |
|
|
|
sum_changes = sum_deletions + sum_insertions |
|
end_length = len(row[end_col]) |
|
start_length = len(row[start_col]) |
|
|
|
relative_deletions.append(sum_deletions / start_length) |
|
relative_insertions.append(sum_insertions / end_length) |
|
relative_changes.append(sum_changes / end_length) |
|
|
|
return { |
|
"deletions": np.asarray(relative_deletions), |
|
"insertions": np.asarray(relative_insertions), |
|
"changes": np.asarray(relative_changes) |
|
} |
|
|
|
|
|
def get_statistics_for_manual_df(df): |
|
return get_statistics_for_df(df, start_col="commit_msg_start", end_col='commit_msg_end', |
|
annotated_col='annotated_diff') |
|
|
|
|
|
def get_statistics_for_synthetic_df(df): |
|
return get_statistics_for_df(df, start_col="initial_msg_pred", end_col='reference', annotated_col='annotated_diff') |
|
|