File size: 1,363 Bytes
f26a894
 
 
 
4017643
f26a894
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4017643
f26a894
4017643
f26a894
 
 
 
 
 
 
 
 
 
 
4017643
 
f26a894
 
 
e2a35c0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
import numpy as np
import pandas as pd


def get_statistics_for_df(df: pd.DataFrame, start_col, end_col, annotated_col):
    relative_deletions = []
    relative_insertions = []
    relative_changes = []

    for _, row in df.iterrows():
        sum_deletions = 0
        sum_insertions = 0
        for text, change_type in row[annotated_col]:
            if change_type == '-':
                sum_deletions += len(text)
            elif change_type == '+':
                sum_insertions += len(text)

        sum_changes = sum_deletions + sum_insertions
        end_length = len(row[end_col])
        start_length = len(row[start_col])

        relative_deletions.append(sum_deletions / start_length)
        relative_insertions.append(sum_insertions / end_length)
        relative_changes.append(sum_changes / end_length)

    return {
        "deletions": np.asarray(relative_deletions),
        "insertions": np.asarray(relative_insertions),
        "changes": np.asarray(relative_changes)
    }


def get_statistics_for_manual_df(df):
    return get_statistics_for_df(df, start_col="commit_msg_start", end_col='commit_msg_end',
                                 annotated_col='annotated_diff')


def get_statistics_for_synthetic_df(df):
    return get_statistics_for_df(df, start_col="initial_msg_pred", end_col='reference', annotated_col='annotated_diff')