File size: 1,733 Bytes
aab3281
 
5434c4b
6503e4e
aab3281
 
 
 
 
 
6503e4e
aab3281
 
 
 
 
 
6503e4e
aab3281
 
 
 
 
5434c4b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6503e4e
 
 
 
5434c4b
 
aab3281
 
5434c4b
6503e4e
 
5434c4b
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
from datetime import datetime

import diff_match_patch as dmp_module

import hf_data_loader


def group_changes(changes):
    groups = {}
    for change in changes:
        group = datetime.fromisoformat(change['ts'])
        if group not in groups:
            groups[group] = []
        groups[group].append(change)

    grouped_changes = []
    for group in sorted(groups.keys()):
        groups[group].sort(key=lambda x: x['p'])
        grouped_changes.append(groups[group])

    return grouped_changes


def fill_in_annotation_gaps(annotated_text):
    seg_start = None
    seg_type = None

    for i, e in enumerate(annotated_text):
        if e[1] is None:
            continue

        if seg_type is None:
            seg_start = i
        elif seg_type != e[1]:
            for j in range(seg_start, i):
                annotated_text[j][1] = seg_type
            seg_start = i
            seg_type = e[1]

    if seg_start is not None:
        for j in range(seg_start, len(annotated_text)):
            annotated_text[j][1] = seg_type

    return annotated_text


def get_annotated_diff(start_text, end_text):
    dmp = dmp_module.diff_match_patch()
    dmp_mapping = {
        -1: '-',
        0: None,
        1: '+'
    }

    diff = dmp.diff_main(start_text, end_text)
    dmp.diff_cleanupSemantic(diff)

    result = [[w, dmp_mapping[t]] for t, w in diff]

    return result


def annotated_diff_for_row(row):
    start = row['commit_msg_start']
    end = row['commit_msg_end']
    return get_annotated_diff(start, end)


def data_with_annotated_diffs():
    df = hf_data_loader.load_raw_dataset_as_pandas()
    annotated = df.apply(annotated_diff_for_row, axis=1)
    df['annotated_diff'] = annotated
    return df