from datetime import datetime import diff_match_patch as dmp_module import hf_data_loader def group_changes(changes): groups = {} for change in changes: group = datetime.fromisoformat(change['ts']) if group not in groups: groups[group] = [] groups[group].append(change) grouped_changes = [] for group in sorted(groups.keys()): groups[group].sort(key=lambda x: x['p']) grouped_changes.append(groups[group]) return grouped_changes def fill_in_annotation_gaps(annotated_text): seg_start = None seg_type = None for i, e in enumerate(annotated_text): if e[1] is None: continue if seg_type is None: seg_start = i elif seg_type != e[1]: for j in range(seg_start, i): annotated_text[j][1] = seg_type seg_start = i seg_type = e[1] if seg_start is not None: for j in range(seg_start, len(annotated_text)): annotated_text[j][1] = seg_type return annotated_text def get_annotated_diff(start_text, end_text): dmp = dmp_module.diff_match_patch() dmp_mapping = { -1: '-', 0: None, 1: '+' } diff = dmp.diff_main(start_text, end_text) dmp.diff_cleanupSemantic(diff) result = [[w, dmp_mapping[t]] for t, w in diff] return result def annotated_diff_for_row(row): start = row['commit_msg_start'] end = row['commit_msg_end'] return get_annotated_diff(start, end) def data_with_annotated_diffs(): df = hf_data_loader.load_raw_dataset_as_pandas() annotated = df.apply(annotated_diff_for_row, axis=1) df['annotated_diff'] = annotated return df