File size: 3,202 Bytes
f5faae7
 
 
305e536
 
 
 
 
6676c5a
305e536
 
 
 
aab3281
 
6676c5a
5f3a4af
 
 
0c136d8
 
5f3a4af
 
f5faae7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6676c5a
 
f5faae7
 
 
 
 
 
 
 
a8a595d
 
6676c5a
a8a595d
 
 
 
 
6676c5a
30e165f
 
 
 
6676c5a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import json
from datetime import datetime, timedelta

from datasets import load_dataset

import config


def load_raw_rewriting_as_pandas():
    return load_dataset(config.HF_RAW_DATASET_NAME,
                        split=config.HF_RAW_DATASET_SPLIT,
                        token=config.HF_TOKEN,
                        cache_dir=config.CACHE_DIR).to_pandas()


def load_full_commit_as_pandas():
    return load_dataset(path=config.HF_FULL_COMMITS_DATASET_NAME,
                        name=config.HF_FULL_COMMITS_DATASET_SUBNAME,
                        split=config.HF_FULL_COMMITS_DATASET_SPLIT,
                        cache_dir=config.CACHE_DIR).to_pandas().rename(
        columns={'message': 'reference'})


def edit_time_from_history(history_str):
    history = json.loads(history_str)

    if len(history) == 0:
        return 0

    timestamps = list(map(lambda e: datetime.fromisoformat(e['ts']), history))
    delta = (max(timestamps) - min(timestamps))

    return delta // timedelta(milliseconds=1)


def edit_time_from_timestamps(row):
    loaded_ts = datetime.fromisoformat(row['loaded_ts'])
    submitted_ts = datetime.fromisoformat(row['submitted_ts'])

    delta = submitted_ts - loaded_ts

    result = delta // timedelta(milliseconds=1)

    return result if result >= 0 else None


def load_processed_rewriting_as_pandas():
    manual_rewriting = load_raw_rewriting_as_pandas()[
        ["hash", "repo", "commit_msg_start", "commit_msg_end", "session", "commit_msg_history", "loaded_ts",
         "submitted_ts"]]

    manual_rewriting['edit_time_hist'] = manual_rewriting['commit_msg_history'].apply(edit_time_from_history)
    manual_rewriting['edit_time'] = manual_rewriting.apply(edit_time_from_timestamps, axis=1)

    manual_rewriting.drop(columns=['commit_msg_history', "loaded_ts", "submitted_ts"])

    manual_rewriting.set_index(["hash", "repo"], inplace=True)

    mods_dataset = load_full_commit_as_pandas()[["hash", "repo", "mods"]]
    mods_dataset.set_index(["hash", "repo"], inplace=True)

    return manual_rewriting.join(other=mods_dataset, how='left').reset_index()


def load_synthetic_as_pandas():
    return load_dataset(config.HF_SYNTHETIC_DATASET_NAME,
                        split=config.HF_SYNTHETIC_DATASET_SPLIT,
                        token=config.HF_TOKEN,
                        cache_dir=config.CACHE_DIR).to_pandas()


def load_full_commit_with_predictions_as_pandas():
    full_dataset = load_full_commit_as_pandas()
    predictions_dataset = load_dataset(config.HF_PREDICTIONS_DATASET_NAME,
                                       config.HF_PREDICTIONS_DATASET_SUBNAME,
                                       split=config.HF_PREDICTIONS_DATASET_SPLIT,
                                       cache_dir=config.CACHE_DIR
                                       ).to_pandas().sample(frac=1, random_state=config.RANDOM_STATE
                                                            ).set_index(['hash', 'repo'])[["prediction"]]

    predictions_dataset = predictions_dataset[~predictions_dataset.index.duplicated(keep='first')]

    dataset = full_dataset.join(other=predictions_dataset, on=('hash', 'repo'))

    return dataset.reset_index()