File size: 1,429 Bytes
305e536
 
 
 
 
0c136d8
305e536
 
 
 
aab3281
 
0c136d8
5f3a4af
 
 
0c136d8
 
5f3a4af
 
a8a595d
02ebb6e
 
a8a595d
 
 
 
 
 
 
 
5f3a4af
30e165f
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
from datasets import load_dataset

import config


def load_raw_rewriting_dataset_as_pandas():
    return load_dataset(config.HF_RAW_DATASET_NAME,
                        split=config.HF_RAW_DATASET_SPLIT,
                        token=config.HF_TOKEN,
                        cache_dir=config.CACHE_DIR).to_pandas()


def load_full_commit_dataset_as_pandas():
    return load_dataset(path=config.HF_FULL_COMMITS_DATASET_NAME,
                        name=config.HF_FULL_COMMITS_DATASET_SUBNAME,
                        split=config.HF_FULL_COMMITS_DATASET_SPLIT,
                        cache_dir=config.CACHE_DIR).to_pandas().rename(
        columns={'message': 'reference'})


def load_processed_rewriting_dataset_as_pandas():
    manual_rewriting = load_raw_rewriting_dataset_as_pandas()[
        ["hash", "repo", "commit_msg_start", "commit_msg_end", "session"]]
    manual_rewriting.set_index(["hash", "repo"], inplace=True)

    mods_dataset = load_full_commit_dataset_as_pandas()[["hash", "repo", "mods"]]
    mods_dataset.set_index(["hash", "repo"], inplace=True)

    return manual_rewriting.join(other=mods_dataset, how='left').reset_index()


def load_synthetic_dataset_as_pandas():
    return load_dataset(config.HF_SYNTHETIC_DATASET_NAME,
                        split=config.HF_SYNTHETIC_DATASET_SPLIT,
                        token=config.HF_TOKEN,
                        cache_dir=config.CACHE_DIR).to_pandas()