File size: 3,202 Bytes
f5faae7 305e536 6676c5a 305e536 aab3281 6676c5a 5f3a4af 0c136d8 5f3a4af f5faae7 6676c5a f5faae7 a8a595d 6676c5a a8a595d 6676c5a 30e165f 6676c5a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 |
import json
from datetime import datetime, timedelta
from datasets import load_dataset
import config
def load_raw_rewriting_as_pandas():
return load_dataset(config.HF_RAW_DATASET_NAME,
split=config.HF_RAW_DATASET_SPLIT,
token=config.HF_TOKEN,
cache_dir=config.CACHE_DIR).to_pandas()
def load_full_commit_as_pandas():
return load_dataset(path=config.HF_FULL_COMMITS_DATASET_NAME,
name=config.HF_FULL_COMMITS_DATASET_SUBNAME,
split=config.HF_FULL_COMMITS_DATASET_SPLIT,
cache_dir=config.CACHE_DIR).to_pandas().rename(
columns={'message': 'reference'})
def edit_time_from_history(history_str):
history = json.loads(history_str)
if len(history) == 0:
return 0
timestamps = list(map(lambda e: datetime.fromisoformat(e['ts']), history))
delta = (max(timestamps) - min(timestamps))
return delta // timedelta(milliseconds=1)
def edit_time_from_timestamps(row):
loaded_ts = datetime.fromisoformat(row['loaded_ts'])
submitted_ts = datetime.fromisoformat(row['submitted_ts'])
delta = submitted_ts - loaded_ts
result = delta // timedelta(milliseconds=1)
return result if result >= 0 else None
def load_processed_rewriting_as_pandas():
manual_rewriting = load_raw_rewriting_as_pandas()[
["hash", "repo", "commit_msg_start", "commit_msg_end", "session", "commit_msg_history", "loaded_ts",
"submitted_ts"]]
manual_rewriting['edit_time_hist'] = manual_rewriting['commit_msg_history'].apply(edit_time_from_history)
manual_rewriting['edit_time'] = manual_rewriting.apply(edit_time_from_timestamps, axis=1)
manual_rewriting.drop(columns=['commit_msg_history', "loaded_ts", "submitted_ts"])
manual_rewriting.set_index(["hash", "repo"], inplace=True)
mods_dataset = load_full_commit_as_pandas()[["hash", "repo", "mods"]]
mods_dataset.set_index(["hash", "repo"], inplace=True)
return manual_rewriting.join(other=mods_dataset, how='left').reset_index()
def load_synthetic_as_pandas():
return load_dataset(config.HF_SYNTHETIC_DATASET_NAME,
split=config.HF_SYNTHETIC_DATASET_SPLIT,
token=config.HF_TOKEN,
cache_dir=config.CACHE_DIR).to_pandas()
def load_full_commit_with_predictions_as_pandas():
full_dataset = load_full_commit_as_pandas()
predictions_dataset = load_dataset(config.HF_PREDICTIONS_DATASET_NAME,
config.HF_PREDICTIONS_DATASET_SUBNAME,
split=config.HF_PREDICTIONS_DATASET_SPLIT,
cache_dir=config.CACHE_DIR
).to_pandas().sample(frac=1, random_state=config.RANDOM_STATE
).set_index(['hash', 'repo'])[["prediction"]]
predictions_dataset = predictions_dataset[~predictions_dataset.index.duplicated(keep='first')]
dataset = full_dataset.join(other=predictions_dataset, on=('hash', 'repo'))
return dataset.reset_index()
|