File size: 3,700 Bytes
f5faae7 abb3f0c f5faae7 86f1b98 305e536 abb3f0c 305e536 6676c5a a6b5a66 aab3281 6676c5a a6b5a66 5f3a4af f5faae7 a6b5a66 f5faae7 a6b5a66 f5faae7 6676c5a a6b5a66 f5faae7 a8a595d 6676c5a a8a595d a6b5a66 a8a595d 6676c5a a6b5a66 6676c5a abb3f0c 86f1b98 a6b5a66 86f1b98 a6b5a66 6676c5a a6b5a66 6676c5a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 |
import json
import os
from datetime import datetime, timedelta
import pandas as pd
from datasets import load_dataset
from huggingface_hub import hf_hub_download, list_repo_tree
import config
def load_raw_rewriting_as_pandas():
return load_dataset(
config.HF_RAW_DATASET_NAME, split=config.HF_RAW_DATASET_SPLIT, token=config.HF_TOKEN, cache_dir=config.CACHE_DIR
).to_pandas()
def load_full_commit_as_pandas():
return (
load_dataset(
path=config.HF_FULL_COMMITS_DATASET_NAME,
name=config.HF_FULL_COMMITS_DATASET_SUBNAME,
split=config.HF_FULL_COMMITS_DATASET_SPLIT,
cache_dir=config.CACHE_DIR,
)
.to_pandas()
.rename(columns={"message": "reference"})
)
def edit_time_from_history(history_str):
history = json.loads(history_str)
if len(history) == 0:
return 0
timestamps = list(map(lambda e: datetime.fromisoformat(e["ts"]), history))
delta = max(timestamps) - min(timestamps)
return delta // timedelta(milliseconds=1)
def edit_time_from_timestamps(row):
loaded_ts = datetime.fromisoformat(row["loaded_ts"])
submitted_ts = datetime.fromisoformat(row["submitted_ts"])
delta = submitted_ts - loaded_ts
result = delta // timedelta(milliseconds=1)
return result if result >= 0 else None
def load_processed_rewriting_as_pandas():
manual_rewriting = load_raw_rewriting_as_pandas()[
[
"hash",
"repo",
"commit_msg_start",
"commit_msg_end",
"session",
"commit_msg_history",
"loaded_ts",
"submitted_ts",
]
]
manual_rewriting["edit_time_hist"] = manual_rewriting["commit_msg_history"].apply(edit_time_from_history)
manual_rewriting["edit_time"] = manual_rewriting.apply(edit_time_from_timestamps, axis=1)
manual_rewriting.drop(columns=["commit_msg_history", "loaded_ts", "submitted_ts"])
manual_rewriting.set_index(["hash", "repo"], inplace=True)
mods_dataset = load_full_commit_as_pandas()[["hash", "repo", "mods"]]
mods_dataset.set_index(["hash", "repo"], inplace=True)
return manual_rewriting.join(other=mods_dataset, how="left").reset_index()
def load_synthetic_as_pandas():
return load_dataset(
config.HF_SYNTHETIC_DATASET_NAME,
"all_pairs_with_metrics",
split=config.HF_SYNTHETIC_DATASET_SPLIT,
token=config.HF_TOKEN,
cache_dir=config.CACHE_DIR,
).to_pandas()
def load_full_commit_with_predictions_as_pandas():
full_dataset = load_full_commit_as_pandas()
predictions_paths = []
for prediction_file in list_repo_tree(
repo_id=config.HF_PREDICTIONS_DATASET_NAME,
path=os.path.join("commit_message_generation/predictions", config.HF_PREDICTIONS_MODEL),
repo_type="dataset",
):
predictions_paths.append(
hf_hub_download(
prediction_file.path,
repo_id=config.HF_PREDICTIONS_DATASET_NAME,
repo_type="dataset",
cache_dir=config.CACHE_DIR,
)
)
dfs = []
for path in predictions_paths:
dfs.append(pd.read_json(path, orient="records", lines=True))
predictions_dataset = pd.concat(dfs, axis=0, ignore_index=True)
predictions_dataset = predictions_dataset.sample(frac=1, random_state=config.RANDOM_STATE).set_index(
["hash", "repo"]
)[["prediction"]]
predictions_dataset = predictions_dataset[~predictions_dataset.index.duplicated(keep="first")]
dataset = full_dataset.join(other=predictions_dataset, on=("hash", "repo"))
return dataset.reset_index()
|