from datasets import load_dataset import config def load_raw_rewriting_as_pandas(): return load_dataset(config.HF_RAW_DATASET_NAME, split=config.HF_RAW_DATASET_SPLIT, token=config.HF_TOKEN, cache_dir=config.CACHE_DIR).to_pandas() def load_full_commit_as_pandas(): return load_dataset(path=config.HF_FULL_COMMITS_DATASET_NAME, name=config.HF_FULL_COMMITS_DATASET_SUBNAME, split=config.HF_FULL_COMMITS_DATASET_SPLIT, cache_dir=config.CACHE_DIR).to_pandas().rename( columns={'message': 'reference'}) def load_processed_rewriting_as_pandas(): manual_rewriting = load_raw_rewriting_as_pandas()[ ["hash", "repo", "commit_msg_start", "commit_msg_end", "session"]] manual_rewriting.set_index(["hash", "repo"], inplace=True) mods_dataset = load_full_commit_as_pandas()[["hash", "repo", "mods"]] mods_dataset.set_index(["hash", "repo"], inplace=True) return manual_rewriting.join(other=mods_dataset, how='left').reset_index() def load_synthetic_as_pandas(): return load_dataset(config.HF_SYNTHETIC_DATASET_NAME, split=config.HF_SYNTHETIC_DATASET_SPLIT, token=config.HF_TOKEN, cache_dir=config.CACHE_DIR).to_pandas() def load_full_commit_with_predictions_as_pandas(): full_dataset = load_full_commit_as_pandas() predictions_dataset = load_dataset(config.HF_PREDICTIONS_DATASET_NAME, config.HF_PREDICTIONS_DATASET_SUBNAME, split=config.HF_PREDICTIONS_DATASET_SPLIT, cache_dir=config.CACHE_DIR ).to_pandas().sample(frac=1, random_state=config.RANDOM_STATE ).set_index(['hash', 'repo'])[["prediction"]] predictions_dataset = predictions_dataset[~predictions_dataset.index.duplicated(keep='first')] dataset = full_dataset.join(other=predictions_dataset, on=('hash', 'repo')) return dataset.reset_index()