Spaces:

JetBrains-Research
/

commit-message-editing-visualization

Sleeping

Petr Tsvetkov

- New version of the end->start synthetics samples generation

a8a595d 4 months ago

1.41 kB

	from datasets import load_dataset

	import config


	def load_raw_rewriting_dataset_as_pandas():
	return load_dataset(config.HF_RAW_DATASET_NAME,
	split=config.HF_RAW_DATASET_SPLIT,
	token=config.HF_TOKEN,
	cache_dir=config.CACHE_DIR).to_pandas()


	def load_full_commit_dataset_as_pandas():
	return load_dataset(path=config.HF_FULL_COMMITS_DATASET_NAME,
	name=config.HF_FULL_COMMITS_DATASET_SUBNAME,
	split=config.HF_FULL_COMMITS_DATASET_SPLIT,
	cache_dir=config.CACHE_DIR).to_pandas().rename(
	columns={'message': 'reference'})


	def load_processed_rewriting_dataset_as_pandas():
	manual_rewriting = load_raw_rewriting_dataset_as_pandas()[["hash", "repo", "commit_msg_start", "commit_msg_end"]]
	manual_rewriting.set_index(["hash", "repo"], inplace=True)

	mods_dataset = load_full_commit_dataset_as_pandas()[["hash", "repo", "mods"]]
	mods_dataset.set_index(["hash", "repo"], inplace=True)

	return manual_rewriting.join(other=mods_dataset, how='left').reset_index()


	def load_synthetic_dataset_as_pandas():
	return load_dataset(config.HF_SYNTHETIC_DATASET_NAME,
	split=config.HF_SYNTHETIC_DATASET_SPLIT,
	token=config.HF_TOKEN,
	cache_dir=config.CACHE_DIR).to_pandas()