Spaces:

JetBrains-Research
/

commit-message-editing

Sleeping

commit-message-editing / data_loader.py

update dataset name to our organization

ff0444e verified 4 months ago

514 Bytes

	import os

	from datasets import load_dataset

	CACHE_DIR = 'cache'
	N_SAMPLES = 15
	REMOVED_COMMITS = ['9cc896202dc38d962c01aa2637dbc5bbc3e3dd9b']


	def load_data():
	df = load_dataset("JetBrains-Research/commit-rewriting-samples",
	split="train",
	token=os.environ.get('HF_REWRITING_TOKEN'),
	cache_dir=CACHE_DIR).to_pandas()

	removed_idx = df['hash'].isin(REMOVED_COMMITS)
	df = df[~removed_idx]

	return df.to_dict('records')[:N_SAMPLES]