Spaces:

theresatvan
/

patent-language-model

Runtime error

App Files Files Community

patent-language-model / models /preprocessing.py

theresatvan

Preprocess data & train model

81414ba over 1 year ago

raw

history blame

1.86 kB

	from datasets import load_dataset

	# Initializing global variables
	file_path = '/app/models/content/'

	"""## Loading the Dataset

	We will be finetuning the DistilBERT model on a subset of patents filed in January 2016. We perform the train-test split as
	any patents filed on and before January 21st, 2016 will be part of the training set and and patents filed on January 22nd,
	2016 and after will be part of the validation set.
	"""

	dataset_dict = load_dataset('HUPD/hupd',
	name='sample',
	data_files="https://huggingface.co/datasets/HUPD/hupd/blob/main/hupd_metadata_2022-02-22.feather",
	icpr_label=None,
	train_filing_start_date='2016-01-01',
	train_filing_end_date='2016-01-21',
	val_filing_start_date='2016-01-22',
	val_filing_end_date='2016-01-31',
	)

	print(dataset_dict)
	print(f'Train dataset size: {dataset_dict["train"].shape}')
	print(f'Validation dataset size: {dataset_dict["validation"].shape}')

	"""## Pre-Processing Steps

	Our model will only be able to predict rejections or acceptances. We will have to filter out any
	other decisions from our training and validation set in order to proceed.
	"""

	# Label-to-index mapping for the decision status field
	decision_to_str = {'REJECTED': 0, 'ACCEPTED': 1, 'PENDING': 2, 'CONT-REJECTED': 3, 'CONT-ACCEPTED': 4, 'CONT-PENDING': 5}

	# Helper function
	def map_decision_to_string(example):
	return {'decision': decision_to_str[example['decision']]}

	# Re-labeling/mapping.
	# Filtering out any decisions that are not 'REJECTED' or 'ACCEPTED'.
	for name in ['train', 'validation']:
	dataset_dict[name] = dataset_dict[name].map(map_decision_to_string)
	# Remove the pending and CONT-patent applications
	dataset_dict[name] = dataset_dict[name].filter(lambda e: e['decision'] <= 1)


	# Save the dataset dictionary to disk
	dataset_dict.save_to_disk(file_path + 'dataset_dict')