from datasets import load_dataset # Initializing global variables file_path = '/app/models/content/' """## Loading the Dataset We will be finetuning the DistilBERT model on a subset of patents filed in January 2016. We perform the train-test split as any patents filed on and before January 21st, 2016 will be part of the training set and and patents filed on January 22nd, 2016 and after will be part of the validation set. """ dataset_dict = load_dataset('HUPD/hupd', name='sample', data_files="https://huggingface.co/datasets/HUPD/hupd/blob/main/hupd_metadata_2022-02-22.feather", icpr_label=None, train_filing_start_date='2016-01-01', train_filing_end_date='2016-01-21', val_filing_start_date='2016-01-22', val_filing_end_date='2016-01-31', ) print(dataset_dict) print(f'Train dataset size: {dataset_dict["train"].shape}') print(f'Validation dataset size: {dataset_dict["validation"].shape}') """## Pre-Processing Steps Our model will only be able to predict rejections or acceptances. We will have to filter out any other decisions from our training and validation set in order to proceed. """ # Label-to-index mapping for the decision status field decision_to_str = {'REJECTED': 0, 'ACCEPTED': 1, 'PENDING': 2, 'CONT-REJECTED': 3, 'CONT-ACCEPTED': 4, 'CONT-PENDING': 5} # Helper function def map_decision_to_string(example): return {'decision': decision_to_str[example['decision']]} # Re-labeling/mapping. # Filtering out any decisions that are not 'REJECTED' or 'ACCEPTED'. for name in ['train', 'validation']: dataset_dict[name] = dataset_dict[name].map(map_decision_to_string) # Remove the pending and CONT-patent applications dataset_dict[name] = dataset_dict[name].filter(lambda e: e['decision'] <= 1) # Save the dataset dictionary to disk dataset_dict.save_to_disk(file_path + 'dataset_dict')