Spaces:
Runtime error
Runtime error
from datasets import load_dataset | |
# Initializing global variables | |
file_path = '/app/models/content/' | |
"""## Loading the Dataset | |
We will be finetuning the DistilBERT model on a subset of patents filed in January 2016. We perform the train-test split as | |
any patents filed on and before January 21st, 2016 will be part of the training set and and patents filed on January 22nd, | |
2016 and after will be part of the validation set. | |
""" | |
dataset_dict = load_dataset('HUPD/hupd', | |
name='sample', | |
data_files="https://huggingface.co/datasets/HUPD/hupd/blob/main/hupd_metadata_2022-02-22.feather", | |
icpr_label=None, | |
train_filing_start_date='2016-01-01', | |
train_filing_end_date='2016-01-21', | |
val_filing_start_date='2016-01-22', | |
val_filing_end_date='2016-01-31', | |
) | |
print(dataset_dict) | |
print(f'Train dataset size: {dataset_dict["train"].shape}') | |
print(f'Validation dataset size: {dataset_dict["validation"].shape}') | |
"""## Pre-Processing Steps | |
Our model will only be able to predict rejections or acceptances. We will have to filter out any | |
other decisions from our training and validation set in order to proceed. | |
""" | |
# Label-to-index mapping for the decision status field | |
decision_to_str = {'REJECTED': 0, 'ACCEPTED': 1, 'PENDING': 2, 'CONT-REJECTED': 3, 'CONT-ACCEPTED': 4, 'CONT-PENDING': 5} | |
# Helper function | |
def map_decision_to_string(example): | |
return {'decision': decision_to_str[example['decision']]} | |
# Re-labeling/mapping. | |
# Filtering out any decisions that are not 'REJECTED' or 'ACCEPTED'. | |
for name in ['train', 'validation']: | |
dataset_dict[name] = dataset_dict[name].map(map_decision_to_string) | |
# Remove the pending and CONT-patent applications | |
dataset_dict[name] = dataset_dict[name].filter(lambda e: e['decision'] <= 1) | |
# Save the dataset dictionary to disk | |
dataset_dict.save_to_disk(file_path + 'dataset_dict') |