theresatvan commited on
Commit
81414ba
1 Parent(s): 0fb0bdd

Preprocess data & train model

Browse files
.gitattributes ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
1
+ models/content/dataset_dict/train/cache-b56665f85f005b13.arrow filter=lfs diff=lfs merge=lfs -text
2
+ models/content/dataset_dict/train/data-00000-of-00002.arrow filter=lfs diff=lfs merge=lfs -text
3
+ models/content/dataset_dict/train/data-00001-of-00002.arrow filter=lfs diff=lfs merge=lfs -text
4
+ models/content/dataset_dict/validation/cache-0fb09a456da0a13c.arrow filter=lfs diff=lfs merge=lfs -text
5
+ models/content/dataset_dict/validation/cache-56a4339a2c8de01a.arrow filter=lfs diff=lfs merge=lfs -text
6
+ models/content/dataset_dict/validation/cache-8107f3f237676f0e.arrow filter=lfs diff=lfs merge=lfs -text
7
+ models/content/dataset_dict/validation/cache-f992378180dfe232.arrow filter=lfs diff=lfs merge=lfs -text
8
+ models/content/dataset_dict/validation/data-00000-of-00001.arrow filter=lfs diff=lfs merge=lfs -text
models/content/dataset_dict/dataset_dict.json ADDED
@@ -0,0 +1 @@
 
1
+ {"splits": ["train", "validation"]}
models/content/dataset_dict/train/cache-b56665f85f005b13.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:95c4796d4a7e5d26b19049d858dbde51415ecd026957fb0ee8529b16ec286d55
3
+ size 658677464
models/content/dataset_dict/train/data-00000-of-00002.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ca9f363c02cb2049734a724c1324a27d659e66b4f5fe90e820b6dcbca3caac0f
3
+ size 309302984
models/content/dataset_dict/train/data-00001-of-00002.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3aed3e86a963f7d8f374839818366af5ee654a2793ca5b751214e7dd62aa68d7
3
+ size 326984520
models/content/dataset_dict/train/dataset_info.json ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "builder_name": "hupd",
3
+ "citation": "@InProceedings{suzgun2021:hupd,\ntitle = {The Harvard USPTO Patent Dataset},\nauthors={Mirac Suzgun and Suproteem Sarkar and Luke Melas-Kyriazi and Scott Kominers and Stuart Shieber},\nyear={2021}\n}\n",
4
+ "config_name": "sample",
5
+ "dataset_size": 1848322042,
6
+ "description": "\nThe Harvard USPTO Patent Dataset (HUPD) is a large-scale, well-structured, and multi-purpose corpus \nof English-language patent applications filed to the United States Patent and Trademark Office (USPTO) \nbetween 2004 and 2018. With more than 4.5 million patent documents, HUPD is two to three times larger \nthan comparable corpora. Unlike other NLP patent datasets, HUPD contains the inventor-submitted versions \nof patent applications, not the final versions of granted patents, allowing us to study patentability at \nthe time of filing using NLP methods for the first time.\n",
7
+ "download_checksums": {
8
+ "https://huggingface.co/datasets/HUPD/hupd/resolve/main/hupd_metadata_jan16_2022-02-22.feather": {
9
+ "num_bytes": 6665746,
10
+ "checksum": null
11
+ },
12
+ "https://huggingface.co/datasets/HUPD/hupd/resolve/main/data/sample-jan-2016.tar.gz": {
13
+ "num_bytes": 387636489,
14
+ "checksum": null
15
+ }
16
+ },
17
+ "download_size": 394302235,
18
+ "features": {
19
+ "patent_number": {
20
+ "dtype": "string",
21
+ "_type": "Value"
22
+ },
23
+ "decision": {
24
+ "dtype": "int64",
25
+ "_type": "Value"
26
+ },
27
+ "title": {
28
+ "dtype": "string",
29
+ "_type": "Value"
30
+ },
31
+ "abstract": {
32
+ "dtype": "string",
33
+ "_type": "Value"
34
+ },
35
+ "claims": {
36
+ "dtype": "string",
37
+ "_type": "Value"
38
+ },
39
+ "background": {
40
+ "dtype": "string",
41
+ "_type": "Value"
42
+ },
43
+ "summary": {
44
+ "dtype": "string",
45
+ "_type": "Value"
46
+ },
47
+ "description": {
48
+ "dtype": "string",
49
+ "_type": "Value"
50
+ },
51
+ "cpc_label": {
52
+ "dtype": "string",
53
+ "_type": "Value"
54
+ },
55
+ "ipc_label": {
56
+ "dtype": "string",
57
+ "_type": "Value"
58
+ },
59
+ "filing_date": {
60
+ "dtype": "string",
61
+ "_type": "Value"
62
+ },
63
+ "patent_issue_date": {
64
+ "dtype": "string",
65
+ "_type": "Value"
66
+ },
67
+ "date_published": {
68
+ "dtype": "string",
69
+ "_type": "Value"
70
+ },
71
+ "examiner_id": {
72
+ "dtype": "string",
73
+ "_type": "Value"
74
+ }
75
+ },
76
+ "homepage": "https://github.com/suzgunmirac/hupd",
77
+ "license": "",
78
+ "size_in_bytes": 2242624277,
79
+ "splits": {
80
+ "train": {
81
+ "name": "train",
82
+ "num_bytes": 1184126558,
83
+ "num_examples": 16153,
84
+ "shard_lengths": [
85
+ 7000,
86
+ 7000,
87
+ 2153
88
+ ],
89
+ "dataset_name": "hupd"
90
+ },
91
+ "validation": {
92
+ "name": "validation",
93
+ "num_bytes": 664195484,
94
+ "num_examples": 9094,
95
+ "shard_lengths": [
96
+ 7000,
97
+ 2094
98
+ ],
99
+ "dataset_name": "hupd"
100
+ }
101
+ },
102
+ "supervised_keys": {
103
+ "input": "claims",
104
+ "output": "decision"
105
+ },
106
+ "version": {
107
+ "version_str": "0.0.0",
108
+ "major": 0,
109
+ "minor": 0,
110
+ "patch": 0
111
+ }
112
+ }
models/content/dataset_dict/train/state.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_data_files": [
3
+ {
4
+ "filename": "data-00000-of-00002.arrow"
5
+ },
6
+ {
7
+ "filename": "data-00001-of-00002.arrow"
8
+ }
9
+ ],
10
+ "_fingerprint": "5fe802206a6f6c6f",
11
+ "_format_columns": null,
12
+ "_format_kwargs": {},
13
+ "_format_type": null,
14
+ "_output_all_columns": false,
15
+ "_split": "train"
16
+ }
models/content/dataset_dict/validation/cache-0fb09a456da0a13c.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:da9135daa052e81c84a26b153b6794de3089413cec1d15e2ea7bd2bc3f407913
3
+ size 367685864
models/content/dataset_dict/validation/cache-56a4339a2c8de01a.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7312bb14fe9365dcc94d783bf6ae32574dcda815a5cffe309b8ea84687989988
3
+ size 367685864
models/content/dataset_dict/validation/cache-8107f3f237676f0e.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:47409881486805513c0efdb578039ef5ebd96d6912948565e1174ae45d90e838
3
+ size 367685864
models/content/dataset_dict/validation/cache-f992378180dfe232.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ecbe5a4baa1c1f422305e6b7a59da5a5cc6b5133850cc7d3bea2e833873b34f4
3
+ size 367685864
models/content/dataset_dict/validation/data-00000-of-00001.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dac0232941a79f4bea7b714d57931bba46e0c67e97d474bb1722107dafa59e4a
3
+ size 355132024
models/content/dataset_dict/validation/dataset_info.json ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "builder_name": "hupd",
3
+ "citation": "@InProceedings{suzgun2021:hupd,\ntitle = {The Harvard USPTO Patent Dataset},\nauthors={Mirac Suzgun and Suproteem Sarkar and Luke Melas-Kyriazi and Scott Kominers and Stuart Shieber},\nyear={2021}\n}\n",
4
+ "config_name": "sample",
5
+ "dataset_size": 1848322042,
6
+ "description": "\nThe Harvard USPTO Patent Dataset (HUPD) is a large-scale, well-structured, and multi-purpose corpus \nof English-language patent applications filed to the United States Patent and Trademark Office (USPTO) \nbetween 2004 and 2018. With more than 4.5 million patent documents, HUPD is two to three times larger \nthan comparable corpora. Unlike other NLP patent datasets, HUPD contains the inventor-submitted versions \nof patent applications, not the final versions of granted patents, allowing us to study patentability at \nthe time of filing using NLP methods for the first time.\n",
7
+ "download_checksums": {
8
+ "https://huggingface.co/datasets/HUPD/hupd/resolve/main/hupd_metadata_jan16_2022-02-22.feather": {
9
+ "num_bytes": 6665746,
10
+ "checksum": null
11
+ },
12
+ "https://huggingface.co/datasets/HUPD/hupd/resolve/main/data/sample-jan-2016.tar.gz": {
13
+ "num_bytes": 387636489,
14
+ "checksum": null
15
+ }
16
+ },
17
+ "download_size": 394302235,
18
+ "features": {
19
+ "patent_number": {
20
+ "dtype": "string",
21
+ "_type": "Value"
22
+ },
23
+ "decision": {
24
+ "dtype": "int64",
25
+ "_type": "Value"
26
+ },
27
+ "title": {
28
+ "dtype": "string",
29
+ "_type": "Value"
30
+ },
31
+ "abstract": {
32
+ "dtype": "string",
33
+ "_type": "Value"
34
+ },
35
+ "claims": {
36
+ "dtype": "string",
37
+ "_type": "Value"
38
+ },
39
+ "background": {
40
+ "dtype": "string",
41
+ "_type": "Value"
42
+ },
43
+ "summary": {
44
+ "dtype": "string",
45
+ "_type": "Value"
46
+ },
47
+ "description": {
48
+ "dtype": "string",
49
+ "_type": "Value"
50
+ },
51
+ "cpc_label": {
52
+ "dtype": "string",
53
+ "_type": "Value"
54
+ },
55
+ "ipc_label": {
56
+ "dtype": "string",
57
+ "_type": "Value"
58
+ },
59
+ "filing_date": {
60
+ "dtype": "string",
61
+ "_type": "Value"
62
+ },
63
+ "patent_issue_date": {
64
+ "dtype": "string",
65
+ "_type": "Value"
66
+ },
67
+ "date_published": {
68
+ "dtype": "string",
69
+ "_type": "Value"
70
+ },
71
+ "examiner_id": {
72
+ "dtype": "string",
73
+ "_type": "Value"
74
+ }
75
+ },
76
+ "homepage": "https://github.com/suzgunmirac/hupd",
77
+ "license": "",
78
+ "size_in_bytes": 2242624277,
79
+ "splits": {
80
+ "train": {
81
+ "name": "train",
82
+ "num_bytes": 1184126558,
83
+ "num_examples": 16153,
84
+ "shard_lengths": [
85
+ 7000,
86
+ 7000,
87
+ 2153
88
+ ],
89
+ "dataset_name": "hupd"
90
+ },
91
+ "validation": {
92
+ "name": "validation",
93
+ "num_bytes": 664195484,
94
+ "num_examples": 9094,
95
+ "shard_lengths": [
96
+ 7000,
97
+ 2094
98
+ ],
99
+ "dataset_name": "hupd"
100
+ }
101
+ },
102
+ "supervised_keys": {
103
+ "input": "claims",
104
+ "output": "decision"
105
+ },
106
+ "version": {
107
+ "version_str": "0.0.0",
108
+ "major": 0,
109
+ "minor": 0,
110
+ "patch": 0
111
+ }
112
+ }
models/content/dataset_dict/validation/state.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_data_files": [
3
+ {
4
+ "filename": "data-00000-of-00001.arrow"
5
+ }
6
+ ],
7
+ "_fingerprint": "da58beb6a9e5af41",
8
+ "_format_columns": null,
9
+ "_format_kwargs": {},
10
+ "_format_type": null,
11
+ "_output_all_columns": false,
12
+ "_split": "validation"
13
+ }
models/preprocessing.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import load_dataset
2
+
3
+ # Initializing global variables
4
+ file_path = '/app/models/content/'
5
+
6
+ """## Loading the Dataset
7
+
8
+ We will be finetuning the DistilBERT model on a subset of patents filed in January 2016. We perform the train-test split as
9
+ any patents filed on and before January 21st, 2016 will be part of the training set and and patents filed on January 22nd,
10
+ 2016 and after will be part of the validation set.
11
+ """
12
+
13
+ dataset_dict = load_dataset('HUPD/hupd',
14
+ name='sample',
15
+ data_files="https://huggingface.co/datasets/HUPD/hupd/blob/main/hupd_metadata_2022-02-22.feather",
16
+ icpr_label=None,
17
+ train_filing_start_date='2016-01-01',
18
+ train_filing_end_date='2016-01-21',
19
+ val_filing_start_date='2016-01-22',
20
+ val_filing_end_date='2016-01-31',
21
+ )
22
+
23
+ print(dataset_dict)
24
+ print(f'Train dataset size: {dataset_dict["train"].shape}')
25
+ print(f'Validation dataset size: {dataset_dict["validation"].shape}')
26
+
27
+ """## Pre-Processing Steps
28
+
29
+ Our model will only be able to predict rejections or acceptances. We will have to filter out any
30
+ other decisions from our training and validation set in order to proceed.
31
+ """
32
+
33
+ # Label-to-index mapping for the decision status field
34
+ decision_to_str = {'REJECTED': 0, 'ACCEPTED': 1, 'PENDING': 2, 'CONT-REJECTED': 3, 'CONT-ACCEPTED': 4, 'CONT-PENDING': 5}
35
+
36
+ # Helper function
37
+ def map_decision_to_string(example):
38
+ return {'decision': decision_to_str[example['decision']]}
39
+
40
+ # Re-labeling/mapping.
41
+ # Filtering out any decisions that are not 'REJECTED' or 'ACCEPTED'.
42
+ for name in ['train', 'validation']:
43
+ dataset_dict[name] = dataset_dict[name].map(map_decision_to_string)
44
+ # Remove the pending and CONT-patent applications
45
+ dataset_dict[name] = dataset_dict[name].filter(lambda e: e['decision'] <= 1)
46
+
47
+
48
+ # Save the dataset dictionary to disk
49
+ dataset_dict.save_to_disk(file_path + 'dataset_dict')
models/train.py ADDED
@@ -0,0 +1,173 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """Finetuning Language Models - Can I Patent This?.ipynb
3
+
4
+ Automatically generated by Colaboratory.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1x9XfLKvGNBsajOK8rztsZnCoD2ucGfO6
8
+
9
+ # Finetuning Language Models - Can I Patent This?
10
+
11
+ Using the [Harvard USPTO patent dataset](https://github.com/suzgunmirac/hupd), we will fine-tune a DistilBERT model
12
+ obtained from Hugging Face that can predict whether a patent is accepted or rejected based off of its abstract and claims.
13
+ """
14
+ import gc
15
+ import argparse
16
+ import numpy as np
17
+
18
+ import torch
19
+ from torch.utils.data import DataLoader
20
+ from torch.optim import AdamW
21
+
22
+ from datasets import load_dataset, load_from_disk
23
+ from transformers import DistilBertForSequenceClassification, DistilBertTokenizer, DistilBertConfig
24
+
25
+
26
+ # Initializing global variables
27
+ file_path = '/app/models/content/'
28
+ decision_to_str = {'REJECTED': 0, 'ACCEPTED': 1, 'PENDING': 2, 'CONT-REJECTED': 3, 'CONT-ACCEPTED': 4, 'CONT-PENDING': 5}
29
+ criterion = torch.nn.CrossEntropyLoss()
30
+
31
+
32
+ def create_dataloaders(dataset_dict, section):
33
+ # Initializing the tokenizer
34
+ model_name = 'distilbert-base-uncased'
35
+ tokenizer = DistilBertTokenizer.from_pretrained(model_name, do_lower_case=True)
36
+
37
+ train_set, val_set = dataset_dict['train'], dataset_dict['validation']
38
+
39
+ # Training set
40
+ train_set = train_set.map(
41
+ lambda e: tokenizer((e[section]), truncation=True, padding='max_length'),
42
+ batched=True)
43
+
44
+ # Validation set
45
+ val_set = val_set.map(
46
+ lambda e: tokenizer((e[section]), truncation=True, padding='max_length'),
47
+ batched=True)
48
+
49
+ train_set.set_format(type='torch',
50
+ columns=['input_ids', 'attention_mask', 'decision'])
51
+
52
+ val_set.set_format(type='torch',
53
+ columns=['input_ids', 'attention_mask', 'decision'])
54
+
55
+ train_loader = DataLoader(train_set, batch_size=8, shuffle=True)
56
+ val_loader = DataLoader(val_set, batch_size=8, shuffle=False)
57
+
58
+ return train_loader, val_loader, tokenizer
59
+
60
+
61
+ def measure_accuracy(outputs, labels):
62
+ # This function will accept a model's outputs and the actual decisions
63
+ # and return test accuracy and number of samples.
64
+
65
+ preds = np.argmax(outputs, axis=1).flatten()
66
+ labels = labels.flatten()
67
+ correct = np.sum(preds == labels)
68
+
69
+ return correct, len(labels)
70
+
71
+ def validation(model, val_loader):
72
+ # This function accepts a model and a validation set DataLoader as its parameters
73
+ # and returns the test accuracy.
74
+
75
+ model.eval()
76
+
77
+ total_correct = 0
78
+ total_samples = 0
79
+
80
+ for batch in val_loader:
81
+ input_ids = batch['input_ids'].to(device)
82
+ labels = batch['decision'].to(device)
83
+
84
+ with torch.no_grad():
85
+ outputs = model(input_ids=input_ids, labels=labels)
86
+
87
+ logits = outputs.logits
88
+ num_correct, num_samples = measure_accuracy(logits.cpu().numpy(), labels.cpu().numpy())
89
+
90
+ total_correct += num_correct
91
+ total_samples += num_samples
92
+
93
+ del input_ids, labels, logits
94
+ gc.collect()
95
+ torch.cuda.empty_cache()
96
+
97
+ return (total_correct/total_samples) * 100
98
+
99
+
100
+ def train(device, model, tokenizer, train_loader, val_loader, section):
101
+ # This function will accept a model, the training set DataLoader, validation set
102
+ # DataLoader, and section as its parameters and return the trained model.
103
+
104
+ model.train()
105
+
106
+ # Define optimizer.
107
+ optim = AdamW(model.parameters(), lr=5e-5)
108
+ num_epochs = 5
109
+ best_val_acc = 0
110
+
111
+ for epoch in range(num_epochs):
112
+ for batch in train_loader:
113
+ optim.zero_grad()
114
+
115
+ input_ids = batch['input_ids'].to(device, non_blocking=True)
116
+ attention_mask = batch['attention_mask'].to(device, non_blocking=True)
117
+ labels = batch['decision'].to(device, non_blocking=True)
118
+
119
+ outputs = model(input_ids, attention_mask=attention_mask, labels=labels).logits
120
+
121
+ loss = criterion(outputs, labels)
122
+ loss.backward()
123
+ optim.step()
124
+
125
+ del input_ids, attention_mask, labels
126
+ gc.collect()
127
+ torch.cuda.empty_cache()
128
+
129
+ # Calculate test accuracy.
130
+ val_acc = validation(model, val_loader)
131
+
132
+ # Save the model that yields the best test accuracy
133
+ if best_val_acc < val_acc:
134
+ best_val_acc = val_acc
135
+
136
+ model.save_pretrained(file_path + section + '/')
137
+ tokenizer.save_pretrained(file_path + section + '_model_tokenizer/')
138
+
139
+ model.train()
140
+
141
+ return model
142
+
143
+
144
+ if __name__ == '__main__':
145
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
146
+
147
+ parser = argparse.ArgumentParser()
148
+
149
+ parser.add_argument('--section', type=str)
150
+
151
+ args = parser.parse_args()
152
+ section = args.section
153
+
154
+ dataset_dict = load_from_disk(file_path + 'dataset_dict')
155
+
156
+ train_loader, val_loader, tokenizer = create_dataloaders(dataset_dict, section)
157
+
158
+ del dataset_dict
159
+ gc.collect()
160
+ torch.cuda.empty_cache()
161
+
162
+ # Defining the models.
163
+ config = DistilBertConfig(num_classes=2, output_hidden_states=False)
164
+ model = DistilBertForSequenceClassification(config=config)
165
+ model.to(device)
166
+
167
+ # Train the model.
168
+ model = train(device, model, tokenizer, train_loader, val_loader, section)
169
+
170
+ val_acc = validation(model, val_loader)
171
+
172
+ print(f'*** Accuracy on the validation set ({section}): {val_acc}')
173
+