AmitGarage commited on
Commit
3ab8bd6
1 Parent(s): 4439cab

Upload 8 files

Browse files
scripts/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+
scripts/custom_functions.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from scripts.azure.azure_ner_pipe import make_azure_entity_recognizer
2
+ from scripts.torch_ner_model import build_torch_ner_model
3
+ from scripts.torch_ner_pipe import make_torch_entity_recognizer
scripts/preprocess.py ADDED
@@ -0,0 +1,214 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from collections import defaultdict
2
+ import random
3
+ from typing import List
4
+ import tarfile
5
+ import shutil
6
+ import typer
7
+ from pathlib import Path
8
+ import spacy
9
+ from spacy.language import Language
10
+ from spacy.tokens import Doc, DocBin, Span
11
+ from spacy.util import filter_spans
12
+ from wasabi import msg
13
+ from spacy.tokenizer import Tokenizer
14
+ from spacy.util import compile_prefix_regex, compile_infix_regex, compile_suffix_regex
15
+ import functools
16
+
17
+ random.seed(42)
18
+
19
+
20
+ def main(
21
+ input_dir: Path = typer.Argument(..., exists=True),
22
+ output_dir: Path = typer.Argument(...),
23
+ beth_train_tar_name: str = "i2b2_Beth_Train_Release.tar.gz",
24
+ partners_train_tar_name: str = "i2b2_Partners_Train_Release.tar.gz",
25
+ test_zip_name: str = "Task_1C.zip",
26
+ merge_docs: bool = True,
27
+ ):
28
+ """Extract and preprocess raw n2c2 2011 Challenge data into spaCy DocBin format.
29
+ input_dir (Path): Input directory with raw downloads from Harvard DBMI Portal.
30
+ output_dir (Path): Output directory to save spaCy .docbin files to.
31
+ beth_train_tar_name (str): Filename of downloaded tarfile for Beth Training Data.
32
+ partners_train_tar_name (str): Filename of downloaded tarfile for Partners Training Data.
33
+ test_zip_name (str): Filename of downloaded tarfile for n2c2 Test Data.
34
+ merge_docs (bool): If False, create spaCy docs for each line of each medical record
35
+ """
36
+ # Unpack compressed data files
37
+ msg.info("Extracting raw data.")
38
+ beth_train_tar_path = input_dir / beth_train_tar_name
39
+ partners_train_tar_path = input_dir / partners_train_tar_name
40
+ test_zip_path = input_dir / test_zip_name
41
+
42
+ #for path in [beth_train_tar_path, partners_train_tar_path]:
43
+ # if path.name.endswith("tar.gz"):
44
+ # msg.text(f"Extracting {path}")
45
+ # tar = tarfile.open(path, "r:gz")
46
+ # tar.extractall(path.parent)
47
+ # tar.close()
48
+
49
+ #shutil.unpack_archive(test_zip_path, input_dir / test_zip_name.replace(".zip", ""))
50
+
51
+ # preprocess data
52
+ msg.info("Converting to spaCy Doc objects.")
53
+
54
+ with open(r'/notebooks/Clinical_NER/difference.txt', 'a') as fp:
55
+ fp.write(str((input_dir / "Beth_Train").stem)+'\n')
56
+
57
+ beth_train_docs = docs_from_many_clinical_records(
58
+ input_dir / "Beth_Train", merge_docs=merge_docs
59
+ )
60
+
61
+ with open(r'/notebooks/Clinical_NER/difference.txt', 'a') as fp:
62
+ fp.write(str((input_dir / "Partners_Train").stem)+'\n')
63
+
64
+ partners_train_docs = docs_from_many_clinical_records(
65
+ input_dir / "Partners_Train", merge_docs=merge_docs
66
+ )
67
+ train_docs = beth_train_docs + partners_train_docs
68
+
69
+ with open(r'/notebooks/Clinical_NER/difference.txt', 'a') as fp:
70
+ fp.write(str((input_dir / "Task_1C/i2b2_Test/i2b2_Beth_Test").stem)+'\n')
71
+
72
+ beth_test_docs = docs_from_many_clinical_records(
73
+ input_dir / "Task_1C/i2b2_Test/i2b2_Beth_Test", merge_docs=merge_docs
74
+ )
75
+
76
+ with open(r'/notebooks/Clinical_NER/difference.txt', 'a') as fp:
77
+ fp.write(str((input_dir / "Task_1C/i2b2_Test/i2b2_Partners_Test").stem)+'\n')
78
+
79
+ partners_test_docs = docs_from_many_clinical_records(
80
+ input_dir / "Task_1C/i2b2_Test/i2b2_Partners_Test", merge_docs=merge_docs
81
+ )
82
+ test_docs = beth_test_docs + partners_test_docs
83
+
84
+ random.shuffle(train_docs)
85
+ split_idx = int(len(train_docs) * 0.8)
86
+ train_docs, dev_docs = train_docs[:split_idx], train_docs[split_idx:]
87
+
88
+ msg.good(f"Num Train Docs: {len(train_docs)}")
89
+ msg.good(f"Num Dev Docs: {len(dev_docs)}")
90
+ msg.good(f"Num Test Docs: {len(test_docs)}")
91
+
92
+ with msg.loading(f"Saving docs to: {output_dir}..."):
93
+ DocBin(docs=train_docs).to_disk(output_dir / "train.spacy")
94
+ DocBin(docs=dev_docs).to_disk(output_dir / "dev.spacy")
95
+ DocBin(docs=test_docs).to_disk(output_dir / "test.spacy")
96
+ msg.good("Done.")
97
+
98
+
99
+ def docs_from_clinical_record(
100
+ lines: List[str], annotations: List[str], nlp: Language, merge_docs: bool = False
101
+ ) -> List[Doc]:
102
+ """Create spaCy docs from a single annotated medical record in the n2c2 2011 format
103
+ lines (List[str]): Text of the clinical record as a list separated by newlines
104
+ annotations (List[str]): Raw entity annotations in the n2c2 2011 format
105
+ nlp (Language): spaCy Language object. Defaults to spacy.blank("en").
106
+ merge_docs (bool): If True: merge all lines into a single spaCy doc so
107
+ there is only 1 element in the output array.
108
+ If False: create a spaCy doc for each line in the original record
109
+ RETURNS (List[Doc]): List of spaCy Doc objects with entity spans set
110
+ """
111
+ difference = []
112
+ docs = []
113
+ spans_by_line = defaultdict(list)
114
+ nlp.Defaults.prefixes = [signs for signs in nlp.Defaults.prefixes if ':' not in signs and '#' not in signs and '+' not in signs and '(' not in signs and ')' not in signs and '*' not in signs and "'" not in signs and "%" not in signs and "_" not in signs and ";" not in signs and ">" not in signs and "," not in signs and "&" not in signs and '"' not in signs and "<" not in signs ]
115
+ infixes = nlp.Defaults.prefixes + [r"[-]~"]
116
+
117
+ infix_re = spacy.util.compile_infix_regex(infixes)
118
+
119
+ def custom_tokenizer(nlp):
120
+ return Tokenizer(nlp.vocab, infix_finditer=infix_re.finditer)
121
+
122
+ nlp.tokenizer = custom_tokenizer(nlp)
123
+
124
+ entities = {}
125
+
126
+ for row in annotations:
127
+ row = row.split("||")
128
+ text_info = row[0]
129
+ type_info = row[1]
130
+
131
+ offset_start = text_info.split(" ")[-2]
132
+ offset_end = text_info.split(" ")[-1]
133
+
134
+ start_line, word_start = offset_start.split(":")
135
+ end_line, word_end = offset_end.split(":")
136
+
137
+ label = type_info.split('"')[-2]
138
+
139
+ if start_line != end_line:
140
+ # This happens very infrequently (only about 10 times in total)
141
+ # so we just skip these annotations
142
+ continue
143
+ else:
144
+ spans_by_line[int(start_line)].append(
145
+ (int(word_start), int(word_end), label)
146
+ )
147
+
148
+ if start_line in entities :
149
+ entities[start_line].append(text_info.split('"')[1])
150
+ else :
151
+ entities[start_line] = [text_info.split('"')[1]]
152
+
153
+ extracted_entities = {}
154
+
155
+ for i, line in enumerate(lines):
156
+ n = i + 1
157
+ line = line.replace(" "," ")
158
+ doc = nlp.make_doc(line)
159
+ if n in spans_by_line:
160
+ ents = [
161
+ Span(doc, start, end + 1, label=label)
162
+ for (start, end, label) in spans_by_line[n]
163
+ ]
164
+ ents = [
165
+ e for e in ents if bool(e.text.strip()) and e.text.strip() == e.text
166
+ ]
167
+ doc.ents = filter_spans(ents)
168
+ extracted_entities[str(n)] = [ e.text for e in ents if bool(e.text.strip()) and e.text.strip() == e.text ]
169
+
170
+ docs.append(doc)
171
+
172
+ for key , value in entities.items() :
173
+ if key in extracted_entities :
174
+ if functools.reduce(lambda x, y : x and y, map(lambda p, q: p.lower() != q.lower(),entities[key],extracted_entities[key]), True):
175
+ difference = difference+[key]+entities[key]+extracted_entities[key]
176
+ else :
177
+ difference = difference+[key+" Key not present"]+entities[key]
178
+
179
+ with open(r'/notebooks/Clinical_NER/difference.txt', 'a') as fp:
180
+ fp.write('\n'.join(difference))
181
+ return [Doc.from_docs(docs)] if merge_docs else docs
182
+
183
+
184
+ def docs_from_many_clinical_records(
185
+ base_path: Path, nlp: Language = spacy.blank("en"), merge_docs: bool = True
186
+ ) -> List[Doc]:
187
+ """Convert raw n2c2 annotated clinical records into a list of
188
+ spaCy Doc objects to be ready to be used in training
189
+ base_path (Path): Root path to the raw data
190
+ nlp (Language): spaCy Language object. Defaults to spacy.blank("en").
191
+ merge_docs (bool): If True: merge all lines into a single spaCy doc so
192
+ there is only 1 element in the output array.
193
+ If False: create a spaCy doc for each line in the original record
194
+
195
+ RETURNS (List[Doc]): List of spaCy Doc objects with entity spans set
196
+ """
197
+ all_docs = []
198
+ concept_paths = sorted((base_path / "concepts").glob("*.txt.con"))
199
+ document_paths = sorted((base_path / "docs").glob("*.txt"))
200
+
201
+ for con_path, doc_path in zip(concept_paths, document_paths):
202
+ with open(r'/notebooks/Clinical_NER/difference.txt', 'a') as fp:
203
+ fp.write('\n'+str(con_path.stem))
204
+ annotations = con_path.open().read().splitlines()
205
+ lines = doc_path.open().read().splitlines()
206
+
207
+ docs = docs_from_clinical_record(lines, annotations, nlp, merge_docs=merge_docs)
208
+ all_docs += docs
209
+
210
+ return all_docs
211
+
212
+
213
+ if __name__ == "__main__":
214
+ typer.run(main)
scripts/torch_ner_model.py ADDED
@@ -0,0 +1,203 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from collections import OrderedDict
2
+ from typing import Optional, List
3
+ from thinc.api import (
4
+ with_array,
5
+ chain,
6
+ Model,
7
+ PyTorchWrapper,
8
+ PyTorchLSTM,
9
+ )
10
+ from thinc.types import Floats2d
11
+
12
+ from spacy.tokens import Doc
13
+ from spacy.util import registry
14
+ import torch
15
+ from torch import nn
16
+
17
+
18
+ @registry.architectures("TorchEntityRecognizer.v1")
19
+ def build_torch_ner_model(
20
+ tok2vec: Model[List[Doc], List[Floats2d]],
21
+ hidden_width: int,
22
+ dropout: Optional[float] = None,
23
+ nO: Optional[int] = None,
24
+ ) -> Model[List[Doc], List[Floats2d]]:
25
+ """Build a tagger model, using a provided token-to-vector component. The tagger
26
+ model simply adds a linear layer with softmax activation to predict scores
27
+ given the token vectors.
28
+ tok2vec (Model[List[Doc], List[Floats2d]]): The token-to-vector subnetwork.
29
+ nO (int or None): The number of tags to output. Inferred from the data if None.
30
+ RETURNS (Model[List[Doc], List[Floats2d]]): Initialized Model
31
+ """
32
+ ##print("Entered build_torch_ner_model - ")
33
+ #print(tok2vec.dim_names,tok2vec.name)
34
+ listener = tok2vec.maybe_get_ref("listener")
35
+ #print(listener.maybe_get_dim("nI"))
36
+ t2v_width = listener.maybe_get_dim("nO") if listener else None
37
+ #print(t2v_width, hidden_width, nO, dropout)
38
+ t2v_width = 768
39
+ #print(t2v_width, hidden_width, nO, dropout)
40
+ torch_model = TorchEntityRecognizer(t2v_width, hidden_width, nO, dropout)
41
+ #print("torch_model - ",torch_model)
42
+ wrapped_pt_model = PyTorchWrapper(torch_model)
43
+ #print("wrapped")
44
+ wrapped_pt_model.attrs["set_dropout_rate"] = torch_model.set_dropout_rate
45
+ #print("set dropout")
46
+
47
+ model = chain(tok2vec, with_array(wrapped_pt_model))
48
+ #print(model.param_names)
49
+ model.set_ref("tok2vec", tok2vec)
50
+ model.set_ref("torch_model", wrapped_pt_model)
51
+ model.init = init
52
+ #print("Completed build_torch_ner_model")
53
+ return model
54
+
55
+
56
+ def init(
57
+ model: Model[List[Doc], Floats2d],
58
+ X: Optional[List[Doc]] = None,
59
+ Y: Optional[List[str]] = None,
60
+ ) -> Model[List[Doc], List[Floats2d]]:
61
+ """Dynamically set PyTorch Output Layer shape based on labels data
62
+ model (Model[List[Doc], Floats2d]): Thinc Model wrapping tok2vec and PyTorch model
63
+ X (Optional[List[Doc]], optional): Sample of Doc objects.
64
+ Y (Optional[List[Ints2d]], optional): Available model labels.
65
+ RETURNS (Model[List[Doc], List[Floats2d]]): Initialized Model
66
+ """
67
+
68
+ #print("Entered init - ")
69
+ tok2vec = model.get_ref("tok2vec")
70
+ #print(tok2vec.ref_names)
71
+ torch_model = model.get_ref("torch_model")
72
+ #print(torch_model)
73
+
74
+ #print("Ref names - ",model.ref_names)
75
+ #print(tok2vec.dim_names,tok2vec.name)
76
+ #print(torch_model.dim_names,torch_model.name)
77
+ listener = tok2vec.maybe_get_ref("listener")
78
+ #print(listener)
79
+ t2v_width = listener.maybe_get_dim("nO") if listener else None
80
+ #print(t2v_width," - ",Y)
81
+ if t2v_width:
82
+ #print(torch_model.shims[0]._model)
83
+ #print("Searching - ",torch_model.maybe_get_dim("nI"))
84
+ torch_model.shims[0]._model.set_input_shape(t2v_width)
85
+ torch_model.set_dim("nI", t2v_width)
86
+ #print(torch_model.dim_names)
87
+
88
+ if Y is not None:
89
+ nO = len(Y)
90
+ #print(nO)
91
+ torch_model.shims[0]._model.set_output_shape(nO)
92
+ torch_model.set_dim("nO", nO)
93
+ #print(torch_model)
94
+
95
+ tok2vec = model.get_ref("tok2vec")
96
+ tok2vec.initialize()
97
+ #print(tok2vec)
98
+ torch_model = model.get_ref("torch_model")
99
+ #print("Found - ",torch_model.get_dim("nI"))
100
+ #print("Exit")
101
+ return model
102
+
103
+
104
+ def is_dropout_module(
105
+ module: nn.Module,
106
+ dropout_modules: List[nn.Module] = [nn.Dropout, nn.Dropout2d, nn.Dropout3d],
107
+ ) -> bool:
108
+ """Detect if a PyTorch Module is a Dropout layer
109
+ module (nn.Module): Module to check
110
+ dropout_modules (List[nn.Module], optional): List of Modules that count as Dropout layers.
111
+ RETURNS (bool): True if module is a Dropout layer.
112
+ """
113
+ #print("Entered is_dropout_module - ")
114
+ for m in dropout_modules:
115
+ if isinstance(module, m):
116
+ return True
117
+ return False
118
+
119
+
120
+ class TorchEntityRecognizer(nn.Module):
121
+ """Torch Entity Recognizer Model Head"""
122
+
123
+ def __init__(self, nI: int, nH: int, nO: int, dropout: float):
124
+ """Initialize TorchEntityRecognizer.
125
+ nI (int): Input Dimension
126
+ nH (int): Hidden Dimension Width
127
+ nO (int): Output Dimension Width
128
+ dropout (float): Dropout ratio (0 - 1.0)
129
+ """
130
+ super(TorchEntityRecognizer, self).__init__()
131
+
132
+ # Just for initialization of PyTorch layer. Output shape set during Model.init
133
+ #print("Entered TorchEntityRecognizer.__init__ - ")
134
+ nI = nI or 1
135
+ nO = nO or 1
136
+
137
+ self.nH = nH
138
+ self.model = nn.Sequential(
139
+ OrderedDict(
140
+ {
141
+ "input_layer": nn.Linear(nI, nH),
142
+ "input_activation": nn.ReLU(),
143
+ "input_dropout": nn.Dropout2d(dropout),
144
+ "output_layer": nn.Linear(nH, nO),
145
+ "output_dropout": nn.Dropout2d(dropout),
146
+ "softmax": nn.Softmax(dim=1),
147
+ }
148
+ )
149
+ )
150
+ #print(self.model)
151
+
152
+ def forward(self, inputs: torch.Tensor) -> torch.Tensor:
153
+ """Forward pass of the model.
154
+ inputs (torch.Tensor): Batch of outputs from spaCy tok2vec layer
155
+ RETURNS (torch.Tensor): Batch of results with a score for each tag for each token
156
+ """
157
+ #print("Entered TorchEntityRecognizer.forward - ")
158
+ return self.model(inputs)
159
+
160
+ def _set_layer_shape(self, name: str, nI: int, nO: int):
161
+ """Dynamically set the shape of a layer
162
+ name (str): Layer name
163
+ nI (int): New input shape
164
+ nO (int): New output shape
165
+ """
166
+ #print("Entered TorchEntityRecognizer._set_layer_shape - ",nO, nI)
167
+ with torch.no_grad():
168
+ layer = getattr(self.model, name)
169
+ #print(layer)
170
+ layer.out_features = nO
171
+ layer.weight = nn.Parameter(torch.Tensor(nO, nI))
172
+ #print(layer.weight.shape)
173
+ if layer.bias is not None:
174
+ layer.bias = nn.Parameter(torch.Tensor(nO))
175
+ #print(layer)
176
+ layer.reset_parameters()
177
+ #print(layer.weight.shape)
178
+ #print(layer)
179
+
180
+ def set_input_shape(self, nI: int):
181
+ """Dynamically set the shape of the input layer
182
+ nI (int): New input layer shape
183
+ """
184
+ #print("Entered TorchEntityRecognizer.set_input_shape - ",nI, self.nH)
185
+ self._set_layer_shape("input_layer", nI, self.nH)
186
+
187
+ def set_output_shape(self, nO: int):
188
+ """Dynamically set the shape of the output layer
189
+ nO (int): New output layer shape
190
+ """
191
+ #print("Entered TorchEntityRecognizer.set_output_shape - ", self.nH, nO)
192
+ self._set_layer_shape("output_layer", self.nH, nO)
193
+
194
+ def set_dropout_rate(self, dropout: float):
195
+ """Set the dropout rate of all Dropout layers in the model.
196
+ dropout (float): Dropout rate to set
197
+ """
198
+ #print("Entered TorchEntityRecognizer.set_dropout_rate - ")
199
+ dropout_layers = [
200
+ module for module in self.modules() if is_dropout_module(module)
201
+ ]
202
+ for layer in dropout_layers:
203
+ layer.p = dropout
scripts/torch_ner_model_test.py ADDED
@@ -0,0 +1,203 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from collections import OrderedDict
2
+ from typing import Optional, List
3
+ from thinc.api import (
4
+ with_array,
5
+ chain,
6
+ Model,
7
+ PyTorchWrapper,
8
+ PyTorchLSTM,
9
+ )
10
+ from thinc.types import Floats2d
11
+
12
+ from spacy.tokens import Doc
13
+ from spacy.util import registry
14
+ import torch
15
+ from torch import nn
16
+
17
+
18
+ @registry.architectures("TorchEntityRecognizer.v1")
19
+ def build_torch_ner_model(
20
+ tok2vec: Model[List[Doc], List[Floats2d]],
21
+ hidden_width: int,
22
+ dropout: Optional[float] = None,
23
+ nO: Optional[int] = None,
24
+ ) -> Model[List[Doc], List[Floats2d]]:
25
+ """Build a tagger model, using a provided token-to-vector component. The tagger
26
+ model simply adds a linear layer with softmax activation to predict scores
27
+ given the token vectors.
28
+ tok2vec (Model[List[Doc], List[Floats2d]]): The token-to-vector subnetwork.
29
+ nO (int or None): The number of tags to output. Inferred from the data if None.
30
+ RETURNS (Model[List[Doc], List[Floats2d]]): Initialized Model
31
+ """
32
+ print("Entered build_torch_ner_model - ")
33
+ print(tok2vec.dim_names,tok2vec.name)
34
+ listener = tok2vec.maybe_get_ref("listener")
35
+ print(listener.maybe_get_dim("nI"))
36
+ t2v_width = listener.maybe_get_dim("nO") if listener else None
37
+ print(t2v_width, hidden_width, nO, dropout)
38
+ t2v_width = 768
39
+ print(t2v_width, hidden_width, nO, dropout)
40
+ torch_model = TorchEntityRecognizer(t2v_width, hidden_width, nO, dropout)
41
+ print("torch_model - ",torch_model)
42
+ wrapped_pt_model = PyTorchWrapper(torch_model)
43
+ print("wrapped")
44
+ wrapped_pt_model.attrs["set_dropout_rate"] = torch_model.set_dropout_rate
45
+ print("set dropout")
46
+
47
+ model = chain(tok2vec, with_array(wrapped_pt_model))
48
+ print(model.param_names)
49
+ model.set_ref("tok2vec", tok2vec)
50
+ model.set_ref("torch_model", wrapped_pt_model)
51
+ model.init = init
52
+ print("Completed build_torch_ner_model")
53
+ return model
54
+
55
+
56
+ def init(
57
+ model: Model[List[Doc], Floats2d],
58
+ X: Optional[List[Doc]] = None,
59
+ Y: Optional[List[str]] = None,
60
+ ) -> Model[List[Doc], List[Floats2d]]:
61
+ """Dynamically set PyTorch Output Layer shape based on labels data
62
+ model (Model[List[Doc], Floats2d]): Thinc Model wrapping tok2vec and PyTorch model
63
+ X (Optional[List[Doc]], optional): Sample of Doc objects.
64
+ Y (Optional[List[Ints2d]], optional): Available model labels.
65
+ RETURNS (Model[List[Doc], List[Floats2d]]): Initialized Model
66
+ """
67
+
68
+ print("Entered init - ")
69
+ tok2vec = model.get_ref("tok2vec")
70
+ print(tok2vec.ref_names)
71
+ torch_model = model.get_ref("torch_model")
72
+ print(torch_model)
73
+
74
+ print("Ref names - ",model.ref_names)
75
+ print(tok2vec.dim_names,tok2vec.name)
76
+ print(torch_model.dim_names,torch_model.name)
77
+ listener = tok2vec.maybe_get_ref("listener")
78
+ print(listener)
79
+ t2v_width = listener.maybe_get_dim("nO") if listener else None
80
+ print(t2v_width," - ",Y)
81
+ if t2v_width:
82
+ print(torch_model.shims[0]._model)
83
+ print("Searching - ",torch_model.maybe_get_dim("nI"))
84
+ torch_model.shims[0]._model.set_input_shape(t2v_width)
85
+ torch_model.set_dim("nI", t2v_width)
86
+ print(torch_model.dim_names)
87
+
88
+ if Y is not None:
89
+ nO = len(Y)
90
+ print(nO)
91
+ torch_model.shims[0]._model.set_output_shape(nO)
92
+ torch_model.set_dim("nO", nO)
93
+ print(torch_model)
94
+
95
+ tok2vec = model.get_ref("tok2vec")
96
+ tok2vec.initialize()
97
+ print(tok2vec)
98
+ torch_model = model.get_ref("torch_model")
99
+ print("Found - ",torch_model.get_dim("nI"))
100
+ print("Exit")
101
+ return model
102
+
103
+
104
+ def is_dropout_module(
105
+ module: nn.Module,
106
+ dropout_modules: List[nn.Module] = [nn.Dropout, nn.Dropout2d, nn.Dropout3d],
107
+ ) -> bool:
108
+ """Detect if a PyTorch Module is a Dropout layer
109
+ module (nn.Module): Module to check
110
+ dropout_modules (List[nn.Module], optional): List of Modules that count as Dropout layers.
111
+ RETURNS (bool): True if module is a Dropout layer.
112
+ """
113
+ print("Entered is_dropout_module - ")
114
+ for m in dropout_modules:
115
+ if isinstance(module, m):
116
+ return True
117
+ return False
118
+
119
+
120
+ class TorchEntityRecognizer(nn.Module):
121
+ """Torch Entity Recognizer Model Head"""
122
+
123
+ def __init__(self, nI: int, nH: int, nO: int, dropout: float):
124
+ """Initialize TorchEntityRecognizer.
125
+ nI (int): Input Dimension
126
+ nH (int): Hidden Dimension Width
127
+ nO (int): Output Dimension Width
128
+ dropout (float): Dropout ratio (0 - 1.0)
129
+ """
130
+ super(TorchEntityRecognizer, self).__init__()
131
+
132
+ # Just for initialization of PyTorch layer. Output shape set during Model.init
133
+ print("Entered TorchEntityRecognizer.__init__ - ")
134
+ nI = nI or 1
135
+ nO = nO or 1
136
+
137
+ self.nH = nH
138
+ self.model = nn.Sequential(
139
+ OrderedDict(
140
+ {
141
+ "input_layer": nn.Linear(nI, nH),
142
+ "input_activation": nn.ReLU(),
143
+ "input_dropout": nn.Dropout2d(dropout),
144
+ "output_layer": nn.Linear(nH, nO),
145
+ "output_dropout": nn.Dropout2d(dropout),
146
+ "softmax": nn.Softmax(dim=1),
147
+ }
148
+ )
149
+ )
150
+ print(self.model)
151
+
152
+ def forward(self, inputs: torch.Tensor) -> torch.Tensor:
153
+ """Forward pass of the model.
154
+ inputs (torch.Tensor): Batch of outputs from spaCy tok2vec layer
155
+ RETURNS (torch.Tensor): Batch of results with a score for each tag for each token
156
+ """
157
+ print("Entered TorchEntityRecognizer.forward - ")
158
+ return self.model(inputs)
159
+
160
+ def _set_layer_shape(self, name: str, nI: int, nO: int):
161
+ """Dynamically set the shape of a layer
162
+ name (str): Layer name
163
+ nI (int): New input shape
164
+ nO (int): New output shape
165
+ """
166
+ print("Entered TorchEntityRecognizer._set_layer_shape - ",nO, nI)
167
+ with torch.no_grad():
168
+ layer = getattr(self.model, name)
169
+ print(layer)
170
+ layer.out_features = nO
171
+ layer.weight = nn.Parameter(torch.Tensor(nO, nI))
172
+ print(layer.weight.shape)
173
+ if layer.bias is not None:
174
+ layer.bias = nn.Parameter(torch.Tensor(nO))
175
+ print(layer)
176
+ layer.reset_parameters()
177
+ print(layer.weight.shape)
178
+ print(layer)
179
+
180
+ def set_input_shape(self, nI: int):
181
+ """Dynamically set the shape of the input layer
182
+ nI (int): New input layer shape
183
+ """
184
+ print("Entered TorchEntityRecognizer.set_input_shape - ",nI, self.nH)
185
+ self._set_layer_shape("input_layer", nI, self.nH)
186
+
187
+ def set_output_shape(self, nO: int):
188
+ """Dynamically set the shape of the output layer
189
+ nO (int): New output layer shape
190
+ """
191
+ print("Entered TorchEntityRecognizer.set_output_shape - ", self.nH, nO)
192
+ self._set_layer_shape("output_layer", self.nH, nO)
193
+
194
+ def set_dropout_rate(self, dropout: float):
195
+ """Set the dropout rate of all Dropout layers in the model.
196
+ dropout (float): Dropout rate to set
197
+ """
198
+ print("Entered TorchEntityRecognizer.set_dropout_rate - ")
199
+ dropout_layers = [
200
+ module for module in self.modules() if is_dropout_module(module)
201
+ ]
202
+ for layer in dropout_layers:
203
+ layer.p = dropout
scripts/torch_ner_pipe.py ADDED
@@ -0,0 +1,294 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from collections import OrderedDict
2
+ from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple
3
+ import numpy
4
+ from thinc.api import (
5
+ Config,
6
+ Model,
7
+ set_dropout_rate,
8
+ SequenceCategoricalCrossentropy,
9
+ Optimizer,
10
+ )
11
+ from thinc.types import Ints1d, Floats2d
12
+ from itertools import islice
13
+
14
+ from spacy.tokens.doc import Doc
15
+ from spacy.vocab import Vocab
16
+
17
+ from spacy.training import Example
18
+ from spacy.training.iob_utils import biluo_tags_to_spans, biluo_to_iob, iob_to_biluo
19
+ from spacy.pipeline.trainable_pipe import TrainablePipe
20
+ from spacy.pipeline.pipe import deserialize_config
21
+ from spacy.language import Language
22
+ from spacy.attrs import POS, ID
23
+ from spacy.parts_of_speech import X
24
+ from spacy.errors import Errors
25
+ from spacy.scorer import get_ner_prf
26
+ from spacy.training import validate_examples, validate_get_examples
27
+ from spacy import util
28
+
29
+
30
+ def set_torch_dropout_rate(model: Model, dropout_rate: float):
31
+ """Set dropout rate for Thinc and wrapped PyTorch models
32
+
33
+ Args:
34
+ model (Model): Thinc Model (with PyTorch sub-modules)
35
+ dropout_rate (float): Dropout rate
36
+ """
37
+ #print("Entered set_torch_dropout_rate - ")
38
+ set_dropout_rate(model, dropout_rate)
39
+ func = model.get_ref("torch_model").attrs["set_dropout_rate"]
40
+ func(dropout_rate)
41
+
42
+
43
+ default_model_config = """
44
+ [model]
45
+ @architectures = "TorchEntityRecognizer.v1"
46
+ hidden_width = 48
47
+ dropout = 0.1
48
+ nO = null
49
+
50
+ [model.tok2vec]
51
+ @architectures = "spacy.HashEmbedCNN.v1"
52
+ pretrained_vectors = null
53
+ width = 96
54
+ depth = 4
55
+ embed_size = 2000
56
+ window_size = 1
57
+ maxout_pieces = 3
58
+ subword_features = true
59
+ """
60
+ DEFAULT_MODEL = Config().from_str(default_model_config)["model"]
61
+
62
+
63
+ @Language.factory(
64
+ "torch_ner",
65
+ assigns=["doc.ents", "token.ent_iob", "token.ent_type"],
66
+ default_config={"model": DEFAULT_MODEL},
67
+ default_score_weights={
68
+ "ents_f": 1.0,
69
+ "ents_p": 0.0,
70
+ "ents_r": 0.0,
71
+ "ents_per_type": None,
72
+ },
73
+ )
74
+ def make_torch_entity_recognizer(nlp: Language, name: str, model: Model):
75
+ """Construct a PyTorch based Named Entity Recognition model
76
+ model (Model[List[Doc], List[Floats2d]]): A model instance that predicts
77
+ the tag probabilities. The output vectors should match the number of tags
78
+ in size, and be normalized as probabilities (all scores between 0 and 1,
79
+ with the rows summing to 1).
80
+ """
81
+ #print("Entered make_torch_entity_recognizer - ")
82
+ return TorchEntityRecognizer(nlp.vocab, model, name)
83
+
84
+
85
+ class TorchEntityRecognizer(TrainablePipe):
86
+ """Pipeline component Named Entity Recognition using PyTorch"""
87
+
88
+ def __init__(self, vocab: Vocab, model: Model, name: str = "torch_ner"):
89
+ """Initialize a part-of-speech tagger.
90
+ vocab (Vocab): The shared vocabulary.
91
+ model (thinc.api.Model): The Thinc Model powering the pipeline component.
92
+ name (str): The component instance name, used to add entries to the
93
+ losses during training.
94
+ """
95
+ #print("Entered pipe TorchEntityRecognizer.__init__ - ")
96
+ self.vocab = vocab
97
+ self.model = model
98
+ self.name = name
99
+ cfg = {"labels": []}
100
+ self.cfg = dict(sorted(cfg.items()))
101
+ #print(self.vocab,self.model,self.name,self.cfg)
102
+ #print(self.model.layers[0].ref_names)
103
+ #print(self.model.layers[1].ref_names)
104
+ #print("Completed pipe TorchEntityRecognizer.__init__ - ")
105
+
106
+ @property
107
+ def labels(self) -> Tuple[str, ...]:
108
+ """The labels currently added to the component.
109
+ RETURNS (Tuple[str]): The labels.
110
+ """
111
+ ##print("Entered TorchEntityRecognizer.labels - ")
112
+ labels = ["O"]
113
+ for label in self.cfg["labels"]:
114
+ for iob in ["B", "I"]:
115
+ labels.append(f"{iob}-{label}")
116
+ return tuple(labels)
117
+
118
+ def predict(self, docs: Iterable[Doc]) -> Iterable[Ints1d]:
119
+ """Apply the pipeline's model to a batch of docs, without modifying them.
120
+ docs (Iterable[Doc]): The documents to predict.
121
+ RETURNS: The models prediction for each document.
122
+ """
123
+ #print("Entered pipe TorchEntityRecognizer.predict - ")
124
+ if not any(len(doc) for doc in docs):
125
+ # Handle cases where there are no tokens in any docs.
126
+ n_labels = len(self.labels)
127
+ guesses = [self.model.ops.alloc((0, n_labels)) for doc in docs]
128
+ assert len(guesses) == len(docs)
129
+ return guesses
130
+ scores = self.model.predict(docs)
131
+
132
+ assert len(scores) == len(docs), (len(scores), len(docs))
133
+ guesses = []
134
+ for doc_scores in scores:
135
+ doc_guesses = doc_scores.argmax(axis=1)
136
+ if not isinstance(doc_guesses, numpy.ndarray):
137
+ doc_guesses = doc_guesses.get()
138
+ guesses.append(doc_guesses)
139
+ assert len(guesses) == len(docs)
140
+ return guesses
141
+
142
+ def set_annotations(self, docs: Iterable[Doc], preds: Iterable[Ints1d]):
143
+ """Modify a batch of documents, using pre-computed scores.
144
+ docs (Iterable[Doc]): The documents to modify.
145
+ preds (Iterable[Ints1d]): The IDs to set, produced by TorchEntityRecognizer.predict.
146
+ """
147
+ #print("Entered pipe TorchEntityRecognizer.set_annotations - ")
148
+ if isinstance(docs, Doc):
149
+ docs = [docs]
150
+ for doc, tag_ids in zip(docs, preds):
151
+ labels = iob_to_biluo([self.labels[tag_id] for tag_id in tag_ids])
152
+ try:
153
+ spans = biluo_tags_to_spans(doc, labels)
154
+ except ValueError:
155
+ # Note:
156
+ # biluo_tags_to_spans will raise an exception for an invalid tag sequence
157
+ # this could be fixed using a more complex transition system
158
+ # (e.g. a Conditional Random Field model head)
159
+ spans = []
160
+ doc.ents = spans
161
+
162
+ def update(
163
+ self,
164
+ examples: Iterable[Example],
165
+ *,
166
+ drop: float = 0.0,
167
+ sgd: Optimizer = None,
168
+ losses: Dict[str, float] = None,
169
+ ) -> Dict[str, float]:
170
+ """Learn from a batch of documents and gold-standard information,
171
+ updating the pipe's model. Delegates to predict and get_loss.
172
+ examples (Iterable[Example]): A batch of Example objects.
173
+ drop (float): The dropout rate.
174
+ sgd (thinc.api.Optimizer): The optimizer.
175
+ losses (Dict[str, float]): Optional record of the loss during training.
176
+ Updated using the component name as the key.
177
+ RETURNS (Dict[str, float]): The updated losses dictionary.
178
+ """
179
+ #print("Entered pipe TorchEntityRecognizer.update - ")
180
+ if losses is None:
181
+ losses = {}
182
+ losses.setdefault(self.name, 0.0)
183
+ validate_examples(examples, "TorchEntityRecognizer.update")
184
+ if not any(len(eg.predicted) if eg.predicted else 0 for eg in examples):
185
+ # Handle cases where there are no tokens in any docs.
186
+ return losses
187
+ set_torch_dropout_rate(self.model, drop)
188
+ tag_scores, bp_tag_scores = self.model.begin_update(
189
+ [eg.predicted for eg in examples]
190
+ )
191
+ for sc in tag_scores:
192
+ if self.model.ops.xp.isnan(sc.sum()):
193
+ raise ValueError(Errors.E940)
194
+ loss, d_tag_scores = self.get_loss(examples, tag_scores)
195
+ bp_tag_scores(d_tag_scores)
196
+ if sgd not in (None, False):
197
+ self.finish_update(sgd)
198
+
199
+ losses[self.name] += loss
200
+ return losses
201
+
202
+ def get_loss(
203
+ self, examples: Iterable[Example], scores: Iterable[Floats2d]
204
+ ) -> Tuple[float, float]:
205
+ """Find the loss and gradient of loss for the batch of documents and
206
+ their predicted scores.
207
+ examples (Iterable[Example]): The batch of examples.
208
+ scores: Scores representing the model's predictions.
209
+ RETURNS (Tuple[float, float]): The loss and the gradient.
210
+ """
211
+ #print("Entered pipe TorchEntityRecognizer.get_loss - ")
212
+ validate_examples(examples, "TorchEntityRecognizer.get_loss")
213
+ loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False)
214
+ truths = []
215
+ for eg in examples:
216
+ eg_truths = [
217
+ tag if tag != "" else None for tag in biluo_to_iob(eg.get_aligned_ner())
218
+ ]
219
+ truths.append(eg_truths)
220
+ d_scores, loss = loss_func(scores, truths)
221
+ if self.model.ops.xp.isnan(loss):
222
+ raise ValueError(Errors.E910.format(name=self.name))
223
+ return float(loss), d_scores
224
+
225
+ def initialize(
226
+ self,
227
+ get_examples: Callable[[], Iterable[Example]],
228
+ *,
229
+ nlp: Optional[Language] = None,
230
+ labels: Optional[List[str]] = None,
231
+ ):
232
+ """Initialize the pipe for training, using a representative set
233
+ of data examples.
234
+ get_examples (Callable[[], Iterable[Example]]): Function that
235
+ returns a representative sample of gold-standard Example objects..
236
+ nlp (Language): The current nlp object the component is part of.
237
+ labels (Optional[List[str]]): The labels to add to the component, typically generated by the
238
+ `init labels` command. If no labels are provided, the get_examples
239
+ callback is used to extract the labels from the data.
240
+ """
241
+ #print("Entered pipe TorchEntityRecognizer.initialize - ")
242
+ validate_get_examples(get_examples, "TorchEntityRecognizer.initialize")
243
+ if labels is not None:
244
+ for tag in labels:
245
+ self.add_label(tag)
246
+ else:
247
+ tags = set()
248
+ for example in get_examples():
249
+ for token in example.y:
250
+ if token.ent_type_:
251
+ tags.add(token.ent_type_)
252
+ for tag in sorted(tags):
253
+ self.add_label(tag)
254
+ doc_sample = []
255
+ for example in islice(get_examples(), 10):
256
+ doc_sample.append(example.x)
257
+
258
+ self._require_labels()
259
+ assert len(doc_sample) > 0, Errors.E923.format(name=self.name)
260
+ #print(nlp.config["components"][self.name]["model"]["nO"])
261
+ ##print(nlp.config["components"][self.name]["model"]["nI"])
262
+ self.model.initialize(X=doc_sample, Y=self.labels)
263
+ #print("self.model.initialize exit")
264
+ #print(self.model.name)
265
+ #print(self.model.layers[0].ref_names)
266
+ #print(self.model.layers[1].ref_names)
267
+ #print(self.name)
268
+ nlp.config["components"][self.name]["model"]["nO"] = len(self.labels)
269
+ #nlp.config["components"][self.name]["model"]["nI"] = 768
270
+ #print(nlp.config["components"][self.name]["model"])
271
+
272
+ def add_label(self, label: str) -> int:
273
+ """Add a new label to the pipe.
274
+ label (str): The label to add.
275
+ RETURNS (int): 0 if label is already present, otherwise 1.
276
+ """
277
+ #print("Entered pipe TorchEntityRecognizer.add_label - ")
278
+ if not isinstance(label, str):
279
+ raise ValueError(Errors.E187)
280
+ if label in self.labels:
281
+ return 0
282
+ self._allow_extra_label()
283
+ self.cfg["labels"].append(label)
284
+ self.vocab.strings.add(label)
285
+ return 1
286
+
287
+ def score(self, examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
288
+ """Score a batch of examples.
289
+ examples (Iterable[Example]): The examples to score.
290
+ RETURNS (Dict[str, Any]): The NER precision, recall and f-scores.
291
+ """
292
+ #print("Entered pipe TorchEntityRecognizer.score - ")
293
+ validate_examples(examples, "TorchEntityRecognizer.score")
294
+ return get_ner_prf(examples)
scripts/torch_ner_pipe_test.py ADDED
@@ -0,0 +1,294 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from collections import OrderedDict
2
+ from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple
3
+ import numpy
4
+ from thinc.api import (
5
+ Config,
6
+ Model,
7
+ set_dropout_rate,
8
+ SequenceCategoricalCrossentropy,
9
+ Optimizer,
10
+ )
11
+ from thinc.types import Ints1d, Floats2d
12
+ from itertools import islice
13
+
14
+ from spacy.tokens.doc import Doc
15
+ from spacy.vocab import Vocab
16
+
17
+ from spacy.training import Example
18
+ from spacy.training.iob_utils import biluo_tags_to_spans, biluo_to_iob, iob_to_biluo
19
+ from spacy.pipeline.trainable_pipe import TrainablePipe
20
+ from spacy.pipeline.pipe import deserialize_config
21
+ from spacy.language import Language
22
+ from spacy.attrs import POS, ID
23
+ from spacy.parts_of_speech import X
24
+ from spacy.errors import Errors
25
+ from spacy.scorer import get_ner_prf
26
+ from spacy.training import validate_examples, validate_get_examples
27
+ from spacy import util
28
+
29
+
30
+ def set_torch_dropout_rate(model: Model, dropout_rate: float):
31
+ """Set dropout rate for Thinc and wrapped PyTorch models
32
+
33
+ Args:
34
+ model (Model): Thinc Model (with PyTorch sub-modules)
35
+ dropout_rate (float): Dropout rate
36
+ """
37
+ print("Entered set_torch_dropout_rate - ")
38
+ set_dropout_rate(model, dropout_rate)
39
+ func = model.get_ref("torch_model").attrs["set_dropout_rate"]
40
+ func(dropout_rate)
41
+
42
+
43
+ default_model_config = """
44
+ [model]
45
+ @architectures = "TorchEntityRecognizer.v1"
46
+ hidden_width = 48
47
+ dropout = 0.1
48
+ nO = null
49
+
50
+ [model.tok2vec]
51
+ @architectures = "spacy.HashEmbedCNN.v1"
52
+ pretrained_vectors = null
53
+ width = 96
54
+ depth = 4
55
+ embed_size = 2000
56
+ window_size = 1
57
+ maxout_pieces = 3
58
+ subword_features = true
59
+ """
60
+ DEFAULT_MODEL = Config().from_str(default_model_config)["model"]
61
+
62
+
63
+ @Language.factory(
64
+ "torch_ner",
65
+ assigns=["doc.ents", "token.ent_iob", "token.ent_type"],
66
+ default_config={"model": DEFAULT_MODEL},
67
+ default_score_weights={
68
+ "ents_f": 1.0,
69
+ "ents_p": 0.0,
70
+ "ents_r": 0.0,
71
+ "ents_per_type": None,
72
+ },
73
+ )
74
+ def make_torch_entity_recognizer(nlp: Language, name: str, model: Model):
75
+ """Construct a PyTorch based Named Entity Recognition model
76
+ model (Model[List[Doc], List[Floats2d]]): A model instance that predicts
77
+ the tag probabilities. The output vectors should match the number of tags
78
+ in size, and be normalized as probabilities (all scores between 0 and 1,
79
+ with the rows summing to 1).
80
+ """
81
+ print("Entered make_torch_entity_recognizer - ")
82
+ return TorchEntityRecognizer(nlp.vocab, model, name)
83
+
84
+
85
+ class TorchEntityRecognizer(TrainablePipe):
86
+ """Pipeline component Named Entity Recognition using PyTorch"""
87
+
88
+ def __init__(self, vocab: Vocab, model: Model, name: str = "torch_ner"):
89
+ """Initialize a part-of-speech tagger.
90
+ vocab (Vocab): The shared vocabulary.
91
+ model (thinc.api.Model): The Thinc Model powering the pipeline component.
92
+ name (str): The component instance name, used to add entries to the
93
+ losses during training.
94
+ """
95
+ print("Entered pipe TorchEntityRecognizer.__init__ - ")
96
+ self.vocab = vocab
97
+ self.model = model
98
+ self.name = name
99
+ cfg = {"labels": []}
100
+ self.cfg = dict(sorted(cfg.items()))
101
+ print(self.vocab,self.model,self.name,self.cfg)
102
+ print(self.model.layers[0].ref_names)
103
+ print(self.model.layers[1].ref_names)
104
+ print("Completed pipe TorchEntityRecognizer.__init__ - ")
105
+
106
+ @property
107
+ def labels(self) -> Tuple[str, ...]:
108
+ """The labels currently added to the component.
109
+ RETURNS (Tuple[str]): The labels.
110
+ """
111
+ #print("Entered TorchEntityRecognizer.labels - ")
112
+ labels = ["O"]
113
+ for label in self.cfg["labels"]:
114
+ for iob in ["B", "I"]:
115
+ labels.append(f"{iob}-{label}")
116
+ return tuple(labels)
117
+
118
+ def predict(self, docs: Iterable[Doc]) -> Iterable[Ints1d]:
119
+ """Apply the pipeline's model to a batch of docs, without modifying them.
120
+ docs (Iterable[Doc]): The documents to predict.
121
+ RETURNS: The models prediction for each document.
122
+ """
123
+ print("Entered pipe TorchEntityRecognizer.predict - ")
124
+ if not any(len(doc) for doc in docs):
125
+ # Handle cases where there are no tokens in any docs.
126
+ n_labels = len(self.labels)
127
+ guesses = [self.model.ops.alloc((0, n_labels)) for doc in docs]
128
+ assert len(guesses) == len(docs)
129
+ return guesses
130
+ scores = self.model.predict(docs)
131
+
132
+ assert len(scores) == len(docs), (len(scores), len(docs))
133
+ guesses = []
134
+ for doc_scores in scores:
135
+ doc_guesses = doc_scores.argmax(axis=1)
136
+ if not isinstance(doc_guesses, numpy.ndarray):
137
+ doc_guesses = doc_guesses.get()
138
+ guesses.append(doc_guesses)
139
+ assert len(guesses) == len(docs)
140
+ return guesses
141
+
142
+ def set_annotations(self, docs: Iterable[Doc], preds: Iterable[Ints1d]):
143
+ """Modify a batch of documents, using pre-computed scores.
144
+ docs (Iterable[Doc]): The documents to modify.
145
+ preds (Iterable[Ints1d]): The IDs to set, produced by TorchEntityRecognizer.predict.
146
+ """
147
+ print("Entered pipe TorchEntityRecognizer.set_annotations - ")
148
+ if isinstance(docs, Doc):
149
+ docs = [docs]
150
+ for doc, tag_ids in zip(docs, preds):
151
+ labels = iob_to_biluo([self.labels[tag_id] for tag_id in tag_ids])
152
+ try:
153
+ spans = biluo_tags_to_spans(doc, labels)
154
+ except ValueError:
155
+ # Note:
156
+ # biluo_tags_to_spans will raise an exception for an invalid tag sequence
157
+ # this could be fixed using a more complex transition system
158
+ # (e.g. a Conditional Random Field model head)
159
+ spans = []
160
+ doc.ents = spans
161
+
162
+ def update(
163
+ self,
164
+ examples: Iterable[Example],
165
+ *,
166
+ drop: float = 0.0,
167
+ sgd: Optimizer = None,
168
+ losses: Dict[str, float] = None,
169
+ ) -> Dict[str, float]:
170
+ """Learn from a batch of documents and gold-standard information,
171
+ updating the pipe's model. Delegates to predict and get_loss.
172
+ examples (Iterable[Example]): A batch of Example objects.
173
+ drop (float): The dropout rate.
174
+ sgd (thinc.api.Optimizer): The optimizer.
175
+ losses (Dict[str, float]): Optional record of the loss during training.
176
+ Updated using the component name as the key.
177
+ RETURNS (Dict[str, float]): The updated losses dictionary.
178
+ """
179
+ print("Entered pipe TorchEntityRecognizer.update - ")
180
+ if losses is None:
181
+ losses = {}
182
+ losses.setdefault(self.name, 0.0)
183
+ validate_examples(examples, "TorchEntityRecognizer.update")
184
+ if not any(len(eg.predicted) if eg.predicted else 0 for eg in examples):
185
+ # Handle cases where there are no tokens in any docs.
186
+ return losses
187
+ set_torch_dropout_rate(self.model, drop)
188
+ tag_scores, bp_tag_scores = self.model.begin_update(
189
+ [eg.predicted for eg in examples]
190
+ )
191
+ for sc in tag_scores:
192
+ if self.model.ops.xp.isnan(sc.sum()):
193
+ raise ValueError(Errors.E940)
194
+ loss, d_tag_scores = self.get_loss(examples, tag_scores)
195
+ bp_tag_scores(d_tag_scores)
196
+ if sgd not in (None, False):
197
+ self.finish_update(sgd)
198
+
199
+ losses[self.name] += loss
200
+ return losses
201
+
202
+ def get_loss(
203
+ self, examples: Iterable[Example], scores: Iterable[Floats2d]
204
+ ) -> Tuple[float, float]:
205
+ """Find the loss and gradient of loss for the batch of documents and
206
+ their predicted scores.
207
+ examples (Iterable[Example]): The batch of examples.
208
+ scores: Scores representing the model's predictions.
209
+ RETURNS (Tuple[float, float]): The loss and the gradient.
210
+ """
211
+ print("Entered pipe TorchEntityRecognizer.get_loss - ")
212
+ validate_examples(examples, "TorchEntityRecognizer.get_loss")
213
+ loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False)
214
+ truths = []
215
+ for eg in examples:
216
+ eg_truths = [
217
+ tag if tag != "" else None for tag in biluo_to_iob(eg.get_aligned_ner())
218
+ ]
219
+ truths.append(eg_truths)
220
+ d_scores, loss = loss_func(scores, truths)
221
+ if self.model.ops.xp.isnan(loss):
222
+ raise ValueError(Errors.E910.format(name=self.name))
223
+ return float(loss), d_scores
224
+
225
+ def initialize(
226
+ self,
227
+ get_examples: Callable[[], Iterable[Example]],
228
+ *,
229
+ nlp: Optional[Language] = None,
230
+ labels: Optional[List[str]] = None,
231
+ ):
232
+ """Initialize the pipe for training, using a representative set
233
+ of data examples.
234
+ get_examples (Callable[[], Iterable[Example]]): Function that
235
+ returns a representative sample of gold-standard Example objects..
236
+ nlp (Language): The current nlp object the component is part of.
237
+ labels (Optional[List[str]]): The labels to add to the component, typically generated by the
238
+ `init labels` command. If no labels are provided, the get_examples
239
+ callback is used to extract the labels from the data.
240
+ """
241
+ print("Entered pipe TorchEntityRecognizer.initialize - ")
242
+ validate_get_examples(get_examples, "TorchEntityRecognizer.initialize")
243
+ if labels is not None:
244
+ for tag in labels:
245
+ self.add_label(tag)
246
+ else:
247
+ tags = set()
248
+ for example in get_examples():
249
+ for token in example.y:
250
+ if token.ent_type_:
251
+ tags.add(token.ent_type_)
252
+ for tag in sorted(tags):
253
+ self.add_label(tag)
254
+ doc_sample = []
255
+ for example in islice(get_examples(), 10):
256
+ doc_sample.append(example.x)
257
+
258
+ self._require_labels()
259
+ assert len(doc_sample) > 0, Errors.E923.format(name=self.name)
260
+ print(nlp.config["components"][self.name]["model"]["nO"])
261
+ #print(nlp.config["components"][self.name]["model"]["nI"])
262
+ self.model.initialize(X=doc_sample, Y=self.labels)
263
+ print("self.model.initialize exit")
264
+ print(self.model.name)
265
+ print(self.model.layers[0].ref_names)
266
+ print(self.model.layers[1].ref_names)
267
+ print(self.name)
268
+ nlp.config["components"][self.name]["model"]["nO"] = len(self.labels)
269
+ #nlp.config["components"][self.name]["model"]["nI"] = 768
270
+ print(nlp.config["components"][self.name]["model"])
271
+
272
+ def add_label(self, label: str) -> int:
273
+ """Add a new label to the pipe.
274
+ label (str): The label to add.
275
+ RETURNS (int): 0 if label is already present, otherwise 1.
276
+ """
277
+ print("Entered pipe TorchEntityRecognizer.add_label - ")
278
+ if not isinstance(label, str):
279
+ raise ValueError(Errors.E187)
280
+ if label in self.labels:
281
+ return 0
282
+ self._allow_extra_label()
283
+ self.cfg["labels"].append(label)
284
+ self.vocab.strings.add(label)
285
+ return 1
286
+
287
+ def score(self, examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
288
+ """Score a batch of examples.
289
+ examples (Iterable[Example]): The examples to score.
290
+ RETURNS (Dict[str, Any]): The NER precision, recall and f-scores.
291
+ """
292
+ print("Entered pipe TorchEntityRecognizer.score - ")
293
+ validate_examples(examples, "TorchEntityRecognizer.score")
294
+ return get_ner_prf(examples)
scripts/visualize_model.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import spacy_streamlit
2
+ import typer
3
+ from torch_ner_model import build_torch_ner_model
4
+ from torch_ner_pipe import make_torch_entity_recognizer
5
+
6
+
7
+ def main(models: str, default_text: str):
8
+ models = [name.strip() for name in models.split(",")]
9
+ labels = ["person", "problem", "pronoun", "test", "treatment"]
10
+ spacy_streamlit.visualize(
11
+ models, default_text, visualizers=["ner"], ner_labels=labels
12
+ )
13
+
14
+
15
+ if __name__ == "__main__":
16
+ try:
17
+ typer.run(main)
18
+ except SystemExit:
19
+ pass