Spaces:
Runtime error
Runtime error
AmitGarage
commited on
Commit
•
3ab8bd6
1
Parent(s):
4439cab
Upload 8 files
Browse files- scripts/__init__.py +1 -0
- scripts/custom_functions.py +3 -0
- scripts/preprocess.py +214 -0
- scripts/torch_ner_model.py +203 -0
- scripts/torch_ner_model_test.py +203 -0
- scripts/torch_ner_pipe.py +294 -0
- scripts/torch_ner_pipe_test.py +294 -0
- scripts/visualize_model.py +19 -0
scripts/__init__.py
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
|
scripts/custom_functions.py
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
from scripts.azure.azure_ner_pipe import make_azure_entity_recognizer
|
2 |
+
from scripts.torch_ner_model import build_torch_ner_model
|
3 |
+
from scripts.torch_ner_pipe import make_torch_entity_recognizer
|
scripts/preprocess.py
ADDED
@@ -0,0 +1,214 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from collections import defaultdict
|
2 |
+
import random
|
3 |
+
from typing import List
|
4 |
+
import tarfile
|
5 |
+
import shutil
|
6 |
+
import typer
|
7 |
+
from pathlib import Path
|
8 |
+
import spacy
|
9 |
+
from spacy.language import Language
|
10 |
+
from spacy.tokens import Doc, DocBin, Span
|
11 |
+
from spacy.util import filter_spans
|
12 |
+
from wasabi import msg
|
13 |
+
from spacy.tokenizer import Tokenizer
|
14 |
+
from spacy.util import compile_prefix_regex, compile_infix_regex, compile_suffix_regex
|
15 |
+
import functools
|
16 |
+
|
17 |
+
random.seed(42)
|
18 |
+
|
19 |
+
|
20 |
+
def main(
|
21 |
+
input_dir: Path = typer.Argument(..., exists=True),
|
22 |
+
output_dir: Path = typer.Argument(...),
|
23 |
+
beth_train_tar_name: str = "i2b2_Beth_Train_Release.tar.gz",
|
24 |
+
partners_train_tar_name: str = "i2b2_Partners_Train_Release.tar.gz",
|
25 |
+
test_zip_name: str = "Task_1C.zip",
|
26 |
+
merge_docs: bool = True,
|
27 |
+
):
|
28 |
+
"""Extract and preprocess raw n2c2 2011 Challenge data into spaCy DocBin format.
|
29 |
+
input_dir (Path): Input directory with raw downloads from Harvard DBMI Portal.
|
30 |
+
output_dir (Path): Output directory to save spaCy .docbin files to.
|
31 |
+
beth_train_tar_name (str): Filename of downloaded tarfile for Beth Training Data.
|
32 |
+
partners_train_tar_name (str): Filename of downloaded tarfile for Partners Training Data.
|
33 |
+
test_zip_name (str): Filename of downloaded tarfile for n2c2 Test Data.
|
34 |
+
merge_docs (bool): If False, create spaCy docs for each line of each medical record
|
35 |
+
"""
|
36 |
+
# Unpack compressed data files
|
37 |
+
msg.info("Extracting raw data.")
|
38 |
+
beth_train_tar_path = input_dir / beth_train_tar_name
|
39 |
+
partners_train_tar_path = input_dir / partners_train_tar_name
|
40 |
+
test_zip_path = input_dir / test_zip_name
|
41 |
+
|
42 |
+
#for path in [beth_train_tar_path, partners_train_tar_path]:
|
43 |
+
# if path.name.endswith("tar.gz"):
|
44 |
+
# msg.text(f"Extracting {path}")
|
45 |
+
# tar = tarfile.open(path, "r:gz")
|
46 |
+
# tar.extractall(path.parent)
|
47 |
+
# tar.close()
|
48 |
+
|
49 |
+
#shutil.unpack_archive(test_zip_path, input_dir / test_zip_name.replace(".zip", ""))
|
50 |
+
|
51 |
+
# preprocess data
|
52 |
+
msg.info("Converting to spaCy Doc objects.")
|
53 |
+
|
54 |
+
with open(r'/notebooks/Clinical_NER/difference.txt', 'a') as fp:
|
55 |
+
fp.write(str((input_dir / "Beth_Train").stem)+'\n')
|
56 |
+
|
57 |
+
beth_train_docs = docs_from_many_clinical_records(
|
58 |
+
input_dir / "Beth_Train", merge_docs=merge_docs
|
59 |
+
)
|
60 |
+
|
61 |
+
with open(r'/notebooks/Clinical_NER/difference.txt', 'a') as fp:
|
62 |
+
fp.write(str((input_dir / "Partners_Train").stem)+'\n')
|
63 |
+
|
64 |
+
partners_train_docs = docs_from_many_clinical_records(
|
65 |
+
input_dir / "Partners_Train", merge_docs=merge_docs
|
66 |
+
)
|
67 |
+
train_docs = beth_train_docs + partners_train_docs
|
68 |
+
|
69 |
+
with open(r'/notebooks/Clinical_NER/difference.txt', 'a') as fp:
|
70 |
+
fp.write(str((input_dir / "Task_1C/i2b2_Test/i2b2_Beth_Test").stem)+'\n')
|
71 |
+
|
72 |
+
beth_test_docs = docs_from_many_clinical_records(
|
73 |
+
input_dir / "Task_1C/i2b2_Test/i2b2_Beth_Test", merge_docs=merge_docs
|
74 |
+
)
|
75 |
+
|
76 |
+
with open(r'/notebooks/Clinical_NER/difference.txt', 'a') as fp:
|
77 |
+
fp.write(str((input_dir / "Task_1C/i2b2_Test/i2b2_Partners_Test").stem)+'\n')
|
78 |
+
|
79 |
+
partners_test_docs = docs_from_many_clinical_records(
|
80 |
+
input_dir / "Task_1C/i2b2_Test/i2b2_Partners_Test", merge_docs=merge_docs
|
81 |
+
)
|
82 |
+
test_docs = beth_test_docs + partners_test_docs
|
83 |
+
|
84 |
+
random.shuffle(train_docs)
|
85 |
+
split_idx = int(len(train_docs) * 0.8)
|
86 |
+
train_docs, dev_docs = train_docs[:split_idx], train_docs[split_idx:]
|
87 |
+
|
88 |
+
msg.good(f"Num Train Docs: {len(train_docs)}")
|
89 |
+
msg.good(f"Num Dev Docs: {len(dev_docs)}")
|
90 |
+
msg.good(f"Num Test Docs: {len(test_docs)}")
|
91 |
+
|
92 |
+
with msg.loading(f"Saving docs to: {output_dir}..."):
|
93 |
+
DocBin(docs=train_docs).to_disk(output_dir / "train.spacy")
|
94 |
+
DocBin(docs=dev_docs).to_disk(output_dir / "dev.spacy")
|
95 |
+
DocBin(docs=test_docs).to_disk(output_dir / "test.spacy")
|
96 |
+
msg.good("Done.")
|
97 |
+
|
98 |
+
|
99 |
+
def docs_from_clinical_record(
|
100 |
+
lines: List[str], annotations: List[str], nlp: Language, merge_docs: bool = False
|
101 |
+
) -> List[Doc]:
|
102 |
+
"""Create spaCy docs from a single annotated medical record in the n2c2 2011 format
|
103 |
+
lines (List[str]): Text of the clinical record as a list separated by newlines
|
104 |
+
annotations (List[str]): Raw entity annotations in the n2c2 2011 format
|
105 |
+
nlp (Language): spaCy Language object. Defaults to spacy.blank("en").
|
106 |
+
merge_docs (bool): If True: merge all lines into a single spaCy doc so
|
107 |
+
there is only 1 element in the output array.
|
108 |
+
If False: create a spaCy doc for each line in the original record
|
109 |
+
RETURNS (List[Doc]): List of spaCy Doc objects with entity spans set
|
110 |
+
"""
|
111 |
+
difference = []
|
112 |
+
docs = []
|
113 |
+
spans_by_line = defaultdict(list)
|
114 |
+
nlp.Defaults.prefixes = [signs for signs in nlp.Defaults.prefixes if ':' not in signs and '#' not in signs and '+' not in signs and '(' not in signs and ')' not in signs and '*' not in signs and "'" not in signs and "%" not in signs and "_" not in signs and ";" not in signs and ">" not in signs and "," not in signs and "&" not in signs and '"' not in signs and "<" not in signs ]
|
115 |
+
infixes = nlp.Defaults.prefixes + [r"[-]~"]
|
116 |
+
|
117 |
+
infix_re = spacy.util.compile_infix_regex(infixes)
|
118 |
+
|
119 |
+
def custom_tokenizer(nlp):
|
120 |
+
return Tokenizer(nlp.vocab, infix_finditer=infix_re.finditer)
|
121 |
+
|
122 |
+
nlp.tokenizer = custom_tokenizer(nlp)
|
123 |
+
|
124 |
+
entities = {}
|
125 |
+
|
126 |
+
for row in annotations:
|
127 |
+
row = row.split("||")
|
128 |
+
text_info = row[0]
|
129 |
+
type_info = row[1]
|
130 |
+
|
131 |
+
offset_start = text_info.split(" ")[-2]
|
132 |
+
offset_end = text_info.split(" ")[-1]
|
133 |
+
|
134 |
+
start_line, word_start = offset_start.split(":")
|
135 |
+
end_line, word_end = offset_end.split(":")
|
136 |
+
|
137 |
+
label = type_info.split('"')[-2]
|
138 |
+
|
139 |
+
if start_line != end_line:
|
140 |
+
# This happens very infrequently (only about 10 times in total)
|
141 |
+
# so we just skip these annotations
|
142 |
+
continue
|
143 |
+
else:
|
144 |
+
spans_by_line[int(start_line)].append(
|
145 |
+
(int(word_start), int(word_end), label)
|
146 |
+
)
|
147 |
+
|
148 |
+
if start_line in entities :
|
149 |
+
entities[start_line].append(text_info.split('"')[1])
|
150 |
+
else :
|
151 |
+
entities[start_line] = [text_info.split('"')[1]]
|
152 |
+
|
153 |
+
extracted_entities = {}
|
154 |
+
|
155 |
+
for i, line in enumerate(lines):
|
156 |
+
n = i + 1
|
157 |
+
line = line.replace(" "," ")
|
158 |
+
doc = nlp.make_doc(line)
|
159 |
+
if n in spans_by_line:
|
160 |
+
ents = [
|
161 |
+
Span(doc, start, end + 1, label=label)
|
162 |
+
for (start, end, label) in spans_by_line[n]
|
163 |
+
]
|
164 |
+
ents = [
|
165 |
+
e for e in ents if bool(e.text.strip()) and e.text.strip() == e.text
|
166 |
+
]
|
167 |
+
doc.ents = filter_spans(ents)
|
168 |
+
extracted_entities[str(n)] = [ e.text for e in ents if bool(e.text.strip()) and e.text.strip() == e.text ]
|
169 |
+
|
170 |
+
docs.append(doc)
|
171 |
+
|
172 |
+
for key , value in entities.items() :
|
173 |
+
if key in extracted_entities :
|
174 |
+
if functools.reduce(lambda x, y : x and y, map(lambda p, q: p.lower() != q.lower(),entities[key],extracted_entities[key]), True):
|
175 |
+
difference = difference+[key]+entities[key]+extracted_entities[key]
|
176 |
+
else :
|
177 |
+
difference = difference+[key+" Key not present"]+entities[key]
|
178 |
+
|
179 |
+
with open(r'/notebooks/Clinical_NER/difference.txt', 'a') as fp:
|
180 |
+
fp.write('\n'.join(difference))
|
181 |
+
return [Doc.from_docs(docs)] if merge_docs else docs
|
182 |
+
|
183 |
+
|
184 |
+
def docs_from_many_clinical_records(
|
185 |
+
base_path: Path, nlp: Language = spacy.blank("en"), merge_docs: bool = True
|
186 |
+
) -> List[Doc]:
|
187 |
+
"""Convert raw n2c2 annotated clinical records into a list of
|
188 |
+
spaCy Doc objects to be ready to be used in training
|
189 |
+
base_path (Path): Root path to the raw data
|
190 |
+
nlp (Language): spaCy Language object. Defaults to spacy.blank("en").
|
191 |
+
merge_docs (bool): If True: merge all lines into a single spaCy doc so
|
192 |
+
there is only 1 element in the output array.
|
193 |
+
If False: create a spaCy doc for each line in the original record
|
194 |
+
|
195 |
+
RETURNS (List[Doc]): List of spaCy Doc objects with entity spans set
|
196 |
+
"""
|
197 |
+
all_docs = []
|
198 |
+
concept_paths = sorted((base_path / "concepts").glob("*.txt.con"))
|
199 |
+
document_paths = sorted((base_path / "docs").glob("*.txt"))
|
200 |
+
|
201 |
+
for con_path, doc_path in zip(concept_paths, document_paths):
|
202 |
+
with open(r'/notebooks/Clinical_NER/difference.txt', 'a') as fp:
|
203 |
+
fp.write('\n'+str(con_path.stem))
|
204 |
+
annotations = con_path.open().read().splitlines()
|
205 |
+
lines = doc_path.open().read().splitlines()
|
206 |
+
|
207 |
+
docs = docs_from_clinical_record(lines, annotations, nlp, merge_docs=merge_docs)
|
208 |
+
all_docs += docs
|
209 |
+
|
210 |
+
return all_docs
|
211 |
+
|
212 |
+
|
213 |
+
if __name__ == "__main__":
|
214 |
+
typer.run(main)
|
scripts/torch_ner_model.py
ADDED
@@ -0,0 +1,203 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from collections import OrderedDict
|
2 |
+
from typing import Optional, List
|
3 |
+
from thinc.api import (
|
4 |
+
with_array,
|
5 |
+
chain,
|
6 |
+
Model,
|
7 |
+
PyTorchWrapper,
|
8 |
+
PyTorchLSTM,
|
9 |
+
)
|
10 |
+
from thinc.types import Floats2d
|
11 |
+
|
12 |
+
from spacy.tokens import Doc
|
13 |
+
from spacy.util import registry
|
14 |
+
import torch
|
15 |
+
from torch import nn
|
16 |
+
|
17 |
+
|
18 |
+
@registry.architectures("TorchEntityRecognizer.v1")
|
19 |
+
def build_torch_ner_model(
|
20 |
+
tok2vec: Model[List[Doc], List[Floats2d]],
|
21 |
+
hidden_width: int,
|
22 |
+
dropout: Optional[float] = None,
|
23 |
+
nO: Optional[int] = None,
|
24 |
+
) -> Model[List[Doc], List[Floats2d]]:
|
25 |
+
"""Build a tagger model, using a provided token-to-vector component. The tagger
|
26 |
+
model simply adds a linear layer with softmax activation to predict scores
|
27 |
+
given the token vectors.
|
28 |
+
tok2vec (Model[List[Doc], List[Floats2d]]): The token-to-vector subnetwork.
|
29 |
+
nO (int or None): The number of tags to output. Inferred from the data if None.
|
30 |
+
RETURNS (Model[List[Doc], List[Floats2d]]): Initialized Model
|
31 |
+
"""
|
32 |
+
##print("Entered build_torch_ner_model - ")
|
33 |
+
#print(tok2vec.dim_names,tok2vec.name)
|
34 |
+
listener = tok2vec.maybe_get_ref("listener")
|
35 |
+
#print(listener.maybe_get_dim("nI"))
|
36 |
+
t2v_width = listener.maybe_get_dim("nO") if listener else None
|
37 |
+
#print(t2v_width, hidden_width, nO, dropout)
|
38 |
+
t2v_width = 768
|
39 |
+
#print(t2v_width, hidden_width, nO, dropout)
|
40 |
+
torch_model = TorchEntityRecognizer(t2v_width, hidden_width, nO, dropout)
|
41 |
+
#print("torch_model - ",torch_model)
|
42 |
+
wrapped_pt_model = PyTorchWrapper(torch_model)
|
43 |
+
#print("wrapped")
|
44 |
+
wrapped_pt_model.attrs["set_dropout_rate"] = torch_model.set_dropout_rate
|
45 |
+
#print("set dropout")
|
46 |
+
|
47 |
+
model = chain(tok2vec, with_array(wrapped_pt_model))
|
48 |
+
#print(model.param_names)
|
49 |
+
model.set_ref("tok2vec", tok2vec)
|
50 |
+
model.set_ref("torch_model", wrapped_pt_model)
|
51 |
+
model.init = init
|
52 |
+
#print("Completed build_torch_ner_model")
|
53 |
+
return model
|
54 |
+
|
55 |
+
|
56 |
+
def init(
|
57 |
+
model: Model[List[Doc], Floats2d],
|
58 |
+
X: Optional[List[Doc]] = None,
|
59 |
+
Y: Optional[List[str]] = None,
|
60 |
+
) -> Model[List[Doc], List[Floats2d]]:
|
61 |
+
"""Dynamically set PyTorch Output Layer shape based on labels data
|
62 |
+
model (Model[List[Doc], Floats2d]): Thinc Model wrapping tok2vec and PyTorch model
|
63 |
+
X (Optional[List[Doc]], optional): Sample of Doc objects.
|
64 |
+
Y (Optional[List[Ints2d]], optional): Available model labels.
|
65 |
+
RETURNS (Model[List[Doc], List[Floats2d]]): Initialized Model
|
66 |
+
"""
|
67 |
+
|
68 |
+
#print("Entered init - ")
|
69 |
+
tok2vec = model.get_ref("tok2vec")
|
70 |
+
#print(tok2vec.ref_names)
|
71 |
+
torch_model = model.get_ref("torch_model")
|
72 |
+
#print(torch_model)
|
73 |
+
|
74 |
+
#print("Ref names - ",model.ref_names)
|
75 |
+
#print(tok2vec.dim_names,tok2vec.name)
|
76 |
+
#print(torch_model.dim_names,torch_model.name)
|
77 |
+
listener = tok2vec.maybe_get_ref("listener")
|
78 |
+
#print(listener)
|
79 |
+
t2v_width = listener.maybe_get_dim("nO") if listener else None
|
80 |
+
#print(t2v_width," - ",Y)
|
81 |
+
if t2v_width:
|
82 |
+
#print(torch_model.shims[0]._model)
|
83 |
+
#print("Searching - ",torch_model.maybe_get_dim("nI"))
|
84 |
+
torch_model.shims[0]._model.set_input_shape(t2v_width)
|
85 |
+
torch_model.set_dim("nI", t2v_width)
|
86 |
+
#print(torch_model.dim_names)
|
87 |
+
|
88 |
+
if Y is not None:
|
89 |
+
nO = len(Y)
|
90 |
+
#print(nO)
|
91 |
+
torch_model.shims[0]._model.set_output_shape(nO)
|
92 |
+
torch_model.set_dim("nO", nO)
|
93 |
+
#print(torch_model)
|
94 |
+
|
95 |
+
tok2vec = model.get_ref("tok2vec")
|
96 |
+
tok2vec.initialize()
|
97 |
+
#print(tok2vec)
|
98 |
+
torch_model = model.get_ref("torch_model")
|
99 |
+
#print("Found - ",torch_model.get_dim("nI"))
|
100 |
+
#print("Exit")
|
101 |
+
return model
|
102 |
+
|
103 |
+
|
104 |
+
def is_dropout_module(
|
105 |
+
module: nn.Module,
|
106 |
+
dropout_modules: List[nn.Module] = [nn.Dropout, nn.Dropout2d, nn.Dropout3d],
|
107 |
+
) -> bool:
|
108 |
+
"""Detect if a PyTorch Module is a Dropout layer
|
109 |
+
module (nn.Module): Module to check
|
110 |
+
dropout_modules (List[nn.Module], optional): List of Modules that count as Dropout layers.
|
111 |
+
RETURNS (bool): True if module is a Dropout layer.
|
112 |
+
"""
|
113 |
+
#print("Entered is_dropout_module - ")
|
114 |
+
for m in dropout_modules:
|
115 |
+
if isinstance(module, m):
|
116 |
+
return True
|
117 |
+
return False
|
118 |
+
|
119 |
+
|
120 |
+
class TorchEntityRecognizer(nn.Module):
|
121 |
+
"""Torch Entity Recognizer Model Head"""
|
122 |
+
|
123 |
+
def __init__(self, nI: int, nH: int, nO: int, dropout: float):
|
124 |
+
"""Initialize TorchEntityRecognizer.
|
125 |
+
nI (int): Input Dimension
|
126 |
+
nH (int): Hidden Dimension Width
|
127 |
+
nO (int): Output Dimension Width
|
128 |
+
dropout (float): Dropout ratio (0 - 1.0)
|
129 |
+
"""
|
130 |
+
super(TorchEntityRecognizer, self).__init__()
|
131 |
+
|
132 |
+
# Just for initialization of PyTorch layer. Output shape set during Model.init
|
133 |
+
#print("Entered TorchEntityRecognizer.__init__ - ")
|
134 |
+
nI = nI or 1
|
135 |
+
nO = nO or 1
|
136 |
+
|
137 |
+
self.nH = nH
|
138 |
+
self.model = nn.Sequential(
|
139 |
+
OrderedDict(
|
140 |
+
{
|
141 |
+
"input_layer": nn.Linear(nI, nH),
|
142 |
+
"input_activation": nn.ReLU(),
|
143 |
+
"input_dropout": nn.Dropout2d(dropout),
|
144 |
+
"output_layer": nn.Linear(nH, nO),
|
145 |
+
"output_dropout": nn.Dropout2d(dropout),
|
146 |
+
"softmax": nn.Softmax(dim=1),
|
147 |
+
}
|
148 |
+
)
|
149 |
+
)
|
150 |
+
#print(self.model)
|
151 |
+
|
152 |
+
def forward(self, inputs: torch.Tensor) -> torch.Tensor:
|
153 |
+
"""Forward pass of the model.
|
154 |
+
inputs (torch.Tensor): Batch of outputs from spaCy tok2vec layer
|
155 |
+
RETURNS (torch.Tensor): Batch of results with a score for each tag for each token
|
156 |
+
"""
|
157 |
+
#print("Entered TorchEntityRecognizer.forward - ")
|
158 |
+
return self.model(inputs)
|
159 |
+
|
160 |
+
def _set_layer_shape(self, name: str, nI: int, nO: int):
|
161 |
+
"""Dynamically set the shape of a layer
|
162 |
+
name (str): Layer name
|
163 |
+
nI (int): New input shape
|
164 |
+
nO (int): New output shape
|
165 |
+
"""
|
166 |
+
#print("Entered TorchEntityRecognizer._set_layer_shape - ",nO, nI)
|
167 |
+
with torch.no_grad():
|
168 |
+
layer = getattr(self.model, name)
|
169 |
+
#print(layer)
|
170 |
+
layer.out_features = nO
|
171 |
+
layer.weight = nn.Parameter(torch.Tensor(nO, nI))
|
172 |
+
#print(layer.weight.shape)
|
173 |
+
if layer.bias is not None:
|
174 |
+
layer.bias = nn.Parameter(torch.Tensor(nO))
|
175 |
+
#print(layer)
|
176 |
+
layer.reset_parameters()
|
177 |
+
#print(layer.weight.shape)
|
178 |
+
#print(layer)
|
179 |
+
|
180 |
+
def set_input_shape(self, nI: int):
|
181 |
+
"""Dynamically set the shape of the input layer
|
182 |
+
nI (int): New input layer shape
|
183 |
+
"""
|
184 |
+
#print("Entered TorchEntityRecognizer.set_input_shape - ",nI, self.nH)
|
185 |
+
self._set_layer_shape("input_layer", nI, self.nH)
|
186 |
+
|
187 |
+
def set_output_shape(self, nO: int):
|
188 |
+
"""Dynamically set the shape of the output layer
|
189 |
+
nO (int): New output layer shape
|
190 |
+
"""
|
191 |
+
#print("Entered TorchEntityRecognizer.set_output_shape - ", self.nH, nO)
|
192 |
+
self._set_layer_shape("output_layer", self.nH, nO)
|
193 |
+
|
194 |
+
def set_dropout_rate(self, dropout: float):
|
195 |
+
"""Set the dropout rate of all Dropout layers in the model.
|
196 |
+
dropout (float): Dropout rate to set
|
197 |
+
"""
|
198 |
+
#print("Entered TorchEntityRecognizer.set_dropout_rate - ")
|
199 |
+
dropout_layers = [
|
200 |
+
module for module in self.modules() if is_dropout_module(module)
|
201 |
+
]
|
202 |
+
for layer in dropout_layers:
|
203 |
+
layer.p = dropout
|
scripts/torch_ner_model_test.py
ADDED
@@ -0,0 +1,203 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from collections import OrderedDict
|
2 |
+
from typing import Optional, List
|
3 |
+
from thinc.api import (
|
4 |
+
with_array,
|
5 |
+
chain,
|
6 |
+
Model,
|
7 |
+
PyTorchWrapper,
|
8 |
+
PyTorchLSTM,
|
9 |
+
)
|
10 |
+
from thinc.types import Floats2d
|
11 |
+
|
12 |
+
from spacy.tokens import Doc
|
13 |
+
from spacy.util import registry
|
14 |
+
import torch
|
15 |
+
from torch import nn
|
16 |
+
|
17 |
+
|
18 |
+
@registry.architectures("TorchEntityRecognizer.v1")
|
19 |
+
def build_torch_ner_model(
|
20 |
+
tok2vec: Model[List[Doc], List[Floats2d]],
|
21 |
+
hidden_width: int,
|
22 |
+
dropout: Optional[float] = None,
|
23 |
+
nO: Optional[int] = None,
|
24 |
+
) -> Model[List[Doc], List[Floats2d]]:
|
25 |
+
"""Build a tagger model, using a provided token-to-vector component. The tagger
|
26 |
+
model simply adds a linear layer with softmax activation to predict scores
|
27 |
+
given the token vectors.
|
28 |
+
tok2vec (Model[List[Doc], List[Floats2d]]): The token-to-vector subnetwork.
|
29 |
+
nO (int or None): The number of tags to output. Inferred from the data if None.
|
30 |
+
RETURNS (Model[List[Doc], List[Floats2d]]): Initialized Model
|
31 |
+
"""
|
32 |
+
print("Entered build_torch_ner_model - ")
|
33 |
+
print(tok2vec.dim_names,tok2vec.name)
|
34 |
+
listener = tok2vec.maybe_get_ref("listener")
|
35 |
+
print(listener.maybe_get_dim("nI"))
|
36 |
+
t2v_width = listener.maybe_get_dim("nO") if listener else None
|
37 |
+
print(t2v_width, hidden_width, nO, dropout)
|
38 |
+
t2v_width = 768
|
39 |
+
print(t2v_width, hidden_width, nO, dropout)
|
40 |
+
torch_model = TorchEntityRecognizer(t2v_width, hidden_width, nO, dropout)
|
41 |
+
print("torch_model - ",torch_model)
|
42 |
+
wrapped_pt_model = PyTorchWrapper(torch_model)
|
43 |
+
print("wrapped")
|
44 |
+
wrapped_pt_model.attrs["set_dropout_rate"] = torch_model.set_dropout_rate
|
45 |
+
print("set dropout")
|
46 |
+
|
47 |
+
model = chain(tok2vec, with_array(wrapped_pt_model))
|
48 |
+
print(model.param_names)
|
49 |
+
model.set_ref("tok2vec", tok2vec)
|
50 |
+
model.set_ref("torch_model", wrapped_pt_model)
|
51 |
+
model.init = init
|
52 |
+
print("Completed build_torch_ner_model")
|
53 |
+
return model
|
54 |
+
|
55 |
+
|
56 |
+
def init(
|
57 |
+
model: Model[List[Doc], Floats2d],
|
58 |
+
X: Optional[List[Doc]] = None,
|
59 |
+
Y: Optional[List[str]] = None,
|
60 |
+
) -> Model[List[Doc], List[Floats2d]]:
|
61 |
+
"""Dynamically set PyTorch Output Layer shape based on labels data
|
62 |
+
model (Model[List[Doc], Floats2d]): Thinc Model wrapping tok2vec and PyTorch model
|
63 |
+
X (Optional[List[Doc]], optional): Sample of Doc objects.
|
64 |
+
Y (Optional[List[Ints2d]], optional): Available model labels.
|
65 |
+
RETURNS (Model[List[Doc], List[Floats2d]]): Initialized Model
|
66 |
+
"""
|
67 |
+
|
68 |
+
print("Entered init - ")
|
69 |
+
tok2vec = model.get_ref("tok2vec")
|
70 |
+
print(tok2vec.ref_names)
|
71 |
+
torch_model = model.get_ref("torch_model")
|
72 |
+
print(torch_model)
|
73 |
+
|
74 |
+
print("Ref names - ",model.ref_names)
|
75 |
+
print(tok2vec.dim_names,tok2vec.name)
|
76 |
+
print(torch_model.dim_names,torch_model.name)
|
77 |
+
listener = tok2vec.maybe_get_ref("listener")
|
78 |
+
print(listener)
|
79 |
+
t2v_width = listener.maybe_get_dim("nO") if listener else None
|
80 |
+
print(t2v_width," - ",Y)
|
81 |
+
if t2v_width:
|
82 |
+
print(torch_model.shims[0]._model)
|
83 |
+
print("Searching - ",torch_model.maybe_get_dim("nI"))
|
84 |
+
torch_model.shims[0]._model.set_input_shape(t2v_width)
|
85 |
+
torch_model.set_dim("nI", t2v_width)
|
86 |
+
print(torch_model.dim_names)
|
87 |
+
|
88 |
+
if Y is not None:
|
89 |
+
nO = len(Y)
|
90 |
+
print(nO)
|
91 |
+
torch_model.shims[0]._model.set_output_shape(nO)
|
92 |
+
torch_model.set_dim("nO", nO)
|
93 |
+
print(torch_model)
|
94 |
+
|
95 |
+
tok2vec = model.get_ref("tok2vec")
|
96 |
+
tok2vec.initialize()
|
97 |
+
print(tok2vec)
|
98 |
+
torch_model = model.get_ref("torch_model")
|
99 |
+
print("Found - ",torch_model.get_dim("nI"))
|
100 |
+
print("Exit")
|
101 |
+
return model
|
102 |
+
|
103 |
+
|
104 |
+
def is_dropout_module(
|
105 |
+
module: nn.Module,
|
106 |
+
dropout_modules: List[nn.Module] = [nn.Dropout, nn.Dropout2d, nn.Dropout3d],
|
107 |
+
) -> bool:
|
108 |
+
"""Detect if a PyTorch Module is a Dropout layer
|
109 |
+
module (nn.Module): Module to check
|
110 |
+
dropout_modules (List[nn.Module], optional): List of Modules that count as Dropout layers.
|
111 |
+
RETURNS (bool): True if module is a Dropout layer.
|
112 |
+
"""
|
113 |
+
print("Entered is_dropout_module - ")
|
114 |
+
for m in dropout_modules:
|
115 |
+
if isinstance(module, m):
|
116 |
+
return True
|
117 |
+
return False
|
118 |
+
|
119 |
+
|
120 |
+
class TorchEntityRecognizer(nn.Module):
|
121 |
+
"""Torch Entity Recognizer Model Head"""
|
122 |
+
|
123 |
+
def __init__(self, nI: int, nH: int, nO: int, dropout: float):
|
124 |
+
"""Initialize TorchEntityRecognizer.
|
125 |
+
nI (int): Input Dimension
|
126 |
+
nH (int): Hidden Dimension Width
|
127 |
+
nO (int): Output Dimension Width
|
128 |
+
dropout (float): Dropout ratio (0 - 1.0)
|
129 |
+
"""
|
130 |
+
super(TorchEntityRecognizer, self).__init__()
|
131 |
+
|
132 |
+
# Just for initialization of PyTorch layer. Output shape set during Model.init
|
133 |
+
print("Entered TorchEntityRecognizer.__init__ - ")
|
134 |
+
nI = nI or 1
|
135 |
+
nO = nO or 1
|
136 |
+
|
137 |
+
self.nH = nH
|
138 |
+
self.model = nn.Sequential(
|
139 |
+
OrderedDict(
|
140 |
+
{
|
141 |
+
"input_layer": nn.Linear(nI, nH),
|
142 |
+
"input_activation": nn.ReLU(),
|
143 |
+
"input_dropout": nn.Dropout2d(dropout),
|
144 |
+
"output_layer": nn.Linear(nH, nO),
|
145 |
+
"output_dropout": nn.Dropout2d(dropout),
|
146 |
+
"softmax": nn.Softmax(dim=1),
|
147 |
+
}
|
148 |
+
)
|
149 |
+
)
|
150 |
+
print(self.model)
|
151 |
+
|
152 |
+
def forward(self, inputs: torch.Tensor) -> torch.Tensor:
|
153 |
+
"""Forward pass of the model.
|
154 |
+
inputs (torch.Tensor): Batch of outputs from spaCy tok2vec layer
|
155 |
+
RETURNS (torch.Tensor): Batch of results with a score for each tag for each token
|
156 |
+
"""
|
157 |
+
print("Entered TorchEntityRecognizer.forward - ")
|
158 |
+
return self.model(inputs)
|
159 |
+
|
160 |
+
def _set_layer_shape(self, name: str, nI: int, nO: int):
|
161 |
+
"""Dynamically set the shape of a layer
|
162 |
+
name (str): Layer name
|
163 |
+
nI (int): New input shape
|
164 |
+
nO (int): New output shape
|
165 |
+
"""
|
166 |
+
print("Entered TorchEntityRecognizer._set_layer_shape - ",nO, nI)
|
167 |
+
with torch.no_grad():
|
168 |
+
layer = getattr(self.model, name)
|
169 |
+
print(layer)
|
170 |
+
layer.out_features = nO
|
171 |
+
layer.weight = nn.Parameter(torch.Tensor(nO, nI))
|
172 |
+
print(layer.weight.shape)
|
173 |
+
if layer.bias is not None:
|
174 |
+
layer.bias = nn.Parameter(torch.Tensor(nO))
|
175 |
+
print(layer)
|
176 |
+
layer.reset_parameters()
|
177 |
+
print(layer.weight.shape)
|
178 |
+
print(layer)
|
179 |
+
|
180 |
+
def set_input_shape(self, nI: int):
|
181 |
+
"""Dynamically set the shape of the input layer
|
182 |
+
nI (int): New input layer shape
|
183 |
+
"""
|
184 |
+
print("Entered TorchEntityRecognizer.set_input_shape - ",nI, self.nH)
|
185 |
+
self._set_layer_shape("input_layer", nI, self.nH)
|
186 |
+
|
187 |
+
def set_output_shape(self, nO: int):
|
188 |
+
"""Dynamically set the shape of the output layer
|
189 |
+
nO (int): New output layer shape
|
190 |
+
"""
|
191 |
+
print("Entered TorchEntityRecognizer.set_output_shape - ", self.nH, nO)
|
192 |
+
self._set_layer_shape("output_layer", self.nH, nO)
|
193 |
+
|
194 |
+
def set_dropout_rate(self, dropout: float):
|
195 |
+
"""Set the dropout rate of all Dropout layers in the model.
|
196 |
+
dropout (float): Dropout rate to set
|
197 |
+
"""
|
198 |
+
print("Entered TorchEntityRecognizer.set_dropout_rate - ")
|
199 |
+
dropout_layers = [
|
200 |
+
module for module in self.modules() if is_dropout_module(module)
|
201 |
+
]
|
202 |
+
for layer in dropout_layers:
|
203 |
+
layer.p = dropout
|
scripts/torch_ner_pipe.py
ADDED
@@ -0,0 +1,294 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from collections import OrderedDict
|
2 |
+
from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple
|
3 |
+
import numpy
|
4 |
+
from thinc.api import (
|
5 |
+
Config,
|
6 |
+
Model,
|
7 |
+
set_dropout_rate,
|
8 |
+
SequenceCategoricalCrossentropy,
|
9 |
+
Optimizer,
|
10 |
+
)
|
11 |
+
from thinc.types import Ints1d, Floats2d
|
12 |
+
from itertools import islice
|
13 |
+
|
14 |
+
from spacy.tokens.doc import Doc
|
15 |
+
from spacy.vocab import Vocab
|
16 |
+
|
17 |
+
from spacy.training import Example
|
18 |
+
from spacy.training.iob_utils import biluo_tags_to_spans, biluo_to_iob, iob_to_biluo
|
19 |
+
from spacy.pipeline.trainable_pipe import TrainablePipe
|
20 |
+
from spacy.pipeline.pipe import deserialize_config
|
21 |
+
from spacy.language import Language
|
22 |
+
from spacy.attrs import POS, ID
|
23 |
+
from spacy.parts_of_speech import X
|
24 |
+
from spacy.errors import Errors
|
25 |
+
from spacy.scorer import get_ner_prf
|
26 |
+
from spacy.training import validate_examples, validate_get_examples
|
27 |
+
from spacy import util
|
28 |
+
|
29 |
+
|
30 |
+
def set_torch_dropout_rate(model: Model, dropout_rate: float):
|
31 |
+
"""Set dropout rate for Thinc and wrapped PyTorch models
|
32 |
+
|
33 |
+
Args:
|
34 |
+
model (Model): Thinc Model (with PyTorch sub-modules)
|
35 |
+
dropout_rate (float): Dropout rate
|
36 |
+
"""
|
37 |
+
#print("Entered set_torch_dropout_rate - ")
|
38 |
+
set_dropout_rate(model, dropout_rate)
|
39 |
+
func = model.get_ref("torch_model").attrs["set_dropout_rate"]
|
40 |
+
func(dropout_rate)
|
41 |
+
|
42 |
+
|
43 |
+
default_model_config = """
|
44 |
+
[model]
|
45 |
+
@architectures = "TorchEntityRecognizer.v1"
|
46 |
+
hidden_width = 48
|
47 |
+
dropout = 0.1
|
48 |
+
nO = null
|
49 |
+
|
50 |
+
[model.tok2vec]
|
51 |
+
@architectures = "spacy.HashEmbedCNN.v1"
|
52 |
+
pretrained_vectors = null
|
53 |
+
width = 96
|
54 |
+
depth = 4
|
55 |
+
embed_size = 2000
|
56 |
+
window_size = 1
|
57 |
+
maxout_pieces = 3
|
58 |
+
subword_features = true
|
59 |
+
"""
|
60 |
+
DEFAULT_MODEL = Config().from_str(default_model_config)["model"]
|
61 |
+
|
62 |
+
|
63 |
+
@Language.factory(
|
64 |
+
"torch_ner",
|
65 |
+
assigns=["doc.ents", "token.ent_iob", "token.ent_type"],
|
66 |
+
default_config={"model": DEFAULT_MODEL},
|
67 |
+
default_score_weights={
|
68 |
+
"ents_f": 1.0,
|
69 |
+
"ents_p": 0.0,
|
70 |
+
"ents_r": 0.0,
|
71 |
+
"ents_per_type": None,
|
72 |
+
},
|
73 |
+
)
|
74 |
+
def make_torch_entity_recognizer(nlp: Language, name: str, model: Model):
|
75 |
+
"""Construct a PyTorch based Named Entity Recognition model
|
76 |
+
model (Model[List[Doc], List[Floats2d]]): A model instance that predicts
|
77 |
+
the tag probabilities. The output vectors should match the number of tags
|
78 |
+
in size, and be normalized as probabilities (all scores between 0 and 1,
|
79 |
+
with the rows summing to 1).
|
80 |
+
"""
|
81 |
+
#print("Entered make_torch_entity_recognizer - ")
|
82 |
+
return TorchEntityRecognizer(nlp.vocab, model, name)
|
83 |
+
|
84 |
+
|
85 |
+
class TorchEntityRecognizer(TrainablePipe):
|
86 |
+
"""Pipeline component Named Entity Recognition using PyTorch"""
|
87 |
+
|
88 |
+
def __init__(self, vocab: Vocab, model: Model, name: str = "torch_ner"):
|
89 |
+
"""Initialize a part-of-speech tagger.
|
90 |
+
vocab (Vocab): The shared vocabulary.
|
91 |
+
model (thinc.api.Model): The Thinc Model powering the pipeline component.
|
92 |
+
name (str): The component instance name, used to add entries to the
|
93 |
+
losses during training.
|
94 |
+
"""
|
95 |
+
#print("Entered pipe TorchEntityRecognizer.__init__ - ")
|
96 |
+
self.vocab = vocab
|
97 |
+
self.model = model
|
98 |
+
self.name = name
|
99 |
+
cfg = {"labels": []}
|
100 |
+
self.cfg = dict(sorted(cfg.items()))
|
101 |
+
#print(self.vocab,self.model,self.name,self.cfg)
|
102 |
+
#print(self.model.layers[0].ref_names)
|
103 |
+
#print(self.model.layers[1].ref_names)
|
104 |
+
#print("Completed pipe TorchEntityRecognizer.__init__ - ")
|
105 |
+
|
106 |
+
@property
|
107 |
+
def labels(self) -> Tuple[str, ...]:
|
108 |
+
"""The labels currently added to the component.
|
109 |
+
RETURNS (Tuple[str]): The labels.
|
110 |
+
"""
|
111 |
+
##print("Entered TorchEntityRecognizer.labels - ")
|
112 |
+
labels = ["O"]
|
113 |
+
for label in self.cfg["labels"]:
|
114 |
+
for iob in ["B", "I"]:
|
115 |
+
labels.append(f"{iob}-{label}")
|
116 |
+
return tuple(labels)
|
117 |
+
|
118 |
+
def predict(self, docs: Iterable[Doc]) -> Iterable[Ints1d]:
|
119 |
+
"""Apply the pipeline's model to a batch of docs, without modifying them.
|
120 |
+
docs (Iterable[Doc]): The documents to predict.
|
121 |
+
RETURNS: The models prediction for each document.
|
122 |
+
"""
|
123 |
+
#print("Entered pipe TorchEntityRecognizer.predict - ")
|
124 |
+
if not any(len(doc) for doc in docs):
|
125 |
+
# Handle cases where there are no tokens in any docs.
|
126 |
+
n_labels = len(self.labels)
|
127 |
+
guesses = [self.model.ops.alloc((0, n_labels)) for doc in docs]
|
128 |
+
assert len(guesses) == len(docs)
|
129 |
+
return guesses
|
130 |
+
scores = self.model.predict(docs)
|
131 |
+
|
132 |
+
assert len(scores) == len(docs), (len(scores), len(docs))
|
133 |
+
guesses = []
|
134 |
+
for doc_scores in scores:
|
135 |
+
doc_guesses = doc_scores.argmax(axis=1)
|
136 |
+
if not isinstance(doc_guesses, numpy.ndarray):
|
137 |
+
doc_guesses = doc_guesses.get()
|
138 |
+
guesses.append(doc_guesses)
|
139 |
+
assert len(guesses) == len(docs)
|
140 |
+
return guesses
|
141 |
+
|
142 |
+
def set_annotations(self, docs: Iterable[Doc], preds: Iterable[Ints1d]):
|
143 |
+
"""Modify a batch of documents, using pre-computed scores.
|
144 |
+
docs (Iterable[Doc]): The documents to modify.
|
145 |
+
preds (Iterable[Ints1d]): The IDs to set, produced by TorchEntityRecognizer.predict.
|
146 |
+
"""
|
147 |
+
#print("Entered pipe TorchEntityRecognizer.set_annotations - ")
|
148 |
+
if isinstance(docs, Doc):
|
149 |
+
docs = [docs]
|
150 |
+
for doc, tag_ids in zip(docs, preds):
|
151 |
+
labels = iob_to_biluo([self.labels[tag_id] for tag_id in tag_ids])
|
152 |
+
try:
|
153 |
+
spans = biluo_tags_to_spans(doc, labels)
|
154 |
+
except ValueError:
|
155 |
+
# Note:
|
156 |
+
# biluo_tags_to_spans will raise an exception for an invalid tag sequence
|
157 |
+
# this could be fixed using a more complex transition system
|
158 |
+
# (e.g. a Conditional Random Field model head)
|
159 |
+
spans = []
|
160 |
+
doc.ents = spans
|
161 |
+
|
162 |
+
def update(
|
163 |
+
self,
|
164 |
+
examples: Iterable[Example],
|
165 |
+
*,
|
166 |
+
drop: float = 0.0,
|
167 |
+
sgd: Optimizer = None,
|
168 |
+
losses: Dict[str, float] = None,
|
169 |
+
) -> Dict[str, float]:
|
170 |
+
"""Learn from a batch of documents and gold-standard information,
|
171 |
+
updating the pipe's model. Delegates to predict and get_loss.
|
172 |
+
examples (Iterable[Example]): A batch of Example objects.
|
173 |
+
drop (float): The dropout rate.
|
174 |
+
sgd (thinc.api.Optimizer): The optimizer.
|
175 |
+
losses (Dict[str, float]): Optional record of the loss during training.
|
176 |
+
Updated using the component name as the key.
|
177 |
+
RETURNS (Dict[str, float]): The updated losses dictionary.
|
178 |
+
"""
|
179 |
+
#print("Entered pipe TorchEntityRecognizer.update - ")
|
180 |
+
if losses is None:
|
181 |
+
losses = {}
|
182 |
+
losses.setdefault(self.name, 0.0)
|
183 |
+
validate_examples(examples, "TorchEntityRecognizer.update")
|
184 |
+
if not any(len(eg.predicted) if eg.predicted else 0 for eg in examples):
|
185 |
+
# Handle cases where there are no tokens in any docs.
|
186 |
+
return losses
|
187 |
+
set_torch_dropout_rate(self.model, drop)
|
188 |
+
tag_scores, bp_tag_scores = self.model.begin_update(
|
189 |
+
[eg.predicted for eg in examples]
|
190 |
+
)
|
191 |
+
for sc in tag_scores:
|
192 |
+
if self.model.ops.xp.isnan(sc.sum()):
|
193 |
+
raise ValueError(Errors.E940)
|
194 |
+
loss, d_tag_scores = self.get_loss(examples, tag_scores)
|
195 |
+
bp_tag_scores(d_tag_scores)
|
196 |
+
if sgd not in (None, False):
|
197 |
+
self.finish_update(sgd)
|
198 |
+
|
199 |
+
losses[self.name] += loss
|
200 |
+
return losses
|
201 |
+
|
202 |
+
def get_loss(
|
203 |
+
self, examples: Iterable[Example], scores: Iterable[Floats2d]
|
204 |
+
) -> Tuple[float, float]:
|
205 |
+
"""Find the loss and gradient of loss for the batch of documents and
|
206 |
+
their predicted scores.
|
207 |
+
examples (Iterable[Example]): The batch of examples.
|
208 |
+
scores: Scores representing the model's predictions.
|
209 |
+
RETURNS (Tuple[float, float]): The loss and the gradient.
|
210 |
+
"""
|
211 |
+
#print("Entered pipe TorchEntityRecognizer.get_loss - ")
|
212 |
+
validate_examples(examples, "TorchEntityRecognizer.get_loss")
|
213 |
+
loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False)
|
214 |
+
truths = []
|
215 |
+
for eg in examples:
|
216 |
+
eg_truths = [
|
217 |
+
tag if tag != "" else None for tag in biluo_to_iob(eg.get_aligned_ner())
|
218 |
+
]
|
219 |
+
truths.append(eg_truths)
|
220 |
+
d_scores, loss = loss_func(scores, truths)
|
221 |
+
if self.model.ops.xp.isnan(loss):
|
222 |
+
raise ValueError(Errors.E910.format(name=self.name))
|
223 |
+
return float(loss), d_scores
|
224 |
+
|
225 |
+
def initialize(
|
226 |
+
self,
|
227 |
+
get_examples: Callable[[], Iterable[Example]],
|
228 |
+
*,
|
229 |
+
nlp: Optional[Language] = None,
|
230 |
+
labels: Optional[List[str]] = None,
|
231 |
+
):
|
232 |
+
"""Initialize the pipe for training, using a representative set
|
233 |
+
of data examples.
|
234 |
+
get_examples (Callable[[], Iterable[Example]]): Function that
|
235 |
+
returns a representative sample of gold-standard Example objects..
|
236 |
+
nlp (Language): The current nlp object the component is part of.
|
237 |
+
labels (Optional[List[str]]): The labels to add to the component, typically generated by the
|
238 |
+
`init labels` command. If no labels are provided, the get_examples
|
239 |
+
callback is used to extract the labels from the data.
|
240 |
+
"""
|
241 |
+
#print("Entered pipe TorchEntityRecognizer.initialize - ")
|
242 |
+
validate_get_examples(get_examples, "TorchEntityRecognizer.initialize")
|
243 |
+
if labels is not None:
|
244 |
+
for tag in labels:
|
245 |
+
self.add_label(tag)
|
246 |
+
else:
|
247 |
+
tags = set()
|
248 |
+
for example in get_examples():
|
249 |
+
for token in example.y:
|
250 |
+
if token.ent_type_:
|
251 |
+
tags.add(token.ent_type_)
|
252 |
+
for tag in sorted(tags):
|
253 |
+
self.add_label(tag)
|
254 |
+
doc_sample = []
|
255 |
+
for example in islice(get_examples(), 10):
|
256 |
+
doc_sample.append(example.x)
|
257 |
+
|
258 |
+
self._require_labels()
|
259 |
+
assert len(doc_sample) > 0, Errors.E923.format(name=self.name)
|
260 |
+
#print(nlp.config["components"][self.name]["model"]["nO"])
|
261 |
+
##print(nlp.config["components"][self.name]["model"]["nI"])
|
262 |
+
self.model.initialize(X=doc_sample, Y=self.labels)
|
263 |
+
#print("self.model.initialize exit")
|
264 |
+
#print(self.model.name)
|
265 |
+
#print(self.model.layers[0].ref_names)
|
266 |
+
#print(self.model.layers[1].ref_names)
|
267 |
+
#print(self.name)
|
268 |
+
nlp.config["components"][self.name]["model"]["nO"] = len(self.labels)
|
269 |
+
#nlp.config["components"][self.name]["model"]["nI"] = 768
|
270 |
+
#print(nlp.config["components"][self.name]["model"])
|
271 |
+
|
272 |
+
def add_label(self, label: str) -> int:
|
273 |
+
"""Add a new label to the pipe.
|
274 |
+
label (str): The label to add.
|
275 |
+
RETURNS (int): 0 if label is already present, otherwise 1.
|
276 |
+
"""
|
277 |
+
#print("Entered pipe TorchEntityRecognizer.add_label - ")
|
278 |
+
if not isinstance(label, str):
|
279 |
+
raise ValueError(Errors.E187)
|
280 |
+
if label in self.labels:
|
281 |
+
return 0
|
282 |
+
self._allow_extra_label()
|
283 |
+
self.cfg["labels"].append(label)
|
284 |
+
self.vocab.strings.add(label)
|
285 |
+
return 1
|
286 |
+
|
287 |
+
def score(self, examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
|
288 |
+
"""Score a batch of examples.
|
289 |
+
examples (Iterable[Example]): The examples to score.
|
290 |
+
RETURNS (Dict[str, Any]): The NER precision, recall and f-scores.
|
291 |
+
"""
|
292 |
+
#print("Entered pipe TorchEntityRecognizer.score - ")
|
293 |
+
validate_examples(examples, "TorchEntityRecognizer.score")
|
294 |
+
return get_ner_prf(examples)
|
scripts/torch_ner_pipe_test.py
ADDED
@@ -0,0 +1,294 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from collections import OrderedDict
|
2 |
+
from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple
|
3 |
+
import numpy
|
4 |
+
from thinc.api import (
|
5 |
+
Config,
|
6 |
+
Model,
|
7 |
+
set_dropout_rate,
|
8 |
+
SequenceCategoricalCrossentropy,
|
9 |
+
Optimizer,
|
10 |
+
)
|
11 |
+
from thinc.types import Ints1d, Floats2d
|
12 |
+
from itertools import islice
|
13 |
+
|
14 |
+
from spacy.tokens.doc import Doc
|
15 |
+
from spacy.vocab import Vocab
|
16 |
+
|
17 |
+
from spacy.training import Example
|
18 |
+
from spacy.training.iob_utils import biluo_tags_to_spans, biluo_to_iob, iob_to_biluo
|
19 |
+
from spacy.pipeline.trainable_pipe import TrainablePipe
|
20 |
+
from spacy.pipeline.pipe import deserialize_config
|
21 |
+
from spacy.language import Language
|
22 |
+
from spacy.attrs import POS, ID
|
23 |
+
from spacy.parts_of_speech import X
|
24 |
+
from spacy.errors import Errors
|
25 |
+
from spacy.scorer import get_ner_prf
|
26 |
+
from spacy.training import validate_examples, validate_get_examples
|
27 |
+
from spacy import util
|
28 |
+
|
29 |
+
|
30 |
+
def set_torch_dropout_rate(model: Model, dropout_rate: float):
|
31 |
+
"""Set dropout rate for Thinc and wrapped PyTorch models
|
32 |
+
|
33 |
+
Args:
|
34 |
+
model (Model): Thinc Model (with PyTorch sub-modules)
|
35 |
+
dropout_rate (float): Dropout rate
|
36 |
+
"""
|
37 |
+
print("Entered set_torch_dropout_rate - ")
|
38 |
+
set_dropout_rate(model, dropout_rate)
|
39 |
+
func = model.get_ref("torch_model").attrs["set_dropout_rate"]
|
40 |
+
func(dropout_rate)
|
41 |
+
|
42 |
+
|
43 |
+
default_model_config = """
|
44 |
+
[model]
|
45 |
+
@architectures = "TorchEntityRecognizer.v1"
|
46 |
+
hidden_width = 48
|
47 |
+
dropout = 0.1
|
48 |
+
nO = null
|
49 |
+
|
50 |
+
[model.tok2vec]
|
51 |
+
@architectures = "spacy.HashEmbedCNN.v1"
|
52 |
+
pretrained_vectors = null
|
53 |
+
width = 96
|
54 |
+
depth = 4
|
55 |
+
embed_size = 2000
|
56 |
+
window_size = 1
|
57 |
+
maxout_pieces = 3
|
58 |
+
subword_features = true
|
59 |
+
"""
|
60 |
+
DEFAULT_MODEL = Config().from_str(default_model_config)["model"]
|
61 |
+
|
62 |
+
|
63 |
+
@Language.factory(
|
64 |
+
"torch_ner",
|
65 |
+
assigns=["doc.ents", "token.ent_iob", "token.ent_type"],
|
66 |
+
default_config={"model": DEFAULT_MODEL},
|
67 |
+
default_score_weights={
|
68 |
+
"ents_f": 1.0,
|
69 |
+
"ents_p": 0.0,
|
70 |
+
"ents_r": 0.0,
|
71 |
+
"ents_per_type": None,
|
72 |
+
},
|
73 |
+
)
|
74 |
+
def make_torch_entity_recognizer(nlp: Language, name: str, model: Model):
|
75 |
+
"""Construct a PyTorch based Named Entity Recognition model
|
76 |
+
model (Model[List[Doc], List[Floats2d]]): A model instance that predicts
|
77 |
+
the tag probabilities. The output vectors should match the number of tags
|
78 |
+
in size, and be normalized as probabilities (all scores between 0 and 1,
|
79 |
+
with the rows summing to 1).
|
80 |
+
"""
|
81 |
+
print("Entered make_torch_entity_recognizer - ")
|
82 |
+
return TorchEntityRecognizer(nlp.vocab, model, name)
|
83 |
+
|
84 |
+
|
85 |
+
class TorchEntityRecognizer(TrainablePipe):
|
86 |
+
"""Pipeline component Named Entity Recognition using PyTorch"""
|
87 |
+
|
88 |
+
def __init__(self, vocab: Vocab, model: Model, name: str = "torch_ner"):
|
89 |
+
"""Initialize a part-of-speech tagger.
|
90 |
+
vocab (Vocab): The shared vocabulary.
|
91 |
+
model (thinc.api.Model): The Thinc Model powering the pipeline component.
|
92 |
+
name (str): The component instance name, used to add entries to the
|
93 |
+
losses during training.
|
94 |
+
"""
|
95 |
+
print("Entered pipe TorchEntityRecognizer.__init__ - ")
|
96 |
+
self.vocab = vocab
|
97 |
+
self.model = model
|
98 |
+
self.name = name
|
99 |
+
cfg = {"labels": []}
|
100 |
+
self.cfg = dict(sorted(cfg.items()))
|
101 |
+
print(self.vocab,self.model,self.name,self.cfg)
|
102 |
+
print(self.model.layers[0].ref_names)
|
103 |
+
print(self.model.layers[1].ref_names)
|
104 |
+
print("Completed pipe TorchEntityRecognizer.__init__ - ")
|
105 |
+
|
106 |
+
@property
|
107 |
+
def labels(self) -> Tuple[str, ...]:
|
108 |
+
"""The labels currently added to the component.
|
109 |
+
RETURNS (Tuple[str]): The labels.
|
110 |
+
"""
|
111 |
+
#print("Entered TorchEntityRecognizer.labels - ")
|
112 |
+
labels = ["O"]
|
113 |
+
for label in self.cfg["labels"]:
|
114 |
+
for iob in ["B", "I"]:
|
115 |
+
labels.append(f"{iob}-{label}")
|
116 |
+
return tuple(labels)
|
117 |
+
|
118 |
+
def predict(self, docs: Iterable[Doc]) -> Iterable[Ints1d]:
|
119 |
+
"""Apply the pipeline's model to a batch of docs, without modifying them.
|
120 |
+
docs (Iterable[Doc]): The documents to predict.
|
121 |
+
RETURNS: The models prediction for each document.
|
122 |
+
"""
|
123 |
+
print("Entered pipe TorchEntityRecognizer.predict - ")
|
124 |
+
if not any(len(doc) for doc in docs):
|
125 |
+
# Handle cases where there are no tokens in any docs.
|
126 |
+
n_labels = len(self.labels)
|
127 |
+
guesses = [self.model.ops.alloc((0, n_labels)) for doc in docs]
|
128 |
+
assert len(guesses) == len(docs)
|
129 |
+
return guesses
|
130 |
+
scores = self.model.predict(docs)
|
131 |
+
|
132 |
+
assert len(scores) == len(docs), (len(scores), len(docs))
|
133 |
+
guesses = []
|
134 |
+
for doc_scores in scores:
|
135 |
+
doc_guesses = doc_scores.argmax(axis=1)
|
136 |
+
if not isinstance(doc_guesses, numpy.ndarray):
|
137 |
+
doc_guesses = doc_guesses.get()
|
138 |
+
guesses.append(doc_guesses)
|
139 |
+
assert len(guesses) == len(docs)
|
140 |
+
return guesses
|
141 |
+
|
142 |
+
def set_annotations(self, docs: Iterable[Doc], preds: Iterable[Ints1d]):
|
143 |
+
"""Modify a batch of documents, using pre-computed scores.
|
144 |
+
docs (Iterable[Doc]): The documents to modify.
|
145 |
+
preds (Iterable[Ints1d]): The IDs to set, produced by TorchEntityRecognizer.predict.
|
146 |
+
"""
|
147 |
+
print("Entered pipe TorchEntityRecognizer.set_annotations - ")
|
148 |
+
if isinstance(docs, Doc):
|
149 |
+
docs = [docs]
|
150 |
+
for doc, tag_ids in zip(docs, preds):
|
151 |
+
labels = iob_to_biluo([self.labels[tag_id] for tag_id in tag_ids])
|
152 |
+
try:
|
153 |
+
spans = biluo_tags_to_spans(doc, labels)
|
154 |
+
except ValueError:
|
155 |
+
# Note:
|
156 |
+
# biluo_tags_to_spans will raise an exception for an invalid tag sequence
|
157 |
+
# this could be fixed using a more complex transition system
|
158 |
+
# (e.g. a Conditional Random Field model head)
|
159 |
+
spans = []
|
160 |
+
doc.ents = spans
|
161 |
+
|
162 |
+
def update(
|
163 |
+
self,
|
164 |
+
examples: Iterable[Example],
|
165 |
+
*,
|
166 |
+
drop: float = 0.0,
|
167 |
+
sgd: Optimizer = None,
|
168 |
+
losses: Dict[str, float] = None,
|
169 |
+
) -> Dict[str, float]:
|
170 |
+
"""Learn from a batch of documents and gold-standard information,
|
171 |
+
updating the pipe's model. Delegates to predict and get_loss.
|
172 |
+
examples (Iterable[Example]): A batch of Example objects.
|
173 |
+
drop (float): The dropout rate.
|
174 |
+
sgd (thinc.api.Optimizer): The optimizer.
|
175 |
+
losses (Dict[str, float]): Optional record of the loss during training.
|
176 |
+
Updated using the component name as the key.
|
177 |
+
RETURNS (Dict[str, float]): The updated losses dictionary.
|
178 |
+
"""
|
179 |
+
print("Entered pipe TorchEntityRecognizer.update - ")
|
180 |
+
if losses is None:
|
181 |
+
losses = {}
|
182 |
+
losses.setdefault(self.name, 0.0)
|
183 |
+
validate_examples(examples, "TorchEntityRecognizer.update")
|
184 |
+
if not any(len(eg.predicted) if eg.predicted else 0 for eg in examples):
|
185 |
+
# Handle cases where there are no tokens in any docs.
|
186 |
+
return losses
|
187 |
+
set_torch_dropout_rate(self.model, drop)
|
188 |
+
tag_scores, bp_tag_scores = self.model.begin_update(
|
189 |
+
[eg.predicted for eg in examples]
|
190 |
+
)
|
191 |
+
for sc in tag_scores:
|
192 |
+
if self.model.ops.xp.isnan(sc.sum()):
|
193 |
+
raise ValueError(Errors.E940)
|
194 |
+
loss, d_tag_scores = self.get_loss(examples, tag_scores)
|
195 |
+
bp_tag_scores(d_tag_scores)
|
196 |
+
if sgd not in (None, False):
|
197 |
+
self.finish_update(sgd)
|
198 |
+
|
199 |
+
losses[self.name] += loss
|
200 |
+
return losses
|
201 |
+
|
202 |
+
def get_loss(
|
203 |
+
self, examples: Iterable[Example], scores: Iterable[Floats2d]
|
204 |
+
) -> Tuple[float, float]:
|
205 |
+
"""Find the loss and gradient of loss for the batch of documents and
|
206 |
+
their predicted scores.
|
207 |
+
examples (Iterable[Example]): The batch of examples.
|
208 |
+
scores: Scores representing the model's predictions.
|
209 |
+
RETURNS (Tuple[float, float]): The loss and the gradient.
|
210 |
+
"""
|
211 |
+
print("Entered pipe TorchEntityRecognizer.get_loss - ")
|
212 |
+
validate_examples(examples, "TorchEntityRecognizer.get_loss")
|
213 |
+
loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False)
|
214 |
+
truths = []
|
215 |
+
for eg in examples:
|
216 |
+
eg_truths = [
|
217 |
+
tag if tag != "" else None for tag in biluo_to_iob(eg.get_aligned_ner())
|
218 |
+
]
|
219 |
+
truths.append(eg_truths)
|
220 |
+
d_scores, loss = loss_func(scores, truths)
|
221 |
+
if self.model.ops.xp.isnan(loss):
|
222 |
+
raise ValueError(Errors.E910.format(name=self.name))
|
223 |
+
return float(loss), d_scores
|
224 |
+
|
225 |
+
def initialize(
|
226 |
+
self,
|
227 |
+
get_examples: Callable[[], Iterable[Example]],
|
228 |
+
*,
|
229 |
+
nlp: Optional[Language] = None,
|
230 |
+
labels: Optional[List[str]] = None,
|
231 |
+
):
|
232 |
+
"""Initialize the pipe for training, using a representative set
|
233 |
+
of data examples.
|
234 |
+
get_examples (Callable[[], Iterable[Example]]): Function that
|
235 |
+
returns a representative sample of gold-standard Example objects..
|
236 |
+
nlp (Language): The current nlp object the component is part of.
|
237 |
+
labels (Optional[List[str]]): The labels to add to the component, typically generated by the
|
238 |
+
`init labels` command. If no labels are provided, the get_examples
|
239 |
+
callback is used to extract the labels from the data.
|
240 |
+
"""
|
241 |
+
print("Entered pipe TorchEntityRecognizer.initialize - ")
|
242 |
+
validate_get_examples(get_examples, "TorchEntityRecognizer.initialize")
|
243 |
+
if labels is not None:
|
244 |
+
for tag in labels:
|
245 |
+
self.add_label(tag)
|
246 |
+
else:
|
247 |
+
tags = set()
|
248 |
+
for example in get_examples():
|
249 |
+
for token in example.y:
|
250 |
+
if token.ent_type_:
|
251 |
+
tags.add(token.ent_type_)
|
252 |
+
for tag in sorted(tags):
|
253 |
+
self.add_label(tag)
|
254 |
+
doc_sample = []
|
255 |
+
for example in islice(get_examples(), 10):
|
256 |
+
doc_sample.append(example.x)
|
257 |
+
|
258 |
+
self._require_labels()
|
259 |
+
assert len(doc_sample) > 0, Errors.E923.format(name=self.name)
|
260 |
+
print(nlp.config["components"][self.name]["model"]["nO"])
|
261 |
+
#print(nlp.config["components"][self.name]["model"]["nI"])
|
262 |
+
self.model.initialize(X=doc_sample, Y=self.labels)
|
263 |
+
print("self.model.initialize exit")
|
264 |
+
print(self.model.name)
|
265 |
+
print(self.model.layers[0].ref_names)
|
266 |
+
print(self.model.layers[1].ref_names)
|
267 |
+
print(self.name)
|
268 |
+
nlp.config["components"][self.name]["model"]["nO"] = len(self.labels)
|
269 |
+
#nlp.config["components"][self.name]["model"]["nI"] = 768
|
270 |
+
print(nlp.config["components"][self.name]["model"])
|
271 |
+
|
272 |
+
def add_label(self, label: str) -> int:
|
273 |
+
"""Add a new label to the pipe.
|
274 |
+
label (str): The label to add.
|
275 |
+
RETURNS (int): 0 if label is already present, otherwise 1.
|
276 |
+
"""
|
277 |
+
print("Entered pipe TorchEntityRecognizer.add_label - ")
|
278 |
+
if not isinstance(label, str):
|
279 |
+
raise ValueError(Errors.E187)
|
280 |
+
if label in self.labels:
|
281 |
+
return 0
|
282 |
+
self._allow_extra_label()
|
283 |
+
self.cfg["labels"].append(label)
|
284 |
+
self.vocab.strings.add(label)
|
285 |
+
return 1
|
286 |
+
|
287 |
+
def score(self, examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
|
288 |
+
"""Score a batch of examples.
|
289 |
+
examples (Iterable[Example]): The examples to score.
|
290 |
+
RETURNS (Dict[str, Any]): The NER precision, recall and f-scores.
|
291 |
+
"""
|
292 |
+
print("Entered pipe TorchEntityRecognizer.score - ")
|
293 |
+
validate_examples(examples, "TorchEntityRecognizer.score")
|
294 |
+
return get_ner_prf(examples)
|
scripts/visualize_model.py
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import spacy_streamlit
|
2 |
+
import typer
|
3 |
+
from torch_ner_model import build_torch_ner_model
|
4 |
+
from torch_ner_pipe import make_torch_entity_recognizer
|
5 |
+
|
6 |
+
|
7 |
+
def main(models: str, default_text: str):
|
8 |
+
models = [name.strip() for name in models.split(",")]
|
9 |
+
labels = ["person", "problem", "pronoun", "test", "treatment"]
|
10 |
+
spacy_streamlit.visualize(
|
11 |
+
models, default_text, visualizers=["ner"], ner_labels=labels
|
12 |
+
)
|
13 |
+
|
14 |
+
|
15 |
+
if __name__ == "__main__":
|
16 |
+
try:
|
17 |
+
typer.run(main)
|
18 |
+
except SystemExit:
|
19 |
+
pass
|