Spaces:
Runtime error
Runtime error
File size: 11,638 Bytes
3ab8bd6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 |
from collections import OrderedDict
from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple
import numpy
from thinc.api import (
Config,
Model,
set_dropout_rate,
SequenceCategoricalCrossentropy,
Optimizer,
)
from thinc.types import Ints1d, Floats2d
from itertools import islice
from spacy.tokens.doc import Doc
from spacy.vocab import Vocab
from spacy.training import Example
from spacy.training.iob_utils import biluo_tags_to_spans, biluo_to_iob, iob_to_biluo
from spacy.pipeline.trainable_pipe import TrainablePipe
from spacy.pipeline.pipe import deserialize_config
from spacy.language import Language
from spacy.attrs import POS, ID
from spacy.parts_of_speech import X
from spacy.errors import Errors
from spacy.scorer import get_ner_prf
from spacy.training import validate_examples, validate_get_examples
from spacy import util
def set_torch_dropout_rate(model: Model, dropout_rate: float):
"""Set dropout rate for Thinc and wrapped PyTorch models
Args:
model (Model): Thinc Model (with PyTorch sub-modules)
dropout_rate (float): Dropout rate
"""
#print("Entered set_torch_dropout_rate - ")
set_dropout_rate(model, dropout_rate)
func = model.get_ref("torch_model").attrs["set_dropout_rate"]
func(dropout_rate)
default_model_config = """
[model]
@architectures = "TorchEntityRecognizer.v1"
hidden_width = 48
dropout = 0.1
nO = null
[model.tok2vec]
@architectures = "spacy.HashEmbedCNN.v1"
pretrained_vectors = null
width = 96
depth = 4
embed_size = 2000
window_size = 1
maxout_pieces = 3
subword_features = true
"""
DEFAULT_MODEL = Config().from_str(default_model_config)["model"]
@Language.factory(
"torch_ner",
assigns=["doc.ents", "token.ent_iob", "token.ent_type"],
default_config={"model": DEFAULT_MODEL},
default_score_weights={
"ents_f": 1.0,
"ents_p": 0.0,
"ents_r": 0.0,
"ents_per_type": None,
},
)
def make_torch_entity_recognizer(nlp: Language, name: str, model: Model):
"""Construct a PyTorch based Named Entity Recognition model
model (Model[List[Doc], List[Floats2d]]): A model instance that predicts
the tag probabilities. The output vectors should match the number of tags
in size, and be normalized as probabilities (all scores between 0 and 1,
with the rows summing to 1).
"""
#print("Entered make_torch_entity_recognizer - ")
return TorchEntityRecognizer(nlp.vocab, model, name)
class TorchEntityRecognizer(TrainablePipe):
"""Pipeline component Named Entity Recognition using PyTorch"""
def __init__(self, vocab: Vocab, model: Model, name: str = "torch_ner"):
"""Initialize a part-of-speech tagger.
vocab (Vocab): The shared vocabulary.
model (thinc.api.Model): The Thinc Model powering the pipeline component.
name (str): The component instance name, used to add entries to the
losses during training.
"""
#print("Entered pipe TorchEntityRecognizer.__init__ - ")
self.vocab = vocab
self.model = model
self.name = name
cfg = {"labels": []}
self.cfg = dict(sorted(cfg.items()))
#print(self.vocab,self.model,self.name,self.cfg)
#print(self.model.layers[0].ref_names)
#print(self.model.layers[1].ref_names)
#print("Completed pipe TorchEntityRecognizer.__init__ - ")
@property
def labels(self) -> Tuple[str, ...]:
"""The labels currently added to the component.
RETURNS (Tuple[str]): The labels.
"""
##print("Entered TorchEntityRecognizer.labels - ")
labels = ["O"]
for label in self.cfg["labels"]:
for iob in ["B", "I"]:
labels.append(f"{iob}-{label}")
return tuple(labels)
def predict(self, docs: Iterable[Doc]) -> Iterable[Ints1d]:
"""Apply the pipeline's model to a batch of docs, without modifying them.
docs (Iterable[Doc]): The documents to predict.
RETURNS: The models prediction for each document.
"""
#print("Entered pipe TorchEntityRecognizer.predict - ")
if not any(len(doc) for doc in docs):
# Handle cases where there are no tokens in any docs.
n_labels = len(self.labels)
guesses = [self.model.ops.alloc((0, n_labels)) for doc in docs]
assert len(guesses) == len(docs)
return guesses
scores = self.model.predict(docs)
assert len(scores) == len(docs), (len(scores), len(docs))
guesses = []
for doc_scores in scores:
doc_guesses = doc_scores.argmax(axis=1)
if not isinstance(doc_guesses, numpy.ndarray):
doc_guesses = doc_guesses.get()
guesses.append(doc_guesses)
assert len(guesses) == len(docs)
return guesses
def set_annotations(self, docs: Iterable[Doc], preds: Iterable[Ints1d]):
"""Modify a batch of documents, using pre-computed scores.
docs (Iterable[Doc]): The documents to modify.
preds (Iterable[Ints1d]): The IDs to set, produced by TorchEntityRecognizer.predict.
"""
#print("Entered pipe TorchEntityRecognizer.set_annotations - ")
if isinstance(docs, Doc):
docs = [docs]
for doc, tag_ids in zip(docs, preds):
labels = iob_to_biluo([self.labels[tag_id] for tag_id in tag_ids])
try:
spans = biluo_tags_to_spans(doc, labels)
except ValueError:
# Note:
# biluo_tags_to_spans will raise an exception for an invalid tag sequence
# this could be fixed using a more complex transition system
# (e.g. a Conditional Random Field model head)
spans = []
doc.ents = spans
def update(
self,
examples: Iterable[Example],
*,
drop: float = 0.0,
sgd: Optimizer = None,
losses: Dict[str, float] = None,
) -> Dict[str, float]:
"""Learn from a batch of documents and gold-standard information,
updating the pipe's model. Delegates to predict and get_loss.
examples (Iterable[Example]): A batch of Example objects.
drop (float): The dropout rate.
sgd (thinc.api.Optimizer): The optimizer.
losses (Dict[str, float]): Optional record of the loss during training.
Updated using the component name as the key.
RETURNS (Dict[str, float]): The updated losses dictionary.
"""
#print("Entered pipe TorchEntityRecognizer.update - ")
if losses is None:
losses = {}
losses.setdefault(self.name, 0.0)
validate_examples(examples, "TorchEntityRecognizer.update")
if not any(len(eg.predicted) if eg.predicted else 0 for eg in examples):
# Handle cases where there are no tokens in any docs.
return losses
set_torch_dropout_rate(self.model, drop)
tag_scores, bp_tag_scores = self.model.begin_update(
[eg.predicted for eg in examples]
)
for sc in tag_scores:
if self.model.ops.xp.isnan(sc.sum()):
raise ValueError(Errors.E940)
loss, d_tag_scores = self.get_loss(examples, tag_scores)
bp_tag_scores(d_tag_scores)
if sgd not in (None, False):
self.finish_update(sgd)
losses[self.name] += loss
return losses
def get_loss(
self, examples: Iterable[Example], scores: Iterable[Floats2d]
) -> Tuple[float, float]:
"""Find the loss and gradient of loss for the batch of documents and
their predicted scores.
examples (Iterable[Example]): The batch of examples.
scores: Scores representing the model's predictions.
RETURNS (Tuple[float, float]): The loss and the gradient.
"""
#print("Entered pipe TorchEntityRecognizer.get_loss - ")
validate_examples(examples, "TorchEntityRecognizer.get_loss")
loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False)
truths = []
for eg in examples:
eg_truths = [
tag if tag != "" else None for tag in biluo_to_iob(eg.get_aligned_ner())
]
truths.append(eg_truths)
d_scores, loss = loss_func(scores, truths)
if self.model.ops.xp.isnan(loss):
raise ValueError(Errors.E910.format(name=self.name))
return float(loss), d_scores
def initialize(
self,
get_examples: Callable[[], Iterable[Example]],
*,
nlp: Optional[Language] = None,
labels: Optional[List[str]] = None,
):
"""Initialize the pipe for training, using a representative set
of data examples.
get_examples (Callable[[], Iterable[Example]]): Function that
returns a representative sample of gold-standard Example objects..
nlp (Language): The current nlp object the component is part of.
labels (Optional[List[str]]): The labels to add to the component, typically generated by the
`init labels` command. If no labels are provided, the get_examples
callback is used to extract the labels from the data.
"""
#print("Entered pipe TorchEntityRecognizer.initialize - ")
validate_get_examples(get_examples, "TorchEntityRecognizer.initialize")
if labels is not None:
for tag in labels:
self.add_label(tag)
else:
tags = set()
for example in get_examples():
for token in example.y:
if token.ent_type_:
tags.add(token.ent_type_)
for tag in sorted(tags):
self.add_label(tag)
doc_sample = []
for example in islice(get_examples(), 10):
doc_sample.append(example.x)
self._require_labels()
assert len(doc_sample) > 0, Errors.E923.format(name=self.name)
#print(nlp.config["components"][self.name]["model"]["nO"])
##print(nlp.config["components"][self.name]["model"]["nI"])
self.model.initialize(X=doc_sample, Y=self.labels)
#print("self.model.initialize exit")
#print(self.model.name)
#print(self.model.layers[0].ref_names)
#print(self.model.layers[1].ref_names)
#print(self.name)
nlp.config["components"][self.name]["model"]["nO"] = len(self.labels)
#nlp.config["components"][self.name]["model"]["nI"] = 768
#print(nlp.config["components"][self.name]["model"])
def add_label(self, label: str) -> int:
"""Add a new label to the pipe.
label (str): The label to add.
RETURNS (int): 0 if label is already present, otherwise 1.
"""
#print("Entered pipe TorchEntityRecognizer.add_label - ")
if not isinstance(label, str):
raise ValueError(Errors.E187)
if label in self.labels:
return 0
self._allow_extra_label()
self.cfg["labels"].append(label)
self.vocab.strings.add(label)
return 1
def score(self, examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
"""Score a batch of examples.
examples (Iterable[Example]): The examples to score.
RETURNS (Dict[str, Any]): The NER precision, recall and f-scores.
"""
#print("Entered pipe TorchEntityRecognizer.score - ")
validate_examples(examples, "TorchEntityRecognizer.score")
return get_ner_prf(examples)
|