Spaces:

AmitGarage
/

Pytorch_clinical_NER

Runtime error

File size: 10,477 Bytes

from collections import OrderedDict
from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple
import numpy
from thinc.api import (
    Config,
    Model,
    set_dropout_rate,
    SequenceCategoricalCrossentropy,
    Optimizer,
)
from thinc.types import Ints1d, Floats2d
from itertools import islice

from spacy.tokens.doc import Doc
from spacy.vocab import Vocab

from spacy.training import Example
from spacy.training.iob_utils import biluo_tags_to_spans, biluo_to_iob, iob_to_biluo
from spacy.pipeline.trainable_pipe import TrainablePipe
from spacy.pipeline.pipe import deserialize_config
from spacy.language import Language
from spacy.attrs import POS, ID
from spacy.parts_of_speech import X
from spacy.errors import Errors
from spacy.scorer import get_ner_prf
from spacy.training import validate_examples, validate_get_examples
from spacy import util


def set_torch_dropout_rate(model: Model, dropout_rate: float):
    """Set dropout rate for Thinc and wrapped PyTorch models

    Args:
        model (Model): Thinc Model (with PyTorch sub-modules)
        dropout_rate (float): Dropout rate
    """
    set_dropout_rate(model, dropout_rate)
    func = model.get_ref("torch_model").attrs["set_dropout_rate"]
    func(dropout_rate)


default_model_config = """
[model]
@architectures = "TorchEntityRecognizer.v1"
hidden_width = 48
dropout = 0.1
nO = null

[model.tok2vec]
@architectures = "spacy.HashEmbedCNN.v1"
pretrained_vectors = null
width = 96
depth = 4
embed_size = 2000
window_size = 1
maxout_pieces = 3
subword_features = true
"""
DEFAULT_MODEL = Config().from_str(default_model_config)["model"]


@Language.factory(
    "torch_ner",
    assigns=["doc.ents", "token.ent_iob", "token.ent_type"],
    default_config={"model": DEFAULT_MODEL},
    default_score_weights={
        "ents_f": 1.0,
        "ents_p": 0.0,
        "ents_r": 0.0,
        "ents_per_type": None,
    },
)
def make_torch_entity_recognizer(nlp: Language, name: str, model: Model):
    """Construct a PyTorch based Named Entity Recognition model
    model (Model[List[Doc], List[Floats2d]]): A model instance that predicts
        the tag probabilities. The output vectors should match the number of tags
        in size, and be normalized as probabilities (all scores between 0 and 1,
        with the rows summing to 1).
    """
    return TorchEntityRecognizer(nlp.vocab, model, name)


class TorchEntityRecognizer(TrainablePipe):
    """Pipeline component Named Entity Recognition using PyTorch"""

    def __init__(self, vocab: Vocab, model: Model, name: str = "torch_ner"):
        """Initialize a part-of-speech tagger.
        vocab (Vocab): The shared vocabulary.
        model (thinc.api.Model): The Thinc Model powering the pipeline component.
        name (str): The component instance name, used to add entries to the
            losses during training.
        """
        self.vocab = vocab
        self.model = model
        self.name = name
        cfg = {"labels": []}
        self.cfg = dict(sorted(cfg.items()))

    @property
    def labels(self) -> Tuple[str, ...]:
        """The labels currently added to the component.
        RETURNS (Tuple[str]): The labels.
        """
        labels = ["O"]
        for label in self.cfg["labels"]:
            for iob in ["B", "I"]:
                labels.append(f"{iob}-{label}")
        return tuple(labels)

    def predict(self, docs: Iterable[Doc]) -> Iterable[Ints1d]:
        """Apply the pipeline's model to a batch of docs, without modifying them.
        docs (Iterable[Doc]): The documents to predict.
        RETURNS: The models prediction for each document.
        """
        if not any(len(doc) for doc in docs):
            # Handle cases where there are no tokens in any docs.
            n_labels = len(self.labels)
            guesses = [self.model.ops.alloc((0, n_labels)) for doc in docs]
            assert len(guesses) == len(docs)
            return guesses
        scores = self.model.predict(docs)

        assert len(scores) == len(docs), (len(scores), len(docs))
        guesses = []
        for doc_scores in scores:
            doc_guesses = doc_scores.argmax(axis=1)
            if not isinstance(doc_guesses, numpy.ndarray):
                doc_guesses = doc_guesses.get()
            guesses.append(doc_guesses)
        assert len(guesses) == len(docs)
        return guesses

    def set_annotations(self, docs: Iterable[Doc], preds: Iterable[Ints1d]):
        """Modify a batch of documents, using pre-computed scores.
        docs (Iterable[Doc]): The documents to modify.
        preds (Iterable[Ints1d]): The IDs to set, produced by TorchEntityRecognizer.predict.
        """
        if isinstance(docs, Doc):
            docs = [docs]
        for doc, tag_ids in zip(docs, preds):
            labels = iob_to_biluo([self.labels[tag_id] for tag_id in tag_ids])
            try:
                spans = biluo_tags_to_spans(doc, labels)
            except ValueError:
                # Note:
                # biluo_tags_to_spans will raise an exception for an invalid tag sequence
                # this could be fixed using a more complex transition system
                # (e.g. a Conditional Random Field model head)
                spans = []
            doc.ents = spans

    def update(
        self,
        examples: Iterable[Example],
        *,
        drop: float = 0.0,
        sgd: Optimizer = None,
        losses: Dict[str, float] = None,
    ) -> Dict[str, float]:
        """Learn from a batch of documents and gold-standard information,
        updating the pipe's model. Delegates to predict and get_loss.
        examples (Iterable[Example]): A batch of Example objects.
        drop (float): The dropout rate.
        sgd (thinc.api.Optimizer): The optimizer.
        losses (Dict[str, float]): Optional record of the loss during training.
            Updated using the component name as the key.
        RETURNS (Dict[str, float]): The updated losses dictionary.
        """
        if losses is None:
            losses = {}
        losses.setdefault(self.name, 0.0)
        validate_examples(examples, "TorchEntityRecognizer.update")
        if not any(len(eg.predicted) if eg.predicted else 0 for eg in examples):
            # Handle cases where there are no tokens in any docs.
            return losses
        set_torch_dropout_rate(self.model, drop)
        tag_scores, bp_tag_scores = self.model.begin_update(
            [eg.predicted for eg in examples]
        )
        for sc in tag_scores:
            if self.model.ops.xp.isnan(sc.sum()):
                raise ValueError(Errors.E940)
        loss, d_tag_scores = self.get_loss(examples, tag_scores)
        bp_tag_scores(d_tag_scores)
        if sgd not in (None, False):
            self.finish_update(sgd)

        losses[self.name] += loss
        return losses

    def get_loss(
        self, examples: Iterable[Example], scores: Iterable[Floats2d]
    ) -> Tuple[float, float]:
        """Find the loss and gradient of loss for the batch of documents and
        their predicted scores.
        examples (Iterable[Example]): The batch of examples.
        scores: Scores representing the model's predictions.
        RETURNS (Tuple[float, float]): The loss and the gradient.
        """
        validate_examples(examples, "TorchEntityRecognizer.get_loss")
        loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False)
        truths = []
        for eg in examples:
            eg_truths = [
                tag if tag != "" else None for tag in biluo_to_iob(eg.get_aligned_ner())
            ]
            truths.append(eg_truths)
        d_scores, loss = loss_func(scores, truths)
        if self.model.ops.xp.isnan(loss):
            raise ValueError(Errors.E910.format(name=self.name))
        return float(loss), d_scores

    def initialize(
        self,
        get_examples: Callable[[], Iterable[Example]],
        *,
        nlp: Optional[Language] = None,
        labels: Optional[List[str]] = None,
    ):
        """Initialize the pipe for training, using a representative set
        of data examples.
        get_examples (Callable[[], Iterable[Example]]): Function that
            returns a representative sample of gold-standard Example objects..
        nlp (Language): The current nlp object the component is part of.
        labels (Optional[List[str]]): The labels to add to the component, typically generated by the
            `init labels` command. If no labels are provided, the get_examples
            callback is used to extract the labels from the data.
        """
        validate_get_examples(get_examples, "TorchEntityRecognizer.initialize")
        if labels is not None:
            for tag in labels:
                self.add_label(tag)
        else:
            tags = set()
            for example in get_examples():
                for token in example.y:
                    if token.ent_type_:
                        tags.add(token.ent_type_)
            for tag in sorted(tags):
                self.add_label(tag)
        doc_sample = []
        for example in islice(get_examples(), 10):
            doc_sample.append(example.x)

        self._require_labels()
        assert len(doc_sample) > 0, Errors.E923.format(name=self.name)
        self.model.initialize(X=doc_sample, Y=self.labels)
        nlp.config["components"][self.name]["model"]["nO"] = len(self.labels)
        if self.model.layers[0].maybe_get_ref("listener") != None :
            nlp.config["components"][self.name]["model"]["width"] = self.model.layers[0].maybe_get_ref("listener").maybe_get_dim("nO")

    def add_label(self, label: str) -> int:
        """Add a new label to the pipe.
        label (str): The label to add.
        RETURNS (int): 0 if label is already present, otherwise 1.
        """
        if not isinstance(label, str):
            raise ValueError(Errors.E187)
        if label in self.labels:
            return 0
        self._allow_extra_label()
        self.cfg["labels"].append(label)
        self.vocab.strings.add(label)
        return 1

    def score(self, examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
        """Score a batch of examples.
        examples (Iterable[Example]): The examples to score.
        RETURNS (Dict[str, Any]): The NER precision, recall and f-scores.
        """
        validate_examples(examples, "TorchEntityRecognizer.score")
        return get_ner_prf(examples)