Spaces:
Sleeping
Sleeping
File size: 6,538 Bytes
8778cfe |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 |
import numpy as np
from .downloader import load_trained_model
from ..parse_base import BaseParser, BaseInputExample
from .spacy_extensions import ConstituentData, NonConstituentException
import torch
class PartialConstituentData:
def __init__(self):
self.starts = [np.array([], dtype=int)]
self.ends = [np.array([], dtype=int)]
self.labels = [np.array([], dtype=int)]
def finalize(self, doc, label_vocab):
self.starts = np.hstack(self.starts)
self.ends = np.hstack(self.ends)
self.labels = np.hstack(self.labels)
# TODO(nikita): Python for loops aren't very fast
loc_to_constituent = np.full(len(doc), -1, dtype=int)
prev = None
for position in range(self.starts.shape[0]):
if self.starts[position] != prev:
prev = self.starts[position]
loc_to_constituent[self.starts[position]] = position
return ConstituentData(
self.starts, self.ends, self.labels, loc_to_constituent, label_vocab
)
class SentenceWrapper(BaseInputExample):
TEXT_NORMALIZATION_MAPPING = {
"`": "'",
"«": '"',
"»": '"',
"‘": "'",
"’": "'",
"“": '"',
"”": '"',
"„": '"',
"‹": "'",
"›": "'",
"—": "--", # em dash
}
def __init__(self, spacy_sent):
self.sent = spacy_sent
@property
def words(self):
return [
self.TEXT_NORMALIZATION_MAPPING.get(token.text, token.text)
for token in self.sent
]
@property
def space_after(self):
return [bool(token.whitespace_) for token in self.sent]
@property
def tree(self):
return None
def leaves(self):
return self.words
def pos(self):
return [(word, "UNK") for word in self.words]
class BeneparComponent:
"""
Berkeley Neural Parser (benepar) component for spaCy.
Sample usage:
>>> nlp = spacy.load('en_core_web_md')
>>> if spacy.__version__.startswith('2'):
nlp.add_pipe(BeneparComponent("benepar_en3"))
else:
nlp.add_pipe("benepar", config={"model": "benepar_en3"})
>>> doc = nlp("The quick brown fox jumps over the lazy dog.")
>>> sent = list(doc.sents)[0]
>>> print(sent._.parse_string)
This component is only responsible for constituency parsing and (for some
trained models) part-of-speech tagging. It should be preceded in the
pipeline by other components that can, at minimum, perform tokenization and
sentence segmentation.
"""
name = "benepar"
def __init__(
self,
name,
subbatch_max_tokens=500,
disable_tagger=False,
batch_size="ignored",
):
"""Load a trained parser model.
Args:
name (str): Model name, or path to pytorch saved model
subbatch_max_tokens (int): Maximum number of tokens to process in
each batch
disable_tagger (bool, default False): Unless disabled, the parser
will set predicted part-of-speech tags for the document,
overwriting any existing tags provided by spaCy models or
previous pipeline steps. This option has no effect for parser
models that do not have a part-of-speech tagger built in.
batch_size: deprecated and ignored; use subbatch_max_tokens instead
"""
self._parser = load_trained_model(name)
if torch.cuda.is_available():
self._parser.cuda()
self.subbatch_max_tokens = subbatch_max_tokens
self.disable_tagger = disable_tagger
self._label_vocab = self._parser.config["label_vocab"]
label_vocab_size = max(self._label_vocab.values()) + 1
self._label_from_index = [()] * label_vocab_size
for label, i in self._label_vocab.items():
if label:
self._label_from_index[i] = tuple(label.split("::"))
else:
self._label_from_index[i] = ()
self._label_from_index = tuple(self._label_from_index)
if not self.disable_tagger:
tag_vocab = self._parser.config["tag_vocab"]
tag_vocab_size = max(tag_vocab.values()) + 1
self._tag_from_index = [()] * tag_vocab_size
for tag, i in tag_vocab.items():
self._tag_from_index[i] = tag
self._tag_from_index = tuple(self._tag_from_index)
else:
self._tag_from_index = None
def __call__(self, doc):
"""Update the input document with predicted constituency parses."""
# TODO(https://github.com/nikitakit/self-attentive-parser/issues/16): handle
# tokens that consist entirely of whitespace.
constituent_data = PartialConstituentData()
wrapped_sents = [SentenceWrapper(sent) for sent in doc.sents]
for sent, parse in zip(
doc.sents,
self._parser.parse(
wrapped_sents,
return_compressed=True,
subbatch_max_tokens=self.subbatch_max_tokens,
),
):
constituent_data.starts.append(parse.starts + sent.start)
constituent_data.ends.append(parse.ends + sent.start)
constituent_data.labels.append(parse.labels)
if parse.tags is not None and not self.disable_tagger:
for i, tag_id in enumerate(parse.tags):
sent[i].tag_ = self._tag_from_index[tag_id]
doc._._constituent_data = constituent_data.finalize(doc, self._label_from_index)
return doc
def create_benepar_component(
nlp,
name,
model: str,
subbatch_max_tokens: int,
disable_tagger: bool,
):
return BeneparComponent(
model,
subbatch_max_tokens=subbatch_max_tokens,
disable_tagger=disable_tagger,
)
def register_benepar_component_factory():
# Starting with spaCy 3.0, nlp.add_pipe no longer directly accepts
# BeneparComponent instances. We must instead register a component factory.
import spacy
if spacy.__version__.startswith("2"):
return
from spacy.language import Language
Language.factory(
"benepar",
default_config={
"subbatch_max_tokens": 500,
"disable_tagger": False,
},
func=create_benepar_component,
)
try:
register_benepar_component_factory()
except ImportError:
pass
|