Spaces:
Build error
Build error
File size: 1,723 Bytes
b303a60 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 |
import spacy
from spacy.lang.char_classes import LIST_ELLIPSES
from spacy.lang.char_classes import HYPHENS, LIST_ICONS
from spacy.lang.char_classes import (
ALPHA,
ALPHA_LOWER,
ALPHA_UPPER,
CONCAT_QUOTES,
PUNCT,
)
from spacy.util import compile_infix_regex, registry
infixes = (
LIST_ELLIPSES
+ LIST_ICONS
+ [
r"(?<=[0-9])[+\-\*^](?=[0-9-])",
r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
),
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
r"(?<=[{a}])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),
r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
# [10]R. Nakamura and T. Kenzaka
# (10)R. Nakamura and T. Kenzaka
# 10.R. Nakamura and T. Kenzaka
r"(?<=[{a}0-9])[:<>\]\)\.](?=[{a}])".format(a=ALPHA),
# 10R. Nakamura and T. Kenzaka
r"(?<=[0-9])(?=[{a}])".format(a=ALPHA),
]
)
@spacy.registry.tokenizers("references_tokenizer")
def create_references_tokenizer():
def create_tokenizer(nlp):
default_tokenizer_cfg = {"tokenizer": {"@tokenizers": "spacy.Tokenizer.v1"}}
create_default_tokenizer = registry.resolve(default_tokenizer_cfg)["tokenizer"]
tokenizer = create_default_tokenizer(nlp)
infix_re = compile_infix_regex(infixes)
tokenizer.infix_finditer = infix_re.finditer
for s in [
"[10]R. Nakamura and T. Kenzaka",
"(10)R. Nakamura and T. Kenzaka",
"10.R. Nakamura and T. Kenzaka",
"10R. Nakamura and T. Kenzaka",
]:
print(s, "->", [t for t in tokenizer(s)])
return tokenizer
return create_tokenizer
|