Spaces:
Build error
Build error
| import spacy | |
| from spacy.lang.char_classes import LIST_ELLIPSES | |
| from spacy.lang.char_classes import HYPHENS, LIST_ICONS | |
| from spacy.lang.char_classes import ( | |
| ALPHA, | |
| ALPHA_LOWER, | |
| ALPHA_UPPER, | |
| CONCAT_QUOTES, | |
| PUNCT, | |
| ) | |
| from spacy.util import compile_infix_regex, registry | |
| infixes = ( | |
| LIST_ELLIPSES | |
| + LIST_ICONS | |
| + [ | |
| r"(?<=[0-9])[+\-\*^](?=[0-9-])", | |
| r"(?<=[{al}{q}])\.(?=[{au}{q}])".format( | |
| al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES | |
| ), | |
| r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA), | |
| r"(?<=[{a}])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS), | |
| r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA), | |
| # [10]R. Nakamura and T. Kenzaka | |
| # (10)R. Nakamura and T. Kenzaka | |
| # 10.R. Nakamura and T. Kenzaka | |
| r"(?<=[{a}0-9])[:<>\]\)\.](?=[{a}])".format(a=ALPHA), | |
| # 10R. Nakamura and T. Kenzaka | |
| r"(?<=[0-9])(?=[{a}])".format(a=ALPHA), | |
| ] | |
| ) | |
| def create_references_tokenizer(): | |
| def create_tokenizer(nlp): | |
| default_tokenizer_cfg = {"tokenizer": {"@tokenizers": "spacy.Tokenizer.v1"}} | |
| create_default_tokenizer = registry.resolve(default_tokenizer_cfg)["tokenizer"] | |
| tokenizer = create_default_tokenizer(nlp) | |
| infix_re = compile_infix_regex(infixes) | |
| tokenizer.infix_finditer = infix_re.finditer | |
| for s in [ | |
| "[10]R. Nakamura and T. Kenzaka", | |
| "(10)R. Nakamura and T. Kenzaka", | |
| "10.R. Nakamura and T. Kenzaka", | |
| "10R. Nakamura and T. Kenzaka", | |
| ]: | |
| print(s, "->", [t for t in tokenizer(s)]) | |
| return tokenizer | |
| return create_tokenizer | |