File size: 1,723 Bytes
b303a60
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
import spacy
from spacy.lang.char_classes import LIST_ELLIPSES
from spacy.lang.char_classes import HYPHENS, LIST_ICONS
from spacy.lang.char_classes import (
    ALPHA,
    ALPHA_LOWER,
    ALPHA_UPPER,
    CONCAT_QUOTES,
    PUNCT,
)
from spacy.util import compile_infix_regex, registry

infixes = (
    LIST_ELLIPSES
    + LIST_ICONS
    + [
        r"(?<=[0-9])[+\-\*^](?=[0-9-])",
        r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
            al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
        ),
        r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
        r"(?<=[{a}])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),
        r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
        # [10]R. Nakamura and T. Kenzaka
        # (10)R. Nakamura and T. Kenzaka
        # 10.R. Nakamura and T. Kenzaka
        r"(?<=[{a}0-9])[:<>\]\)\.](?=[{a}])".format(a=ALPHA),
        # 10R. Nakamura and T. Kenzaka
        r"(?<=[0-9])(?=[{a}])".format(a=ALPHA),
    ]
)


@spacy.registry.tokenizers("references_tokenizer")
def create_references_tokenizer():
    def create_tokenizer(nlp):

        default_tokenizer_cfg = {"tokenizer": {"@tokenizers": "spacy.Tokenizer.v1"}}

        create_default_tokenizer = registry.resolve(default_tokenizer_cfg)["tokenizer"]
        tokenizer = create_default_tokenizer(nlp)

        infix_re = compile_infix_regex(infixes)
        tokenizer.infix_finditer = infix_re.finditer

        for s in [
            "[10]R. Nakamura and T. Kenzaka",
            "(10)R. Nakamura and T. Kenzaka",
            "10.R. Nakamura and T. Kenzaka",
            "10R. Nakamura and T. Kenzaka",
        ]:
            print(s, "->", [t for t in tokenizer(s)])

        return tokenizer

    return create_tokenizer