File size: 6,610 Bytes
20b1575
ae7950f
20b1575
 
a800d1a
 
 
 
20b1575
a800d1a
 
20b1575
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
import re
from spacy.tokenizer import Tokenizer
from spacy.util import compile_infix_regex, compile_prefix_regex, compile_suffix_regex, registry
from spacy.symbols import ORTH

@registry.tokenizers("customize_tokenizer")
def make_customize_tokenizer():
    def customize_tokenizer(nlp):
        return custom_tokenizer(nlp)

    return customize_tokenizer

# File included for bundling 
# spacy/custom_tokenizer/custom_tokenizer.py
EXTENDED_LETTER_RANGE = "A-Za-zäöüÄÖÜàòèéìù"
DATE = r"[0-3][1-9]\.[0-1][1-9]\.[1-2][0-9]{3}"
TOP_LEVEL_DOMAINS = "ch|at|de|com|edu|org|gov|net|fr|uk|be|es|pl|it|eu|nl|ba|cz|dk|al|ad|bg|by|fi|gr|ie|li|lu|no|pt|ro|rs|ru|se|si|sk"

DOT_AFTER_WORD = [
    rf"(?<!www\.)(?<=([a-zA-ZäöüÄÖÜ]){{{i}}})\.(?!({TOP_LEVEL_DOMAINS}))"
    for i in range(3, 30)
]

DOT_AFTER_DATE = rf"(?<=({DATE}))\."

infix_res = [
    r"[\(\[\]\)]",
    r"(?<=\.--)\.",  # DOT after .--
    rf"\.(?=[{EXTENDED_LETTER_RANGE}]{{3,20}})",  # DOT before word
    r"'\.\.",  # e.g., 'Tscheicha'.. -> "Tscheicha" "'..", then split ".." as suffix
    *DOT_AFTER_WORD,  # when there is no space after the dot
    r"[A-Z](?=\. )",  # DOT after capital letter
    DOT_AFTER_DATE,
]

LETTER_DOUBLE_ENDING_DOT_VAR_LENGTH = [  # DOT after letter, e.g., A.G., or u.s.w.
    rf"(?<=([{EXTENDED_LETTER_RANGE}]\.){{{i}}})\." for i in range(1, 30)
]

suffix_res = [
    r"(?<=\d)[\.]",  # DOT after number
    r"(?<=[\.])[\]\)]",  # Closing brackets with DOT before
    rf"[\)\]](?=[\(\[\.{EXTENDED_LETTER_RANGE}0-9]+)",  # Closing brackets with word/brackets after
    r"(?<=')\.\.",  # split "..'" -> ".." "'"
    r"\.\.\.",
    *LETTER_DOUBLE_ENDING_DOT_VAR_LENGTH,
    r"(?<=[A-Z])\.",
]

DOT_DOT_PLUS = r"\.\.+"
DOT_DOT_PLUS_FIXED = r"\.\.\.+"
NUMBER_DASH_NUMBER = r"(?<=[0-9])-(?=[0-9])"
NUMBER_SIGN_NUMBER = r"(?<=[0-9])[+\-\*^](?=[0-9-])"
NUMBER_SIGN_NUMBER_FIXED = r"(?<=[0-9])[+\*^](?=[0-9])"


# Given a nlp object, return a custom tokenizer that splits on special cases and with unwanted tokenization removed
def custom_tokenizer(nlp):
    nlp.tokenizer = Tokenizer(nlp.vocab)

    prefix_regex = compile_prefix_regex(nlp.Defaults.prefixes)
    nlp.tokenizer.prefix_search = prefix_regex.search

    # We use the default infixes and remove some cases that lead to unwanted tokenization.
    # The removed cases are: [number]-[number] and[number][sign][number]
    # We don't want to remove all signs, so we readd the NUMBER_SIGN_NUMBER_FIXED case that only excludes
    # the minus sign, since we don't want to split for example CH-501.3.014.015-5
    infixes = nlp.Defaults.infixes
    if NUMBER_DASH_NUMBER in infixes:
        infixes.remove(NUMBER_DASH_NUMBER)
    if NUMBER_SIGN_NUMBER in infixes:
        infixes.remove(NUMBER_SIGN_NUMBER)
        infixes.append(NUMBER_SIGN_NUMBER_FIXED)
    infixes += infix_res
    infix_regex = compile_infix_regex(infixes)
    nlp.tokenizer.infix_finditer = infix_regex.finditer

    # We remove the "..+" case and replace it with "...+" to be able to split on ".."
    suffixes = nlp.Defaults.suffixes
    if DOT_DOT_PLUS in suffixes:
        suffixes.remove(DOT_DOT_PLUS)
        suffixes.append(DOT_DOT_PLUS_FIXED)
    suffixes += suffix_res
    suffix_regex = compile_suffix_regex(suffixes)
    nlp.tokenizer.suffix_search = suffix_regex.search

    # Add all special cases (e.g., GmbH. -> GmbH .)
    for special_case, tokens in special_cases.items():
        nlp.tokenizer.add_special_case(special_case, tokens)

    nlp.tokenizer.token_match = re.compile(r"^\[$").search

    return nlp.tokenizer

# File included for bundling
# spacy/custom_tokenizer/custom_tokenizer_special_cases.py

# Special cases following either pattern:
# word. -> word. e.g., etc. which we don't want to split (an exception to the general rule)
# word.. -> word. . e.g., Liq.. which we want to split after the first dot
special_cases = {
    "cf.": [{ORTH: "cf."}],
    "etc.": [{ORTH: "etc."}],
    "usw.": [{ORTH: "usw."}],
    "u.s.w.": [{ORTH: "u.s.w."}],
    "u.ä.": [{ORTH: "u.ä."}],
    "Liq..": [{ORTH: "Liq."}, {ORTH: "."}],
    "Cie..": [{ORTH: "Cie."}, {ORTH: "."}],
    "Co..": [{ORTH: "Co."}, {ORTH: "."}],
    "S.à.r.l.": [{ORTH: "S.à.r.l."}],
    "r.l.": [{ORTH: "r.l."}],
    "R.l.": [{ORTH: "R.l."}],
    "g.l.": [{ORTH: "g.l."}],
    "S.c.r.l.": [{ORTH: "S.c.r.l."}],
    "u.a.": [{ORTH: "u.a."}],
    "u.a.m.": [{ORTH: "u.a.m."}],
    "s.à.r.l.": [{ORTH: "s.à.r.l."}],
    "S.a.r.l.": [{ORTH: "S.a.r.l."}],
    "s.a.r.l.": [{ORTH: "s.a.r.l."}],
    "s.àr.l.": [{ORTH: "s.àr.l."}],
    "u.d.g.": [{ORTH: "u.d.g."}],
    "S.a.g.l.": [{ORTH: "S.a.g.l."}],
    "S.r.l.": [{ORTH: "S.r.l."}],
    "S.r.": [{ORTH: "S.r."}],
    "Ltd..": [{ORTH: "Ltd."}, {ORTH: "."}],
    "LTD..": [{ORTH: "LTD."}, {ORTH: "."}],
    "ltd..": [{ORTH: "ltd."}, {ORTH: "."}],
    "Corp..": [{ORTH: "Corp."}, {ORTH: "."}],
    "Inc..": [{ORTH: "Inc."}, {ORTH: "."}],
    "dgl..": [{ORTH: "dgl."}, {ORTH: "."}],
    "ect..": [{ORTH: "ect."}, {ORTH: "."}],  # typo of etc.
    "co..": [{ORTH: "co."}, {ORTH: "."}],
    "CO..": [{ORTH: "CO."}, {ORTH: "."}],
    "Ing..": [{ORTH: "Ing."}, {ORTH: "."}],
    "HRegV..": [{ORTH: "HRegV."}, {ORTH: "."}],
    "ehf..": [{ORTH: "ehf."}, {ORTH: "."}],
    "Gen..": [{ORTH: "Gen."}, {ORTH: "."}],
    "Var..": [{ORTH: "Var."}, {ORTH: "."}],
    "b.v..": [{ORTH: "b.v."}, {ORTH: "."}],
    "Dr..": [{ORTH: "Dr."}, {ORTH: "."}],
    "Br..": [{ORTH: "Br."}, {ORTH: "."}],
    "iu..": [{ORTH: "iu."}, {ORTH: "."}],
    "Ch..": [{ORTH: "Ch."}, {ORTH: "."}],
    "Inh..": [{ORTH: "Inh."}, {ORTH: "."}],
    "sf..": [{ORTH: "sf."}, {ORTH: "."}],
    "sen..": [{ORTH: "sen."}, {ORTH: "."}],
    "Std..": [{ORTH: "Std."}, {ORTH: "."}],
    "d.o.o..": [{ORTH: "d.o.o."}, {ORTH: "."}],
    "M.Sc..": [{ORTH: "M.Sc."}, {ORTH: "."}],
    "s.a..": [{ORTH: "s.a."}, {ORTH: "."}],
    "ag..": [{ORTH: "ag."}, {ORTH: "."}],
    "Fa..": [{ORTH: "Fa."}, {ORTH: "."}],
    "Ti..": [{ORTH: "Ti."}, {ORTH: "."}],
    "div..": [{ORTH: "div."}, {ORTH: "."}],
    "ä..": [{ORTH: "ä."}, {ORTH: "."}],
    "v.k.s.s..": [{ORTH: "v.k.s.s."}, {ORTH: "."}],
    "ecc..": [{ORTH: "ecc."}, {ORTH: "."}],
    "fed..": [{ORTH: "fed."}, {ORTH: "."}],
    "Psy-K..": [{ORTH: "Psy-K."}, {ORTH: "."}],
    "dipl.fed..": [{ORTH: "dipl.fed."}, {ORTH: "."}],
    "Jr..": [{ORTH: "Jr."}, {ORTH: "."}],
    "succ..": [{ORTH: "succ."}, {ORTH: "."}],
    "méd..": [{ORTH: "méd."}, {ORTH: "."}],
    "ass..": [{ORTH: "ass."}, {ORTH: "."}],
    "env..": [{ORTH: "env."}, {ORTH: "."}],
    "Int..": [{ORTH: "Int."}, {ORTH: "."}],
    "Chr..": [{ORTH: "Chr."}, {ORTH: "."}],
}