File size: 6,610 Bytes
20b1575 ae7950f 20b1575 a800d1a 20b1575 a800d1a 20b1575 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 |
import re
from spacy.tokenizer import Tokenizer
from spacy.util import compile_infix_regex, compile_prefix_regex, compile_suffix_regex, registry
from spacy.symbols import ORTH
@registry.tokenizers("customize_tokenizer")
def make_customize_tokenizer():
def customize_tokenizer(nlp):
return custom_tokenizer(nlp)
return customize_tokenizer
# File included for bundling
# spacy/custom_tokenizer/custom_tokenizer.py
EXTENDED_LETTER_RANGE = "A-Za-zäöüÄÖÜàòèéìù"
DATE = r"[0-3][1-9]\.[0-1][1-9]\.[1-2][0-9]{3}"
TOP_LEVEL_DOMAINS = "ch|at|de|com|edu|org|gov|net|fr|uk|be|es|pl|it|eu|nl|ba|cz|dk|al|ad|bg|by|fi|gr|ie|li|lu|no|pt|ro|rs|ru|se|si|sk"
DOT_AFTER_WORD = [
rf"(?<!www\.)(?<=([a-zA-ZäöüÄÖÜ]){{{i}}})\.(?!({TOP_LEVEL_DOMAINS}))"
for i in range(3, 30)
]
DOT_AFTER_DATE = rf"(?<=({DATE}))\."
infix_res = [
r"[\(\[\]\)]",
r"(?<=\.--)\.", # DOT after .--
rf"\.(?=[{EXTENDED_LETTER_RANGE}]{{3,20}})", # DOT before word
r"'\.\.", # e.g., 'Tscheicha'.. -> "Tscheicha" "'..", then split ".." as suffix
*DOT_AFTER_WORD, # when there is no space after the dot
r"[A-Z](?=\. )", # DOT after capital letter
DOT_AFTER_DATE,
]
LETTER_DOUBLE_ENDING_DOT_VAR_LENGTH = [ # DOT after letter, e.g., A.G., or u.s.w.
rf"(?<=([{EXTENDED_LETTER_RANGE}]\.){{{i}}})\." for i in range(1, 30)
]
suffix_res = [
r"(?<=\d)[\.]", # DOT after number
r"(?<=[\.])[\]\)]", # Closing brackets with DOT before
rf"[\)\]](?=[\(\[\.{EXTENDED_LETTER_RANGE}0-9]+)", # Closing brackets with word/brackets after
r"(?<=')\.\.", # split "..'" -> ".." "'"
r"\.\.\.",
*LETTER_DOUBLE_ENDING_DOT_VAR_LENGTH,
r"(?<=[A-Z])\.",
]
DOT_DOT_PLUS = r"\.\.+"
DOT_DOT_PLUS_FIXED = r"\.\.\.+"
NUMBER_DASH_NUMBER = r"(?<=[0-9])-(?=[0-9])"
NUMBER_SIGN_NUMBER = r"(?<=[0-9])[+\-\*^](?=[0-9-])"
NUMBER_SIGN_NUMBER_FIXED = r"(?<=[0-9])[+\*^](?=[0-9])"
# Given a nlp object, return a custom tokenizer that splits on special cases and with unwanted tokenization removed
def custom_tokenizer(nlp):
nlp.tokenizer = Tokenizer(nlp.vocab)
prefix_regex = compile_prefix_regex(nlp.Defaults.prefixes)
nlp.tokenizer.prefix_search = prefix_regex.search
# We use the default infixes and remove some cases that lead to unwanted tokenization.
# The removed cases are: [number]-[number] and[number][sign][number]
# We don't want to remove all signs, so we readd the NUMBER_SIGN_NUMBER_FIXED case that only excludes
# the minus sign, since we don't want to split for example CH-501.3.014.015-5
infixes = nlp.Defaults.infixes
if NUMBER_DASH_NUMBER in infixes:
infixes.remove(NUMBER_DASH_NUMBER)
if NUMBER_SIGN_NUMBER in infixes:
infixes.remove(NUMBER_SIGN_NUMBER)
infixes.append(NUMBER_SIGN_NUMBER_FIXED)
infixes += infix_res
infix_regex = compile_infix_regex(infixes)
nlp.tokenizer.infix_finditer = infix_regex.finditer
# We remove the "..+" case and replace it with "...+" to be able to split on ".."
suffixes = nlp.Defaults.suffixes
if DOT_DOT_PLUS in suffixes:
suffixes.remove(DOT_DOT_PLUS)
suffixes.append(DOT_DOT_PLUS_FIXED)
suffixes += suffix_res
suffix_regex = compile_suffix_regex(suffixes)
nlp.tokenizer.suffix_search = suffix_regex.search
# Add all special cases (e.g., GmbH. -> GmbH .)
for special_case, tokens in special_cases.items():
nlp.tokenizer.add_special_case(special_case, tokens)
nlp.tokenizer.token_match = re.compile(r"^\[$").search
return nlp.tokenizer
# File included for bundling
# spacy/custom_tokenizer/custom_tokenizer_special_cases.py
# Special cases following either pattern:
# word. -> word. e.g., etc. which we don't want to split (an exception to the general rule)
# word.. -> word. . e.g., Liq.. which we want to split after the first dot
special_cases = {
"cf.": [{ORTH: "cf."}],
"etc.": [{ORTH: "etc."}],
"usw.": [{ORTH: "usw."}],
"u.s.w.": [{ORTH: "u.s.w."}],
"u.ä.": [{ORTH: "u.ä."}],
"Liq..": [{ORTH: "Liq."}, {ORTH: "."}],
"Cie..": [{ORTH: "Cie."}, {ORTH: "."}],
"Co..": [{ORTH: "Co."}, {ORTH: "."}],
"S.à.r.l.": [{ORTH: "S.à.r.l."}],
"r.l.": [{ORTH: "r.l."}],
"R.l.": [{ORTH: "R.l."}],
"g.l.": [{ORTH: "g.l."}],
"S.c.r.l.": [{ORTH: "S.c.r.l."}],
"u.a.": [{ORTH: "u.a."}],
"u.a.m.": [{ORTH: "u.a.m."}],
"s.à.r.l.": [{ORTH: "s.à.r.l."}],
"S.a.r.l.": [{ORTH: "S.a.r.l."}],
"s.a.r.l.": [{ORTH: "s.a.r.l."}],
"s.àr.l.": [{ORTH: "s.àr.l."}],
"u.d.g.": [{ORTH: "u.d.g."}],
"S.a.g.l.": [{ORTH: "S.a.g.l."}],
"S.r.l.": [{ORTH: "S.r.l."}],
"S.r.": [{ORTH: "S.r."}],
"Ltd..": [{ORTH: "Ltd."}, {ORTH: "."}],
"LTD..": [{ORTH: "LTD."}, {ORTH: "."}],
"ltd..": [{ORTH: "ltd."}, {ORTH: "."}],
"Corp..": [{ORTH: "Corp."}, {ORTH: "."}],
"Inc..": [{ORTH: "Inc."}, {ORTH: "."}],
"dgl..": [{ORTH: "dgl."}, {ORTH: "."}],
"ect..": [{ORTH: "ect."}, {ORTH: "."}], # typo of etc.
"co..": [{ORTH: "co."}, {ORTH: "."}],
"CO..": [{ORTH: "CO."}, {ORTH: "."}],
"Ing..": [{ORTH: "Ing."}, {ORTH: "."}],
"HRegV..": [{ORTH: "HRegV."}, {ORTH: "."}],
"ehf..": [{ORTH: "ehf."}, {ORTH: "."}],
"Gen..": [{ORTH: "Gen."}, {ORTH: "."}],
"Var..": [{ORTH: "Var."}, {ORTH: "."}],
"b.v..": [{ORTH: "b.v."}, {ORTH: "."}],
"Dr..": [{ORTH: "Dr."}, {ORTH: "."}],
"Br..": [{ORTH: "Br."}, {ORTH: "."}],
"iu..": [{ORTH: "iu."}, {ORTH: "."}],
"Ch..": [{ORTH: "Ch."}, {ORTH: "."}],
"Inh..": [{ORTH: "Inh."}, {ORTH: "."}],
"sf..": [{ORTH: "sf."}, {ORTH: "."}],
"sen..": [{ORTH: "sen."}, {ORTH: "."}],
"Std..": [{ORTH: "Std."}, {ORTH: "."}],
"d.o.o..": [{ORTH: "d.o.o."}, {ORTH: "."}],
"M.Sc..": [{ORTH: "M.Sc."}, {ORTH: "."}],
"s.a..": [{ORTH: "s.a."}, {ORTH: "."}],
"ag..": [{ORTH: "ag."}, {ORTH: "."}],
"Fa..": [{ORTH: "Fa."}, {ORTH: "."}],
"Ti..": [{ORTH: "Ti."}, {ORTH: "."}],
"div..": [{ORTH: "div."}, {ORTH: "."}],
"ä..": [{ORTH: "ä."}, {ORTH: "."}],
"v.k.s.s..": [{ORTH: "v.k.s.s."}, {ORTH: "."}],
"ecc..": [{ORTH: "ecc."}, {ORTH: "."}],
"fed..": [{ORTH: "fed."}, {ORTH: "."}],
"Psy-K..": [{ORTH: "Psy-K."}, {ORTH: "."}],
"dipl.fed..": [{ORTH: "dipl.fed."}, {ORTH: "."}],
"Jr..": [{ORTH: "Jr."}, {ORTH: "."}],
"succ..": [{ORTH: "succ."}, {ORTH: "."}],
"méd..": [{ORTH: "méd."}, {ORTH: "."}],
"ass..": [{ORTH: "ass."}, {ORTH: "."}],
"env..": [{ORTH: "env."}, {ORTH: "."}],
"Int..": [{ORTH: "Int."}, {ORTH: "."}],
"Chr..": [{ORTH: "Chr."}, {ORTH: "."}],
}
|