vitaly commited on
Commit
b303a60
1 Parent(s): 0f8d97e

Upload bib_tokenizers.py

Browse files
Files changed (1) hide show
  1. bib_tokenizers.py +56 -0
bib_tokenizers.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import spacy
2
+ from spacy.lang.char_classes import LIST_ELLIPSES
3
+ from spacy.lang.char_classes import HYPHENS, LIST_ICONS
4
+ from spacy.lang.char_classes import (
5
+ ALPHA,
6
+ ALPHA_LOWER,
7
+ ALPHA_UPPER,
8
+ CONCAT_QUOTES,
9
+ PUNCT,
10
+ )
11
+ from spacy.util import compile_infix_regex, registry
12
+
13
+ infixes = (
14
+ LIST_ELLIPSES
15
+ + LIST_ICONS
16
+ + [
17
+ r"(?<=[0-9])[+\-\*^](?=[0-9-])",
18
+ r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
19
+ al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
20
+ ),
21
+ r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
22
+ r"(?<=[{a}])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),
23
+ r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
24
+ # [10]R. Nakamura and T. Kenzaka
25
+ # (10)R. Nakamura and T. Kenzaka
26
+ # 10.R. Nakamura and T. Kenzaka
27
+ r"(?<=[{a}0-9])[:<>\]\)\.](?=[{a}])".format(a=ALPHA),
28
+ # 10R. Nakamura and T. Kenzaka
29
+ r"(?<=[0-9])(?=[{a}])".format(a=ALPHA),
30
+ ]
31
+ )
32
+
33
+
34
+ @spacy.registry.tokenizers("references_tokenizer")
35
+ def create_references_tokenizer():
36
+ def create_tokenizer(nlp):
37
+
38
+ default_tokenizer_cfg = {"tokenizer": {"@tokenizers": "spacy.Tokenizer.v1"}}
39
+
40
+ create_default_tokenizer = registry.resolve(default_tokenizer_cfg)["tokenizer"]
41
+ tokenizer = create_default_tokenizer(nlp)
42
+
43
+ infix_re = compile_infix_regex(infixes)
44
+ tokenizer.infix_finditer = infix_re.finditer
45
+
46
+ for s in [
47
+ "[10]R. Nakamura and T. Kenzaka",
48
+ "(10)R. Nakamura and T. Kenzaka",
49
+ "10.R. Nakamura and T. Kenzaka",
50
+ "10R. Nakamura and T. Kenzaka",
51
+ ]:
52
+ print(s, "->", [t for t in tokenizer(s)])
53
+
54
+ return tokenizer
55
+
56
+ return create_tokenizer