File size: 6,516 Bytes
a8d4e3d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
"""
regex_arxiv.py

author: Matt Bierbaum
date: 2019-03-14

RegEx patterns for finding arXiv id citations in fulltext articles.
"""

import re

# These are all the primary categories present in the OAI ArXiv metadata
CATEGORIES = [
    "acc-phys", "adap-org", "alg-geom", "ao-sci", "astro-ph", "atom-ph",
    "bayes-an", "chao-dyn", "chem-ph", "cmp-lg", "comp-gas", "cond-mat", "cs",
    "dg-ga", "funct-an", "gr-qc", "hep-ex", "hep-lat", "hep-ph", "hep-th",
    "math", "math-ph", "mtrl-th", "nlin", "nucl-ex", "nucl-th", "patt-sol",
    "physics", "plasm-ph", "q-alg", "q-bio", "quant-ph", "solv-int",
    "supr-con", "eess", "econ", "q-fin", "stat"
]

#  All subcategories with more than 2 capital letters (not SG, SI, SP, etc)
SUB_CATEGORIES = [
     'acc-ph', 'ao-ph', 'app-ph', 'atm-clus', 'atom-ph', 'bio-ph', 'chem-ph',
     'class-ph', 'comp-ph', 'data-an', 'dis-nn', 'ed-ph', 'flu-dyn', 'gen-ph',
     'geo-ph', 'hist-ph', 'ins-det', 'med-ph', 'mes-hall', 'mtrl-sci', 'optics',
     'other', 'plasm-ph', 'pop-ph', 'quant-gas', 'soc-ph', 'soft', 'space-ph',
     'stat-mech', 'str-el', 'supr-con'
]

__all__ = (
    'REGEX_ARXIV_SIMPLE',
    'REGEX_ARXIV_STRICT',
    'REGEX_ARXIV_FLEXIBLE'
)

dashdict = {c.replace('-', ''): c for c in CATEGORIES if '-' in c}
dashdict.update({c.replace('-', ''): c for c in SUB_CATEGORIES if '-' in c})

REGEX_VERSION_SPLITTER = re.compile(r'([vV][1-9]\d*)')

def strip_version(name):
    """ 1501.21981v1 -> 1501.21981 """
    return REGEX_VERSION_SPLITTER.split(name)[0]

def format_cat(name):
    """ Strip subcategory, add hyphen to category name if missing """
    if '/' in name:  # OLD ID, names contains subcategory 
        catsubcat, aid = name.split('/')
        cat = catsubcat.split('.')[0] 
        return dashdict.get(cat, cat) + "/" + aid
    else:
        return name

def zeropad_1501(name):
    """ Arxiv IDs after yymm=1501 are padded to 5 zeros """
    if not '/' in name:  # new ID
        yymm, num = name.split('.')
        if int(yymm) > 1500 and len(num) < 5:
            return yymm + ".0" + num
    return name

def clean(name):
    """ Correct common errors in ArXiv IDs to improve matching """
    funcs = [strip_version, format_cat, zeropad_1501]
    for func in funcs:
        name = func(name)
    return name

# A common typo is to exclude the hyphen in the category.
categories = list(set(CATEGORIES + [cat.replace('-', '') for cat in
                                    CATEGORIES]))
subcategories = list(set(SUB_CATEGORIES + [cat.replace('-', '') for cat in
                                           SUB_CATEGORIES]))

#  capture possible minor categories
RE_CATEGORIES = r'(?:{})(?:(?:[.][A-Z]{{2}})|(?:{}))?'.format(
    r'|'.join(categories), r'|'.join(subcategories)
)

# valid YYMM date, NOT preceded by any digits
# NOTE: at the date of writing, it is 2019, so we do not allow
# proper dates for YY 20 or larger
RE_DATE = r'(?:(?:[0-1][0-9])|(?:9[1-9]))(?:0[1-9]|1[0-2])'
RE_VERSION = r'(?:[vV][1-9]\d*)?'

# =============================================================================
RE_NUM_NEW = RE_DATE + r'(?:[.]\d{4,5})' + RE_VERSION
RE_NUM_OLD = RE_DATE + r'(?:\d{3})' + RE_VERSION

# matches: 1612.00001 1203.0023v2
RE_ID_NEW = r'(?:{})'.format(RE_NUM_NEW)

# matches: hep-th/11030234 cs/0112345v2 cs.AI/0112345v2
RE_ID_OLD = r'(?:{}/{})'.format(RE_CATEGORIES, RE_NUM_OLD)

# =============================================================================
# matches: https://arxiv.org/abs/ abs/ arxiv.org/abs/
#   3. e-print: eprints
RE_PREFIX_URL = (
    r'(?:'
      r'(?i:http[s]?\://)?'  # we could have a url prefix
      r'(?i:arxiv\.org/)?'   # maybe with the arxiv.org bit
      r'(?i:abs/|pdf/)'      # at least it has the abs/ part
    r')'
)

# matches: arXiv: arxiv/ arxiv
RE_PREFIX_ARXIV = r'(?i:arxiv\s*[:/\s,.]*\s*)'

# matches:  cs.AI/ cs.AI nucl-th
RE_PREFIX_CATEGORIES = r'(?i:{})'.format(RE_CATEGORIES)

# matches: e-prints: e-print eprints:
RE_PREFIX_EPRINT = r'(?i:e[-]?print[s]?.{1,3})'

# =============================================================================
# matches simple old or new identifiers, no fancy business
REGEX_ARXIV_SIMPLE = r'(?:{}|{})'.format(RE_ID_OLD, RE_ID_NEW)

# this one follows the guide set forth by:
#   https://arxiv.org/help/arxiv_identifier
REGEX_ARXIV_STRICT = (
    r'(?:{})'.format(RE_PREFIX_ARXIV) +
    r'(?:'
      r'({})'.format(RE_ID_OLD) +
    r'|'
      r'({})'.format(RE_ID_NEW) +
    r')'
)

# this regex essentially accepts anything that looks like an arxiv id and has
# the slightest smell of being one as well. that is, if it is an id and
# mentions anything about the arxiv before hand, then it is an id.
REGEX_ARXIV_FLEXIBLE = (
    r'(?:'
      r'({})'.format(REGEX_ARXIV_SIMPLE) +  # capture
    r')|(?:'
      r'(?:'
        r'(?:{})?'.format(RE_PREFIX_URL) +
        r'(?:{})?'.format(RE_PREFIX_EPRINT) +
        r'(?:'
          r'(?:{})?'.format(RE_PREFIX_ARXIV) +
          r'({})'.format(RE_ID_OLD) +  # capture
        r'|'
          r'(?:{})'.format(RE_PREFIX_ARXIV) +
          r'(?:{}/)?'.format(RE_CATEGORIES) +
          r'({})'.format(RE_ID_NEW) +  # capture
        r')'
      r')'
    r'|'
      r'(?:'
        r'(?:{})|'.format(RE_PREFIX_URL) +
        r'(?:{})|'.format(RE_PREFIX_EPRINT) +
        r'(?:{})|'.format(RE_PREFIX_CATEGORIES) +
        r'(?:{})'.format(RE_PREFIX_ARXIV) +
      r')'
      r'.*?'
      r'({})'.format(REGEX_ARXIV_SIMPLE) +  # capture
    r')|(?:'
      r'(?:[\[\(]\s*)'
        r'({})'.format(REGEX_ARXIV_SIMPLE) +  # capture
      r'(?:\s*[\]\)])'
    r')'
)

TEST_POSITIVE = [
    'arXiv:quant-ph 1503.01017v3',
    'math. RT/0903.2992',
    'arXiv, 1511.03262',
    'tions. arXiv preprint arXiv:1607.00021, 2016',
    'Math. Phys. 255, 577 (2005), hep-th/0306165',
    'Kuzovlev, arXiv:cond-mat/9903350 ',
    'arXiv:math.RT/1206.5933,',
    'arXiv e-prints 1306.1595',
    'ays, JHEP 07 (2009) 055, [ 0903.0883]',
    ' Rev. D71 (2005) 063534, [ astro-ph/0501562]',
    'e-print arXiv:1506.02215v1',
    'available at: http://arxiv.org/abs/1511.08977',
    'arXiv e-print: 1306.2144',
    'Preprint arXiv:math/0612139',
    'Vertices in a Digraph. arXiv preprint 1602.02129 ',
    'cond-mat/0309488.'
    'decays, 1701.01871 LHCB-PAPE',
    'Distribution. In: 1404.2485v3 (2015)',
    '113005 (2013), 1307.4331,',
    'scalar quantum 1610.07877v1',
    'cond-mat/0309488.'
    'cond-mat/0309488.8383'
]

TEST_NEGATIVE = [
    'doi: 10.1145/ 321105.321114 ',
    'doi: 10.1145/ 1105.321114 ',
]