File size: 1,000 Bytes
ff7710f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
# non-overlapped spans generated by CSL. Can be considered as annotations for the NER task
tags_ent = [
    "citation-number",
    "citation-label",
    "family",
    "given",
    "title",
    "container-title",
    "issued",
    "url",
    "publisher",
    "page",
    "doi",
    "publisher-place",
    "number-of-pages",
    "collection-title",
    "collection-number",
    "genre",
    "authority",
    "URL",
    "DOI",
    "volume",
    # "title-short", it is a valid tag, but we ended up with the only one in the dataset...
    "number",
    "note",
    "archive",
    "archive_location",
]

# spans which may enclose other annotated spans. Spacy allows to store overlapped spans within doc.spans
tags_span = [
    "author",
    "year",
    "month",
    "day",
    "issued",
    "url",
    "bib",
] + tags_ent

# span tag used for adding sentence boundaries annotations: an annotated CSL style encloses each bib item with <bib>..</bib>
tag_sentence_start = "bib"

spankey_sentence_start = "sc"