Spaces:
Build error
Build error
Parsed references are displayed in HTML table. Colors for some labels
Browse files
app.py
CHANGED
@@ -4,6 +4,7 @@ import spacy
|
|
4 |
from spacy import displacy
|
5 |
|
6 |
from bib_tokenizers import create_references_tokenizer
|
|
|
7 |
|
8 |
|
9 |
nlp = spacy.load("en_bib_references_trf")
|
@@ -121,22 +122,51 @@ def split_up_references(
|
|
121 |
def text_analysis(text, is_eol_mode):
|
122 |
|
123 |
if not text:
|
124 |
-
return "<div style='max-width:100%;
|
125 |
-
|
126 |
-
html = ""
|
127 |
|
128 |
doc_with_linebreaks = split_up_references(
|
129 |
text, is_eol_mode=is_eol_mode, nlp=nlp, nlp_blank=nlp_blank
|
130 |
)
|
131 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
132 |
for i, sent in enumerate(doc_with_linebreaks.sents):
|
133 |
bib_item_doc = sent.as_doc()
|
134 |
-
bib_item_doc
|
135 |
-
html +=
|
136 |
|
137 |
html = (
|
138 |
-
"<div style='max-width:100%; max-height:720px; overflow:auto'>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
139 |
+ html
|
|
|
140 |
+ "</div>"
|
141 |
)
|
142 |
|
@@ -154,7 +184,7 @@ with demo:
|
|
154 |
is_eol_mode = gr.components.Checkbox(
|
155 |
label="My Unparsed Bibliography does not contain more than one reference per line (Multiline references are supported regardless of this choice)"
|
156 |
)
|
157 |
-
html = gr.components.HTML(label="Parsed
|
158 |
textbox.change(fn=text_analysis, inputs=[textbox, is_eol_mode], outputs=[html])
|
159 |
is_eol_mode.change(fn=text_analysis, inputs=[textbox, is_eol_mode], outputs=[html])
|
160 |
|
@@ -189,8 +219,7 @@ CFR
|
|
189 |
[Knu] Donald Knuth. Knuth: Computers and typesetting."""
|
190 |
],
|
191 |
[
|
192 |
-
"""
|
193 |
-
Bartkiewicz, A., Szymczak, M., Cohen, R. J., & Richards, A. M. S. 2005, MN- RAS, 361, 623
|
194 |
Bartkiewicz, A., Szymczak, M., & van Langevelde, H. J. 2016, A&A, 587, A104
|
195 |
Benjamin, R. A., Churchwell, E., Babler, B. L., et al. 2003, PASP, 115, 953
|
196 |
Beuther, H., Mottram, J. C., Ahmadi, A., et al. 2018, A&A, 617, A100
|
|
|
4 |
from spacy import displacy
|
5 |
|
6 |
from bib_tokenizers import create_references_tokenizer
|
7 |
+
from schema import tags_ent
|
8 |
|
9 |
|
10 |
nlp = spacy.load("en_bib_references_trf")
|
|
|
122 |
def text_analysis(text, is_eol_mode):
|
123 |
|
124 |
if not text:
|
125 |
+
return "<div style='max-width:100%; overflow:auto; color:grey'><p>Unparsed Bibliography Section is empty</p></div>"
|
|
|
|
|
126 |
|
127 |
doc_with_linebreaks = split_up_references(
|
128 |
text, is_eol_mode=is_eol_mode, nlp=nlp, nlp_blank=nlp_blank
|
129 |
)
|
130 |
|
131 |
+
html = ""
|
132 |
+
options = {
|
133 |
+
"ents": tags_ent,
|
134 |
+
"colors": {
|
135 |
+
"citation-number": "yellow",
|
136 |
+
"citation-label": "yellow",
|
137 |
+
"family": "DeepSkyBlue",
|
138 |
+
"given": "LightSkyBlue",
|
139 |
+
"title": "PeachPuff",
|
140 |
+
"container-title": "Moccasin",
|
141 |
+
"publisher": "PaleTurquoise",
|
142 |
+
"issued": "Gold",
|
143 |
+
},
|
144 |
+
}
|
145 |
for i, sent in enumerate(doc_with_linebreaks.sents):
|
146 |
bib_item_doc = sent.as_doc()
|
147 |
+
ref = displacy.render(bib_item_doc, style="ent", options=options)
|
148 |
+
html += f"<tr><td>{i}</td><td>{ref}</td></tr>"
|
149 |
|
150 |
html = (
|
151 |
+
"""<div style='max-width:100%; max-height:720px; overflow:auto'>
|
152 |
+
<style>table {
|
153 |
+
font-family: arial, sans-serif;
|
154 |
+
border-collapse: collapse;
|
155 |
+
width: 100%;
|
156 |
+
}
|
157 |
+
|
158 |
+
td, th {
|
159 |
+
border: 1px solid #b0b0b0;
|
160 |
+
text-align: left;
|
161 |
+
padding: 8px;
|
162 |
+
}
|
163 |
+
|
164 |
+
tr:nth-child(even) {
|
165 |
+
background-color: #f2f2f2;
|
166 |
+
}</style>"""
|
167 |
+
+ "<table><tr><th>Index</th><th>Parsed Reference</th></tr>"
|
168 |
+ html
|
169 |
+
+ "</table>"
|
170 |
+ "</div>"
|
171 |
)
|
172 |
|
|
|
184 |
is_eol_mode = gr.components.Checkbox(
|
185 |
label="My Unparsed Bibliography does not contain more than one reference per line (Multiline references are supported regardless of this choice)"
|
186 |
)
|
187 |
+
html = gr.components.HTML(label="Parsed References")
|
188 |
textbox.change(fn=text_analysis, inputs=[textbox, is_eol_mode], outputs=[html])
|
189 |
is_eol_mode.change(fn=text_analysis, inputs=[textbox, is_eol_mode], outputs=[html])
|
190 |
|
|
|
219 |
[Knu] Donald Knuth. Knuth: Computers and typesetting."""
|
220 |
],
|
221 |
[
|
222 |
+
"""Bartkiewicz, A., Szymczak, M., Cohen, R. J., & Richards, A. M. S. 2005, MN- RAS, 361, 623
|
|
|
223 |
Bartkiewicz, A., Szymczak, M., & van Langevelde, H. J. 2016, A&A, 587, A104
|
224 |
Benjamin, R. A., Churchwell, E., Babler, B. L., et al. 2003, PASP, 115, 953
|
225 |
Beuther, H., Mottram, J. C., Ahmadi, A., et al. 2018, A&A, 617, A100
|
schema.py
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# non-overlapped spans generated by CSL. Can be considered as annotations for the NER task
|
2 |
+
tags_ent = [
|
3 |
+
"citation-number",
|
4 |
+
"citation-label",
|
5 |
+
"family",
|
6 |
+
"given",
|
7 |
+
"title",
|
8 |
+
"container-title",
|
9 |
+
"issued",
|
10 |
+
"url",
|
11 |
+
"publisher",
|
12 |
+
"page",
|
13 |
+
"doi",
|
14 |
+
"publisher-place",
|
15 |
+
"number-of-pages",
|
16 |
+
"collection-title",
|
17 |
+
"collection-number",
|
18 |
+
"genre",
|
19 |
+
"authority",
|
20 |
+
"URL",
|
21 |
+
"DOI",
|
22 |
+
"volume",
|
23 |
+
# "title-short", it is a valid tag, but we ended up with the only one in the dataset...
|
24 |
+
"number",
|
25 |
+
"note",
|
26 |
+
"archive",
|
27 |
+
"archive_location",
|
28 |
+
]
|
29 |
+
|
30 |
+
# spans which may enclose other annotated spans. Spacy allows to store overlapped spans within doc.spans
|
31 |
+
tags_span = [
|
32 |
+
"author",
|
33 |
+
"year",
|
34 |
+
"month",
|
35 |
+
"day",
|
36 |
+
"issued",
|
37 |
+
"url",
|
38 |
+
"bib",
|
39 |
+
] + tags_ent
|
40 |
+
|
41 |
+
# span tag used for adding sentence boundaries annotations: an annotated CSL style encloses each bib item with <bib>..</bib>
|
42 |
+
tag_sentence_start = "bib"
|
43 |
+
|
44 |
+
spankey_sentence_start = "sc"
|