vitaly commited on
Commit
ff7710f
1 Parent(s): 3e82e04

Parsed references are displayed in HTML table. Colors for some labels

Browse files
Files changed (2) hide show
  1. app.py +38 -9
  2. schema.py +44 -0
app.py CHANGED
@@ -4,6 +4,7 @@ import spacy
4
  from spacy import displacy
5
 
6
  from bib_tokenizers import create_references_tokenizer
 
7
 
8
 
9
  nlp = spacy.load("en_bib_references_trf")
@@ -121,22 +122,51 @@ def split_up_references(
121
  def text_analysis(text, is_eol_mode):
122
 
123
  if not text:
124
- return "<div style='max-width:100%; max-height:720px; overflow:auto; color:grey'><p>Unparsed Bibliography Section is empty</p></div>"
125
-
126
- html = ""
127
 
128
  doc_with_linebreaks = split_up_references(
129
  text, is_eol_mode=is_eol_mode, nlp=nlp, nlp_blank=nlp_blank
130
  )
131
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
132
  for i, sent in enumerate(doc_with_linebreaks.sents):
133
  bib_item_doc = sent.as_doc()
134
- bib_item_doc.user_data = {"title": f"***** Bib Item {i+1}: *****"}
135
- html += displacy.render(bib_item_doc, style="ent")
136
 
137
  html = (
138
- "<div style='max-width:100%; max-height:720px; overflow:auto'>"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
139
  + html
 
140
  + "</div>"
141
  )
142
 
@@ -154,7 +184,7 @@ with demo:
154
  is_eol_mode = gr.components.Checkbox(
155
  label="My Unparsed Bibliography does not contain more than one reference per line (Multiline references are supported regardless of this choice)"
156
  )
157
- html = gr.components.HTML(label="Parsed Bibliography Section: list of references")
158
  textbox.change(fn=text_analysis, inputs=[textbox, is_eol_mode], outputs=[html])
159
  is_eol_mode.change(fn=text_analysis, inputs=[textbox, is_eol_mode], outputs=[html])
160
 
@@ -189,8 +219,7 @@ CFR
189
  [Knu] Donald Knuth. Knuth: Computers and typesetting."""
190
  ],
191
  [
192
- """References
193
- Bartkiewicz, A., Szymczak, M., Cohen, R. J., & Richards, A. M. S. 2005, MN- RAS, 361, 623
194
  Bartkiewicz, A., Szymczak, M., & van Langevelde, H. J. 2016, A&A, 587, A104
195
  Benjamin, R. A., Churchwell, E., Babler, B. L., et al. 2003, PASP, 115, 953
196
  Beuther, H., Mottram, J. C., Ahmadi, A., et al. 2018, A&A, 617, A100
 
4
  from spacy import displacy
5
 
6
  from bib_tokenizers import create_references_tokenizer
7
+ from schema import tags_ent
8
 
9
 
10
  nlp = spacy.load("en_bib_references_trf")
 
122
  def text_analysis(text, is_eol_mode):
123
 
124
  if not text:
125
+ return "<div style='max-width:100%; overflow:auto; color:grey'><p>Unparsed Bibliography Section is empty</p></div>"
 
 
126
 
127
  doc_with_linebreaks = split_up_references(
128
  text, is_eol_mode=is_eol_mode, nlp=nlp, nlp_blank=nlp_blank
129
  )
130
 
131
+ html = ""
132
+ options = {
133
+ "ents": tags_ent,
134
+ "colors": {
135
+ "citation-number": "yellow",
136
+ "citation-label": "yellow",
137
+ "family": "DeepSkyBlue",
138
+ "given": "LightSkyBlue",
139
+ "title": "PeachPuff",
140
+ "container-title": "Moccasin",
141
+ "publisher": "PaleTurquoise",
142
+ "issued": "Gold",
143
+ },
144
+ }
145
  for i, sent in enumerate(doc_with_linebreaks.sents):
146
  bib_item_doc = sent.as_doc()
147
+ ref = displacy.render(bib_item_doc, style="ent", options=options)
148
+ html += f"<tr><td>{i}</td><td>{ref}</td></tr>"
149
 
150
  html = (
151
+ """<div style='max-width:100%; max-height:720px; overflow:auto'>
152
+ <style>table {
153
+ font-family: arial, sans-serif;
154
+ border-collapse: collapse;
155
+ width: 100%;
156
+ }
157
+
158
+ td, th {
159
+ border: 1px solid #b0b0b0;
160
+ text-align: left;
161
+ padding: 8px;
162
+ }
163
+
164
+ tr:nth-child(even) {
165
+ background-color: #f2f2f2;
166
+ }</style>"""
167
+ + "<table><tr><th>Index</th><th>Parsed Reference</th></tr>"
168
  + html
169
+ + "</table>"
170
  + "</div>"
171
  )
172
 
 
184
  is_eol_mode = gr.components.Checkbox(
185
  label="My Unparsed Bibliography does not contain more than one reference per line (Multiline references are supported regardless of this choice)"
186
  )
187
+ html = gr.components.HTML(label="Parsed References")
188
  textbox.change(fn=text_analysis, inputs=[textbox, is_eol_mode], outputs=[html])
189
  is_eol_mode.change(fn=text_analysis, inputs=[textbox, is_eol_mode], outputs=[html])
190
 
 
219
  [Knu] Donald Knuth. Knuth: Computers and typesetting."""
220
  ],
221
  [
222
+ """Bartkiewicz, A., Szymczak, M., Cohen, R. J., & Richards, A. M. S. 2005, MN- RAS, 361, 623
 
223
  Bartkiewicz, A., Szymczak, M., & van Langevelde, H. J. 2016, A&A, 587, A104
224
  Benjamin, R. A., Churchwell, E., Babler, B. L., et al. 2003, PASP, 115, 953
225
  Beuther, H., Mottram, J. C., Ahmadi, A., et al. 2018, A&A, 617, A100
schema.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # non-overlapped spans generated by CSL. Can be considered as annotations for the NER task
2
+ tags_ent = [
3
+ "citation-number",
4
+ "citation-label",
5
+ "family",
6
+ "given",
7
+ "title",
8
+ "container-title",
9
+ "issued",
10
+ "url",
11
+ "publisher",
12
+ "page",
13
+ "doi",
14
+ "publisher-place",
15
+ "number-of-pages",
16
+ "collection-title",
17
+ "collection-number",
18
+ "genre",
19
+ "authority",
20
+ "URL",
21
+ "DOI",
22
+ "volume",
23
+ # "title-short", it is a valid tag, but we ended up with the only one in the dataset...
24
+ "number",
25
+ "note",
26
+ "archive",
27
+ "archive_location",
28
+ ]
29
+
30
+ # spans which may enclose other annotated spans. Spacy allows to store overlapped spans within doc.spans
31
+ tags_span = [
32
+ "author",
33
+ "year",
34
+ "month",
35
+ "day",
36
+ "issued",
37
+ "url",
38
+ "bib",
39
+ ] + tags_ent
40
+
41
+ # span tag used for adding sentence boundaries annotations: an annotated CSL style encloses each bib item with <bib>..</bib>
42
+ tag_sentence_start = "bib"
43
+
44
+ spankey_sentence_start = "sc"