Spaces:

vitaly
/

bibliography-parser

Build error

App Files Files Community

vitaly commited on Jul 18, 2022

Commit

ff7710f

•

1 Parent(s): 3e82e04

Parsed references are displayed in HTML table. Colors for some labels

Browse files

Files changed (2) hide show

app.py +38 -9
schema.py +44 -0

app.py CHANGED Viewed

@@ -4,6 +4,7 @@ import spacy
 from spacy import displacy
 from bib_tokenizers import create_references_tokenizer
 nlp = spacy.load("en_bib_references_trf")
@@ -121,22 +122,51 @@ def split_up_references(
 def text_analysis(text, is_eol_mode):
     if not text:
-        return "<div style='max-width:100%; max-height:720px; overflow:auto; color:grey'><p>Unparsed Bibliography Section is empty</p></div>"
-    html = ""
     doc_with_linebreaks = split_up_references(
         text, is_eol_mode=is_eol_mode, nlp=nlp, nlp_blank=nlp_blank
     )
     for i, sent in enumerate(doc_with_linebreaks.sents):
         bib_item_doc = sent.as_doc()
-        bib_item_doc.user_data = {"title": f"***** Bib Item {i+1}: *****"}
-        html += displacy.render(bib_item_doc, style="ent")
     html = (
-        "<div style='max-width:100%; max-height:720px; overflow:auto'>"
         + html
         + "</div>"
     )
@@ -154,7 +184,7 @@ with demo:
     is_eol_mode = gr.components.Checkbox(
         label="My Unparsed Bibliography does not contain more than one reference per line (Multiline references are supported regardless of this choice)"
     )
-    html = gr.components.HTML(label="Parsed Bibliography Section: list of references")
     textbox.change(fn=text_analysis, inputs=[textbox, is_eol_mode], outputs=[html])
     is_eol_mode.change(fn=text_analysis, inputs=[textbox, is_eol_mode], outputs=[html])
@@ -189,8 +219,7 @@ CFR
 [Knu] Donald Knuth. Knuth: Computers and typesetting."""
             ],
             [
-                """References
-Bartkiewicz, A., Szymczak, M., Cohen, R. J., & Richards, A. M. S. 2005, MN- RAS, 361, 623
 Bartkiewicz, A., Szymczak, M., & van Langevelde, H. J. 2016, A&A, 587, A104
 Benjamin, R. A., Churchwell, E., Babler, B. L., et al. 2003, PASP, 115, 953
 Beuther, H., Mottram, J. C., Ahmadi, A., et al. 2018, A&A, 617, A100

 from spacy import displacy
 from bib_tokenizers import create_references_tokenizer
+from schema import tags_ent
 nlp = spacy.load("en_bib_references_trf")
 def text_analysis(text, is_eol_mode):
     if not text:
+        return "<div style='max-width:100%; overflow:auto; color:grey'><p>Unparsed Bibliography Section is empty</p></div>"
     doc_with_linebreaks = split_up_references(
         text, is_eol_mode=is_eol_mode, nlp=nlp, nlp_blank=nlp_blank
     )
+    html = ""
+    options = {
+        "ents": tags_ent,
+        "colors": {
+            "citation-number": "yellow",
+            "citation-label": "yellow",
+            "family": "DeepSkyBlue",
+            "given": "LightSkyBlue",
+            "title": "PeachPuff",
+            "container-title": "Moccasin",
+            "publisher": "PaleTurquoise",
+            "issued": "Gold",
+        },
+    }
     for i, sent in enumerate(doc_with_linebreaks.sents):
         bib_item_doc = sent.as_doc()
+        ref = displacy.render(bib_item_doc, style="ent", options=options)
+        html += f"<tr><td>{i}</td><td>{ref}</td></tr>"
     html = (
+        """<div style='max-width:100%; max-height:720px; overflow:auto'>
+        <style>table {
+              font-family: arial, sans-serif;
+              border-collapse: collapse;
+              width: 100%;
+            }
+            td, th {
+              border: 1px solid #b0b0b0;
+              text-align: left;
+              padding: 8px;
+            }
+            tr:nth-child(even) {
+              background-color: #f2f2f2;
+            }</style>"""
+        + "<table><tr><th>Index</th><th>Parsed Reference</th></tr>"
         + html
+        + "</table>"
         + "</div>"
     )
     is_eol_mode = gr.components.Checkbox(
         label="My Unparsed Bibliography does not contain more than one reference per line (Multiline references are supported regardless of this choice)"
     )
+    html = gr.components.HTML(label="Parsed References")
     textbox.change(fn=text_analysis, inputs=[textbox, is_eol_mode], outputs=[html])
     is_eol_mode.change(fn=text_analysis, inputs=[textbox, is_eol_mode], outputs=[html])
 [Knu] Donald Knuth. Knuth: Computers and typesetting."""
             ],
             [
+                """Bartkiewicz, A., Szymczak, M., Cohen, R. J., & Richards, A. M. S. 2005, MN- RAS, 361, 623
 Bartkiewicz, A., Szymczak, M., & van Langevelde, H. J. 2016, A&A, 587, A104
 Benjamin, R. A., Churchwell, E., Babler, B. L., et al. 2003, PASP, 115, 953
 Beuther, H., Mottram, J. C., Ahmadi, A., et al. 2018, A&A, 617, A100

schema.py ADDED Viewed

	@@ -0,0 +1,44 @@

+# non-overlapped spans generated by CSL. Can be considered as annotations for the NER task
+tags_ent = [
+    "citation-number",
+    "citation-label",
+    "family",
+    "given",
+    "title",
+    "container-title",
+    "issued",
+    "url",
+    "publisher",
+    "page",
+    "doi",
+    "publisher-place",
+    "number-of-pages",
+    "collection-title",
+    "collection-number",
+    "genre",
+    "authority",
+    "URL",
+    "DOI",
+    "volume",
+    # "title-short", it is a valid tag, but we ended up with the only one in the dataset...
+    "number",
+    "note",
+    "archive",
+    "archive_location",
+]
+# spans which may enclose other annotated spans. Spacy allows to store overlapped spans within doc.spans
+tags_span = [
+    "author",
+    "year",
+    "month",
+    "day",
+    "issued",
+    "url",
+    "bib",
+] + tags_ent
+# span tag used for adding sentence boundaries annotations: an annotated CSL style encloses each bib item with <bib>..</bib>
+tag_sentence_start = "bib"
+spankey_sentence_start = "sc"