Spaces:

ArneBinder
/

sam-pointer-bart-base-v0.3

Running

App Files Files Community

ArneBinder commited on Oct 4

Commit

a347ab7

•

1 Parent(s): efae5be

https://github.com/ArneBinder/pie-document-level/pull/266

Browse files

and also https://github.com/ArneBinder/pie-document-level/pull/267

Files changed (3) hide show

app.py +73 -0
rendering_utils_displacy.py +2 -2
requirements.txt +1 -0

app.py CHANGED Viewed

@@ -6,9 +6,12 @@ import tempfile
 from functools import partial
 from typing import List, Optional, Tuple, Union
 import gradio as gr
 import pandas as pd
 import torch
 from document_store import DocumentStore, get_annotation_from_document
 from embedding import EmbeddingModel
 from model_utils import annotate_document, create_document, load_models
@@ -37,6 +40,7 @@ DEFAULT_DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 DEFAULT_EMBEDDING_MAX_LENGTH = 512
 DEFAULT_EMBEDDING_BATCH_SIZE = 32
 DEFAULT_SPLIT_REGEX = "\n\n\n+"
 # Whether to handle segmented entities in the document. If True, labeled_spans are converted
 # to labeled_multi_spans and binary_relations with label "parts_of_same" are used to merge them.
@@ -215,6 +219,62 @@ def upload_processed_documents(
     return document_store.overview()
 def main():
     example_text = "Scholarly Argumentation Mining (SAM) has recently gained attention due to its potential to help scholars with the rapid growth of published scientific literature. It comprises two subtasks: argumentative discourse unit recognition (ADUR) and argumentative relation extraction (ARE), both of which are challenging since they require e.g. the integration of domain knowledge, the detection of implicit statements, and the disambiguation of argument structure. While previous work focused on dataset construction and baseline methods for specific document sections, such as abstract or results, full-text scholarly argumentation mining has seen little progress. In this work, we introduce a sequential pipeline model combining ADUR and ARE for full-text SAM, and provide a first analysis of the performance of pretrained language models (PLMs) on both subtasks. We establish a new SotA for ADUR on the Sci-Arg corpus, outperforming the previous best reported result by a large margin (+7% F1). We also present the first results for ARE, and thus for the full AM pipeline, on this benchmark dataset. Our detailed error analysis reveals that non-contiguous ADUs as well as the interpretation of discourse connectors pose major challenges and that data annotation needs to be more consistent."
@@ -281,6 +341,19 @@ def main():
                     lines=20,
                     value=example_text,
                 )
                 with gr.Accordion("Model Configuration", open=False):
                     model_name = gr.Textbox(
                         label="Model Name",

 from functools import partial
 from typing import List, Optional, Tuple, Union
+import arxiv
 import gradio as gr
 import pandas as pd
+import requests
 import torch
+from bs4 import BeautifulSoup
 from document_store import DocumentStore, get_annotation_from_document
 from embedding import EmbeddingModel
 from model_utils import annotate_document, create_document, load_models
 DEFAULT_EMBEDDING_MAX_LENGTH = 512
 DEFAULT_EMBEDDING_BATCH_SIZE = 32
 DEFAULT_SPLIT_REGEX = "\n\n\n+"
+DEFAULT_ARXIV_ID = "1706.03762"
 # Whether to handle segmented entities in the document. If True, labeled_spans are converted
 # to labeled_multi_spans and binary_relations with label "parts_of_same" are used to merge them.
     return document_store.overview()
+def clean_spaces(text: str) -> str:
+    # replace all multiple spaces with a single space
+    text = re.sub(" +", " ", text)
+    # reduce more than two newlines to two newlines
+    text = re.sub("\n\n+", "\n\n", text)
+    # remove leading and trailing whitespaces
+    text = text.strip()
+    return text
+def get_cleaned_arxiv_paper_text(html_content: str) -> str:
+    # parse the HTML content with BeautifulSoup
+    soup = BeautifulSoup(html_content, "html.parser")
+    # get alerts (this is one div with classes "package-alerts" and "ltx_document")
+    alerts = soup.find("div", class_="package-alerts ltx_document")
+    # get the "article" html element
+    article = soup.find("article")
+    article_text = article.get_text()
+    # cleanup the text
+    article_text_clean = clean_spaces(article_text)
+    return article_text_clean
+def load_text_from_arxiv(arxiv_id: str, abstract_only: bool = False) -> Tuple[str, str]:
+    arxiv_id = arxiv_id.strip()
+    if not arxiv_id:
+        arxiv_id = DEFAULT_ARXIV_ID
+    search_by_id = arxiv.Search(id_list=[arxiv_id])
+    try:
+        result = list(arxiv.Client().results(search_by_id))
+    except arxiv.HTTPError as e:
+        raise gr.Error(f"Failed to fetch arXiv data: {e}")
+    if len(result) == 0:
+        raise gr.Error(f"Could not find any paper with arXive ID '{arxiv_id}'")
+    first_result = result[0]
+    if abstract_only:
+        abstract_clean = first_result.summary.replace("\n", " ")
+        return abstract_clean, first_result.entry_id
+    if "/abs/" not in first_result.entry_id:
+        raise gr.Error(
+            f"Could not create the HTML URL for arXive ID '{arxiv_id}' because its entry ID has "
+            f"an unexpected format: {first_result.entry_id}"
+        )
+    html_url = first_result.entry_id.replace("/abs/", "/html/")
+    request_result = requests.get(html_url)
+    if request_result.status_code != 200:
+        raise gr.Error(
+            f"Could not fetch the HTML content for arXive ID '{arxiv_id}', status code: "
+            f"{request_result.status_code}"
+        )
+    html_content = request_result.text
+    text_clean = get_cleaned_arxiv_paper_text(html_content)
+    return text_clean, html_url
 def main():
     example_text = "Scholarly Argumentation Mining (SAM) has recently gained attention due to its potential to help scholars with the rapid growth of published scientific literature. It comprises two subtasks: argumentative discourse unit recognition (ADUR) and argumentative relation extraction (ARE), both of which are challenging since they require e.g. the integration of domain knowledge, the detection of implicit statements, and the disambiguation of argument structure. While previous work focused on dataset construction and baseline methods for specific document sections, such as abstract or results, full-text scholarly argumentation mining has seen little progress. In this work, we introduce a sequential pipeline model combining ADUR and ARE for full-text SAM, and provide a first analysis of the performance of pretrained language models (PLMs) on both subtasks. We establish a new SotA for ADUR on the Sci-Arg corpus, outperforming the previous best reported result by a large margin (+7% F1). We also present the first results for ARE, and thus for the full AM pipeline, on this benchmark dataset. Our detailed error analysis reveals that non-contiguous ADUs as well as the interpretation of discourse connectors pose major challenges and that data annotation needs to be more consistent."
                     lines=20,
                     value=example_text,
                 )
+                with gr.Accordion("Load Text from arXiv", open=False):
+                    arxiv_id = gr.Textbox(
+                        label="arXiv paper ID",
+                        placeholder=f"e.g. {DEFAULT_ARXIV_ID}",
+                        max_lines=1,
+                    )
+                    load_arxiv_only_abstract = gr.Checkbox(label="abstract only", value=False)
+                    load_arxiv_btn = gr.Button("Load Text from arXiv", variant="secondary")
+                    load_arxiv_btn.click(
+                        fn=load_text_from_arxiv,
+                        inputs=[arxiv_id, load_arxiv_only_abstract],
+                        outputs=[doc_text, doc_id],
+                    )
                 with gr.Accordion("Model Configuration", open=False):
                     model_name = gr.Textbox(
                         label="Model Name",

rendering_utils_displacy.py CHANGED Viewed

@@ -197,7 +197,7 @@ class EntityRenderer(object):
             for i, fragment in enumerate(fragments):
                 markup += escape_html(fragment)
                 if len(fragments) > 1 and i != len(fragments) - 1:
-                    markup += "</br>"
             if self.ents is None or label.upper() in self.ents:
                 color = self.colors.get(label.upper(), self.default_color)
                 ent_settings = {"label": label, "text": entity, "bg": color}
@@ -210,7 +210,7 @@ class EntityRenderer(object):
         for i, fragment in enumerate(fragments):
             markup += escape_html(fragment)
             if len(fragments) > 1 and i != len(fragments) - 1:
-                markup += "</br>"
         markup = TPL_ENTS.format(content=markup, dir=self.direction)
         if title:
             markup = TPL_TITLE.format(title=title) + markup

             for i, fragment in enumerate(fragments):
                 markup += escape_html(fragment)
                 if len(fragments) > 1 and i != len(fragments) - 1:
+                    markup += "<br/>"
             if self.ents is None or label.upper() in self.ents:
                 color = self.colors.get(label.upper(), self.default_color)
                 ent_settings = {"label": label, "text": entity, "bg": color}
         for i, fragment in enumerate(fragments):
             markup += escape_html(fragment)
             if len(fragments) > 1 and i != len(fragments) - 1:
+                markup += "<br/>"
         markup = TPL_ENTS.format(content=markup, dir=self.direction)
         if title:
             markup = TPL_TITLE.format(title=title) + markup

requirements.txt CHANGED Viewed

@@ -8,3 +8,4 @@ datasets==2.14.4
 numpy==1.25.2
 qdrant-client==1.9.1
 scipy==1.13.0

 numpy==1.25.2
 qdrant-client==1.9.1
 scipy==1.13.0
+arxiv==2.1.3