ArneBinder
commited on
Commit
•
a347ab7
1
Parent(s):
efae5be
https://github.com/ArneBinder/pie-document-level/pull/266
Browse filesand also https://github.com/ArneBinder/pie-document-level/pull/267
- app.py +73 -0
- rendering_utils_displacy.py +2 -2
- requirements.txt +1 -0
app.py
CHANGED
@@ -6,9 +6,12 @@ import tempfile
|
|
6 |
from functools import partial
|
7 |
from typing import List, Optional, Tuple, Union
|
8 |
|
|
|
9 |
import gradio as gr
|
10 |
import pandas as pd
|
|
|
11 |
import torch
|
|
|
12 |
from document_store import DocumentStore, get_annotation_from_document
|
13 |
from embedding import EmbeddingModel
|
14 |
from model_utils import annotate_document, create_document, load_models
|
@@ -37,6 +40,7 @@ DEFAULT_DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
|
37 |
DEFAULT_EMBEDDING_MAX_LENGTH = 512
|
38 |
DEFAULT_EMBEDDING_BATCH_SIZE = 32
|
39 |
DEFAULT_SPLIT_REGEX = "\n\n\n+"
|
|
|
40 |
|
41 |
# Whether to handle segmented entities in the document. If True, labeled_spans are converted
|
42 |
# to labeled_multi_spans and binary_relations with label "parts_of_same" are used to merge them.
|
@@ -215,6 +219,62 @@ def upload_processed_documents(
|
|
215 |
return document_store.overview()
|
216 |
|
217 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
218 |
def main():
|
219 |
|
220 |
example_text = "Scholarly Argumentation Mining (SAM) has recently gained attention due to its potential to help scholars with the rapid growth of published scientific literature. It comprises two subtasks: argumentative discourse unit recognition (ADUR) and argumentative relation extraction (ARE), both of which are challenging since they require e.g. the integration of domain knowledge, the detection of implicit statements, and the disambiguation of argument structure. While previous work focused on dataset construction and baseline methods for specific document sections, such as abstract or results, full-text scholarly argumentation mining has seen little progress. In this work, we introduce a sequential pipeline model combining ADUR and ARE for full-text SAM, and provide a first analysis of the performance of pretrained language models (PLMs) on both subtasks. We establish a new SotA for ADUR on the Sci-Arg corpus, outperforming the previous best reported result by a large margin (+7% F1). We also present the first results for ARE, and thus for the full AM pipeline, on this benchmark dataset. Our detailed error analysis reveals that non-contiguous ADUs as well as the interpretation of discourse connectors pose major challenges and that data annotation needs to be more consistent."
|
@@ -281,6 +341,19 @@ def main():
|
|
281 |
lines=20,
|
282 |
value=example_text,
|
283 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
284 |
with gr.Accordion("Model Configuration", open=False):
|
285 |
model_name = gr.Textbox(
|
286 |
label="Model Name",
|
|
|
6 |
from functools import partial
|
7 |
from typing import List, Optional, Tuple, Union
|
8 |
|
9 |
+
import arxiv
|
10 |
import gradio as gr
|
11 |
import pandas as pd
|
12 |
+
import requests
|
13 |
import torch
|
14 |
+
from bs4 import BeautifulSoup
|
15 |
from document_store import DocumentStore, get_annotation_from_document
|
16 |
from embedding import EmbeddingModel
|
17 |
from model_utils import annotate_document, create_document, load_models
|
|
|
40 |
DEFAULT_EMBEDDING_MAX_LENGTH = 512
|
41 |
DEFAULT_EMBEDDING_BATCH_SIZE = 32
|
42 |
DEFAULT_SPLIT_REGEX = "\n\n\n+"
|
43 |
+
DEFAULT_ARXIV_ID = "1706.03762"
|
44 |
|
45 |
# Whether to handle segmented entities in the document. If True, labeled_spans are converted
|
46 |
# to labeled_multi_spans and binary_relations with label "parts_of_same" are used to merge them.
|
|
|
219 |
return document_store.overview()
|
220 |
|
221 |
|
222 |
+
def clean_spaces(text: str) -> str:
|
223 |
+
# replace all multiple spaces with a single space
|
224 |
+
text = re.sub(" +", " ", text)
|
225 |
+
# reduce more than two newlines to two newlines
|
226 |
+
text = re.sub("\n\n+", "\n\n", text)
|
227 |
+
# remove leading and trailing whitespaces
|
228 |
+
text = text.strip()
|
229 |
+
return text
|
230 |
+
|
231 |
+
|
232 |
+
def get_cleaned_arxiv_paper_text(html_content: str) -> str:
|
233 |
+
# parse the HTML content with BeautifulSoup
|
234 |
+
soup = BeautifulSoup(html_content, "html.parser")
|
235 |
+
# get alerts (this is one div with classes "package-alerts" and "ltx_document")
|
236 |
+
alerts = soup.find("div", class_="package-alerts ltx_document")
|
237 |
+
# get the "article" html element
|
238 |
+
article = soup.find("article")
|
239 |
+
article_text = article.get_text()
|
240 |
+
# cleanup the text
|
241 |
+
article_text_clean = clean_spaces(article_text)
|
242 |
+
return article_text_clean
|
243 |
+
|
244 |
+
|
245 |
+
def load_text_from_arxiv(arxiv_id: str, abstract_only: bool = False) -> Tuple[str, str]:
|
246 |
+
arxiv_id = arxiv_id.strip()
|
247 |
+
if not arxiv_id:
|
248 |
+
arxiv_id = DEFAULT_ARXIV_ID
|
249 |
+
|
250 |
+
search_by_id = arxiv.Search(id_list=[arxiv_id])
|
251 |
+
try:
|
252 |
+
result = list(arxiv.Client().results(search_by_id))
|
253 |
+
except arxiv.HTTPError as e:
|
254 |
+
raise gr.Error(f"Failed to fetch arXiv data: {e}")
|
255 |
+
if len(result) == 0:
|
256 |
+
raise gr.Error(f"Could not find any paper with arXive ID '{arxiv_id}'")
|
257 |
+
first_result = result[0]
|
258 |
+
if abstract_only:
|
259 |
+
abstract_clean = first_result.summary.replace("\n", " ")
|
260 |
+
return abstract_clean, first_result.entry_id
|
261 |
+
if "/abs/" not in first_result.entry_id:
|
262 |
+
raise gr.Error(
|
263 |
+
f"Could not create the HTML URL for arXive ID '{arxiv_id}' because its entry ID has "
|
264 |
+
f"an unexpected format: {first_result.entry_id}"
|
265 |
+
)
|
266 |
+
html_url = first_result.entry_id.replace("/abs/", "/html/")
|
267 |
+
request_result = requests.get(html_url)
|
268 |
+
if request_result.status_code != 200:
|
269 |
+
raise gr.Error(
|
270 |
+
f"Could not fetch the HTML content for arXive ID '{arxiv_id}', status code: "
|
271 |
+
f"{request_result.status_code}"
|
272 |
+
)
|
273 |
+
html_content = request_result.text
|
274 |
+
text_clean = get_cleaned_arxiv_paper_text(html_content)
|
275 |
+
return text_clean, html_url
|
276 |
+
|
277 |
+
|
278 |
def main():
|
279 |
|
280 |
example_text = "Scholarly Argumentation Mining (SAM) has recently gained attention due to its potential to help scholars with the rapid growth of published scientific literature. It comprises two subtasks: argumentative discourse unit recognition (ADUR) and argumentative relation extraction (ARE), both of which are challenging since they require e.g. the integration of domain knowledge, the detection of implicit statements, and the disambiguation of argument structure. While previous work focused on dataset construction and baseline methods for specific document sections, such as abstract or results, full-text scholarly argumentation mining has seen little progress. In this work, we introduce a sequential pipeline model combining ADUR and ARE for full-text SAM, and provide a first analysis of the performance of pretrained language models (PLMs) on both subtasks. We establish a new SotA for ADUR on the Sci-Arg corpus, outperforming the previous best reported result by a large margin (+7% F1). We also present the first results for ARE, and thus for the full AM pipeline, on this benchmark dataset. Our detailed error analysis reveals that non-contiguous ADUs as well as the interpretation of discourse connectors pose major challenges and that data annotation needs to be more consistent."
|
|
|
341 |
lines=20,
|
342 |
value=example_text,
|
343 |
)
|
344 |
+
with gr.Accordion("Load Text from arXiv", open=False):
|
345 |
+
arxiv_id = gr.Textbox(
|
346 |
+
label="arXiv paper ID",
|
347 |
+
placeholder=f"e.g. {DEFAULT_ARXIV_ID}",
|
348 |
+
max_lines=1,
|
349 |
+
)
|
350 |
+
load_arxiv_only_abstract = gr.Checkbox(label="abstract only", value=False)
|
351 |
+
load_arxiv_btn = gr.Button("Load Text from arXiv", variant="secondary")
|
352 |
+
load_arxiv_btn.click(
|
353 |
+
fn=load_text_from_arxiv,
|
354 |
+
inputs=[arxiv_id, load_arxiv_only_abstract],
|
355 |
+
outputs=[doc_text, doc_id],
|
356 |
+
)
|
357 |
with gr.Accordion("Model Configuration", open=False):
|
358 |
model_name = gr.Textbox(
|
359 |
label="Model Name",
|
rendering_utils_displacy.py
CHANGED
@@ -197,7 +197,7 @@ class EntityRenderer(object):
|
|
197 |
for i, fragment in enumerate(fragments):
|
198 |
markup += escape_html(fragment)
|
199 |
if len(fragments) > 1 and i != len(fragments) - 1:
|
200 |
-
markup += "
|
201 |
if self.ents is None or label.upper() in self.ents:
|
202 |
color = self.colors.get(label.upper(), self.default_color)
|
203 |
ent_settings = {"label": label, "text": entity, "bg": color}
|
@@ -210,7 +210,7 @@ class EntityRenderer(object):
|
|
210 |
for i, fragment in enumerate(fragments):
|
211 |
markup += escape_html(fragment)
|
212 |
if len(fragments) > 1 and i != len(fragments) - 1:
|
213 |
-
markup += "
|
214 |
markup = TPL_ENTS.format(content=markup, dir=self.direction)
|
215 |
if title:
|
216 |
markup = TPL_TITLE.format(title=title) + markup
|
|
|
197 |
for i, fragment in enumerate(fragments):
|
198 |
markup += escape_html(fragment)
|
199 |
if len(fragments) > 1 and i != len(fragments) - 1:
|
200 |
+
markup += "<br/>"
|
201 |
if self.ents is None or label.upper() in self.ents:
|
202 |
color = self.colors.get(label.upper(), self.default_color)
|
203 |
ent_settings = {"label": label, "text": entity, "bg": color}
|
|
|
210 |
for i, fragment in enumerate(fragments):
|
211 |
markup += escape_html(fragment)
|
212 |
if len(fragments) > 1 and i != len(fragments) - 1:
|
213 |
+
markup += "<br/>"
|
214 |
markup = TPL_ENTS.format(content=markup, dir=self.direction)
|
215 |
if title:
|
216 |
markup = TPL_TITLE.format(title=title) + markup
|
requirements.txt
CHANGED
@@ -8,3 +8,4 @@ datasets==2.14.4
|
|
8 |
numpy==1.25.2
|
9 |
qdrant-client==1.9.1
|
10 |
scipy==1.13.0
|
|
|
|
8 |
numpy==1.25.2
|
9 |
qdrant-client==1.9.1
|
10 |
scipy==1.13.0
|
11 |
+
arxiv==2.1.3
|