ArneBinder commited on
Commit
a347ab7
1 Parent(s): efae5be

https://github.com/ArneBinder/pie-document-level/pull/266

Browse files

and also https://github.com/ArneBinder/pie-document-level/pull/267

Files changed (3) hide show
  1. app.py +73 -0
  2. rendering_utils_displacy.py +2 -2
  3. requirements.txt +1 -0
app.py CHANGED
@@ -6,9 +6,12 @@ import tempfile
6
  from functools import partial
7
  from typing import List, Optional, Tuple, Union
8
 
 
9
  import gradio as gr
10
  import pandas as pd
 
11
  import torch
 
12
  from document_store import DocumentStore, get_annotation_from_document
13
  from embedding import EmbeddingModel
14
  from model_utils import annotate_document, create_document, load_models
@@ -37,6 +40,7 @@ DEFAULT_DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
37
  DEFAULT_EMBEDDING_MAX_LENGTH = 512
38
  DEFAULT_EMBEDDING_BATCH_SIZE = 32
39
  DEFAULT_SPLIT_REGEX = "\n\n\n+"
 
40
 
41
  # Whether to handle segmented entities in the document. If True, labeled_spans are converted
42
  # to labeled_multi_spans and binary_relations with label "parts_of_same" are used to merge them.
@@ -215,6 +219,62 @@ def upload_processed_documents(
215
  return document_store.overview()
216
 
217
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
218
  def main():
219
 
220
  example_text = "Scholarly Argumentation Mining (SAM) has recently gained attention due to its potential to help scholars with the rapid growth of published scientific literature. It comprises two subtasks: argumentative discourse unit recognition (ADUR) and argumentative relation extraction (ARE), both of which are challenging since they require e.g. the integration of domain knowledge, the detection of implicit statements, and the disambiguation of argument structure. While previous work focused on dataset construction and baseline methods for specific document sections, such as abstract or results, full-text scholarly argumentation mining has seen little progress. In this work, we introduce a sequential pipeline model combining ADUR and ARE for full-text SAM, and provide a first analysis of the performance of pretrained language models (PLMs) on both subtasks. We establish a new SotA for ADUR on the Sci-Arg corpus, outperforming the previous best reported result by a large margin (+7% F1). We also present the first results for ARE, and thus for the full AM pipeline, on this benchmark dataset. Our detailed error analysis reveals that non-contiguous ADUs as well as the interpretation of discourse connectors pose major challenges and that data annotation needs to be more consistent."
@@ -281,6 +341,19 @@ def main():
281
  lines=20,
282
  value=example_text,
283
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
284
  with gr.Accordion("Model Configuration", open=False):
285
  model_name = gr.Textbox(
286
  label="Model Name",
 
6
  from functools import partial
7
  from typing import List, Optional, Tuple, Union
8
 
9
+ import arxiv
10
  import gradio as gr
11
  import pandas as pd
12
+ import requests
13
  import torch
14
+ from bs4 import BeautifulSoup
15
  from document_store import DocumentStore, get_annotation_from_document
16
  from embedding import EmbeddingModel
17
  from model_utils import annotate_document, create_document, load_models
 
40
  DEFAULT_EMBEDDING_MAX_LENGTH = 512
41
  DEFAULT_EMBEDDING_BATCH_SIZE = 32
42
  DEFAULT_SPLIT_REGEX = "\n\n\n+"
43
+ DEFAULT_ARXIV_ID = "1706.03762"
44
 
45
  # Whether to handle segmented entities in the document. If True, labeled_spans are converted
46
  # to labeled_multi_spans and binary_relations with label "parts_of_same" are used to merge them.
 
219
  return document_store.overview()
220
 
221
 
222
+ def clean_spaces(text: str) -> str:
223
+ # replace all multiple spaces with a single space
224
+ text = re.sub(" +", " ", text)
225
+ # reduce more than two newlines to two newlines
226
+ text = re.sub("\n\n+", "\n\n", text)
227
+ # remove leading and trailing whitespaces
228
+ text = text.strip()
229
+ return text
230
+
231
+
232
+ def get_cleaned_arxiv_paper_text(html_content: str) -> str:
233
+ # parse the HTML content with BeautifulSoup
234
+ soup = BeautifulSoup(html_content, "html.parser")
235
+ # get alerts (this is one div with classes "package-alerts" and "ltx_document")
236
+ alerts = soup.find("div", class_="package-alerts ltx_document")
237
+ # get the "article" html element
238
+ article = soup.find("article")
239
+ article_text = article.get_text()
240
+ # cleanup the text
241
+ article_text_clean = clean_spaces(article_text)
242
+ return article_text_clean
243
+
244
+
245
+ def load_text_from_arxiv(arxiv_id: str, abstract_only: bool = False) -> Tuple[str, str]:
246
+ arxiv_id = arxiv_id.strip()
247
+ if not arxiv_id:
248
+ arxiv_id = DEFAULT_ARXIV_ID
249
+
250
+ search_by_id = arxiv.Search(id_list=[arxiv_id])
251
+ try:
252
+ result = list(arxiv.Client().results(search_by_id))
253
+ except arxiv.HTTPError as e:
254
+ raise gr.Error(f"Failed to fetch arXiv data: {e}")
255
+ if len(result) == 0:
256
+ raise gr.Error(f"Could not find any paper with arXive ID '{arxiv_id}'")
257
+ first_result = result[0]
258
+ if abstract_only:
259
+ abstract_clean = first_result.summary.replace("\n", " ")
260
+ return abstract_clean, first_result.entry_id
261
+ if "/abs/" not in first_result.entry_id:
262
+ raise gr.Error(
263
+ f"Could not create the HTML URL for arXive ID '{arxiv_id}' because its entry ID has "
264
+ f"an unexpected format: {first_result.entry_id}"
265
+ )
266
+ html_url = first_result.entry_id.replace("/abs/", "/html/")
267
+ request_result = requests.get(html_url)
268
+ if request_result.status_code != 200:
269
+ raise gr.Error(
270
+ f"Could not fetch the HTML content for arXive ID '{arxiv_id}', status code: "
271
+ f"{request_result.status_code}"
272
+ )
273
+ html_content = request_result.text
274
+ text_clean = get_cleaned_arxiv_paper_text(html_content)
275
+ return text_clean, html_url
276
+
277
+
278
  def main():
279
 
280
  example_text = "Scholarly Argumentation Mining (SAM) has recently gained attention due to its potential to help scholars with the rapid growth of published scientific literature. It comprises two subtasks: argumentative discourse unit recognition (ADUR) and argumentative relation extraction (ARE), both of which are challenging since they require e.g. the integration of domain knowledge, the detection of implicit statements, and the disambiguation of argument structure. While previous work focused on dataset construction and baseline methods for specific document sections, such as abstract or results, full-text scholarly argumentation mining has seen little progress. In this work, we introduce a sequential pipeline model combining ADUR and ARE for full-text SAM, and provide a first analysis of the performance of pretrained language models (PLMs) on both subtasks. We establish a new SotA for ADUR on the Sci-Arg corpus, outperforming the previous best reported result by a large margin (+7% F1). We also present the first results for ARE, and thus for the full AM pipeline, on this benchmark dataset. Our detailed error analysis reveals that non-contiguous ADUs as well as the interpretation of discourse connectors pose major challenges and that data annotation needs to be more consistent."
 
341
  lines=20,
342
  value=example_text,
343
  )
344
+ with gr.Accordion("Load Text from arXiv", open=False):
345
+ arxiv_id = gr.Textbox(
346
+ label="arXiv paper ID",
347
+ placeholder=f"e.g. {DEFAULT_ARXIV_ID}",
348
+ max_lines=1,
349
+ )
350
+ load_arxiv_only_abstract = gr.Checkbox(label="abstract only", value=False)
351
+ load_arxiv_btn = gr.Button("Load Text from arXiv", variant="secondary")
352
+ load_arxiv_btn.click(
353
+ fn=load_text_from_arxiv,
354
+ inputs=[arxiv_id, load_arxiv_only_abstract],
355
+ outputs=[doc_text, doc_id],
356
+ )
357
  with gr.Accordion("Model Configuration", open=False):
358
  model_name = gr.Textbox(
359
  label="Model Name",
rendering_utils_displacy.py CHANGED
@@ -197,7 +197,7 @@ class EntityRenderer(object):
197
  for i, fragment in enumerate(fragments):
198
  markup += escape_html(fragment)
199
  if len(fragments) > 1 and i != len(fragments) - 1:
200
- markup += "</br>"
201
  if self.ents is None or label.upper() in self.ents:
202
  color = self.colors.get(label.upper(), self.default_color)
203
  ent_settings = {"label": label, "text": entity, "bg": color}
@@ -210,7 +210,7 @@ class EntityRenderer(object):
210
  for i, fragment in enumerate(fragments):
211
  markup += escape_html(fragment)
212
  if len(fragments) > 1 and i != len(fragments) - 1:
213
- markup += "</br>"
214
  markup = TPL_ENTS.format(content=markup, dir=self.direction)
215
  if title:
216
  markup = TPL_TITLE.format(title=title) + markup
 
197
  for i, fragment in enumerate(fragments):
198
  markup += escape_html(fragment)
199
  if len(fragments) > 1 and i != len(fragments) - 1:
200
+ markup += "<br/>"
201
  if self.ents is None or label.upper() in self.ents:
202
  color = self.colors.get(label.upper(), self.default_color)
203
  ent_settings = {"label": label, "text": entity, "bg": color}
 
210
  for i, fragment in enumerate(fragments):
211
  markup += escape_html(fragment)
212
  if len(fragments) > 1 and i != len(fragments) - 1:
213
+ markup += "<br/>"
214
  markup = TPL_ENTS.format(content=markup, dir=self.direction)
215
  if title:
216
  markup = TPL_TITLE.format(title=title) + markup
requirements.txt CHANGED
@@ -8,3 +8,4 @@ datasets==2.14.4
8
  numpy==1.25.2
9
  qdrant-client==1.9.1
10
  scipy==1.13.0
 
 
8
  numpy==1.25.2
9
  qdrant-client==1.9.1
10
  scipy==1.13.0
11
+ arxiv==2.1.3