davanstrien HF staff commited on
Commit
39c1013
1 Parent(s): d371fc7

refactor: Improve arXiv PDF processing efficiency with caching

Browse files
Files changed (1) hide show
  1. app.py +11 -3
app.py CHANGED
@@ -5,6 +5,7 @@ import re
5
  import json
6
  from PyPDF2 import PdfReader
7
  import gradio as gr
 
8
 
9
 
10
  def extract_arxiv_id(input_string):
@@ -33,6 +34,14 @@ def extract_hyperlinks_from_pdf(pdf_file):
33
 
34
 
35
  def process_arxiv_input(input_string):
 
 
 
 
 
 
 
 
36
  arxiv_id = extract_arxiv_id(input_string)
37
  if not arxiv_id:
38
  raise gr.Error("Invalid input. Please provide a valid arXiv ID or URL.", "{}")
@@ -47,13 +56,12 @@ def process_arxiv_input(input_string):
47
  return f"No paper found with arXiv ID: {arxiv_id}", "{}"
48
 
49
  if pdf_file := download_pdf(paper.pdf_url):
50
- return _extracted_from_process_arxiv_input_(pdf_file, paper, arxiv_id)
51
  else:
52
  return "Couldn't download the PDF.", "{}"
53
 
54
 
55
- # TODO Rename this here and in `process_arxiv_input`
56
- def _extracted_from_process_arxiv_input_(pdf_file, paper, arxiv_id):
57
  hyperlinks = extract_hyperlinks_from_pdf(pdf_file)
58
 
59
  # Prepare markdown output
 
5
  import json
6
  from PyPDF2 import PdfReader
7
  import gradio as gr
8
+ from functools import lru_cache
9
 
10
 
11
  def extract_arxiv_id(input_string):
 
34
 
35
 
36
  def process_arxiv_input(input_string):
37
+ try:
38
+ return _process_arxiv_input(input_string)
39
+ except gr.Error as e:
40
+ return e.message, e.data
41
+
42
+
43
+ @lru_cache(maxsize=1000)
44
+ def _process_arxiv_input(input_string):
45
  arxiv_id = extract_arxiv_id(input_string)
46
  if not arxiv_id:
47
  raise gr.Error("Invalid input. Please provide a valid arXiv ID or URL.", "{}")
 
56
  return f"No paper found with arXiv ID: {arxiv_id}", "{}"
57
 
58
  if pdf_file := download_pdf(paper.pdf_url):
59
+ return core_extract(pdf_file, paper, arxiv_id)
60
  else:
61
  return "Couldn't download the PDF.", "{}"
62
 
63
 
64
+ def core_extract(pdf_file, paper, arxiv_id):
 
65
  hyperlinks = extract_hyperlinks_from_pdf(pdf_file)
66
 
67
  # Prepare markdown output