Spaces:

librarian-bots
/

arxiv-link-extractor

Sleeping

davanstrien HF staff commited on Jul 16

Commit

39c1013

•

1 Parent(s): d371fc7

refactor: Improve arXiv PDF processing efficiency with caching

Files changed (1) hide show

app.py CHANGED Viewed

@@ -5,6 +5,7 @@ import re
 import json
 from PyPDF2 import PdfReader
 import gradio as gr
 def extract_arxiv_id(input_string):
@@ -33,6 +34,14 @@ def extract_hyperlinks_from_pdf(pdf_file):
 def process_arxiv_input(input_string):
     arxiv_id = extract_arxiv_id(input_string)
     if not arxiv_id:
         raise gr.Error("Invalid input. Please provide a valid arXiv ID or URL.", "{}")
@@ -47,13 +56,12 @@ def process_arxiv_input(input_string):
         return f"No paper found with arXiv ID: {arxiv_id}", "{}"
     if pdf_file := download_pdf(paper.pdf_url):
-        return _extracted_from_process_arxiv_input_(pdf_file, paper, arxiv_id)
     else:
         return "Couldn't download the PDF.", "{}"
-# TODO Rename this here and in `process_arxiv_input`
-def _extracted_from_process_arxiv_input_(pdf_file, paper, arxiv_id):
     hyperlinks = extract_hyperlinks_from_pdf(pdf_file)
     # Prepare markdown output

 import json
 from PyPDF2 import PdfReader
 import gradio as gr
+from functools import lru_cache
 def extract_arxiv_id(input_string):
 def process_arxiv_input(input_string):
+    try:
+        return _process_arxiv_input(input_string)
+    except gr.Error as e:
+        return e.message, e.data
+@lru_cache(maxsize=1000)
+def _process_arxiv_input(input_string):
     arxiv_id = extract_arxiv_id(input_string)
     if not arxiv_id:
         raise gr.Error("Invalid input. Please provide a valid arXiv ID or URL.", "{}")
         return f"No paper found with arXiv ID: {arxiv_id}", "{}"
     if pdf_file := download_pdf(paper.pdf_url):
+        return core_extract(pdf_file, paper, arxiv_id)
     else:
         return "Couldn't download the PDF.", "{}"
+def core_extract(pdf_file, paper, arxiv_id):
     hyperlinks = extract_hyperlinks_from_pdf(pdf_file)
     # Prepare markdown output