Spaces:
Sleeping
Sleeping
Commit
•
39c1013
1
Parent(s):
d371fc7
refactor: Improve arXiv PDF processing efficiency with caching
Browse files
app.py
CHANGED
@@ -5,6 +5,7 @@ import re
|
|
5 |
import json
|
6 |
from PyPDF2 import PdfReader
|
7 |
import gradio as gr
|
|
|
8 |
|
9 |
|
10 |
def extract_arxiv_id(input_string):
|
@@ -33,6 +34,14 @@ def extract_hyperlinks_from_pdf(pdf_file):
|
|
33 |
|
34 |
|
35 |
def process_arxiv_input(input_string):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
36 |
arxiv_id = extract_arxiv_id(input_string)
|
37 |
if not arxiv_id:
|
38 |
raise gr.Error("Invalid input. Please provide a valid arXiv ID or URL.", "{}")
|
@@ -47,13 +56,12 @@ def process_arxiv_input(input_string):
|
|
47 |
return f"No paper found with arXiv ID: {arxiv_id}", "{}"
|
48 |
|
49 |
if pdf_file := download_pdf(paper.pdf_url):
|
50 |
-
return
|
51 |
else:
|
52 |
return "Couldn't download the PDF.", "{}"
|
53 |
|
54 |
|
55 |
-
|
56 |
-
def _extracted_from_process_arxiv_input_(pdf_file, paper, arxiv_id):
|
57 |
hyperlinks = extract_hyperlinks_from_pdf(pdf_file)
|
58 |
|
59 |
# Prepare markdown output
|
|
|
5 |
import json
|
6 |
from PyPDF2 import PdfReader
|
7 |
import gradio as gr
|
8 |
+
from functools import lru_cache
|
9 |
|
10 |
|
11 |
def extract_arxiv_id(input_string):
|
|
|
34 |
|
35 |
|
36 |
def process_arxiv_input(input_string):
|
37 |
+
try:
|
38 |
+
return _process_arxiv_input(input_string)
|
39 |
+
except gr.Error as e:
|
40 |
+
return e.message, e.data
|
41 |
+
|
42 |
+
|
43 |
+
@lru_cache(maxsize=1000)
|
44 |
+
def _process_arxiv_input(input_string):
|
45 |
arxiv_id = extract_arxiv_id(input_string)
|
46 |
if not arxiv_id:
|
47 |
raise gr.Error("Invalid input. Please provide a valid arXiv ID or URL.", "{}")
|
|
|
56 |
return f"No paper found with arXiv ID: {arxiv_id}", "{}"
|
57 |
|
58 |
if pdf_file := download_pdf(paper.pdf_url):
|
59 |
+
return core_extract(pdf_file, paper, arxiv_id)
|
60 |
else:
|
61 |
return "Couldn't download the PDF.", "{}"
|
62 |
|
63 |
|
64 |
+
def core_extract(pdf_file, paper, arxiv_id):
|
|
|
65 |
hyperlinks = extract_hyperlinks_from_pdf(pdf_file)
|
66 |
|
67 |
# Prepare markdown output
|