davanstrien's picture
davanstrien HF staff
refactor: Improve arXiv PDF processing efficiency with caching
39c1013
raw
history blame contribute delete
No virus
2.69 kB
import arxiv
import requests
import io
import re
import json
from PyPDF2 import PdfReader
import gradio as gr
from functools import lru_cache
def extract_arxiv_id(input_string):
pattern = r"(\d{4}\.\d{5})"
return match.group(1) if (match := re.search(pattern, input_string)) else None
def download_pdf(url):
response = requests.get(url)
return io.BytesIO(response.content) if response.status_code == 200 else None
def extract_hyperlinks_from_pdf(pdf_file):
reader = PdfReader(pdf_file)
hyperlinks = []
for page in reader.pages:
if "/Annots" in page:
for annot in page["/Annots"]:
obj = annot.get_object()
if obj["/Subtype"] == "/Link" and "/A" in obj and "/URI" in obj["/A"]:
uri = obj["/A"]["/URI"]
hyperlinks.append(uri)
return hyperlinks
def process_arxiv_input(input_string):
try:
return _process_arxiv_input(input_string)
except gr.Error as e:
return e.message, e.data
@lru_cache(maxsize=1000)
def _process_arxiv_input(input_string):
arxiv_id = extract_arxiv_id(input_string)
if not arxiv_id:
raise gr.Error("Invalid input. Please provide a valid arXiv ID or URL.", "{}")
client = arxiv.Client()
search = arxiv.Search(id_list=[arxiv_id])
results = client.results(search)
try:
paper = next(results)
except StopIteration:
return f"No paper found with arXiv ID: {arxiv_id}", "{}"
if pdf_file := download_pdf(paper.pdf_url):
return core_extract(pdf_file, paper, arxiv_id)
else:
return "Couldn't download the PDF.", "{}"
def core_extract(pdf_file, paper, arxiv_id):
hyperlinks = extract_hyperlinks_from_pdf(pdf_file)
# Prepare markdown output
markdown_result = f"# {paper.title}\n\n"
markdown_result += (
f"**arXiv ID**: [{arxiv_id}](https://arxiv.org/abs/{arxiv_id})\n\n"
)
markdown_result += "## Hyperlinks found:\n\n"
for link in hyperlinks:
markdown_result += f"- [{link}]({link})\n"
# Prepare JSON output
json_result = {
"title": paper.title,
"arxiv_id": arxiv_id,
"hyperlinks": hyperlinks,
}
return markdown_result, json.dumps(json_result, indent=2)
# Gradio Interface
iface = gr.Interface(
fn=process_arxiv_input,
inputs=gr.Textbox(label="Enter arXiv ID or URL"),
outputs=[gr.Markdown(label="Markdown Results"), gr.JSON(label="JSON Results")],
title="arXiv PDF Hyperlink Extractor",
description="Enter an arXiv ID or URL or Paper Pages URL to extract hyperlinks from the paper's PDF.",
)
if __name__ == "__main__":
iface.launch()