import arxiv import requests import io import re import json from PyPDF2 import PdfReader import gradio as gr from functools import lru_cache def extract_arxiv_id(input_string): pattern = r"(\d{4}\.\d{5})" return match.group(1) if (match := re.search(pattern, input_string)) else None def download_pdf(url): response = requests.get(url) return io.BytesIO(response.content) if response.status_code == 200 else None def extract_hyperlinks_from_pdf(pdf_file): reader = PdfReader(pdf_file) hyperlinks = [] for page in reader.pages: if "/Annots" in page: for annot in page["/Annots"]: obj = annot.get_object() if obj["/Subtype"] == "/Link" and "/A" in obj and "/URI" in obj["/A"]: uri = obj["/A"]["/URI"] hyperlinks.append(uri) return hyperlinks def process_arxiv_input(input_string): try: return _process_arxiv_input(input_string) except gr.Error as e: return e.message, e.data @lru_cache(maxsize=1000) def _process_arxiv_input(input_string): arxiv_id = extract_arxiv_id(input_string) if not arxiv_id: raise gr.Error("Invalid input. Please provide a valid arXiv ID or URL.", "{}") client = arxiv.Client() search = arxiv.Search(id_list=[arxiv_id]) results = client.results(search) try: paper = next(results) except StopIteration: return f"No paper found with arXiv ID: {arxiv_id}", "{}" if pdf_file := download_pdf(paper.pdf_url): return core_extract(pdf_file, paper, arxiv_id) else: return "Couldn't download the PDF.", "{}" def core_extract(pdf_file, paper, arxiv_id): hyperlinks = extract_hyperlinks_from_pdf(pdf_file) # Prepare markdown output markdown_result = f"# {paper.title}\n\n" markdown_result += ( f"**arXiv ID**: [{arxiv_id}](https://arxiv.org/abs/{arxiv_id})\n\n" ) markdown_result += "## Hyperlinks found:\n\n" for link in hyperlinks: markdown_result += f"- [{link}]({link})\n" # Prepare JSON output json_result = { "title": paper.title, "arxiv_id": arxiv_id, "hyperlinks": hyperlinks, } return markdown_result, json.dumps(json_result, indent=2) # Gradio Interface iface = gr.Interface( fn=process_arxiv_input, inputs=gr.Textbox(label="Enter arXiv ID or URL"), outputs=[gr.Markdown(label="Markdown Results"), gr.JSON(label="JSON Results")], title="arXiv PDF Hyperlink Extractor", description="Enter an arXiv ID or URL or Paper Pages URL to extract hyperlinks from the paper's PDF.", ) if __name__ == "__main__": iface.launch()