|
import arxiv |
|
import requests |
|
import io |
|
import re |
|
import json |
|
from PyPDF2 import PdfReader |
|
import gradio as gr |
|
from functools import lru_cache |
|
|
|
|
|
def extract_arxiv_id(input_string): |
|
pattern = r"(\d{4}\.\d{5})" |
|
return match.group(1) if (match := re.search(pattern, input_string)) else None |
|
|
|
|
|
def download_pdf(url): |
|
response = requests.get(url) |
|
return io.BytesIO(response.content) if response.status_code == 200 else None |
|
|
|
|
|
def extract_hyperlinks_from_pdf(pdf_file): |
|
reader = PdfReader(pdf_file) |
|
hyperlinks = [] |
|
|
|
for page in reader.pages: |
|
if "/Annots" in page: |
|
for annot in page["/Annots"]: |
|
obj = annot.get_object() |
|
if obj["/Subtype"] == "/Link" and "/A" in obj and "/URI" in obj["/A"]: |
|
uri = obj["/A"]["/URI"] |
|
hyperlinks.append(uri) |
|
|
|
return hyperlinks |
|
|
|
|
|
def process_arxiv_input(input_string): |
|
try: |
|
return _process_arxiv_input(input_string) |
|
except gr.Error as e: |
|
return e.message, e.data |
|
|
|
|
|
@lru_cache(maxsize=1000) |
|
def _process_arxiv_input(input_string): |
|
arxiv_id = extract_arxiv_id(input_string) |
|
if not arxiv_id: |
|
raise gr.Error("Invalid input. Please provide a valid arXiv ID or URL.", "{}") |
|
|
|
client = arxiv.Client() |
|
search = arxiv.Search(id_list=[arxiv_id]) |
|
results = client.results(search) |
|
|
|
try: |
|
paper = next(results) |
|
except StopIteration: |
|
return f"No paper found with arXiv ID: {arxiv_id}", "{}" |
|
|
|
if pdf_file := download_pdf(paper.pdf_url): |
|
return core_extract(pdf_file, paper, arxiv_id) |
|
else: |
|
return "Couldn't download the PDF.", "{}" |
|
|
|
|
|
def core_extract(pdf_file, paper, arxiv_id): |
|
hyperlinks = extract_hyperlinks_from_pdf(pdf_file) |
|
|
|
|
|
markdown_result = f"# {paper.title}\n\n" |
|
markdown_result += ( |
|
f"**arXiv ID**: [{arxiv_id}](https://arxiv.org/abs/{arxiv_id})\n\n" |
|
) |
|
markdown_result += "## Hyperlinks found:\n\n" |
|
for link in hyperlinks: |
|
markdown_result += f"- [{link}]({link})\n" |
|
|
|
|
|
json_result = { |
|
"title": paper.title, |
|
"arxiv_id": arxiv_id, |
|
"hyperlinks": hyperlinks, |
|
} |
|
|
|
return markdown_result, json.dumps(json_result, indent=2) |
|
|
|
|
|
|
|
iface = gr.Interface( |
|
fn=process_arxiv_input, |
|
inputs=gr.Textbox(label="Enter arXiv ID or URL"), |
|
outputs=[gr.Markdown(label="Markdown Results"), gr.JSON(label="JSON Results")], |
|
title="arXiv PDF Hyperlink Extractor", |
|
description="Enter an arXiv ID or URL or Paper Pages URL to extract hyperlinks from the paper's PDF.", |
|
) |
|
|
|
if __name__ == "__main__": |
|
iface.launch() |
|
|