File size: 2,692 Bytes
3732d9f
 
 
 
 
 
 
39c1013
3732d9f
706986b
3732d9f
706986b
 
 
3732d9f
 
 
706986b
 
3732d9f
 
 
 
 
 
706986b
 
3732d9f
706986b
 
 
3732d9f
 
 
706986b
3732d9f
39c1013
 
 
 
 
 
 
 
3732d9f
 
d371fc7
3732d9f
 
 
 
706986b
3732d9f
 
 
 
 
706986b
39c1013
d371fc7
 
706986b
 
39c1013
d371fc7
706986b
d371fc7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3732d9f
706986b
3732d9f
 
 
 
d371fc7
3732d9f
d371fc7
3732d9f
 
 
706986b
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import arxiv
import requests
import io
import re
import json
from PyPDF2 import PdfReader
import gradio as gr
from functools import lru_cache


def extract_arxiv_id(input_string):
    pattern = r"(\d{4}\.\d{5})"
    return match.group(1) if (match := re.search(pattern, input_string)) else None


def download_pdf(url):
    response = requests.get(url)
    return io.BytesIO(response.content) if response.status_code == 200 else None


def extract_hyperlinks_from_pdf(pdf_file):
    reader = PdfReader(pdf_file)
    hyperlinks = []

    for page in reader.pages:
        if "/Annots" in page:
            for annot in page["/Annots"]:
                obj = annot.get_object()
                if obj["/Subtype"] == "/Link" and "/A" in obj and "/URI" in obj["/A"]:
                    uri = obj["/A"]["/URI"]
                    hyperlinks.append(uri)

    return hyperlinks


def process_arxiv_input(input_string):
    try:
        return _process_arxiv_input(input_string)
    except gr.Error as e:
        return e.message, e.data


@lru_cache(maxsize=1000)
def _process_arxiv_input(input_string):
    arxiv_id = extract_arxiv_id(input_string)
    if not arxiv_id:
        raise gr.Error("Invalid input. Please provide a valid arXiv ID or URL.", "{}")

    client = arxiv.Client()
    search = arxiv.Search(id_list=[arxiv_id])
    results = client.results(search)

    try:
        paper = next(results)
    except StopIteration:
        return f"No paper found with arXiv ID: {arxiv_id}", "{}"

    if pdf_file := download_pdf(paper.pdf_url):
        return core_extract(pdf_file, paper, arxiv_id)
    else:
        return "Couldn't download the PDF.", "{}"


def core_extract(pdf_file, paper, arxiv_id):
    hyperlinks = extract_hyperlinks_from_pdf(pdf_file)

    # Prepare markdown output
    markdown_result = f"# {paper.title}\n\n"
    markdown_result += (
        f"**arXiv ID**: [{arxiv_id}](https://arxiv.org/abs/{arxiv_id})\n\n"
    )
    markdown_result += "## Hyperlinks found:\n\n"
    for link in hyperlinks:
        markdown_result += f"- [{link}]({link})\n"

    # Prepare JSON output
    json_result = {
        "title": paper.title,
        "arxiv_id": arxiv_id,
        "hyperlinks": hyperlinks,
    }

    return markdown_result, json.dumps(json_result, indent=2)


# Gradio Interface
iface = gr.Interface(
    fn=process_arxiv_input,
    inputs=gr.Textbox(label="Enter arXiv ID or URL"),
    outputs=[gr.Markdown(label="Markdown Results"), gr.JSON(label="JSON Results")],
    title="arXiv PDF Hyperlink Extractor",
    description="Enter an arXiv ID or URL or Paper Pages URL to extract hyperlinks from the paper's PDF.",
)

if __name__ == "__main__":
    iface.launch()