Spaces:

librarian-bots
/

arxiv-link-extractor

Sleeping

App Files Files Community

davanstrien HF staff commited on Jul 16

Commit

706986b

•

1 Parent(s): 65b2393

formatting

Browse files

Files changed (1) hide show

app.py +22 -28

app.py CHANGED Viewed

@@ -6,34 +6,32 @@ import json
 from PyPDF2 import PdfReader
 import gradio as gr
 def extract_arxiv_id(input_string):
-    pattern = r'(\d{4}\.\d{5})'
-    match = re.search(pattern, input_string)
-    if match:
-        return match.group(1)
-    return None
 def download_pdf(url):
     response = requests.get(url)
-    if response.status_code == 200:
-        return io.BytesIO(response.content)
-    return None
 def extract_hyperlinks_from_pdf(pdf_file):
     reader = PdfReader(pdf_file)
     hyperlinks = []
     for page in reader.pages:
-        if '/Annots' in page:
-            for annot in page['/Annots']:
                 obj = annot.get_object()
-                if obj['/Subtype'] == '/Link' and '/A' in obj:
-                    if '/URI' in obj['/A']:
-                        uri = obj['/A']['/URI']
-                        hyperlinks.append(uri)
     return hyperlinks
 def process_arxiv_input(input_string):
     arxiv_id = extract_arxiv_id(input_string)
     if not arxiv_id:
@@ -42,43 +40,39 @@ def process_arxiv_input(input_string):
     client = arxiv.Client()
     search = arxiv.Search(id_list=[arxiv_id])
     results = client.results(search)
     try:
         paper = next(results)
     except StopIteration:
         return f"No paper found with arXiv ID: {arxiv_id}", "{}"
-    pdf_file = download_pdf(paper.pdf_url)
-    if pdf_file:
         hyperlinks = extract_hyperlinks_from_pdf(pdf_file)
         # Prepare text output
         text_result = f"Title: {paper.title}\n\nHyperlinks found:\n"
         text_result += "\n".join([f"- {link}" for link in hyperlinks])
         # Prepare JSON output
         json_result = {
             "title": paper.title,
             "arxiv_id": arxiv_id,
-            "hyperlinks": hyperlinks
         }
         return text_result, json.dumps(json_result, indent=2)
     else:
         return "Couldn't download the PDF.", "{}"
 # Gradio Interface
 iface = gr.Interface(
     fn=process_arxiv_input,
     inputs=gr.Textbox(label="Enter arXiv ID or URL"),
-    outputs=[
-        gr.Textbox(label="Text Results"),
-        gr.JSON(label="JSON Results")
-    ],
     title="arXiv PDF Hyperlink Extractor",
-    description="Enter an arXiv ID or URL to extract hyperlinks from the paper's PDF."
 )
 if __name__ == "__main__":
-    iface.launch()

 from PyPDF2 import PdfReader
 import gradio as gr
 def extract_arxiv_id(input_string):
+    pattern = r"(\d{4}\.\d{5})"
+    return match.group(1) if (match := re.search(pattern, input_string)) else None
 def download_pdf(url):
     response = requests.get(url)
+    return io.BytesIO(response.content) if response.status_code == 200 else None
 def extract_hyperlinks_from_pdf(pdf_file):
     reader = PdfReader(pdf_file)
     hyperlinks = []
     for page in reader.pages:
+        if "/Annots" in page:
+            for annot in page["/Annots"]:
                 obj = annot.get_object()
+                if obj["/Subtype"] == "/Link" and "/A" in obj and "/URI" in obj["/A"]:
+                    uri = obj["/A"]["/URI"]
+                    hyperlinks.append(uri)
     return hyperlinks
 def process_arxiv_input(input_string):
     arxiv_id = extract_arxiv_id(input_string)
     if not arxiv_id:
     client = arxiv.Client()
     search = arxiv.Search(id_list=[arxiv_id])
     results = client.results(search)
     try:
         paper = next(results)
     except StopIteration:
         return f"No paper found with arXiv ID: {arxiv_id}", "{}"
+    if pdf_file := download_pdf(paper.pdf_url):
         hyperlinks = extract_hyperlinks_from_pdf(pdf_file)
         # Prepare text output
         text_result = f"Title: {paper.title}\n\nHyperlinks found:\n"
         text_result += "\n".join([f"- {link}" for link in hyperlinks])
         # Prepare JSON output
         json_result = {
             "title": paper.title,
             "arxiv_id": arxiv_id,
+            "hyperlinks": hyperlinks,
         }
         return text_result, json.dumps(json_result, indent=2)
     else:
         return "Couldn't download the PDF.", "{}"
 # Gradio Interface
 iface = gr.Interface(
     fn=process_arxiv_input,
     inputs=gr.Textbox(label="Enter arXiv ID or URL"),
+    outputs=[gr.Textbox(label="Text Results"), gr.JSON(label="JSON Results")],
     title="arXiv PDF Hyperlink Extractor",
+    description="Enter an arXiv ID or URL to extract hyperlinks from the paper's PDF.",
 )
 if __name__ == "__main__":
+    iface.launch()