davanstrien HF staff commited on
Commit
706986b
1 Parent(s): 65b2393

formatting

Browse files
Files changed (1) hide show
  1. app.py +22 -28
app.py CHANGED
@@ -6,34 +6,32 @@ import json
6
  from PyPDF2 import PdfReader
7
  import gradio as gr
8
 
 
9
  def extract_arxiv_id(input_string):
10
- pattern = r'(\d{4}\.\d{5})'
11
- match = re.search(pattern, input_string)
12
- if match:
13
- return match.group(1)
14
- return None
15
 
16
  def download_pdf(url):
17
  response = requests.get(url)
18
- if response.status_code == 200:
19
- return io.BytesIO(response.content)
20
- return None
21
 
22
  def extract_hyperlinks_from_pdf(pdf_file):
23
  reader = PdfReader(pdf_file)
24
  hyperlinks = []
25
 
26
  for page in reader.pages:
27
- if '/Annots' in page:
28
- for annot in page['/Annots']:
29
  obj = annot.get_object()
30
- if obj['/Subtype'] == '/Link' and '/A' in obj:
31
- if '/URI' in obj['/A']:
32
- uri = obj['/A']['/URI']
33
- hyperlinks.append(uri)
34
 
35
  return hyperlinks
36
 
 
37
  def process_arxiv_input(input_string):
38
  arxiv_id = extract_arxiv_id(input_string)
39
  if not arxiv_id:
@@ -42,43 +40,39 @@ def process_arxiv_input(input_string):
42
  client = arxiv.Client()
43
  search = arxiv.Search(id_list=[arxiv_id])
44
  results = client.results(search)
45
-
46
  try:
47
  paper = next(results)
48
  except StopIteration:
49
  return f"No paper found with arXiv ID: {arxiv_id}", "{}"
50
 
51
- pdf_file = download_pdf(paper.pdf_url)
52
-
53
- if pdf_file:
54
  hyperlinks = extract_hyperlinks_from_pdf(pdf_file)
55
-
56
  # Prepare text output
57
  text_result = f"Title: {paper.title}\n\nHyperlinks found:\n"
58
  text_result += "\n".join([f"- {link}" for link in hyperlinks])
59
-
60
  # Prepare JSON output
61
  json_result = {
62
  "title": paper.title,
63
  "arxiv_id": arxiv_id,
64
- "hyperlinks": hyperlinks
65
  }
66
-
67
  return text_result, json.dumps(json_result, indent=2)
68
  else:
69
  return "Couldn't download the PDF.", "{}"
70
 
 
71
  # Gradio Interface
72
  iface = gr.Interface(
73
  fn=process_arxiv_input,
74
  inputs=gr.Textbox(label="Enter arXiv ID or URL"),
75
- outputs=[
76
- gr.Textbox(label="Text Results"),
77
- gr.JSON(label="JSON Results")
78
- ],
79
  title="arXiv PDF Hyperlink Extractor",
80
- description="Enter an arXiv ID or URL to extract hyperlinks from the paper's PDF."
81
  )
82
 
83
  if __name__ == "__main__":
84
- iface.launch()
 
6
  from PyPDF2 import PdfReader
7
  import gradio as gr
8
 
9
+
10
  def extract_arxiv_id(input_string):
11
+ pattern = r"(\d{4}\.\d{5})"
12
+ return match.group(1) if (match := re.search(pattern, input_string)) else None
13
+
 
 
14
 
15
  def download_pdf(url):
16
  response = requests.get(url)
17
+ return io.BytesIO(response.content) if response.status_code == 200 else None
18
+
 
19
 
20
  def extract_hyperlinks_from_pdf(pdf_file):
21
  reader = PdfReader(pdf_file)
22
  hyperlinks = []
23
 
24
  for page in reader.pages:
25
+ if "/Annots" in page:
26
+ for annot in page["/Annots"]:
27
  obj = annot.get_object()
28
+ if obj["/Subtype"] == "/Link" and "/A" in obj and "/URI" in obj["/A"]:
29
+ uri = obj["/A"]["/URI"]
30
+ hyperlinks.append(uri)
 
31
 
32
  return hyperlinks
33
 
34
+
35
  def process_arxiv_input(input_string):
36
  arxiv_id = extract_arxiv_id(input_string)
37
  if not arxiv_id:
 
40
  client = arxiv.Client()
41
  search = arxiv.Search(id_list=[arxiv_id])
42
  results = client.results(search)
43
+
44
  try:
45
  paper = next(results)
46
  except StopIteration:
47
  return f"No paper found with arXiv ID: {arxiv_id}", "{}"
48
 
49
+ if pdf_file := download_pdf(paper.pdf_url):
 
 
50
  hyperlinks = extract_hyperlinks_from_pdf(pdf_file)
51
+
52
  # Prepare text output
53
  text_result = f"Title: {paper.title}\n\nHyperlinks found:\n"
54
  text_result += "\n".join([f"- {link}" for link in hyperlinks])
55
+
56
  # Prepare JSON output
57
  json_result = {
58
  "title": paper.title,
59
  "arxiv_id": arxiv_id,
60
+ "hyperlinks": hyperlinks,
61
  }
62
+
63
  return text_result, json.dumps(json_result, indent=2)
64
  else:
65
  return "Couldn't download the PDF.", "{}"
66
 
67
+
68
  # Gradio Interface
69
  iface = gr.Interface(
70
  fn=process_arxiv_input,
71
  inputs=gr.Textbox(label="Enter arXiv ID or URL"),
72
+ outputs=[gr.Textbox(label="Text Results"), gr.JSON(label="JSON Results")],
 
 
 
73
  title="arXiv PDF Hyperlink Extractor",
74
+ description="Enter an arXiv ID or URL to extract hyperlinks from the paper's PDF.",
75
  )
76
 
77
  if __name__ == "__main__":
78
+ iface.launch()