rbiswasfc commited on
Commit
2a910d7
·
1 Parent(s): 14e2e93
Files changed (2) hide show
  1. app.py +61 -0
  2. requirements.txt +8 -0
app.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ import gradio as gr
4
+ import requests
5
+ import spaces
6
+ from marker.convert import convert_single_pdf
7
+ from marker.logger import configure_logging
8
+ from marker.models import load_all_models
9
+
10
+ configure_logging()
11
+ MARKER_MODEL_LST = load_all_models()
12
+
13
+
14
+ @spaces.GPU
15
+ def extract_from_pdf(arxiv_id):
16
+ """extract text from a PDF file"""
17
+ pdf_url = f"https://arxiv.org/pdf/{arxiv_id}.pdf"
18
+ tmp_pdf = ".tmp_pdf"
19
+ response = requests.get(pdf_url)
20
+ if response.status_code == 200:
21
+ with open(tmp_pdf, "wb") as file:
22
+ file.write(response.content)
23
+ print("PDF downloaded and saved as ", tmp_pdf)
24
+ else:
25
+ print(f"Failed to download PDF. Status code: {response.status_code}")
26
+ full_text, doc_images, out_meta = convert_single_pdf(
27
+ tmp_pdf, MARKER_MODEL_LST, max_pages=20
28
+ )
29
+
30
+ os.remove(tmp_pdf)
31
+ print("Temporary PDF file removed.")
32
+
33
+ return full_text
34
+
35
+
36
+ def extract(arxiv_id):
37
+ if not arxiv_id:
38
+ return {"error": "ArXiv ID is required"}
39
+
40
+ try:
41
+ full_text = extract_from_pdf(arxiv_id)
42
+ results = {"arxiv_id": arxiv_id, "text": full_text}
43
+ return results
44
+
45
+ except Exception as e:
46
+ return {"error": str(e)}
47
+
48
+
49
+ with gr.Blocks() as app:
50
+ # Create an input text box
51
+ text_input = gr.Textbox(label="Enter arxiv id")
52
+
53
+ # Create an output text component
54
+ output = gr.JSON(label="Extracted text")
55
+
56
+ # When the input text is submitted, call the embedding function and display the output
57
+ text_input.submit(extract, inputs=text_input, outputs=output)
58
+
59
+
60
+ if __name__ == "__main__":
61
+ app.queue().launch(server_name="0.0.0.0", show_error=True, server_port=7860)
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ fastapi
2
+ srsly
3
+ python-dotenv
4
+ transformers
5
+ torch
6
+ beautifulsoup4
7
+ marker-pdf
8
+ retry