Spaces:
Runtime error
Runtime error
BhagatSurya
commited on
Commit
•
1c8ed3e
1
Parent(s):
efa3f91
Update app.py
Browse files
app.py
CHANGED
@@ -27,31 +27,39 @@ def image_to_latex(image):
|
|
27 |
result = subprocess.run(["pix2tex", image_path], capture_output=True, text=True)
|
28 |
return result.stdout
|
29 |
|
30 |
-
|
31 |
|
32 |
def pdf_to_text(file):
|
33 |
doc = fitz.open(file.name)
|
34 |
full_text = ''
|
35 |
for i, page in enumerate(doc):
|
|
|
36 |
page_text = page.get_text()
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
|
|
|
|
|
|
|
|
43 |
page_text = clean_text(page_text)
|
44 |
if len(page_text.split()) > 5:
|
45 |
page_number = i + 1
|
46 |
page_text = "## Metadata: Page Number " + str(page_number) + "\n" + page_text
|
47 |
full_text += page_text + "\n\n"
|
|
|
48 |
base_name = os.path.splitext(os.path.basename(file.name))[0]
|
49 |
output_file_name = base_name + ".txt"
|
50 |
with open(output_file_name, 'w') as f:
|
51 |
f.write(full_text)
|
|
|
52 |
return output_file_name, page_number
|
53 |
|
54 |
|
|
|
55 |
iface = gr.Interface(fn=pdf_to_text,
|
56 |
inputs=gr.inputs.File(label="Your PDF"),
|
57 |
outputs=gr.outputs.File(label="Download TXT"),
|
|
|
27 |
result = subprocess.run(["pix2tex", image_path], capture_output=True, text=True)
|
28 |
return result.stdout
|
29 |
|
30 |
+
|
31 |
|
32 |
def pdf_to_text(file):
|
33 |
doc = fitz.open(file.name)
|
34 |
full_text = ''
|
35 |
for i, page in enumerate(doc):
|
36 |
+
# Extract text
|
37 |
page_text = page.get_text()
|
38 |
+
|
39 |
+
# Extract images and convert to LaTeX
|
40 |
+
image_list = page.get_images(full=True)
|
41 |
+
for img in image_list:
|
42 |
+
xref, img_properties = img
|
43 |
+
image_data = img_properties['image']
|
44 |
+
image = Image.open(io.BytesIO(image_data))
|
45 |
+
latex_code = image_to_latex(image)
|
46 |
+
page_text += "\n" + latex_code # Add LaTeX code to page text
|
47 |
+
|
48 |
page_text = clean_text(page_text)
|
49 |
if len(page_text.split()) > 5:
|
50 |
page_number = i + 1
|
51 |
page_text = "## Metadata: Page Number " + str(page_number) + "\n" + page_text
|
52 |
full_text += page_text + "\n\n"
|
53 |
+
|
54 |
base_name = os.path.splitext(os.path.basename(file.name))[0]
|
55 |
output_file_name = base_name + ".txt"
|
56 |
with open(output_file_name, 'w') as f:
|
57 |
f.write(full_text)
|
58 |
+
|
59 |
return output_file_name, page_number
|
60 |
|
61 |
|
62 |
+
|
63 |
iface = gr.Interface(fn=pdf_to_text,
|
64 |
inputs=gr.inputs.File(label="Your PDF"),
|
65 |
outputs=gr.outputs.File(label="Download TXT"),
|