Spaces:
Runtime error
Runtime error
import gradio as gr | |
import tempfile | |
import re | |
import os | |
import spacy | |
import pytesseract | |
import pdf2image | |
import subprocess | |
from pdf2image.exceptions import ( | |
PDFInfoNotInstalledError, | |
PDFPageCountError, | |
PDFSyntaxError | |
) | |
import fitz # PyMuPDF | |
from PIL import Image, UnidentifiedImageError | |
import io | |
import base64 | |
def clean_text(text): | |
nlp = spacy.load("en_core_web_sm", disable=["tagger", "parser", "ner", "textcat"]) | |
text = re.sub(r'\n+', '\n', text) | |
text = re.sub(r'\s+', ' ', text) | |
return text.strip() | |
def image_to_latex(image): | |
image_path = "/tmp/equation.png" # Modify as needed | |
image.save(image_path) | |
result = subprocess.run(["pix2tex", image_path], capture_output=True, text=True) | |
return result.stdout | |
def pdf_to_text(file): | |
doc = fitz.open(file.name) | |
full_text = '' | |
for i, page in enumerate(doc): | |
# Extract text | |
page_text = page.get_text() | |
# Extract images and convert to LaTeX | |
image_list = page.get_images(full=True) | |
for img in image_list: | |
xref, name, ext, color_space, width, height, bpc, image_data, image_mask, smask_data = img | |
# Decode image_data from base64 before opening it | |
image_data = base64.b64decode(image_data) | |
try: | |
image = Image.open(io.BytesIO(image_data)) | |
latex_code = image_to_latex(image) | |
page_text += "\n" + latex_code # Add LaTeX code to page text | |
except UnidentifiedImageError: | |
print(f"Could not identify image on page {i+1}") | |
page_text = clean_text(page_text) | |
if len(page_text.split()) > 5: | |
page_number = i + 1 | |
page_text = "## Metadata: Page Number " + str(page_number) + "\n" + page_text | |
full_text += page_text + "\n\n" | |
base_name = os.path.splitext(os.path.basename(file.name))[0] | |
output_file_name = base_name + ".txt" | |
with open(output_file_name, 'w') as f: | |
f.write(full_text) | |
return output_file_name, page_number | |
iface = gr.Interface(fn=pdf_to_text, | |
inputs=gr.inputs.File(label="Your PDF"), | |
outputs=[gr.outputs.File(label="Download TXT"), gr.outputs.Textbox(label="Last Page Processed")], | |
title="PDF to TXT", | |
description="Convert your PDF files to clean text") | |
iface.launch() | |