BhagatSurya's picture
Update app.py
dd899a3
raw
history blame contribute delete
No virus
2.75 kB
import gradio as gr
import tempfile
import re
import os
import spacy
import pytesseract
import pdf2image
import subprocess
from pdf2image.exceptions import (
PDFInfoNotInstalledError,
PDFPageCountError,
PDFSyntaxError
)
import fitz # PyMuPDF
from PIL import Image, UnidentifiedImageError
import io
import base64
def clean_text(text):
nlp = spacy.load("en_core_web_sm", disable=["tagger", "parser", "ner", "textcat"])
text = re.sub(r'\n+', '\n', text)
text = re.sub(r'\s+', ' ', text)
return text.strip()
def safe_base64_decode(s):
# add missing padding if necessary
missing_padding = len(s) % 4
if missing_padding:
s += '='* (4 - missing_padding)
try:
return base64.b64decode(s)
except binascii.Error as e:
print("Error decoding base64 string:", e)
return None
def image_to_latex(image):
image_path = "/tmp/equation.png" # Modify as needed
image.save(image_path)
result = subprocess.run(["pix2tex", image_path], capture_output=True, text=True)
return result.stdout
def pdf_to_text(file):
doc = fitz.open(file.name)
full_text = ''
for i, page in enumerate(doc):
# Extract text
page_text = page.get_text()
# Extract images and convert to LaTeX
image_list = page.get_images(full=True)
for img in image_list:
xref, name, ext, color_space, width, height, bpc, image_data, image_mask, smask_data = img
# Check if image_data is base64 encoded string
if isinstance(image_data, str) and re.match(r'^[A-Za-z0-9+/]+[=]{0,2}$', image_data):
image_data = safe_base64_decode(image_data)
try:
image = Image.open(io.BytesIO(image_data))
latex_code = image_to_latex(image)
page_text += "\n" + latex_code # Add LaTeX code to page text
except UnidentifiedImageError:
print(f"Could not identify image on page {i+1}")
page_text = clean_text(page_text)
if len(page_text.split()) > 5:
page_number = i + 1
page_text = "## Metadata: Page Number " + str(page_number) + "\n" + page_text
full_text += page_text + "\n\n"
base_name = os.path.splitext(os.path.basename(file.name))[0]
output_file_name = base_name + ".txt"
with open(output_file_name, 'w') as f:
f.write(full_text)
return output_file_name
iface = gr.Interface(fn=pdf_to_text,
inputs=gr.inputs.File(label="Your PDF"),
outputs=gr.outputs.File(label="Download TXT"),
title="PDF to TXT",
description="Convert your PDF files to clean text")
iface.launch()