import gradio as gr import tempfile import re import os import spacy import pytesseract import pdf2image import subprocess from pdf2image.exceptions import ( PDFInfoNotInstalledError, PDFPageCountError, PDFSyntaxError ) import fitz # PyMuPDF from PIL import Image, UnidentifiedImageError import io import base64 def clean_text(text): nlp = spacy.load("en_core_web_sm", disable=["tagger", "parser", "ner", "textcat"]) text = re.sub(r'\n+', '\n', text) text = re.sub(r'\s+', ' ', text) return text.strip() def safe_base64_decode(s): # add missing padding if necessary missing_padding = len(s) % 4 if missing_padding: s += '='* (4 - missing_padding) try: return base64.b64decode(s) except binascii.Error as e: print("Error decoding base64 string:", e) return None def image_to_latex(image): image_path = "/tmp/equation.png" # Modify as needed image.save(image_path) result = subprocess.run(["pix2tex", image_path], capture_output=True, text=True) return result.stdout def pdf_to_text(file): doc = fitz.open(file.name) full_text = '' for i, page in enumerate(doc): # Extract text page_text = page.get_text() # Extract images and convert to LaTeX image_list = page.get_images(full=True) for img in image_list: xref, name, ext, color_space, width, height, bpc, image_data, image_mask, smask_data = img # Check if image_data is base64 encoded string if isinstance(image_data, str) and re.match(r'^[A-Za-z0-9+/]+[=]{0,2}$', image_data): image_data = safe_base64_decode(image_data) try: image = Image.open(io.BytesIO(image_data)) latex_code = image_to_latex(image) page_text += "\n" + latex_code # Add LaTeX code to page text except UnidentifiedImageError: print(f"Could not identify image on page {i+1}") page_text = clean_text(page_text) if len(page_text.split()) > 5: page_number = i + 1 page_text = "## Metadata: Page Number " + str(page_number) + "\n" + page_text full_text += page_text + "\n\n" base_name = os.path.splitext(os.path.basename(file.name))[0] output_file_name = base_name + ".txt" with open(output_file_name, 'w') as f: f.write(full_text) return output_file_name iface = gr.Interface(fn=pdf_to_text, inputs=gr.inputs.File(label="Your PDF"), outputs=gr.outputs.File(label="Download TXT"), title="PDF to TXT", description="Convert your PDF files to clean text") iface.launch()