Spaces:
Runtime error
Runtime error
File size: 2,749 Bytes
2342442 e1bd857 26072cc f9aff1d 26072cc 185443c 2342442 e1bd857 c91ff76 e1bd857 5abf32d 2342442 26072cc 1c8ed3e 2e6fb2c 1c8ed3e 95e9b45 ea69e31 e787464 f9aff1d 1c8ed3e 26072cc 1c8ed3e e1bd857 2342442 1c8ed3e ea69e31 fa29ba8 dd899a3 e1bd857 ea69e31 e1bd857 2342442 e1bd857 dd899a3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 |
import gradio as gr
import tempfile
import re
import os
import spacy
import pytesseract
import pdf2image
import subprocess
from pdf2image.exceptions import (
PDFInfoNotInstalledError,
PDFPageCountError,
PDFSyntaxError
)
import fitz # PyMuPDF
from PIL import Image, UnidentifiedImageError
import io
import base64
def clean_text(text):
nlp = spacy.load("en_core_web_sm", disable=["tagger", "parser", "ner", "textcat"])
text = re.sub(r'\n+', '\n', text)
text = re.sub(r'\s+', ' ', text)
return text.strip()
def safe_base64_decode(s):
# add missing padding if necessary
missing_padding = len(s) % 4
if missing_padding:
s += '='* (4 - missing_padding)
try:
return base64.b64decode(s)
except binascii.Error as e:
print("Error decoding base64 string:", e)
return None
def image_to_latex(image):
image_path = "/tmp/equation.png" # Modify as needed
image.save(image_path)
result = subprocess.run(["pix2tex", image_path], capture_output=True, text=True)
return result.stdout
def pdf_to_text(file):
doc = fitz.open(file.name)
full_text = ''
for i, page in enumerate(doc):
# Extract text
page_text = page.get_text()
# Extract images and convert to LaTeX
image_list = page.get_images(full=True)
for img in image_list:
xref, name, ext, color_space, width, height, bpc, image_data, image_mask, smask_data = img
# Check if image_data is base64 encoded string
if isinstance(image_data, str) and re.match(r'^[A-Za-z0-9+/]+[=]{0,2}$', image_data):
image_data = safe_base64_decode(image_data)
try:
image = Image.open(io.BytesIO(image_data))
latex_code = image_to_latex(image)
page_text += "\n" + latex_code # Add LaTeX code to page text
except UnidentifiedImageError:
print(f"Could not identify image on page {i+1}")
page_text = clean_text(page_text)
if len(page_text.split()) > 5:
page_number = i + 1
page_text = "## Metadata: Page Number " + str(page_number) + "\n" + page_text
full_text += page_text + "\n\n"
base_name = os.path.splitext(os.path.basename(file.name))[0]
output_file_name = base_name + ".txt"
with open(output_file_name, 'w') as f:
f.write(full_text)
return output_file_name
iface = gr.Interface(fn=pdf_to_text,
inputs=gr.inputs.File(label="Your PDF"),
outputs=gr.outputs.File(label="Download TXT"),
title="PDF to TXT",
description="Convert your PDF files to clean text")
iface.launch()
|