Spaces:

BhagatSurya
/

convet_pdf_to_txt

Runtime error

App Files Files Community

convet_pdf_to_txt / app.py

BhagatSurya

Update app.py

dd899a3 about 1 year ago

raw

history blame contribute delete

No virus

2.75 kB

	import gradio as gr
	import tempfile
	import re
	import os
	import spacy
	import pytesseract
	import pdf2image
	import subprocess
	from pdf2image.exceptions import (
	PDFInfoNotInstalledError,
	PDFPageCountError,
	PDFSyntaxError
	)
	import fitz # PyMuPDF
	from PIL import Image, UnidentifiedImageError
	import io
	import base64

	def clean_text(text):
	nlp = spacy.load("en_core_web_sm", disable=["tagger", "parser", "ner", "textcat"])
	text = re.sub(r'\n+', '\n', text)
	text = re.sub(r'\s+', ' ', text)
	return text.strip()

	def safe_base64_decode(s):
	# add missing padding if necessary
	missing_padding = len(s) % 4
	if missing_padding:
	s += '='* (4 - missing_padding)
	try:
	return base64.b64decode(s)
	except binascii.Error as e:
	print("Error decoding base64 string:", e)
	return None

	def image_to_latex(image):
	image_path = "/tmp/equation.png" # Modify as needed
	image.save(image_path)
	result = subprocess.run(["pix2tex", image_path], capture_output=True, text=True)
	return result.stdout

	def pdf_to_text(file):
	doc = fitz.open(file.name)
	full_text = ''
	for i, page in enumerate(doc):
	# Extract text
	page_text = page.get_text()

	# Extract images and convert to LaTeX
	image_list = page.get_images(full=True)
	for img in image_list:
	xref, name, ext, color_space, width, height, bpc, image_data, image_mask, smask_data = img
	# Check if image_data is base64 encoded string
	if isinstance(image_data, str) and re.match(r'^[A-Za-z0-9+/]+[=]{0,2}$', image_data):
	image_data = safe_base64_decode(image_data)
	try:
	image = Image.open(io.BytesIO(image_data))
	latex_code = image_to_latex(image)
	page_text += "\n" + latex_code # Add LaTeX code to page text
	except UnidentifiedImageError:
	print(f"Could not identify image on page {i+1}")

	page_text = clean_text(page_text)
	if len(page_text.split()) > 5:
	page_number = i + 1
	page_text = "## Metadata: Page Number " + str(page_number) + "\n" + page_text
	full_text += page_text + "\n\n"

	base_name = os.path.splitext(os.path.basename(file.name))[0]
	output_file_name = base_name + ".txt"
	with open(output_file_name, 'w') as f:
	f.write(full_text)

	return output_file_name

	iface = gr.Interface(fn=pdf_to_text,
	inputs=gr.inputs.File(label="Your PDF"),
	outputs=gr.outputs.File(label="Download TXT"),
	title="PDF to TXT",
	description="Convert your PDF files to clean text")
	iface.launch()