Spaces:

crowles
/

PDFToTxT

Sleeping

App Files Files Community

PDFToTxT / app.py

crowles

Update app.py

1121811 verified 5 months ago

raw

history blame contribute delete

1.84 kB

	import os
	import gradio as gr
	import subprocess

	try:
	# Update the package lists
	subprocess.run(['apt-get', 'update'], check=True)

	# Install the required packages
	subprocess.run(['apt-get', 'install', '-y', 'poppler-utils'], check=True)
	subprocess.run(['apt-get', 'install', '-y', 'tesseract-ocr'], check=True)
	subprocess.run(['apt-get', 'install', '-y', 'tesseract-ocr-eng'], check=True)

	print("Packages installed successfully!")
	except subprocess.CalledProcessError as e:
	print(f"An error occurred: {e}")

	def process_pdf(file):

	# Get the uploaded PDF filename (Gradio File object)
	input_pdf = file.name
	os.system(f'pdftoppm -png "{input_pdf}" img')

	# Perform OCR using Tesseract on each PNG image (only English)
	for image in os.listdir():
	if image.startswith('img') and image.endswith('.png'):
	output_txt = f"ocr_{image}.txt"
	os.system(f'tesseract "{image}" "{output_txt[:-4]}"')

	# Combine all OCR text files into one
	output_txt_file = f"{input_pdf[:-4]}.txt"
	with open(output_txt_file, 'w') as output_file:
	for text_file in os.listdir():
	if text_file.startswith('ocr_img') and text_file.endswith('.txt'):
	with open(text_file, 'r') as f:
	output_file.write(f.read())
	output_file.write("\n") # Optional: add newline between text files

	# Optional: Clean up intermediate PNG and text files
	for file in os.listdir():
	if file.startswith('img') or file.startswith('ocr_img'):
	os.remove(file)

	return output_txt_file



	# Example Gradio Interface
	interface = gr.Interface(
	fn=process_pdf,
	inputs=gr.File(),
	outputs=gr.File(),
	title="PDF to Text with OCR",
	description="Upload a PDF, perform OCR on it."
	)

	# Launch the interface
	interface.launch(debug=True)