PDFToTxT / app.py
crowles's picture
Update app.py
1121811 verified
import os
import gradio as gr
import subprocess
try:
# Update the package lists
subprocess.run(['apt-get', 'update'], check=True)
# Install the required packages
subprocess.run(['apt-get', 'install', '-y', 'poppler-utils'], check=True)
subprocess.run(['apt-get', 'install', '-y', 'tesseract-ocr'], check=True)
subprocess.run(['apt-get', 'install', '-y', 'tesseract-ocr-eng'], check=True)
print("Packages installed successfully!")
except subprocess.CalledProcessError as e:
print(f"An error occurred: {e}")
def process_pdf(file):
# Get the uploaded PDF filename (Gradio File object)
input_pdf = file.name
os.system(f'pdftoppm -png "{input_pdf}" img')
# Perform OCR using Tesseract on each PNG image (only English)
for image in os.listdir():
if image.startswith('img') and image.endswith('.png'):
output_txt = f"ocr_{image}.txt"
os.system(f'tesseract "{image}" "{output_txt[:-4]}"')
# Combine all OCR text files into one
output_txt_file = f"{input_pdf[:-4]}.txt"
with open(output_txt_file, 'w') as output_file:
for text_file in os.listdir():
if text_file.startswith('ocr_img') and text_file.endswith('.txt'):
with open(text_file, 'r') as f:
output_file.write(f.read())
output_file.write("\n") # Optional: add newline between text files
# Optional: Clean up intermediate PNG and text files
for file in os.listdir():
if file.startswith('img') or file.startswith('ocr_img'):
os.remove(file)
return output_txt_file
# Example Gradio Interface
interface = gr.Interface(
fn=process_pdf,
inputs=gr.File(),
outputs=gr.File(),
title="PDF to Text with OCR",
description="Upload a PDF, perform OCR on it."
)
# Launch the interface
interface.launch(debug=True)