Spaces:

crowles
/

PDFToTxT

Sleeping

File size: 1,843 Bytes

9e29afb
ec9c8aa
 
 
6f49f90
41ac0fe
1121811
41ac0fe
 
1121811
 
 
41ac0fe
6f49f90
 
 
9e29afb

import os
import gradio as gr 
import subprocess

try:
    # Update the package lists
    subprocess.run(['apt-get', 'update'], check=True)

    # Install the required packages
    subprocess.run(['apt-get', 'install', '-y', 'poppler-utils'], check=True)
    subprocess.run(['apt-get', 'install', '-y', 'tesseract-ocr'], check=True)
    subprocess.run(['apt-get', 'install', '-y', 'tesseract-ocr-eng'], check=True)

    print("Packages installed successfully!")
except subprocess.CalledProcessError as e:
    print(f"An error occurred: {e}")

def process_pdf(file):

  # Get the uploaded PDF filename (Gradio File object)
  input_pdf = file.name
  os.system(f'pdftoppm -png "{input_pdf}" img')

  # Perform OCR using Tesseract on each PNG image (only English)
  for image in os.listdir():
      if image.startswith('img') and image.endswith('.png'):
          output_txt = f"ocr_{image}.txt"
          os.system(f'tesseract "{image}" "{output_txt[:-4]}"')

  # Combine all OCR text files into one
  output_txt_file = f"{input_pdf[:-4]}.txt"
  with open(output_txt_file, 'w') as output_file:
      for text_file in os.listdir():
          if text_file.startswith('ocr_img') and text_file.endswith('.txt'):
              with open(text_file, 'r') as f:
                  output_file.write(f.read())
                  output_file.write("\n")  # Optional: add newline between text files

  # Optional: Clean up intermediate PNG and text files
  for file in os.listdir():
      if file.startswith('img') or file.startswith('ocr_img'):
          os.remove(file)

  return output_txt_file



# Example Gradio Interface
interface = gr.Interface(
    fn=process_pdf,
    inputs=gr.File(),
    outputs=gr.File(),
    title="PDF to Text with OCR",
    description="Upload a PDF, perform OCR on it."
)

# Launch the interface
interface.launch(debug=True)