|
import os |
|
import gradio as gr |
|
import subprocess |
|
|
|
try: |
|
|
|
subprocess.run(['apt-get', 'update'], check=True) |
|
|
|
|
|
subprocess.run(['apt-get', 'install', '-y', 'poppler-utils'], check=True) |
|
subprocess.run(['apt-get', 'install', '-y', 'tesseract-ocr'], check=True) |
|
subprocess.run(['apt-get', 'install', '-y', 'tesseract-ocr-eng'], check=True) |
|
|
|
print("Packages installed successfully!") |
|
except subprocess.CalledProcessError as e: |
|
print(f"An error occurred: {e}") |
|
|
|
def process_pdf(file): |
|
|
|
|
|
input_pdf = file.name |
|
os.system(f'pdftoppm -png "{input_pdf}" img') |
|
|
|
|
|
for image in os.listdir(): |
|
if image.startswith('img') and image.endswith('.png'): |
|
output_txt = f"ocr_{image}.txt" |
|
os.system(f'tesseract "{image}" "{output_txt[:-4]}"') |
|
|
|
|
|
output_txt_file = f"{input_pdf[:-4]}.txt" |
|
with open(output_txt_file, 'w') as output_file: |
|
for text_file in os.listdir(): |
|
if text_file.startswith('ocr_img') and text_file.endswith('.txt'): |
|
with open(text_file, 'r') as f: |
|
output_file.write(f.read()) |
|
output_file.write("\n") |
|
|
|
|
|
for file in os.listdir(): |
|
if file.startswith('img') or file.startswith('ocr_img'): |
|
os.remove(file) |
|
|
|
return output_txt_file |
|
|
|
|
|
|
|
|
|
interface = gr.Interface( |
|
fn=process_pdf, |
|
inputs=gr.File(), |
|
outputs=gr.File(), |
|
title="PDF to Text with OCR", |
|
description="Upload a PDF, perform OCR on it." |
|
) |
|
|
|
|
|
interface.launch(debug=True) |
|
|