import gradio as gr import tempfile import tesserocr import os import fitz # PyMuPDF, imported as fitz for backward compatibility reasons from PIL import Image import logging from multiprocessing.pool import Pool logging.basicConfig(level=logging.INFO) APIs = { "pol": tesserocr.PyTessBaseAPI(lang="pol", path="./tessdata"), "eng": tesserocr.PyTessBaseAPI(), } def pdf_to_image(pdf_file, path, progress, max_pages): # Convert the PDF to a PNG image using pdf2image doc = fitz.open(pdf_file.name) # open document fnames = [] idx = 1 total = len(doc) if max_pages == 0 else max_pages for page in progress.tqdm(doc, desc="Converting PDF to image", total=total): pix = page.get_pixmap() output = f"{path}/page-{idx}.png" pix.save(output) fnames.append(output) idx += 1 if max_pages > 0 and idx > max_pages: break return fnames def tesseract_ocr(image, language, max_pages, progress=gr.Progress()): api = APIs[language] # Run OCR on the image using Tesseract with tempfile.TemporaryDirectory() as path: images = pdf_to_image(image, path, progress, max_pages) text_res = [] for img in progress.tqdm(images, desc="Running OCR"): with open(img, 'rb') as f: img = Image.open(f) img.load() api.SetImage(img) text = api.GetUTF8Text() text_res.append(text) with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".txt") as file: file.write("\n".join(text_res)) return file.name if __name__ == "__main__": logging.info("Starting Tesseract OCR") iface = gr.Interface( fn=tesseract_ocr, inputs=[ gr.File(label="PDF file"), gr.Dropdown(["eng", "pol"], label="Language", value="eng"), gr.Number(label="Number of pages", value=0) ], outputs=gr.File(label="Text file", type="file"), title="PDF to Text Converter", description="Converts a PDF file to text using Tesseract OCR." ).queue(concurrency_count=10) iface.launch(server_port=7860, server_name="0.0.0.0")