|
|
"""OCR engine initializers and runners with safer Tesseract handling.""" |
|
|
|
|
|
import os |
|
|
import sys |
|
|
import tempfile |
|
|
import numpy as np |
|
|
|
|
|
try: |
|
|
import easyocr |
|
|
except Exception: |
|
|
easyocr = None |
|
|
|
|
|
try: |
|
|
from doctr.io import DocumentFile |
|
|
from doctr.models import ocr_predictor |
|
|
except Exception: |
|
|
DocumentFile = None |
|
|
ocr_predictor = None |
|
|
|
|
|
try: |
|
|
from paddleocr import PaddleOCR |
|
|
except Exception: |
|
|
PaddleOCR = None |
|
|
|
|
|
try: |
|
|
import pytesseract |
|
|
except Exception: |
|
|
pytesseract = None |
|
|
|
|
|
try: |
|
|
import cv2 |
|
|
except Exception: |
|
|
cv2 = None |
|
|
|
|
|
|
|
|
def initialize_ocr_models(ocr_models, language_code, device): |
|
|
ocr_readers = {} |
|
|
|
|
|
if "EasyOCR" in ocr_models and easyocr is not None: |
|
|
ocr_readers["EasyOCR"] = easyocr.Reader( |
|
|
[language_code], gpu=(device == "GPU (CUDA)") |
|
|
) |
|
|
|
|
|
if "DocTR" in ocr_models and ocr_predictor is not None: |
|
|
ocr_readers["DocTR"] = ocr_predictor(pretrained=True) |
|
|
|
|
|
if "PaddleOCR" in ocr_models and PaddleOCR is not None: |
|
|
use_gpu = True if device == "GPU (CUDA)" else False |
|
|
ocr_readers["PaddleOCR"] = PaddleOCR(lang=language_code, use_gpu=use_gpu) |
|
|
|
|
|
|
|
|
if "Tesseract" in ocr_models and pytesseract is not None: |
|
|
if sys.platform.startswith("win"): |
|
|
|
|
|
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe" |
|
|
else: |
|
|
|
|
|
for p in ("/usr/bin/tesseract", "/usr/local/bin/tesseract"): |
|
|
if os.path.exists(p): |
|
|
pytesseract.pytesseract.tesseract_cmd = p |
|
|
break |
|
|
|
|
|
return ocr_readers |
|
|
|
|
|
|
|
|
def perform_ocr(model_name, ocr_readers, image, language_code): |
|
|
text = "" |
|
|
|
|
|
if model_name == "EasyOCR": |
|
|
reader = ocr_readers.get("EasyOCR") |
|
|
if reader is None: |
|
|
return "[EasyOCR not available]" |
|
|
result = reader.readtext(np.array(image)) |
|
|
text = "\n".join([res[1] for res in result]) |
|
|
|
|
|
elif model_name == "DocTR": |
|
|
predictor = ocr_readers.get("DocTR") |
|
|
if predictor is None or DocumentFile is None: |
|
|
return "[DocTR not available]" |
|
|
with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as tmp_file: |
|
|
image.save(tmp_file, format="PNG") |
|
|
file_path = tmp_file.name |
|
|
doc = DocumentFile.from_images(file_path) |
|
|
result = predictor(doc) |
|
|
|
|
|
pages = [] |
|
|
for page in result.pages: |
|
|
page_text_blocks = [] |
|
|
for block in page.blocks: |
|
|
lines = [" ".join([word.value for word in line.words]) for line in block.lines] |
|
|
page_text_blocks.append("\n".join(lines)) |
|
|
pages.append("\n\n".join(page_text_blocks)) |
|
|
text = "\n\n".join(pages) |
|
|
try: |
|
|
os.unlink(file_path) |
|
|
except Exception: |
|
|
pass |
|
|
|
|
|
elif model_name == "PaddleOCR": |
|
|
reader = ocr_readers.get("PaddleOCR") |
|
|
if reader is None: |
|
|
return "[PaddleOCR not available]" |
|
|
result = reader.ocr(np.array(image)) |
|
|
|
|
|
try: |
|
|
text = "\n".join([line[1][0] for line in result[0]]) |
|
|
except Exception: |
|
|
|
|
|
tokens = [] |
|
|
for page in result: |
|
|
for line in page: |
|
|
if len(line) > 1 and isinstance(line[1], (list, tuple)): |
|
|
tokens.append(line[1][0]) |
|
|
text = "\n".join(tokens) |
|
|
|
|
|
elif model_name == "Tesseract": |
|
|
if pytesseract is None: |
|
|
return "[pytesseract not available]" |
|
|
|
|
|
try: |
|
|
if image.mode != "RGB": |
|
|
image = image.convert("RGB") |
|
|
except Exception: |
|
|
pass |
|
|
|
|
|
if cv2 is not None: |
|
|
opencv_image = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR) |
|
|
else: |
|
|
|
|
|
opencv_image = np.array(image) |
|
|
config = f"--oem 3 --psm 6 -l {language_code}" |
|
|
try: |
|
|
text = pytesseract.image_to_string(opencv_image) |
|
|
except Exception as e: |
|
|
text = f"[Tesseract error: {e}]" |
|
|
|
|
|
return text |
|
|
|