Shirochi commited on
Commit
f631cce
·
verified ·
1 Parent(s): cb72edf

Upload 5 files

Browse files
manga_integration.py ADDED
The diff for this file is too large to render. See raw diff
 
manga_translator.py ADDED
The diff for this file is too large to render. See raw diff
 
memory_usage_reporter.py ADDED
@@ -0,0 +1,244 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # memory_usage_reporter.py
2
+ """
3
+ Background memory usage reporter.
4
+ - Logs process RSS, VMS, peak (if available), GC counts, and optional tracemalloc stats
5
+ - Writes to logs/memory.log and also propagates to root logger (run.log) via a child logger
6
+ - Designed to be lightweight and safe in GUI apps
7
+ """
8
+ import os
9
+ import sys
10
+ import time
11
+ import threading
12
+ import logging
13
+ import gc
14
+ from logging.handlers import RotatingFileHandler
15
+
16
+ try:
17
+ import psutil
18
+ except Exception:
19
+ psutil = None
20
+
21
+ # Global singletons
22
+ _GLOBAL_THREAD = None
23
+ _GLOBAL_STOP = threading.Event()
24
+
25
+
26
+ def _ensure_logs_dir() -> str:
27
+ # Prefer explicit override from main app
28
+ try:
29
+ env_dir = os.environ.get("GLOSSARION_LOG_DIR")
30
+ if env_dir:
31
+ dir_path = os.path.expanduser(env_dir)
32
+ os.makedirs(dir_path, exist_ok=True)
33
+ return dir_path
34
+ except Exception:
35
+ pass
36
+
37
+ def _can_write(p: str) -> bool:
38
+ try:
39
+ os.makedirs(p, exist_ok=True)
40
+ test_file = os.path.join(p, ".write_test")
41
+ with open(test_file, "w", encoding="utf-8") as f:
42
+ f.write("ok")
43
+ os.remove(test_file)
44
+ return True
45
+ except Exception:
46
+ return False
47
+
48
+ # Frozen exe: try next to the executable first
49
+ try:
50
+ if getattr(sys, 'frozen', False) and hasattr(sys, 'executable'):
51
+ exe_dir = os.path.dirname(sys.executable)
52
+ candidate = os.path.join(exe_dir, "logs")
53
+ if _can_write(candidate):
54
+ return candidate
55
+ except Exception:
56
+ pass
57
+
58
+ # User-local app data (persistent and writable)
59
+ try:
60
+ base = os.environ.get('LOCALAPPDATA') or os.environ.get('APPDATA') or os.path.expanduser('~')
61
+ candidate = os.path.join(base, 'Glossarion', 'logs')
62
+ if _can_write(candidate):
63
+ return candidate
64
+ except Exception:
65
+ pass
66
+
67
+ # Development fallback: next to this file
68
+ try:
69
+ base_dir = os.path.abspath(os.path.dirname(__file__))
70
+ candidate = os.path.join(base_dir, "logs")
71
+ if _can_write(candidate):
72
+ return candidate
73
+ except Exception:
74
+ pass
75
+
76
+ # Final fallback: CWD
77
+ fallback = os.path.join(os.getcwd(), "logs")
78
+ os.makedirs(fallback, exist_ok=True)
79
+ return fallback
80
+
81
+
82
+ def _make_logger() -> logging.Logger:
83
+ logger = logging.getLogger("memory")
84
+ logger.setLevel(logging.INFO)
85
+
86
+ # Avoid duplicate handlers if called more than once
87
+ if not any(isinstance(h, RotatingFileHandler) for h in logger.handlers):
88
+ logs_dir = _ensure_logs_dir()
89
+ file_path = os.path.join(logs_dir, "memory.log")
90
+ fh = RotatingFileHandler(file_path, maxBytes=2 * 1024 * 1024, backupCount=3, encoding="utf-8")
91
+ fmt = logging.Formatter(
92
+ fmt="%(asctime)s %(levelname)s [%(process)d:%(threadName)s] %(name)s: %(message)s",
93
+ datefmt="%Y-%m-%d %H:%M:%S",
94
+ )
95
+ fh.setFormatter(fmt)
96
+ logger.addHandler(fh)
97
+
98
+ # Do NOT propagate to root; keep memory logs out of console and only in memory.log
99
+ logger.propagate = False
100
+ return logger
101
+
102
+
103
+ def _get_process() -> "psutil.Process | None":
104
+ if psutil is None:
105
+ return None
106
+ try:
107
+ return psutil.Process()
108
+ except Exception:
109
+ return None
110
+
111
+
112
+ def _format_bytes(num: int) -> str:
113
+ try:
114
+ for unit in ["B", "KB", "MB", "GB", "TB"]:
115
+ if num < 1024.0:
116
+ return f"{num:,.1f}{unit}"
117
+ num /= 1024.0
118
+ return f"{num:,.1f}PB"
119
+ except Exception:
120
+ return str(num)
121
+
122
+
123
+ def _collect_stats(proc) -> dict:
124
+ stats = {}
125
+ try:
126
+ if proc is not None:
127
+ mi = proc.memory_info()
128
+ stats["rss"] = mi.rss
129
+ stats["vms"] = getattr(mi, "vms", 0)
130
+ # Peak RSS on Windows via psutil.Process.memory_info() may expose peak_wset in private API; skip for portability
131
+ else:
132
+ stats["rss"] = 0
133
+ stats["vms"] = 0
134
+ except Exception:
135
+ stats["rss"] = stats.get("rss", 0)
136
+ stats["vms"] = stats.get("vms", 0)
137
+
138
+ # GC stats
139
+ try:
140
+ counts = gc.get_count()
141
+ stats["gc"] = counts
142
+ except Exception:
143
+ stats["gc"] = (0, 0, 0)
144
+
145
+ return stats
146
+
147
+
148
+ def _worker(interval_sec: float, include_tracemalloc: bool):
149
+ """Memory usage monitoring worker - runs in background thread."""
150
+ try:
151
+ log = _make_logger()
152
+ proc = _get_process()
153
+
154
+ # Optional tracemalloc
155
+ if include_tracemalloc:
156
+ try:
157
+ import tracemalloc
158
+ if not tracemalloc.is_tracing():
159
+ tracemalloc.start()
160
+ tm_enabled = True
161
+ except Exception:
162
+ tm_enabled = False
163
+ else:
164
+ tm_enabled = False
165
+ except Exception:
166
+ # If initialization fails, exit thread gracefully
167
+ return
168
+
169
+ # Main monitoring loop with additional safety
170
+ while not _GLOBAL_STOP.is_set():
171
+ try:
172
+ st = _collect_stats(proc)
173
+ rss = st.get("rss", 0)
174
+ vms = st.get("vms", 0)
175
+ gc0, gc1, gc2 = st.get("gc", (0, 0, 0))
176
+
177
+ msg = (
178
+ f"RSS={_format_bytes(rss)} VMS={_format_bytes(vms)} "
179
+ f"GC={gc0}/{gc1}/{gc2}"
180
+ )
181
+
182
+ if tm_enabled:
183
+ try:
184
+ import tracemalloc
185
+ cur, peak = tracemalloc.get_traced_memory()
186
+ msg += f" TM_CUR={_format_bytes(cur)} TM_PEAK={_format_bytes(peak)}"
187
+ except Exception:
188
+ pass
189
+
190
+ log.info(msg)
191
+ except Exception as e:
192
+ try:
193
+ log.warning("memory reporter error: %s", e)
194
+ except Exception:
195
+ pass
196
+ finally:
197
+ # Use a single sleep with timeout instead of multiple small sleeps
198
+ # This reduces thread switching overhead that can cause GIL issues
199
+ try:
200
+ _GLOBAL_STOP.wait(timeout=interval_sec)
201
+ except Exception:
202
+ # Fallback to regular sleep if wait fails
203
+ time.sleep(interval_sec)
204
+
205
+
206
+ def start_global_memory_logger(interval_sec: float = 3.0, include_tracemalloc: bool = False) -> None:
207
+ """Start the background memory logger once per process.
208
+
209
+ interval_sec: how often to log
210
+ include_tracemalloc: if True, also log tracemalloc current/peak
211
+ """
212
+ global _GLOBAL_THREAD
213
+
214
+ # Thread-safe check
215
+ with threading.Lock():
216
+ if _GLOBAL_THREAD and _GLOBAL_THREAD.is_alive():
217
+ return
218
+
219
+ # Clear stop event before starting
220
+ _GLOBAL_STOP.clear()
221
+
222
+ try:
223
+ t = threading.Thread(
224
+ target=_worker,
225
+ args=(interval_sec, include_tracemalloc),
226
+ name="mem-logger",
227
+ daemon=True
228
+ )
229
+ t.start()
230
+ _GLOBAL_THREAD = t
231
+ except Exception:
232
+ # Do not raise to avoid breaking GUI startup
233
+ _GLOBAL_THREAD = None
234
+ pass
235
+
236
+
237
+ def stop_global_memory_logger() -> None:
238
+ try:
239
+ _GLOBAL_STOP.set()
240
+ if _GLOBAL_THREAD and _GLOBAL_THREAD.is_alive():
241
+ # Give it a moment to exit
242
+ _GLOBAL_THREAD.join(timeout=2.0)
243
+ except Exception:
244
+ pass
ocr_manager.py ADDED
@@ -0,0 +1,1968 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ocr_manager.py
2
+ """
3
+ OCR Manager for handling multiple OCR providers
4
+ Handles installation, model downloading, and OCR processing
5
+ Updated with HuggingFace donut model and proper bubble detection integration
6
+ """
7
+ import os
8
+ import sys
9
+ import cv2
10
+ import json
11
+ import subprocess
12
+ import threading
13
+ import traceback
14
+ from typing import List, Dict, Optional, Tuple, Any
15
+ import numpy as np
16
+ from dataclasses import dataclass
17
+ from PIL import Image
18
+ import logging
19
+ import time
20
+ import random
21
+ import base64
22
+ import io
23
+ import requests
24
+
25
+ try:
26
+ import gptqmodel
27
+ HAS_GPTQ = True
28
+ except ImportError:
29
+ try:
30
+ import auto_gptq
31
+ HAS_GPTQ = True
32
+ except ImportError:
33
+ HAS_GPTQ = False
34
+
35
+ try:
36
+ import optimum
37
+ HAS_OPTIMUM = True
38
+ except ImportError:
39
+ HAS_OPTIMUM = False
40
+
41
+ try:
42
+ import accelerate
43
+ HAS_ACCELERATE = True
44
+ except ImportError:
45
+ HAS_ACCELERATE = False
46
+
47
+ logger = logging.getLogger(__name__)
48
+
49
+ @dataclass
50
+ class OCRResult:
51
+ """Unified OCR result format with built-in sanitization to prevent data corruption."""
52
+ text: str
53
+ bbox: Tuple[int, int, int, int] # x, y, w, h
54
+ confidence: float
55
+ vertices: Optional[List[Tuple[int, int]]] = None
56
+
57
+ def __post_init__(self):
58
+ """
59
+ This special method is called automatically after the object is created.
60
+ It acts as a final safeguard to ensure the 'text' attribute is ALWAYS a clean string.
61
+ """
62
+ # --- THIS IS THE DEFINITIVE FIX ---
63
+ # If the text we received is a tuple, we extract the first element.
64
+ # This makes it impossible for a tuple to exist in a finished object.
65
+ if isinstance(self.text, tuple):
66
+ # Log that we are fixing a critical data error.
67
+ print(f"CRITICAL WARNING: Corrupted tuple detected in OCRResult. Sanitizing '{self.text}' to '{self.text[0]}'.")
68
+ self.text = self.text[0]
69
+
70
+ # Ensure the final result is always a stripped string.
71
+ self.text = str(self.text).strip()
72
+
73
+ class OCRProvider:
74
+ """Base class for OCR providers"""
75
+
76
+ def __init__(self, log_callback=None):
77
+ # Set thread limits early if environment indicates single-threaded mode
78
+ try:
79
+ if os.environ.get('OMP_NUM_THREADS') == '1':
80
+ # Already in single-threaded mode, ensure it's applied to this process
81
+ try:
82
+ import sys
83
+ if 'torch' in sys.modules:
84
+ import torch
85
+ torch.set_num_threads(1)
86
+ except (ImportError, RuntimeError, AttributeError):
87
+ pass
88
+ try:
89
+ import cv2
90
+ cv2.setNumThreads(1)
91
+ except (ImportError, AttributeError):
92
+ pass
93
+ except Exception:
94
+ pass
95
+
96
+ self.log_callback = log_callback
97
+ self.is_installed = False
98
+ self.is_loaded = False
99
+ self.model = None
100
+ self.stop_flag = None
101
+ self._stopped = False
102
+
103
+ def _log(self, message: str, level: str = "info"):
104
+ """Log message with stop suppression"""
105
+ # Suppress logs when stopped (allow only essential stop confirmation messages)
106
+ if self._check_stop():
107
+ essential_stop_keywords = [
108
+ "⏹️ Translation stopped by user",
109
+ "⏹️ OCR processing stopped",
110
+ "cleanup", "🧹"
111
+ ]
112
+ if not any(keyword in message for keyword in essential_stop_keywords):
113
+ return
114
+
115
+ if self.log_callback:
116
+ self.log_callback(message, level)
117
+ else:
118
+ print(f"[{level.upper()}] {message}")
119
+
120
+ def set_stop_flag(self, stop_flag):
121
+ """Set the stop flag for checking interruptions"""
122
+ self.stop_flag = stop_flag
123
+ self._stopped = False
124
+
125
+ def _check_stop(self) -> bool:
126
+ """Check if stop has been requested"""
127
+ if self._stopped:
128
+ return True
129
+ if self.stop_flag and self.stop_flag.is_set():
130
+ self._stopped = True
131
+ return True
132
+ # Check global manga translator cancellation
133
+ try:
134
+ from manga_translator import MangaTranslator
135
+ if MangaTranslator.is_globally_cancelled():
136
+ self._stopped = True
137
+ return True
138
+ except Exception:
139
+ pass
140
+ return False
141
+
142
+ def reset_stop_flags(self):
143
+ """Reset stop flags when starting new processing"""
144
+ self._stopped = False
145
+
146
+ def check_installation(self) -> bool:
147
+ """Check if provider is installed"""
148
+ raise NotImplementedError
149
+
150
+ def install(self, progress_callback=None) -> bool:
151
+ """Install the provider"""
152
+ raise NotImplementedError
153
+
154
+ def load_model(self, **kwargs) -> bool:
155
+ """Load the OCR model"""
156
+ raise NotImplementedError
157
+
158
+ def detect_text(self, image: np.ndarray, **kwargs) -> List[OCRResult]:
159
+ """Detect text in image"""
160
+ raise NotImplementedError
161
+
162
+ class CustomAPIProvider(OCRProvider):
163
+ """Custom API OCR provider that uses existing GUI variables"""
164
+
165
+ def __init__(self, log_callback=None):
166
+ super().__init__(log_callback)
167
+
168
+ # Use EXISTING environment variables from TranslatorGUI
169
+ self.api_url = os.environ.get('OPENAI_CUSTOM_BASE_URL', '')
170
+ self.api_key = os.environ.get('API_KEY', '') or os.environ.get('OPENAI_API_KEY', '')
171
+ self.model_name = os.environ.get('MODEL', 'gpt-4o-mini')
172
+
173
+ # OCR prompt - use system prompt or a dedicated OCR prompt variable
174
+ self.ocr_prompt = os.environ.get('OCR_SYSTEM_PROMPT',
175
+ os.environ.get('SYSTEM_PROMPT',
176
+ "YOU ARE A TEXT EXTRACTION MACHINE. EXTRACT EXACTLY WHAT YOU SEE.\n\n"
177
+ "ABSOLUTE RULES:\n"
178
+ "1. OUTPUT ONLY THE VISIBLE TEXT/SYMBOLS - NOTHING ELSE\n"
179
+ "2. NEVER TRANSLATE OR MODIFY\n"
180
+ "3. NEVER EXPLAIN, DESCRIBE, OR COMMENT\n"
181
+ "4. NEVER SAY \"I can't\" or \"I cannot\" or \"no text\" or \"blank image\"\n"
182
+ "5. IF YOU SEE DOTS, OUTPUT THE DOTS: .\n"
183
+ "6. IF YOU SEE PUNCTUATION, OUTPUT THE PUNCTUATION\n"
184
+ "7. IF YOU SEE A SINGLE CHARACTER, OUTPUT THAT CHARACTER\n"
185
+ "8. IF YOU SEE NOTHING, OUTPUT NOTHING (empty response)\n\n"
186
+ "LANGUAGE PRESERVATION:\n"
187
+ "- Korean text → Output in Korean\n"
188
+ "- Japanese text → Output in Japanese\n"
189
+ "- Chinese text → Output in Chinese\n"
190
+ "- English text → Output in English\n\n"
191
+ "FORMATTING:\n"
192
+ "- OUTPUT ALL TEXT ON A SINGLE LINE WITH NO LINE BREAKS\n"
193
+ "- NEVER use \\n or line breaks in your output\n\n"
194
+ "FORBIDDEN RESPONSES:\n"
195
+ "- \"I can see this appears to be...\"\n"
196
+ "- \"I cannot make out any clear text...\"\n"
197
+ "- \"This appears to be blank...\"\n"
198
+ "- \"If there is text present...\"\n"
199
+ "- ANY explanatory text\n\n"
200
+ "YOUR ONLY OUTPUT: The exact visible text. Nothing more. Nothing less.\n"
201
+ "If image has a dot → Output: .\n"
202
+ "If image has two dots → Output: . .\n"
203
+ "If image has text → Output: [that text]\n"
204
+ "If image is truly blank → Output: [empty/no response]"
205
+ ))
206
+
207
+ # Use existing temperature and token settings
208
+ self.temperature = float(os.environ.get('TRANSLATION_TEMPERATURE', '0.01'))
209
+ # NOTE: max_tokens is NOT cached here - it's read fresh from environment in detect_text()
210
+ # to ensure we always get the latest value from the GUI
211
+
212
+ # Image settings from existing compression variables
213
+ self.image_format = 'jpeg' if os.environ.get('IMAGE_COMPRESSION_FORMAT', 'auto') != 'png' else 'png'
214
+ self.image_quality = int(os.environ.get('JPEG_QUALITY', '100'))
215
+
216
+ # Simple defaults
217
+ self.api_format = 'openai' # Most custom endpoints are OpenAI-compatible
218
+ self.timeout = int(os.environ.get('CHUNK_TIMEOUT', '30'))
219
+ self.api_headers = {} # Additional custom headers
220
+
221
+ # Retry configuration for Custom API OCR calls
222
+ self.max_retries = int(os.environ.get('CUSTOM_OCR_MAX_RETRIES', '3'))
223
+ self.retry_initial_delay = float(os.environ.get('CUSTOM_OCR_RETRY_INITIAL_DELAY', '0.8'))
224
+ self.retry_backoff = float(os.environ.get('CUSTOM_OCR_RETRY_BACKOFF', '1.8'))
225
+ self.retry_jitter = float(os.environ.get('CUSTOM_OCR_RETRY_JITTER', '0.4'))
226
+ self.retry_on_empty = os.environ.get('CUSTOM_OCR_RETRY_ON_EMPTY', '1') == '1'
227
+
228
+ def check_installation(self) -> bool:
229
+ """Always installed - uses UnifiedClient"""
230
+ self.is_installed = True
231
+ return True
232
+
233
+ def install(self, progress_callback=None) -> bool:
234
+ """No installation needed for API-based provider"""
235
+ return self.check_installation()
236
+
237
+ def load_model(self, **kwargs) -> bool:
238
+ """Initialize UnifiedClient with current settings"""
239
+ try:
240
+ from unified_api_client import UnifiedClient
241
+
242
+ # Support passing API key from GUI if available
243
+ if 'api_key' in kwargs:
244
+ api_key = kwargs['api_key']
245
+ else:
246
+ api_key = os.environ.get('API_KEY', '') or os.environ.get('OPENAI_API_KEY', '')
247
+
248
+ if 'model' in kwargs:
249
+ model = kwargs['model']
250
+ else:
251
+ model = os.environ.get('MODEL', 'gpt-4o-mini')
252
+
253
+ if not api_key:
254
+ self._log("❌ No API key configured", "error")
255
+ return False
256
+
257
+ # Create UnifiedClient just like translations do
258
+ self.client = UnifiedClient(model=model, api_key=api_key)
259
+
260
+ #self._log(f"✅ Using {model} for OCR via UnifiedClient")
261
+ self.is_loaded = True
262
+ return True
263
+
264
+ except Exception as e:
265
+ self._log(f"❌ Failed to initialize UnifiedClient: {str(e)}", "error")
266
+ return False
267
+
268
+ def _test_connection(self) -> bool:
269
+ """Test API connection with a simple request"""
270
+ try:
271
+ # Create a small test image
272
+ test_image = np.ones((100, 100, 3), dtype=np.uint8) * 255
273
+ cv2.putText(test_image, "TEST", (10, 50), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 0), 2)
274
+
275
+ # Encode image
276
+ image_base64 = self._encode_image(test_image)
277
+
278
+ # Prepare test request based on API format
279
+ if self.api_format == 'openai':
280
+ test_payload = {
281
+ "model": self.model_name,
282
+ "messages": [
283
+ {
284
+ "role": "user",
285
+ "content": [
286
+ {"type": "text", "text": "What text do you see?"},
287
+ {"type": "image_url", "image_url": {"url": f"data:image/{self.image_format};base64,{image_base64}"}}
288
+ ]
289
+ }
290
+ ],
291
+ "max_tokens": 50
292
+ }
293
+ else:
294
+ # For other formats, just try a basic health check
295
+ return True
296
+
297
+ headers = self._prepare_headers()
298
+ response = requests.post(
299
+ self.api_url,
300
+ headers=headers,
301
+ json=test_payload,
302
+ timeout=10
303
+ )
304
+
305
+ return response.status_code == 200
306
+
307
+ except Exception:
308
+ return False
309
+
310
+ def _encode_image(self, image: np.ndarray) -> str:
311
+ """Encode numpy array to base64 string"""
312
+ # Convert BGR to RGB if needed
313
+ if len(image.shape) == 3 and image.shape[2] == 3:
314
+ image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
315
+ else:
316
+ image_rgb = image
317
+
318
+ # Convert to PIL Image
319
+ pil_image = Image.fromarray(image_rgb)
320
+
321
+ # Save to bytes buffer
322
+ buffer = io.BytesIO()
323
+ if self.image_format.lower() == 'png':
324
+ pil_image.save(buffer, format='PNG')
325
+ else:
326
+ pil_image.save(buffer, format='JPEG', quality=self.image_quality)
327
+
328
+ # Encode to base64
329
+ buffer.seek(0)
330
+ image_base64 = base64.b64encode(buffer.read()).decode('utf-8')
331
+
332
+ return image_base64
333
+
334
+ def _prepare_headers(self) -> dict:
335
+ """Prepare request headers"""
336
+ headers = {
337
+ "Content-Type": "application/json"
338
+ }
339
+
340
+ # Add API key if configured
341
+ if self.api_key:
342
+ if self.api_format == 'anthropic':
343
+ headers["x-api-key"] = self.api_key
344
+ else:
345
+ headers["Authorization"] = f"Bearer {self.api_key}"
346
+
347
+ # Add any custom headers
348
+ headers.update(self.api_headers)
349
+
350
+ return headers
351
+
352
+ def _prepare_request_payload(self, image_base64: str) -> dict:
353
+ """Prepare request payload based on API format"""
354
+ if self.api_format == 'openai':
355
+ return {
356
+ "model": self.model_name,
357
+ "messages": [
358
+ {
359
+ "role": "user",
360
+ "content": [
361
+ {"type": "text", "text": self.ocr_prompt},
362
+ {
363
+ "type": "image_url",
364
+ "image_url": {
365
+ "url": f"data:image/{self.image_format};base64,{image_base64}"
366
+ }
367
+ }
368
+ ]
369
+ }
370
+ ],
371
+ "max_tokens": self.max_tokens,
372
+ "temperature": self.temperature
373
+ }
374
+
375
+ elif self.api_format == 'anthropic':
376
+ return {
377
+ "model": self.model_name,
378
+ "max_tokens": self.max_tokens,
379
+ "temperature": self.temperature,
380
+ "messages": [
381
+ {
382
+ "role": "user",
383
+ "content": [
384
+ {
385
+ "type": "text",
386
+ "text": self.ocr_prompt
387
+ },
388
+ {
389
+ "type": "image",
390
+ "source": {
391
+ "type": "base64",
392
+ "media_type": f"image/{self.image_format}",
393
+ "data": image_base64
394
+ }
395
+ }
396
+ ]
397
+ }
398
+ ]
399
+ }
400
+
401
+ else:
402
+ # Custom format - use environment variable for template
403
+ template = os.environ.get('CUSTOM_OCR_REQUEST_TEMPLATE', '{}')
404
+ payload = json.loads(template)
405
+
406
+ # Replace placeholders
407
+ payload_str = json.dumps(payload)
408
+ payload_str = payload_str.replace('{{IMAGE_BASE64}}', image_base64)
409
+ payload_str = payload_str.replace('{{PROMPT}}', self.ocr_prompt)
410
+ payload_str = payload_str.replace('{{MODEL}}', self.model_name)
411
+ payload_str = payload_str.replace('{{MAX_TOKENS}}', str(self.max_tokens))
412
+ payload_str = payload_str.replace('{{TEMPERATURE}}', str(self.temperature))
413
+
414
+ return json.loads(payload_str)
415
+
416
+ def _extract_text_from_response(self, response_data: dict) -> str:
417
+ """Extract text from API response based on format"""
418
+ try:
419
+ if self.api_format == 'openai':
420
+ # OpenAI format: response.choices[0].message.content
421
+ return response_data.get('choices', [{}])[0].get('message', {}).get('content', '')
422
+
423
+ elif self.api_format == 'anthropic':
424
+ # Anthropic format: response.content[0].text
425
+ content = response_data.get('content', [])
426
+ if content and isinstance(content, list):
427
+ return content[0].get('text', '')
428
+ return ''
429
+
430
+ else:
431
+ # Custom format - use environment variable for path
432
+ response_path = os.environ.get('CUSTOM_OCR_RESPONSE_PATH', 'text')
433
+
434
+ # Navigate through the response using the path
435
+ result = response_data
436
+ for key in response_path.split('.'):
437
+ if isinstance(result, dict):
438
+ result = result.get(key, '')
439
+ elif isinstance(result, list) and key.isdigit():
440
+ idx = int(key)
441
+ result = result[idx] if idx < len(result) else ''
442
+ else:
443
+ result = ''
444
+ break
445
+
446
+ return str(result)
447
+
448
+ except Exception as e:
449
+ self._log(f"Failed to extract text from response: {e}", "error")
450
+ return ''
451
+
452
+ def detect_text(self, image: np.ndarray, **kwargs) -> List[OCRResult]:
453
+ """Process image using UnifiedClient.send_image()"""
454
+ results = []
455
+
456
+ try:
457
+ # CRITICAL: Reload OCR prompt from environment before each detection
458
+ # This ensures we use the latest prompt set by manga_integration.py
459
+ self.ocr_prompt = os.environ.get('OCR_SYSTEM_PROMPT', self.ocr_prompt)
460
+
461
+ # Get fresh max_tokens from environment - GUI will have set this
462
+ max_tokens = int(os.environ.get('MAX_OUTPUT_TOKENS', '8192'))
463
+ if not self.is_loaded:
464
+ if not self.load_model():
465
+ return results
466
+
467
+ import cv2
468
+ from PIL import Image
469
+ import base64
470
+ import io
471
+
472
+ # Validate and resize image if too small (consistent with Google/Azure logic)
473
+ h, w = image.shape[:2]
474
+ MIN_SIZE = 50 # Minimum dimension for good OCR quality
475
+ MIN_AREA = 2500 # Minimum area (50x50)
476
+
477
+ # Skip completely invalid/corrupted images (0 or negative dimensions)
478
+ if h <= 0 or w <= 0:
479
+ self._log(f"⚠️ Invalid image dimensions ({w}x{h}px), skipping", "warning")
480
+ return results
481
+
482
+ if h < MIN_SIZE or w < MIN_SIZE or h * w < MIN_AREA:
483
+ # Image too small - resize it
484
+ scale_w = MIN_SIZE / w if w < MIN_SIZE else 1.0
485
+ scale_h = MIN_SIZE / h if h < MIN_SIZE else 1.0
486
+ scale = max(scale_w, scale_h)
487
+
488
+ if scale > 1.0:
489
+ new_w = int(w * scale)
490
+ new_h = int(h * scale)
491
+ image = cv2.resize(image, (new_w, new_h), interpolation=cv2.INTER_CUBIC)
492
+ self._log(f"🔍 Image resized from {w}x{h}px to {new_w}x{new_h}px for Custom API OCR", "debug")
493
+ h, w = new_h, new_w
494
+
495
+ # Convert numpy array to PIL Image
496
+ image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
497
+ pil_image = Image.fromarray(image_rgb)
498
+
499
+ # Convert PIL Image to base64 string
500
+ buffer = io.BytesIO()
501
+
502
+ # Use the image format from settings
503
+ if self.image_format.lower() == 'png':
504
+ pil_image.save(buffer, format='PNG')
505
+ else:
506
+ pil_image.save(buffer, format='JPEG', quality=self.image_quality)
507
+
508
+ buffer.seek(0)
509
+ image_base64 = base64.b64encode(buffer.read()).decode('utf-8')
510
+
511
+ # For OpenAI vision models, we need BOTH:
512
+ # 1. System prompt with instructions
513
+ # 2. User message that includes the image
514
+ messages = [
515
+ {
516
+ "role": "system",
517
+ "content": self.ocr_prompt # The OCR instruction as system prompt
518
+ },
519
+ {
520
+ "role": "user",
521
+ "content": [
522
+ {
523
+ "type": "text",
524
+ "text": "Image:" # Minimal text, just to have something
525
+ },
526
+ {
527
+ "type": "image_url",
528
+ "image_url": {
529
+ "url": f"data:image/jpeg;base64,{image_base64}"
530
+ }
531
+ }
532
+ ]
533
+ }
534
+ ]
535
+
536
+ # Now send this properly formatted message
537
+ # The UnifiedClient should handle this correctly
538
+ # But we're NOT using send_image, we're using regular send
539
+
540
+ # Retry-aware call
541
+ from unified_api_client import UnifiedClientError # local import to avoid hard dependency at module import time
542
+ max_attempts = max(1, self.max_retries)
543
+ attempt = 0
544
+ last_error = None
545
+
546
+ # Common refusal/error phrases that indicate a non-OCR response (expanded list)
547
+ refusal_phrases = [
548
+ "I can't extract", "I cannot extract",
549
+ "I'm sorry", "I am sorry",
550
+ "I'm unable", "I am unable",
551
+ "cannot process images",
552
+ "I can't help with that",
553
+ "cannot view images",
554
+ "no text in the image",
555
+ "I can see this appears",
556
+ "I cannot make out",
557
+ "appears to be blank",
558
+ "appears to be a mostly blank",
559
+ "mostly blank or white image",
560
+ "If there is text present",
561
+ "too small, faint, or unclear",
562
+ "cannot accurately extract",
563
+ "may be too",
564
+ "However, I cannot",
565
+ "I don't see any",
566
+ "no clear text",
567
+ "no visible text",
568
+ "does not contain",
569
+ "doesn't contain",
570
+ "I do not see"
571
+ ]
572
+
573
+ while attempt < max_attempts:
574
+ # Check for stop before each attempt
575
+ if self._check_stop():
576
+ self._log("⏹️ OCR processing stopped by user", "warning")
577
+ return results
578
+
579
+ try:
580
+ response = self.client.send(
581
+ messages=messages,
582
+ temperature=self.temperature,
583
+ max_tokens=max_tokens
584
+ )
585
+
586
+ # Extract content from response object
587
+ content, finish_reason = response
588
+
589
+ # Validate content
590
+ has_content = bool(content and str(content).strip())
591
+ refused = False
592
+ if has_content:
593
+ # Filter out explicit failure markers
594
+ if "[" in content and "FAILED]" in content:
595
+ refused = True
596
+ elif any(phrase.lower() in content.lower() for phrase in refusal_phrases):
597
+ refused = True
598
+
599
+ # Decide success or retry
600
+ if has_content and not refused:
601
+ text = str(content).strip()
602
+ results.append(OCRResult(
603
+ text=text,
604
+ bbox=(0, 0, w, h),
605
+ confidence=kwargs.get('confidence', 0.85),
606
+ vertices=[(0, 0), (w, 0), (w, h), (0, h)]
607
+ ))
608
+ self._log(f"✅ Detected: {text[:50]}...")
609
+ break # success
610
+ else:
611
+ reason = "empty result" if not has_content else "refusal/non-OCR response"
612
+ last_error = f"{reason} (finish_reason: {finish_reason})"
613
+ # Check if we should retry on empty or refusal
614
+ should_retry = (not has_content and self.retry_on_empty) or refused
615
+ attempt += 1
616
+ if attempt >= max_attempts or not should_retry:
617
+ # No more retries or shouldn't retry
618
+ if not has_content:
619
+ self._log(f"⚠️ No text detected (finish_reason: {finish_reason})")
620
+ else:
621
+ self._log(f"❌ Model returned non-OCR response: {str(content)[:120]}", "warning")
622
+ break
623
+ # Backoff before retrying
624
+ delay = self.retry_initial_delay * (self.retry_backoff ** (attempt - 1)) + random.uniform(0, self.retry_jitter)
625
+ self._log(f"🔄 Retry {attempt}/{max_attempts - 1} after {delay:.1f}s due to {reason}...", "warning")
626
+ time.sleep(delay)
627
+ time.sleep(0.1) # Brief pause for stability
628
+ self._log("💤 OCR retry pausing briefly for stability", "debug")
629
+ continue
630
+
631
+ except UnifiedClientError as ue:
632
+ msg = str(ue)
633
+ last_error = msg
634
+ # Do not retry on explicit user cancellation
635
+ if 'cancelled' in msg.lower() or 'stopped by user' in msg.lower():
636
+ self._log(f"❌ OCR cancelled: {msg}", "error")
637
+ break
638
+ attempt += 1
639
+ if attempt >= max_attempts:
640
+ self._log(f"❌ OCR failed after {attempt} attempts: {msg}", "error")
641
+ break
642
+ delay = self.retry_initial_delay * (self.retry_backoff ** (attempt - 1)) + random.uniform(0, self.retry_jitter)
643
+ self._log(f"🔄 API error, retry {attempt}/{max_attempts - 1} after {delay:.1f}s: {msg}", "warning")
644
+ time.sleep(delay)
645
+ time.sleep(0.1) # Brief pause for stability
646
+ self._log("💤 OCR API error retry pausing briefly for stability", "debug")
647
+ continue
648
+ except Exception as e_inner:
649
+ last_error = str(e_inner)
650
+ attempt += 1
651
+ if attempt >= max_attempts:
652
+ self._log(f"❌ OCR exception after {attempt} attempts: {last_error}", "error")
653
+ break
654
+ delay = self.retry_initial_delay * (self.retry_backoff ** (attempt - 1)) + random.uniform(0, self.retry_jitter)
655
+ self._log(f"🔄 Exception, retry {attempt}/{max_attempts - 1} after {delay:.1f}s: {last_error}", "warning")
656
+ time.sleep(delay)
657
+ time.sleep(0.1) # Brief pause for stability
658
+ self._log("💤 OCR exception retry pausing briefly for stability", "debug")
659
+ continue
660
+
661
+ except Exception as e:
662
+ self._log(f"❌ Error: {str(e)}", "error")
663
+ import traceback
664
+ self._log(traceback.format_exc(), "debug")
665
+
666
+ return results
667
+
668
+ class MangaOCRProvider(OCRProvider):
669
+ """Manga OCR provider using HuggingFace model directly"""
670
+
671
+ def __init__(self, log_callback=None):
672
+ super().__init__(log_callback)
673
+ self.processor = None
674
+ self.model = None
675
+ self.tokenizer = None
676
+
677
+ def check_installation(self) -> bool:
678
+ """Check if transformers is installed"""
679
+ try:
680
+ import transformers
681
+ import torch
682
+ self.is_installed = True
683
+ return True
684
+ except ImportError:
685
+ return False
686
+
687
+ def install(self, progress_callback=None) -> bool:
688
+ """Install transformers and torch"""
689
+ pass
690
+
691
+ def _is_valid_local_model_dir(self, path: str) -> bool:
692
+ """Check that a local HF model directory has required files."""
693
+ try:
694
+ if not path or not os.path.isdir(path):
695
+ return False
696
+ needed_any_weights = any(
697
+ os.path.exists(os.path.join(path, name)) for name in (
698
+ 'pytorch_model.bin',
699
+ 'model.safetensors'
700
+ )
701
+ )
702
+ has_config = os.path.exists(os.path.join(path, 'config.json'))
703
+ has_processor = (
704
+ os.path.exists(os.path.join(path, 'preprocessor_config.json')) or
705
+ os.path.exists(os.path.join(path, 'processor_config.json'))
706
+ )
707
+ has_tokenizer = (
708
+ os.path.exists(os.path.join(path, 'tokenizer.json')) or
709
+ os.path.exists(os.path.join(path, 'tokenizer_config.json'))
710
+ )
711
+ return has_config and needed_any_weights and has_processor and has_tokenizer
712
+ except Exception:
713
+ return False
714
+
715
+ def load_model(self, **kwargs) -> bool:
716
+ """Load the manga-ocr model, preferring a local directory to avoid re-downloading"""
717
+ print("\n>>> MangaOCRProvider.load_model() called")
718
+ try:
719
+ if not self.is_installed and not self.check_installation():
720
+ print("ERROR: Transformers not installed")
721
+ self._log("❌ Transformers not installed", "error")
722
+ return False
723
+
724
+ # Always disable progress bars to avoid tqdm issues in some environments
725
+ import os
726
+ os.environ.setdefault("HF_HUB_DISABLE_PROGRESS_BARS", "1")
727
+
728
+ from transformers import VisionEncoderDecoderModel, AutoTokenizer, AutoImageProcessor
729
+ import torch
730
+
731
+ # Prefer a local model directory if present to avoid any Hub access
732
+ candidates = []
733
+ env_local = os.environ.get("MANGA_OCR_LOCAL_DIR")
734
+ if env_local:
735
+ candidates.append(env_local)
736
+
737
+ # Project root one level up from this file
738
+ root_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
739
+ candidates.append(os.path.join(root_dir, 'models', 'manga-ocr-base'))
740
+ candidates.append(os.path.join(root_dir, 'models', 'kha-white', 'manga-ocr-base'))
741
+
742
+ model_source = None
743
+ local_only = False
744
+ # Find a valid local dir
745
+ for cand in candidates:
746
+ if self._is_valid_local_model_dir(cand):
747
+ model_source = cand
748
+ local_only = True
749
+ break
750
+
751
+ # If no valid local dir, use Hub
752
+ if not model_source:
753
+ model_source = "kha-white/manga-ocr-base"
754
+ # Make sure we are not forcing offline mode
755
+ if os.environ.get("HF_HUB_OFFLINE") == "1":
756
+ try:
757
+ del os.environ["HF_HUB_OFFLINE"]
758
+ except Exception:
759
+ pass
760
+ self._log("🔥 Loading manga-ocr model from Hugging Face Hub")
761
+ self._log(f" Repo: {model_source}")
762
+ else:
763
+ # Only set offline when local dir is fully valid
764
+ os.environ.setdefault("HF_HUB_OFFLINE", "1")
765
+ self._log("🔥 Loading manga-ocr model from local directory")
766
+ self._log(f" Local path: {model_source}")
767
+
768
+ # Decide target device once; we will move after full CPU load to avoid meta tensors
769
+ use_cuda = torch.cuda.is_available()
770
+
771
+ # Try loading components, falling back to Hub if local-only fails
772
+ def _load_components(source: str, local_flag: bool):
773
+ self._log(" Loading tokenizer...")
774
+ tok = AutoTokenizer.from_pretrained(source, local_files_only=local_flag)
775
+
776
+ self._log(" Loading image processor...")
777
+ try:
778
+ from transformers import AutoProcessor
779
+ except Exception:
780
+ AutoProcessor = None
781
+ try:
782
+ proc = AutoImageProcessor.from_pretrained(source, local_files_only=local_flag)
783
+ except Exception as e_proc:
784
+ if AutoProcessor is not None:
785
+ self._log(f" ⚠️ AutoImageProcessor failed: {e_proc}. Trying AutoProcessor...", "warning")
786
+ proc = AutoProcessor.from_pretrained(source, local_files_only=local_flag)
787
+ else:
788
+ raise
789
+
790
+ self._log(" Loading model...")
791
+ # Prevent meta tensors by forcing full materialization on CPU at load time
792
+ os.environ.setdefault('TORCHDYNAMO_DISABLE', '1')
793
+ mdl = VisionEncoderDecoderModel.from_pretrained(
794
+ source,
795
+ local_files_only=local_flag,
796
+ low_cpu_mem_usage=False,
797
+ device_map=None,
798
+ torch_dtype=torch.float32 # Use torch_dtype instead of dtype
799
+ )
800
+ return tok, proc, mdl
801
+
802
+ try:
803
+ self.tokenizer, self.processor, self.model = _load_components(model_source, local_only)
804
+ except Exception as e_local:
805
+ if local_only:
806
+ # Fallback to Hub once if local fails
807
+ self._log(f" ⚠️ Local model load failed: {e_local}", "warning")
808
+ try:
809
+ if os.environ.get("HF_HUB_OFFLINE") == "1":
810
+ del os.environ["HF_HUB_OFFLINE"]
811
+ except Exception:
812
+ pass
813
+ model_source = "kha-white/manga-ocr-base"
814
+ local_only = False
815
+ self._log(" Retrying from Hugging Face Hub...")
816
+ self.tokenizer, self.processor, self.model = _load_components(model_source, local_only)
817
+ else:
818
+ raise
819
+
820
+ # Move to CUDA only after full CPU materialization
821
+ target_device = 'cpu'
822
+ if use_cuda:
823
+ try:
824
+ self.model = self.model.to('cuda')
825
+ target_device = 'cuda'
826
+ except Exception as move_err:
827
+ self._log(f" ⚠️ Could not move model to CUDA: {move_err}", "warning")
828
+ target_device = 'cpu'
829
+
830
+ # Finalize eval mode
831
+ self.model.eval()
832
+
833
+ # Sanity-check: ensure no parameter remains on 'meta' device
834
+ try:
835
+ for n, p in self.model.named_parameters():
836
+ dev = getattr(p, 'device', None)
837
+ if dev is not None and getattr(dev, 'type', '') == 'meta':
838
+ raise RuntimeError(f"Parameter {n} is on 'meta' after load")
839
+ except Exception as sanity_err:
840
+ self._log(f"❌ Manga-OCR model load sanity check failed: {sanity_err}", "error")
841
+ return False
842
+
843
+ print(f"SUCCESS: Model loaded on {target_device.upper()}")
844
+ self._log(f" ✅ Model loaded on {target_device.upper()}")
845
+ self.is_loaded = True
846
+ self._log("✅ Manga OCR model ready")
847
+ print(">>> Returning True from load_model()")
848
+ return True
849
+
850
+ except Exception as e:
851
+ print(f"\nEXCEPTION in load_model: {e}")
852
+ import traceback
853
+ print(traceback.format_exc())
854
+ self._log(f"❌ Failed to load manga-ocr model: {str(e)}", "error")
855
+ self._log(traceback.format_exc(), "error")
856
+ try:
857
+ if 'local_only' in locals() and local_only:
858
+ self._log("Hint: Local load failed. Ensure your models/manga-ocr-base contains required files (config.json, preprocessor_config.json, tokenizer.json or tokenizer_config.json, and model weights).", "warning")
859
+ except Exception:
860
+ pass
861
+ return False
862
+
863
+ def _run_ocr(self, pil_image):
864
+ """Run OCR on a PIL image using the HuggingFace model"""
865
+ import torch
866
+
867
+ # Process image (keyword arg for broader compatibility across transformers versions)
868
+ inputs = self.processor(images=pil_image, return_tensors="pt")
869
+ pixel_values = inputs["pixel_values"]
870
+
871
+ # Move to same device as model
872
+ try:
873
+ model_device = next(self.model.parameters()).device
874
+ except StopIteration:
875
+ model_device = torch.device('cpu')
876
+ pixel_values = pixel_values.to(model_device)
877
+
878
+ # Generate text
879
+ with torch.no_grad():
880
+ generated_ids = self.model.generate(pixel_values)
881
+
882
+ # Decode
883
+ generated_text = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
884
+
885
+ return generated_text
886
+
887
+ def detect_text(self, image: np.ndarray, **kwargs) -> List[OCRResult]:
888
+ """
889
+ Process the image region passed to it.
890
+ This could be a bubble region or the full image.
891
+ """
892
+ results = []
893
+
894
+ # Check for stop at start
895
+ if self._check_stop():
896
+ self._log("⏹️ Manga-OCR processing stopped by user", "warning")
897
+ return results
898
+
899
+ try:
900
+ if not self.is_loaded:
901
+ if not self.load_model():
902
+ return results
903
+
904
+ import cv2
905
+ from PIL import Image
906
+
907
+ # Get confidence from kwargs
908
+ confidence = kwargs.get('confidence', 0.7)
909
+
910
+ # Convert numpy array to PIL
911
+ image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
912
+ pil_image = Image.fromarray(image_rgb)
913
+ h, w = image.shape[:2]
914
+
915
+ self._log("🔍 Processing region with manga-ocr...")
916
+
917
+ # Check for stop before inference
918
+ if self._check_stop():
919
+ self._log("⏹️ Manga-OCR inference stopped by user", "warning")
920
+ return results
921
+
922
+ # Run OCR on the image region
923
+ text = self._run_ocr(pil_image)
924
+
925
+ if text and text.strip():
926
+ # Return result for this region with its actual bbox
927
+ results.append(OCRResult(
928
+ text=text.strip(),
929
+ bbox=(0, 0, w, h), # Relative to the region passed in
930
+ confidence=confidence,
931
+ vertices=[(0, 0), (w, 0), (w, h), (0, h)]
932
+ ))
933
+ self._log(f"✅ Detected text: {text[:50]}...")
934
+
935
+ except Exception as e:
936
+ self._log(f"❌ Error in manga-ocr: {str(e)}", "error")
937
+
938
+ return results
939
+
940
+ class Qwen2VL(OCRProvider):
941
+ """OCR using Qwen2-VL - Vision Language Model that can read Korean text"""
942
+
943
+ def __init__(self, log_callback=None):
944
+ super().__init__(log_callback)
945
+ self.processor = None
946
+ self.model = None
947
+ self.tokenizer = None
948
+
949
+ # Get OCR prompt from environment or use default (UPDATED: Improved prompt)
950
+ self.ocr_prompt = os.environ.get('OCR_SYSTEM_PROMPT',
951
+ "YOU ARE A TEXT EXTRACTION MACHINE. EXTRACT EXACTLY WHAT YOU SEE.\n\n"
952
+ "ABSOLUTE RULES:\n"
953
+ "1. OUTPUT ONLY THE VISIBLE TEXT/SYMBOLS - NOTHING ELSE\n"
954
+ "2. NEVER TRANSLATE OR MODIFY\n"
955
+ "3. NEVER EXPLAIN, DESCRIBE, OR COMMENT\n"
956
+ "4. NEVER SAY \"I can't\" or \"I cannot\" or \"no text\" or \"blank image\"\n"
957
+ "5. IF YOU SEE DOTS, OUTPUT THE DOTS: .\n"
958
+ "6. IF YOU SEE PUNCTUATION, OUTPUT THE PUNCTUATION\n"
959
+ "7. IF YOU SEE A SINGLE CHARACTER, OUTPUT THAT CHARACTER\n"
960
+ "8. IF YOU SEE NOTHING, OUTPUT NOTHING (empty response)\n\n"
961
+ "LANGUAGE PRESERVATION:\n"
962
+ "- Korean text → Output in Korean\n"
963
+ "- Japanese text → Output in Japanese\n"
964
+ "- Chinese text → Output in Chinese\n"
965
+ "- English text → Output in English\n\n"
966
+ "FORMATTING:\n"
967
+ "- OUTPUT ALL TEXT ON A SINGLE LINE WITH NO LINE BREAKS\n"
968
+ "- NEVER use \\n or line breaks in your output\n\n"
969
+ "FORBIDDEN RESPONSES:\n"
970
+ "- \"I can see this appears to be...\"\n"
971
+ "- \"I cannot make out any clear text...\"\n"
972
+ "- \"This appears to be blank...\"\n"
973
+ "- \"If there is text present...\"\n"
974
+ "- ANY explanatory text\n\n"
975
+ "YOUR ONLY OUTPUT: The exact visible text. Nothing more. Nothing less.\n"
976
+ "If image has a dot → Output: .\n"
977
+ "If image has two dots → Output: . .\n"
978
+ "If image has text → Output: [that text]\n"
979
+ "If image is truly blank → Output: [empty/no response]"
980
+ )
981
+
982
+ def set_ocr_prompt(self, prompt: str):
983
+ """Allow setting the OCR prompt dynamically"""
984
+ self.ocr_prompt = prompt
985
+
986
+ def check_installation(self) -> bool:
987
+ """Check if required packages are installed"""
988
+ try:
989
+ import transformers
990
+ import torch
991
+ self.is_installed = True
992
+ return True
993
+ except ImportError:
994
+ return False
995
+
996
+ def install(self, progress_callback=None) -> bool:
997
+ """Install requirements for Qwen2-VL"""
998
+ pass
999
+
1000
+ def load_model(self, model_size=None, **kwargs) -> bool:
1001
+ """Load Qwen2-VL model with size selection"""
1002
+ self._log(f"DEBUG: load_model called with model_size={model_size}")
1003
+
1004
+ try:
1005
+ if not self.is_installed and not self.check_installation():
1006
+ self._log("❌ Not installed", "error")
1007
+ return False
1008
+
1009
+ self._log("🔥 Loading Qwen2-VL for Advanced OCR...")
1010
+
1011
+
1012
+
1013
+ from transformers import AutoProcessor, AutoTokenizer
1014
+ import torch
1015
+
1016
+ # Model options
1017
+ model_options = {
1018
+ "1": "Qwen/Qwen2-VL-2B-Instruct",
1019
+ "2": "Qwen/Qwen2-VL-7B-Instruct",
1020
+ "3": "Qwen/Qwen2-VL-72B-Instruct",
1021
+ "4": "custom"
1022
+ }
1023
+ # CHANGE: Default to 7B instead of 2B
1024
+ # Check for saved preference first
1025
+ if model_size is None:
1026
+ # Try to get from environment or config
1027
+ import os
1028
+ model_size = os.environ.get('QWEN2VL_MODEL_SIZE', '1')
1029
+
1030
+ # Determine which model to load
1031
+ if model_size and str(model_size).startswith("custom:"):
1032
+ # Custom model passed with ID
1033
+ model_id = str(model_size).replace("custom:", "")
1034
+ self.loaded_model_size = "Custom"
1035
+ self.model_id = model_id
1036
+ self._log(f"Loading custom model: {model_id}")
1037
+ elif model_size == "4":
1038
+ # Custom option selected but no ID - shouldn't happen
1039
+ self._log("❌ Custom model selected but no ID provided", "error")
1040
+ return False
1041
+ elif model_size and str(model_size) in model_options:
1042
+ # Standard model option
1043
+ option = model_options[str(model_size)]
1044
+ if option == "custom":
1045
+ self._log("❌ Custom model needs an ID", "error")
1046
+ return False
1047
+ model_id = option
1048
+ # Set loaded_model_size for status display
1049
+ if model_size == "1":
1050
+ self.loaded_model_size = "2B"
1051
+ elif model_size == "2":
1052
+ self.loaded_model_size = "7B"
1053
+ elif model_size == "3":
1054
+ self.loaded_model_size = "72B"
1055
+ else:
1056
+ # CHANGE: Default to 7B (option "2") instead of 2B
1057
+ model_id = model_options["1"] # Changed from "1" to "2"
1058
+ self.loaded_model_size = "2B" # Changed from "2B" to "7B"
1059
+ self._log("No model size specified, defaulting to 2B") # Changed message
1060
+
1061
+ self._log(f"Loading model: {model_id}")
1062
+
1063
+ # Load processor and tokenizer
1064
+ self.processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
1065
+ self.tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
1066
+
1067
+ # Load the model - let it figure out the class dynamically
1068
+ if torch.cuda.is_available():
1069
+ self._log(f"GPU: {torch.cuda.get_device_name(0)}")
1070
+ # Use auto model class
1071
+ from transformers import AutoModelForVision2Seq
1072
+ self.model = AutoModelForVision2Seq.from_pretrained(
1073
+ model_id,
1074
+ dtype=torch.float16,
1075
+ device_map="auto",
1076
+ trust_remote_code=True
1077
+ )
1078
+ self._log("✅ Model loaded on GPU")
1079
+ else:
1080
+ self._log("Loading on CPU...")
1081
+ from transformers import AutoModelForVision2Seq
1082
+ self.model = AutoModelForVision2Seq.from_pretrained(
1083
+ model_id,
1084
+ dtype=torch.float32,
1085
+ trust_remote_code=True
1086
+ )
1087
+ self._log("✅ Model loaded on CPU")
1088
+
1089
+ self.model.eval()
1090
+ self.is_loaded = True
1091
+ self._log("✅ Qwen2-VL ready for Advanced OCR!")
1092
+ return True
1093
+
1094
+ except Exception as e:
1095
+ self._log(f"❌ Failed to load: {str(e)}", "error")
1096
+ import traceback
1097
+ self._log(traceback.format_exc(), "debug")
1098
+ return False
1099
+
1100
+ def detect_text(self, image: np.ndarray, **kwargs) -> List[OCRResult]:
1101
+ """Process image with Qwen2-VL for Korean text extraction"""
1102
+ results = []
1103
+ if hasattr(self, 'model_id'):
1104
+ self._log(f"DEBUG: Using model: {self.model_id}", "debug")
1105
+
1106
+ # Check if OCR prompt was passed in kwargs (for dynamic updates)
1107
+ if 'ocr_prompt' in kwargs:
1108
+ self.ocr_prompt = kwargs['ocr_prompt']
1109
+
1110
+ try:
1111
+ if not self.is_loaded:
1112
+ if not self.load_model():
1113
+ return results
1114
+
1115
+ import cv2
1116
+ from PIL import Image
1117
+ import torch
1118
+
1119
+ # Convert to PIL
1120
+ image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
1121
+ pil_image = Image.fromarray(image_rgb)
1122
+ h, w = image.shape[:2]
1123
+
1124
+ self._log(f"🔍 Processing with Qwen2-VL ({w}x{h} pixels)...")
1125
+
1126
+ # Use the configurable OCR prompt
1127
+ messages = [
1128
+ {
1129
+ "role": "user",
1130
+ "content": [
1131
+ {
1132
+ "type": "image",
1133
+ "image": pil_image,
1134
+ },
1135
+ {
1136
+ "type": "text",
1137
+ "text": self.ocr_prompt # Use the configurable prompt
1138
+ }
1139
+ ]
1140
+ }
1141
+ ]
1142
+
1143
+ # Alternative simpler prompt if the above still causes issues:
1144
+ # "text": "OCR: Extract text as-is"
1145
+
1146
+ # Process with Qwen2-VL
1147
+ text = self.processor.apply_chat_template(
1148
+ messages,
1149
+ tokenize=False,
1150
+ add_generation_prompt=True
1151
+ )
1152
+
1153
+ inputs = self.processor(
1154
+ text=[text],
1155
+ images=[pil_image],
1156
+ padding=True,
1157
+ return_tensors="pt"
1158
+ )
1159
+
1160
+ # Get the device and dtype the model is currently on
1161
+ model_device = next(self.model.parameters()).device
1162
+ model_dtype = next(self.model.parameters()).dtype
1163
+
1164
+ # Move inputs to the same device as the model and cast float tensors to model dtype
1165
+ try:
1166
+ # Move first
1167
+ inputs = inputs.to(model_device)
1168
+ # Then align dtypes only for floating tensors (e.g., pixel_values)
1169
+ for k, v in inputs.items():
1170
+ if isinstance(v, torch.Tensor) and torch.is_floating_point(v):
1171
+ inputs[k] = v.to(model_dtype)
1172
+ except Exception:
1173
+ # Fallback: ensure at least pixel_values is correct if present
1174
+ try:
1175
+ if isinstance(inputs, dict) and "pixel_values" in inputs:
1176
+ pv = inputs["pixel_values"].to(model_device)
1177
+ if torch.is_floating_point(pv):
1178
+ inputs["pixel_values"] = pv.to(model_dtype)
1179
+ except Exception:
1180
+ pass
1181
+
1182
+ # Ensure pixel_values explicitly matches model dtype if present
1183
+ try:
1184
+ if isinstance(inputs, dict) and "pixel_values" in inputs:
1185
+ inputs["pixel_values"] = inputs["pixel_values"].to(device=model_device, dtype=model_dtype)
1186
+ except Exception:
1187
+ pass
1188
+
1189
+ # Generate text with stricter parameters to avoid creative responses
1190
+ use_amp = (hasattr(torch, 'cuda') and model_device.type == 'cuda' and model_dtype in (torch.float16, torch.bfloat16))
1191
+ autocast_dev = 'cuda' if model_device.type == 'cuda' else 'cpu'
1192
+ autocast_dtype = model_dtype if model_dtype in (torch.float16, torch.bfloat16) else None
1193
+
1194
+ with torch.no_grad():
1195
+ if use_amp and autocast_dtype is not None:
1196
+ with torch.autocast(autocast_dev, dtype=autocast_dtype):
1197
+ generated_ids = self.model.generate(
1198
+ **inputs,
1199
+ max_new_tokens=128, # Reduced from 512 - manga bubbles are typically short
1200
+ do_sample=False, # Keep deterministic
1201
+ temperature=0.01, # Keep your very low temperature
1202
+ top_p=1.0, # Keep no nucleus sampling
1203
+ repetition_penalty=1.0, # Keep no repetition penalty
1204
+ num_beams=1, # Ensure greedy decoding (faster than beam search)
1205
+ use_cache=True, # Enable KV cache for speed
1206
+ early_stopping=True, # Stop at EOS token
1207
+ pad_token_id=self.tokenizer.pad_token_id, # Proper padding
1208
+ eos_token_id=self.tokenizer.eos_token_id, # Proper stopping
1209
+ )
1210
+ else:
1211
+ generated_ids = self.model.generate(
1212
+ **inputs,
1213
+ max_new_tokens=128, # Reduced from 512 - manga bubbles are typically short
1214
+ do_sample=False, # Keep deterministic
1215
+ temperature=0.01, # Keep your very low temperature
1216
+ top_p=1.0, # Keep no nucleus sampling
1217
+ repetition_penalty=1.0, # Keep no repetition penalty
1218
+ num_beams=1, # Ensure greedy decoding (faster than beam search)
1219
+ use_cache=True, # Enable KV cache for speed
1220
+ early_stopping=True, # Stop at EOS token
1221
+ pad_token_id=self.tokenizer.pad_token_id, # Proper padding
1222
+ eos_token_id=self.tokenizer.eos_token_id, # Proper stopping
1223
+ )
1224
+
1225
+ # Decode the output
1226
+ generated_ids_trimmed = [
1227
+ out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
1228
+ ]
1229
+ output_text = self.processor.batch_decode(
1230
+ generated_ids_trimmed,
1231
+ skip_special_tokens=True,
1232
+ clean_up_tokenization_spaces=False
1233
+ )[0]
1234
+
1235
+ if output_text and output_text.strip():
1236
+ text = output_text.strip()
1237
+
1238
+ # ADDED: Filter out any response that looks like an explanation or apology
1239
+ # Common patterns that indicate the model is being "helpful" instead of just extracting
1240
+ unwanted_patterns = [
1241
+ "죄송합니다", # "I apologize"
1242
+ "sorry",
1243
+ "apologize",
1244
+ "이미지에는", # "in this image"
1245
+ "텍스트가 없습니다", # "there is no text"
1246
+ "I cannot",
1247
+ "I don't see",
1248
+ "There is no",
1249
+ "질문이 있으시면", # "if you have questions"
1250
+ ]
1251
+
1252
+ # Check if response contains unwanted patterns
1253
+ text_lower = text.lower()
1254
+ is_explanation = any(pattern.lower() in text_lower for pattern in unwanted_patterns)
1255
+
1256
+ # Also check if the response is suspiciously long for a bubble
1257
+ # Most manga bubbles are short, if we get 50+ chars it might be an explanation
1258
+ is_too_long = len(text) > 100 and ('.' in text or ',' in text or '!' in text)
1259
+
1260
+ if is_explanation or is_too_long:
1261
+ self._log(f"⚠️ Model returned explanation instead of text, ignoring", "warning")
1262
+ # Return empty result or just skip this region
1263
+ return results
1264
+
1265
+ # Check language
1266
+ has_korean = any('\uAC00' <= c <= '\uD7AF' for c in text)
1267
+ has_japanese = any('\u3040' <= c <= '\u309F' or '\u30A0' <= c <= '\u30FF' for c in text)
1268
+ has_chinese = any('\u4E00' <= c <= '\u9FFF' for c in text)
1269
+
1270
+ if has_korean:
1271
+ self._log(f"✅ Korean detected: {text[:50]}...")
1272
+ elif has_japanese:
1273
+ self._log(f"✅ Japanese detected: {text[:50]}...")
1274
+ elif has_chinese:
1275
+ self._log(f"✅ Chinese detected: {text[:50]}...")
1276
+ else:
1277
+ self._log(f"✅ Text: {text[:50]}...")
1278
+
1279
+ results.append(OCRResult(
1280
+ text=text,
1281
+ bbox=(0, 0, w, h),
1282
+ confidence=0.9,
1283
+ vertices=[(0, 0), (w, 0), (w, h), (0, h)]
1284
+ ))
1285
+ else:
1286
+ self._log("⚠️ No text detected", "warning")
1287
+
1288
+ except Exception as e:
1289
+ self._log(f"❌ Error: {str(e)}", "error")
1290
+ import traceback
1291
+ self._log(traceback.format_exc(), "debug")
1292
+
1293
+ return results
1294
+
1295
+ class EasyOCRProvider(OCRProvider):
1296
+ """EasyOCR provider for multiple languages"""
1297
+
1298
+ def __init__(self, log_callback=None, languages=None):
1299
+ super().__init__(log_callback)
1300
+ # Default to safe language combination
1301
+ self.languages = languages or ['ja', 'en'] # Safe default
1302
+ self._validate_language_combination()
1303
+
1304
+ def _validate_language_combination(self):
1305
+ """Validate and fix EasyOCR language combinations"""
1306
+ # EasyOCR language compatibility rules
1307
+ incompatible_pairs = [
1308
+ (['ja', 'ko'], 'Japanese and Korean cannot be used together'),
1309
+ (['ja', 'zh'], 'Japanese and Chinese cannot be used together'),
1310
+ (['ko', 'zh'], 'Korean and Chinese cannot be used together')
1311
+ ]
1312
+
1313
+ for incompatible, reason in incompatible_pairs:
1314
+ if all(lang in self.languages for lang in incompatible):
1315
+ self._log(f"⚠️ EasyOCR: {reason}", "warning")
1316
+ # Keep first language + English
1317
+ self.languages = [self.languages[0], 'en']
1318
+ self._log(f"🔧 Auto-adjusted to: {self.languages}", "info")
1319
+ break
1320
+
1321
+ def check_installation(self) -> bool:
1322
+ """Check if easyocr is installed"""
1323
+ try:
1324
+ import easyocr
1325
+ self.is_installed = True
1326
+ return True
1327
+ except ImportError:
1328
+ return False
1329
+
1330
+ def install(self, progress_callback=None) -> bool:
1331
+ """Install easyocr"""
1332
+ pass
1333
+
1334
+ def load_model(self, **kwargs) -> bool:
1335
+ """Load easyocr model"""
1336
+ try:
1337
+ if not self.is_installed and not self.check_installation():
1338
+ self._log("❌ easyocr not installed", "error")
1339
+ return False
1340
+
1341
+ self._log(f"🔥 Loading easyocr model for languages: {self.languages}...")
1342
+ import easyocr
1343
+
1344
+ # This will download models on first run
1345
+ self.model = easyocr.Reader(self.languages, gpu=True)
1346
+ self.is_loaded = True
1347
+
1348
+ self._log("✅ easyocr model loaded successfully")
1349
+ return True
1350
+
1351
+ except Exception as e:
1352
+ self._log(f"❌ Failed to load easyocr: {str(e)}", "error")
1353
+ # Try CPU mode if GPU fails
1354
+ try:
1355
+ import easyocr
1356
+ self.model = easyocr.Reader(self.languages, gpu=False)
1357
+ self.is_loaded = True
1358
+ self._log("✅ easyocr loaded in CPU mode")
1359
+ return True
1360
+ except:
1361
+ return False
1362
+
1363
+ def detect_text(self, image: np.ndarray, **kwargs) -> List[OCRResult]:
1364
+ """Detect text using easyocr"""
1365
+ results = []
1366
+
1367
+ try:
1368
+ if not self.is_loaded:
1369
+ if not self.load_model():
1370
+ return results
1371
+
1372
+ # EasyOCR can work directly with numpy arrays
1373
+ ocr_results = self.model.readtext(image, detail=1)
1374
+
1375
+ # Parse results
1376
+ for (bbox, text, confidence) in ocr_results:
1377
+ # bbox is a list of 4 points
1378
+ xs = [point[0] for point in bbox]
1379
+ ys = [point[1] for point in bbox]
1380
+ x_min, x_max = min(xs), max(xs)
1381
+ y_min, y_max = min(ys), max(ys)
1382
+
1383
+ results.append(OCRResult(
1384
+ text=text,
1385
+ bbox=(int(x_min), int(y_min), int(x_max - x_min), int(y_max - y_min)),
1386
+ confidence=confidence,
1387
+ vertices=[(int(p[0]), int(p[1])) for p in bbox]
1388
+ ))
1389
+
1390
+ self._log(f"✅ Detected {len(results)} text regions")
1391
+
1392
+ except Exception as e:
1393
+ self._log(f"❌ Error in easyocr detection: {str(e)}", "error")
1394
+
1395
+ return results
1396
+
1397
+
1398
+ class PaddleOCRProvider(OCRProvider):
1399
+ """PaddleOCR provider with memory safety measures"""
1400
+
1401
+ def check_installation(self) -> bool:
1402
+ """Check if paddleocr is installed"""
1403
+ try:
1404
+ from paddleocr import PaddleOCR
1405
+ self.is_installed = True
1406
+ return True
1407
+ except ImportError:
1408
+ return False
1409
+
1410
+ def install(self, progress_callback=None) -> bool:
1411
+ """Install paddleocr"""
1412
+ pass
1413
+
1414
+ def load_model(self, **kwargs) -> bool:
1415
+ """Load paddleocr model with memory-safe configurations"""
1416
+ try:
1417
+ if not self.is_installed and not self.check_installation():
1418
+ self._log("❌ paddleocr not installed", "error")
1419
+ return False
1420
+
1421
+ self._log("🔥 Loading PaddleOCR model...")
1422
+
1423
+ # Set memory-safe environment variables BEFORE importing
1424
+ import os
1425
+ os.environ['OMP_NUM_THREADS'] = '1' # Prevent OpenMP conflicts
1426
+ os.environ['MKL_NUM_THREADS'] = '1' # Prevent MKL conflicts
1427
+ os.environ['OPENBLAS_NUM_THREADS'] = '1' # Prevent OpenBLAS conflicts
1428
+ os.environ['FLAGS_use_mkldnn'] = '0' # Disable MKL-DNN
1429
+
1430
+ from paddleocr import PaddleOCR
1431
+
1432
+ # Try memory-safe configurations
1433
+ configs_to_try = [
1434
+ # Config 1: Most memory-safe configuration
1435
+ {
1436
+ 'use_angle_cls': False, # Disable angle to save memory
1437
+ 'lang': 'ch',
1438
+ 'rec_batch_num': 1, # Process one at a time
1439
+ 'max_text_length': 100, # Limit text length
1440
+ 'drop_score': 0.5, # Higher threshold to reduce detections
1441
+ 'cpu_threads': 1, # Single thread to avoid conflicts
1442
+ },
1443
+ # Config 2: Minimal memory footprint
1444
+ {
1445
+ 'lang': 'ch',
1446
+ 'rec_batch_num': 1,
1447
+ 'cpu_threads': 1,
1448
+ },
1449
+ # Config 3: Absolute minimal
1450
+ {
1451
+ 'lang': 'ch'
1452
+ },
1453
+ # Config 4: Empty config
1454
+ {}
1455
+ ]
1456
+
1457
+ for i, config in enumerate(configs_to_try):
1458
+ try:
1459
+ self._log(f" Trying configuration {i+1}/{len(configs_to_try)}: {config}")
1460
+
1461
+ # Force garbage collection before loading
1462
+ import gc
1463
+ gc.collect()
1464
+
1465
+ self.model = PaddleOCR(**config)
1466
+ self.is_loaded = True
1467
+ self.current_config = config
1468
+ self._log(f"✅ PaddleOCR loaded successfully with config: {config}")
1469
+ return True
1470
+ except Exception as e:
1471
+ error_str = str(e)
1472
+ self._log(f" Config {i+1} failed: {error_str}", "debug")
1473
+
1474
+ # Clean up on failure
1475
+ if hasattr(self, 'model'):
1476
+ del self.model
1477
+ gc.collect()
1478
+ continue
1479
+
1480
+ self._log(f"❌ PaddleOCR failed to load with any configuration", "error")
1481
+ return False
1482
+
1483
+ except Exception as e:
1484
+ self._log(f"❌ Failed to load paddleocr: {str(e)}", "error")
1485
+ import traceback
1486
+ self._log(traceback.format_exc(), "debug")
1487
+ return False
1488
+
1489
+ def detect_text(self, image: np.ndarray, **kwargs) -> List[OCRResult]:
1490
+ """Detect text with memory safety measures"""
1491
+ results = []
1492
+
1493
+ try:
1494
+ if not self.is_loaded:
1495
+ if not self.load_model():
1496
+ return results
1497
+
1498
+ import cv2
1499
+ import numpy as np
1500
+ import gc
1501
+
1502
+ # Memory safety: Ensure image isn't too large
1503
+ h, w = image.shape[:2] if len(image.shape) >= 2 else (0, 0)
1504
+
1505
+ # Limit image size to prevent memory issues
1506
+ MAX_DIMENSION = 1500
1507
+ if h > MAX_DIMENSION or w > MAX_DIMENSION:
1508
+ scale = min(MAX_DIMENSION/h, MAX_DIMENSION/w)
1509
+ new_h, new_w = int(h*scale), int(w*scale)
1510
+ self._log(f"⚠️ Resizing large image from {w}x{h} to {new_w}x{new_h} for memory safety", "warning")
1511
+ image = cv2.resize(image, (new_w, new_h), interpolation=cv2.INTER_AREA)
1512
+ scale_factor = 1/scale
1513
+ else:
1514
+ scale_factor = 1.0
1515
+
1516
+ # Ensure correct format
1517
+ if len(image.shape) == 2: # Grayscale
1518
+ image = cv2.cvtColor(image, cv2.COLOR_GRAY2BGR)
1519
+ elif len(image.shape) == 4: # Batch
1520
+ image = image[0]
1521
+
1522
+ # Ensure uint8 type
1523
+ if image.dtype != np.uint8:
1524
+ if image.max() <= 1.0:
1525
+ image = (image * 255).astype(np.uint8)
1526
+ else:
1527
+ image = image.astype(np.uint8)
1528
+
1529
+ # Make a copy to avoid memory corruption
1530
+ image_copy = image.copy()
1531
+
1532
+ # Force garbage collection before OCR
1533
+ gc.collect()
1534
+
1535
+ # Process with timeout protection
1536
+ import signal
1537
+ import threading
1538
+
1539
+ ocr_results = None
1540
+ ocr_error = None
1541
+
1542
+ def run_ocr():
1543
+ nonlocal ocr_results, ocr_error
1544
+ try:
1545
+ ocr_results = self.model.ocr(image_copy)
1546
+ except Exception as e:
1547
+ ocr_error = e
1548
+
1549
+ # Run OCR in a separate thread with timeout
1550
+ ocr_thread = threading.Thread(target=run_ocr)
1551
+ ocr_thread.daemon = True
1552
+ ocr_thread.start()
1553
+ ocr_thread.join(timeout=30) # 30 second timeout
1554
+
1555
+ if ocr_thread.is_alive():
1556
+ self._log("❌ PaddleOCR timeout - taking too long", "error")
1557
+ return results
1558
+
1559
+ if ocr_error:
1560
+ raise ocr_error
1561
+
1562
+ # Parse results
1563
+ results = self._parse_ocr_results(ocr_results)
1564
+
1565
+ # Scale coordinates back if image was resized
1566
+ if scale_factor != 1.0 and results:
1567
+ for r in results:
1568
+ x, y, width, height = r.bbox
1569
+ r.bbox = (int(x*scale_factor), int(y*scale_factor),
1570
+ int(width*scale_factor), int(height*scale_factor))
1571
+ r.vertices = [(int(v[0]*scale_factor), int(v[1]*scale_factor))
1572
+ for v in r.vertices]
1573
+
1574
+ if results:
1575
+ self._log(f"✅ Detected {len(results)} text regions", "info")
1576
+ else:
1577
+ self._log("No text regions found", "debug")
1578
+
1579
+ # Clean up
1580
+ del image_copy
1581
+ gc.collect()
1582
+
1583
+ except Exception as e:
1584
+ error_msg = str(e) if str(e) else type(e).__name__
1585
+
1586
+ if "memory" in error_msg.lower() or "0x" in error_msg:
1587
+ self._log("❌ Memory access violation in PaddleOCR", "error")
1588
+ self._log(" This is a known Windows issue with PaddleOCR", "info")
1589
+ self._log(" Please switch to EasyOCR or manga-ocr instead", "warning")
1590
+ elif "trace_order.size()" in error_msg:
1591
+ self._log("❌ PaddleOCR internal error", "error")
1592
+ self._log(" Please switch to EasyOCR or manga-ocr", "warning")
1593
+ else:
1594
+ self._log(f"❌ Error in paddleocr detection: {error_msg}", "error")
1595
+
1596
+ import traceback
1597
+ self._log(traceback.format_exc(), "debug")
1598
+
1599
+ return results
1600
+
1601
+ def _parse_ocr_results(self, ocr_results) -> List[OCRResult]:
1602
+ """Parse OCR results safely"""
1603
+ results = []
1604
+
1605
+ if isinstance(ocr_results, bool) and ocr_results == False:
1606
+ return results
1607
+
1608
+ if ocr_results is None or not isinstance(ocr_results, list):
1609
+ return results
1610
+
1611
+ if len(ocr_results) == 0:
1612
+ return results
1613
+
1614
+ # Handle batch format
1615
+ if isinstance(ocr_results[0], list) and len(ocr_results[0]) > 0:
1616
+ first_item = ocr_results[0][0]
1617
+ if isinstance(first_item, list) and len(first_item) > 0:
1618
+ if isinstance(first_item[0], (list, tuple)) and len(first_item[0]) == 2:
1619
+ ocr_results = ocr_results[0]
1620
+
1621
+ # Parse detections
1622
+ for detection in ocr_results:
1623
+ if not detection or isinstance(detection, bool):
1624
+ continue
1625
+
1626
+ if not isinstance(detection, (list, tuple)) or len(detection) < 2:
1627
+ continue
1628
+
1629
+ try:
1630
+ bbox_points = detection[0]
1631
+ text_data = detection[1]
1632
+
1633
+ if not isinstance(bbox_points, (list, tuple)) or len(bbox_points) != 4:
1634
+ continue
1635
+
1636
+ if not isinstance(text_data, (tuple, list)) or len(text_data) < 2:
1637
+ continue
1638
+
1639
+ text = str(text_data[0]).strip()
1640
+ confidence = float(text_data[1])
1641
+
1642
+ if not text or confidence < 0.3:
1643
+ continue
1644
+
1645
+ xs = [float(p[0]) for p in bbox_points]
1646
+ ys = [float(p[1]) for p in bbox_points]
1647
+ x_min, x_max = min(xs), max(xs)
1648
+ y_min, y_max = min(ys), max(ys)
1649
+
1650
+ if (x_max - x_min) < 5 or (y_max - y_min) < 5:
1651
+ continue
1652
+
1653
+ results.append(OCRResult(
1654
+ text=text,
1655
+ bbox=(int(x_min), int(y_min), int(x_max - x_min), int(y_max - y_min)),
1656
+ confidence=confidence,
1657
+ vertices=[(int(p[0]), int(p[1])) for p in bbox_points]
1658
+ ))
1659
+
1660
+ except Exception:
1661
+ continue
1662
+
1663
+ return results
1664
+
1665
+ class DocTROCRProvider(OCRProvider):
1666
+ """DocTR OCR provider"""
1667
+
1668
+ def check_installation(self) -> bool:
1669
+ """Check if doctr is installed"""
1670
+ try:
1671
+ from doctr.models import ocr_predictor
1672
+ self.is_installed = True
1673
+ return True
1674
+ except ImportError:
1675
+ return False
1676
+
1677
+ def install(self, progress_callback=None) -> bool:
1678
+ """Install doctr"""
1679
+ pass
1680
+
1681
+ def load_model(self, **kwargs) -> bool:
1682
+ """Load doctr model"""
1683
+ try:
1684
+ if not self.is_installed and not self.check_installation():
1685
+ self._log("❌ doctr not installed", "error")
1686
+ return False
1687
+
1688
+ self._log("🔥 Loading DocTR model...")
1689
+ from doctr.models import ocr_predictor
1690
+
1691
+ # Load pretrained model
1692
+ self.model = ocr_predictor(pretrained=True)
1693
+ self.is_loaded = True
1694
+
1695
+ self._log("✅ DocTR model loaded successfully")
1696
+ return True
1697
+
1698
+ except Exception as e:
1699
+ self._log(f"❌ Failed to load doctr: {str(e)}", "error")
1700
+ return False
1701
+
1702
+ def detect_text(self, image: np.ndarray, **kwargs) -> List[OCRResult]:
1703
+ """Detect text using doctr"""
1704
+ results = []
1705
+
1706
+ try:
1707
+ if not self.is_loaded:
1708
+ if not self.load_model():
1709
+ return results
1710
+
1711
+ from doctr.io import DocumentFile
1712
+
1713
+ # DocTR expects document format
1714
+ # Convert numpy array to PIL and save temporarily
1715
+ import tempfile
1716
+ import cv2
1717
+
1718
+ with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as tmp:
1719
+ cv2.imwrite(tmp.name, image)
1720
+ doc = DocumentFile.from_images(tmp.name)
1721
+
1722
+ # Run OCR
1723
+ result = self.model(doc)
1724
+
1725
+ # Parse results
1726
+ h, w = image.shape[:2]
1727
+ for page in result.pages:
1728
+ for block in page.blocks:
1729
+ for line in block.lines:
1730
+ for word in line.words:
1731
+ # Handle different geometry formats
1732
+ geometry = word.geometry
1733
+
1734
+ if len(geometry) == 4:
1735
+ # Standard format: (x1, y1, x2, y2)
1736
+ x1, y1, x2, y2 = geometry
1737
+ elif len(geometry) == 2:
1738
+ # Alternative format: ((x1, y1), (x2, y2))
1739
+ (x1, y1), (x2, y2) = geometry
1740
+ else:
1741
+ self._log(f"Unexpected geometry format: {geometry}", "warning")
1742
+ continue
1743
+
1744
+ # Convert relative coordinates to absolute
1745
+ x1, x2 = int(x1 * w), int(x2 * w)
1746
+ y1, y2 = int(y1 * h), int(y2 * h)
1747
+
1748
+ results.append(OCRResult(
1749
+ text=word.value,
1750
+ bbox=(x1, y1, x2 - x1, y2 - y1),
1751
+ confidence=word.confidence,
1752
+ vertices=[(x1, y1), (x2, y1), (x2, y2), (x1, y2)]
1753
+ ))
1754
+
1755
+ # Clean up temp file
1756
+ try:
1757
+ os.unlink(tmp.name)
1758
+ except:
1759
+ pass
1760
+
1761
+ self._log(f"DocTR detected {len(results)} text regions")
1762
+
1763
+ except Exception as e:
1764
+ self._log(f"Error in doctr detection: {str(e)}", "error")
1765
+ import traceback
1766
+ self._log(traceback.format_exc(), "error")
1767
+
1768
+ return results
1769
+
1770
+
1771
+ class RapidOCRProvider(OCRProvider):
1772
+ """RapidOCR provider for fast local OCR"""
1773
+
1774
+ def check_installation(self) -> bool:
1775
+ """Check if rapidocr is installed"""
1776
+ try:
1777
+ import rapidocr_onnxruntime
1778
+ self.is_installed = True
1779
+ return True
1780
+ except ImportError:
1781
+ return False
1782
+
1783
+ def install(self, progress_callback=None) -> bool:
1784
+ """Install rapidocr (requires manual pip install)"""
1785
+ # RapidOCR requires manual installation
1786
+ if progress_callback:
1787
+ progress_callback("RapidOCR requires manual pip installation")
1788
+ self._log("Run: pip install rapidocr-onnxruntime", "info")
1789
+ return False # Always return False since we can't auto-install
1790
+
1791
+ def load_model(self, **kwargs) -> bool:
1792
+ """Load RapidOCR model"""
1793
+ try:
1794
+ if not self.is_installed and not self.check_installation():
1795
+ self._log("RapidOCR not installed", "error")
1796
+ return False
1797
+
1798
+ self._log("Loading RapidOCR...")
1799
+ from rapidocr_onnxruntime import RapidOCR
1800
+
1801
+ self.model = RapidOCR()
1802
+ self.is_loaded = True
1803
+
1804
+ self._log("RapidOCR model loaded successfully")
1805
+ return True
1806
+
1807
+ except Exception as e:
1808
+ self._log(f"Failed to load RapidOCR: {str(e)}", "error")
1809
+ return False
1810
+
1811
+ def detect_text(self, image: np.ndarray, **kwargs) -> List[OCRResult]:
1812
+ """Detect text using RapidOCR"""
1813
+ if not self.is_loaded:
1814
+ self._log("RapidOCR model not loaded", "error")
1815
+ return []
1816
+
1817
+ results = []
1818
+
1819
+ try:
1820
+ # Convert numpy array to PIL Image for RapidOCR
1821
+ if len(image.shape) == 3:
1822
+ # BGR to RGB
1823
+ image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
1824
+ else:
1825
+ image_rgb = image
1826
+
1827
+ # RapidOCR expects PIL Image or numpy array
1828
+ ocr_results, _ = self.model(image_rgb)
1829
+
1830
+ if ocr_results:
1831
+ for result in ocr_results:
1832
+ # RapidOCR returns [bbox, text, confidence]
1833
+ bbox_points = result[0] # 4 corner points
1834
+ text = result[1]
1835
+ confidence = float(result[2])
1836
+
1837
+ if not text or not text.strip():
1838
+ continue
1839
+
1840
+ # Convert 4-point bbox to x,y,w,h format
1841
+ xs = [point[0] for point in bbox_points]
1842
+ ys = [point[1] for point in bbox_points]
1843
+ x_min, x_max = min(xs), max(xs)
1844
+ y_min, y_max = min(ys), max(ys)
1845
+
1846
+ results.append(OCRResult(
1847
+ text=text.strip(),
1848
+ bbox=(int(x_min), int(y_min), int(x_max - x_min), int(y_max - y_min)),
1849
+ confidence=confidence,
1850
+ vertices=[(int(p[0]), int(p[1])) for p in bbox_points]
1851
+ ))
1852
+
1853
+ self._log(f"Detected {len(results)} text regions")
1854
+
1855
+ except Exception as e:
1856
+ self._log(f"Error in RapidOCR detection: {str(e)}", "error")
1857
+
1858
+ return results
1859
+
1860
+ class OCRManager:
1861
+ """Manager for multiple OCR providers"""
1862
+
1863
+ def __init__(self, log_callback=None):
1864
+ self.log_callback = log_callback
1865
+ self.providers = {
1866
+ 'custom-api': CustomAPIProvider(log_callback) ,
1867
+ 'manga-ocr': MangaOCRProvider(log_callback),
1868
+ 'easyocr': EasyOCRProvider(log_callback),
1869
+ 'paddleocr': PaddleOCRProvider(log_callback),
1870
+ 'doctr': DocTROCRProvider(log_callback),
1871
+ 'rapidocr': RapidOCRProvider(log_callback),
1872
+ 'Qwen2-VL': Qwen2VL(log_callback)
1873
+ }
1874
+ self.current_provider = None
1875
+ self.stop_flag = None
1876
+
1877
+ def get_provider(self, name: str) -> Optional[OCRProvider]:
1878
+ """Get OCR provider by name"""
1879
+ return self.providers.get(name)
1880
+
1881
+ def set_current_provider(self, name: str):
1882
+ """Set current active provider"""
1883
+ if name in self.providers:
1884
+ self.current_provider = name
1885
+ return True
1886
+ return False
1887
+
1888
+ def check_provider_status(self, name: str) -> Dict[str, bool]:
1889
+ """Check installation and loading status of provider"""
1890
+ provider = self.providers.get(name)
1891
+ if not provider:
1892
+ return {'installed': False, 'loaded': False}
1893
+
1894
+ result = {
1895
+ 'installed': provider.check_installation(),
1896
+ 'loaded': provider.is_loaded
1897
+ }
1898
+ if self.log_callback:
1899
+ self.log_callback(f"DEBUG: check_provider_status({name}) returning loaded={result['loaded']}", "debug")
1900
+ return result
1901
+
1902
+ def install_provider(self, name: str, progress_callback=None) -> bool:
1903
+ """Install a provider"""
1904
+ provider = self.providers.get(name)
1905
+ if not provider:
1906
+ return False
1907
+
1908
+ return provider.install(progress_callback)
1909
+
1910
+ def load_provider(self, name: str, **kwargs) -> bool:
1911
+ """Load a provider's model with optional parameters"""
1912
+ provider = self.providers.get(name)
1913
+ if not provider:
1914
+ return False
1915
+
1916
+ return provider.load_model(**kwargs) # <-- Passes model_size and any other kwargs
1917
+
1918
+ def shutdown(self):
1919
+ """Release models/processors/tokenizers for all providers and clear caches."""
1920
+ try:
1921
+ import gc
1922
+ for name, provider in list(self.providers.items()):
1923
+ try:
1924
+ if hasattr(provider, 'model'):
1925
+ provider.model = None
1926
+ if hasattr(provider, 'processor'):
1927
+ provider.processor = None
1928
+ if hasattr(provider, 'tokenizer'):
1929
+ provider.tokenizer = None
1930
+ if hasattr(provider, 'reader'):
1931
+ provider.reader = None
1932
+ if hasattr(provider, 'is_loaded'):
1933
+ provider.is_loaded = False
1934
+ except Exception:
1935
+ pass
1936
+ gc.collect()
1937
+ try:
1938
+ import torch
1939
+ torch.cuda.empty_cache()
1940
+ except Exception:
1941
+ pass
1942
+ except Exception:
1943
+ pass
1944
+
1945
+ def detect_text(self, image: np.ndarray, provider_name: str = None, **kwargs) -> List[OCRResult]:
1946
+ """Detect text using specified or current provider"""
1947
+ provider_name = provider_name or self.current_provider
1948
+ if not provider_name:
1949
+ return []
1950
+
1951
+ provider = self.providers.get(provider_name)
1952
+ if not provider:
1953
+ return []
1954
+
1955
+ return provider.detect_text(image, **kwargs)
1956
+
1957
+ def set_stop_flag(self, stop_flag):
1958
+ """Set stop flag for all providers"""
1959
+ self.stop_flag = stop_flag
1960
+ for provider in self.providers.values():
1961
+ if hasattr(provider, 'set_stop_flag'):
1962
+ provider.set_stop_flag(stop_flag)
1963
+
1964
+ def reset_stop_flags(self):
1965
+ """Reset stop flags for all providers"""
1966
+ for provider in self.providers.values():
1967
+ if hasattr(provider, 'reset_stop_flags'):
1968
+ provider.reset_stop_flags()
translator_gui.py ADDED
The diff for this file is too large to render. See raw diff