|
from docx import Document |
|
import subprocess, sys, re, ast, os |
|
from pathlib import Path |
|
from typing import List, Tuple |
|
try: |
|
from gemini_post import clean_and_extract, post_check |
|
_HAS_GEMINI = True |
|
except ImportError: |
|
_HAS_GEMINI = False |
|
|
|
|
|
def _save_docx_from_text(text: str, dest: Path) -> None: |
|
doc = Document() |
|
for para in (text or "").split("\n\n"): |
|
p = doc.add_paragraph() |
|
lines = para.splitlines() or [""] |
|
for i, line in enumerate(lines): |
|
run = p.add_run(line) |
|
if i < len(lines) - 1: |
|
run.add_break() |
|
doc.save(dest) |
|
|
|
|
|
def run_ocr(in_file, out_dir_str, lang, |
|
use_doc_unwarping, use_doc_orientation, use_textline_orientation, |
|
use_server_rec, server_rec_dir_str, |
|
use_gpu, gpu_id) -> Tuple[str, str, List[str]]: |
|
|
|
if not in_file: |
|
return "Файл не выбран.", "", [] |
|
|
|
in_path = Path(in_file if isinstance(in_file, (str, os.PathLike)) else getattr(in_file, "name")) |
|
out_dir = Path(out_dir_str) |
|
out_dir.mkdir(parents=True, exist_ok=True) |
|
|
|
server_rec_dir = Path(server_rec_dir_str) if server_rec_dir_str else Path("") |
|
|
|
cmd = [sys.executable, "-m", "paddleocr", "ocr", |
|
"-i", str(in_path), "--lang", str(lang), "--save_path", str(out_dir)] |
|
|
|
if use_doc_unwarping: |
|
cmd += ["--use_doc_unwarping", "true"] |
|
if use_doc_orientation: |
|
cmd += ["--use_doc_orientation_classify", "true"] |
|
if use_textline_orientation: |
|
cmd += ["--use_textline_orientation", "true"] |
|
|
|
if use_server_rec and server_rec_dir.exists(): |
|
cmd += ["--rec_model_dir", str(server_rec_dir)] |
|
|
|
if use_gpu: |
|
try: |
|
gid = int(gpu_id) |
|
except Exception: |
|
gid = 0 |
|
cmd += ["--device", f"gpu:{gid}"] |
|
|
|
log_file = out_dir / f"{in_path.stem}_paddleocr.log" |
|
log_text = "[CMD] " + " ".join(cmd) + "\n" |
|
|
|
with log_file.open("w", encoding="utf-8") as lf: |
|
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True) |
|
for line in proc.stdout: |
|
log_text += line |
|
lf.write(line) |
|
rc = proc.wait() |
|
|
|
if rc != 0: |
|
log_text += f"\n[ERROR] CLI вернул код {rc}. Лог: {log_file}\n" |
|
|
|
candidates = sorted(out_dir.glob(f"{in_path.stem}_page*.txt")) or \ |
|
sorted(out_dir.glob(f"{in_path.stem}.txt")) |
|
|
|
merged_path = out_dir / f"{in_path.stem}_ALL.txt" |
|
merged_text = "" |
|
files_out = [str(log_file)] |
|
|
|
if candidates: |
|
with merged_path.open("w", encoding="utf-8") as w: |
|
for i, p in enumerate(candidates, 1): |
|
w.write(p.read_text(encoding="utf-8")) |
|
if i < len(candidates): |
|
w.write("\n\n") |
|
merged_text = merged_path.read_text(encoding="utf-8") |
|
files_out.append(str(merged_path)) |
|
else: |
|
log = log_file.read_text(encoding="utf-8", errors="ignore") |
|
texts = [] |
|
for m in re.finditer(r"['\"]rec_texts['\"]:\s*(\[[^\]]*\])", log, flags=re.S): |
|
try: |
|
arr = ast.literal_eval(m.group(1)) |
|
if isinstance(arr, list): |
|
texts.extend([str(x) for x in arr]) |
|
except Exception: |
|
continue |
|
if texts: |
|
merged_path.write_text("\n".join(texts), encoding="utf-8") |
|
merged_text = merged_path.read_text(encoding="utf-8") |
|
files_out.append(str(merged_path)) |
|
else: |
|
log_text += "\n[WARN] Не нашёл *.txt и не смог извлечь rec_texts из лога." |
|
|
|
if merged_text: |
|
docx_path = out_dir / f"{in_path.stem}_ALL.docx" |
|
_save_docx_from_text(merged_text, docx_path) |
|
files_out.append(str(docx_path)) |
|
log_text += f"\n[DOCX] Добавлен: {docx_path}" |
|
|
|
use_gemini_env = os.getenv("USE_GEMINI", "0") == "1" |
|
do_postcheck_env = os.getenv("GEMINI_POSTCHECK", "0") == "1" |
|
gemini_model = os.getenv("GEMINI_MODEL", "gemini-1.5-pro") |
|
|
|
if _HAS_GEMINI and use_gemini_env and merged_text and merged_text != "(пусто)": |
|
try: |
|
clean_md, data_json = clean_and_extract(merged_text, model_name=gemini_model) |
|
|
|
clean_path = out_dir / f"{in_path.stem}_clean.md" |
|
data_path = out_dir / f"{in_path.stem}_extracted.json" |
|
clean_path.write_text(clean_md or "", encoding="utf-8") |
|
data_path.write_text(data_json or "{}", encoding="utf-8") |
|
files_out += [str(clean_path), str(data_path)] |
|
|
|
if clean_md: |
|
docx_clean = out_dir / f"{in_path.stem}_clean.docx" |
|
_save_docx_from_text(clean_md, docx_clean) |
|
files_out.append(str(docx_clean)) |
|
log_text += f"\n[DOCX] CLEAN_MARKDOWN -> {docx_clean}" |
|
|
|
log_text += "\n[Gemini] CLEAN_MARKDOWN и EXTRACTED_JSON готовы." |
|
|
|
if do_postcheck_env: |
|
report_json = post_check(clean_md, data_json, model_name=gemini_model) |
|
report_path = out_dir / f"{in_path.stem}_postcheck.json" |
|
report_path.write_text(report_json or "{}", encoding="utf-8") |
|
files_out.append(str(report_path)) |
|
log_text += "\n[Gemini] Post-check отчёт готов." |
|
except Exception as e: |
|
log_text += f"\n[Gemini ERROR] {e}" |
|
|
|
return log_text, (merged_text if merged_text else "(пусто)"), files_out |
|
|