rodnoika's picture
Update ocr.py
b2efba7 verified
from docx import Document
import subprocess, sys, re, ast, os
from pathlib import Path
from typing import List, Tuple
try:
from gemini_post import clean_and_extract, post_check
_HAS_GEMINI = True
except ImportError:
_HAS_GEMINI = False
def _save_docx_from_text(text: str, dest: Path) -> None:
doc = Document()
for para in (text or "").split("\n\n"):
p = doc.add_paragraph()
lines = para.splitlines() or [""]
for i, line in enumerate(lines):
run = p.add_run(line)
if i < len(lines) - 1:
run.add_break()
doc.save(dest)
def run_ocr(in_file, out_dir_str, lang,
use_doc_unwarping, use_doc_orientation, use_textline_orientation,
use_server_rec, server_rec_dir_str,
use_gpu, gpu_id) -> Tuple[str, str, List[str]]:
if not in_file:
return "Файл не выбран.", "", []
in_path = Path(in_file if isinstance(in_file, (str, os.PathLike)) else getattr(in_file, "name"))
out_dir = Path(out_dir_str)
out_dir.mkdir(parents=True, exist_ok=True)
server_rec_dir = Path(server_rec_dir_str) if server_rec_dir_str else Path("")
cmd = [sys.executable, "-m", "paddleocr", "ocr",
"-i", str(in_path), "--lang", str(lang), "--save_path", str(out_dir)]
if use_doc_unwarping:
cmd += ["--use_doc_unwarping", "true"]
if use_doc_orientation:
cmd += ["--use_doc_orientation_classify", "true"]
if use_textline_orientation:
cmd += ["--use_textline_orientation", "true"]
if use_server_rec and server_rec_dir.exists():
cmd += ["--rec_model_dir", str(server_rec_dir)]
if use_gpu:
try:
gid = int(gpu_id)
except Exception:
gid = 0
cmd += ["--device", f"gpu:{gid}"]
log_file = out_dir / f"{in_path.stem}_paddleocr.log"
log_text = "[CMD] " + " ".join(cmd) + "\n"
with log_file.open("w", encoding="utf-8") as lf:
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
for line in proc.stdout:
log_text += line
lf.write(line)
rc = proc.wait()
if rc != 0:
log_text += f"\n[ERROR] CLI вернул код {rc}. Лог: {log_file}\n"
candidates = sorted(out_dir.glob(f"{in_path.stem}_page*.txt")) or \
sorted(out_dir.glob(f"{in_path.stem}.txt"))
merged_path = out_dir / f"{in_path.stem}_ALL.txt"
merged_text = ""
files_out = [str(log_file)]
if candidates:
with merged_path.open("w", encoding="utf-8") as w:
for i, p in enumerate(candidates, 1):
w.write(p.read_text(encoding="utf-8"))
if i < len(candidates):
w.write("\n\n")
merged_text = merged_path.read_text(encoding="utf-8")
files_out.append(str(merged_path))
else:
log = log_file.read_text(encoding="utf-8", errors="ignore")
texts = []
for m in re.finditer(r"['\"]rec_texts['\"]:\s*(\[[^\]]*\])", log, flags=re.S):
try:
arr = ast.literal_eval(m.group(1))
if isinstance(arr, list):
texts.extend([str(x) for x in arr])
except Exception:
continue
if texts:
merged_path.write_text("\n".join(texts), encoding="utf-8")
merged_text = merged_path.read_text(encoding="utf-8")
files_out.append(str(merged_path))
else:
log_text += "\n[WARN] Не нашёл *.txt и не смог извлечь rec_texts из лога."
if merged_text:
docx_path = out_dir / f"{in_path.stem}_ALL.docx"
_save_docx_from_text(merged_text, docx_path)
files_out.append(str(docx_path))
log_text += f"\n[DOCX] Добавлен: {docx_path}"
use_gemini_env = os.getenv("USE_GEMINI", "0") == "1"
do_postcheck_env = os.getenv("GEMINI_POSTCHECK", "0") == "1"
gemini_model = os.getenv("GEMINI_MODEL", "gemini-1.5-pro")
if _HAS_GEMINI and use_gemini_env and merged_text and merged_text != "(пусто)":
try:
clean_md, data_json = clean_and_extract(merged_text, model_name=gemini_model)
clean_path = out_dir / f"{in_path.stem}_clean.md"
data_path = out_dir / f"{in_path.stem}_extracted.json"
clean_path.write_text(clean_md or "", encoding="utf-8")
data_path.write_text(data_json or "{}", encoding="utf-8")
files_out += [str(clean_path), str(data_path)]
if clean_md:
docx_clean = out_dir / f"{in_path.stem}_clean.docx"
_save_docx_from_text(clean_md, docx_clean)
files_out.append(str(docx_clean))
log_text += f"\n[DOCX] CLEAN_MARKDOWN -> {docx_clean}"
log_text += "\n[Gemini] CLEAN_MARKDOWN и EXTRACTED_JSON готовы."
if do_postcheck_env:
report_json = post_check(clean_md, data_json, model_name=gemini_model)
report_path = out_dir / f"{in_path.stem}_postcheck.json"
report_path.write_text(report_json or "{}", encoding="utf-8")
files_out.append(str(report_path))
log_text += "\n[Gemini] Post-check отчёт готов."
except Exception as e:
log_text += f"\n[Gemini ERROR] {e}"
return log_text, (merged_text if merged_text else "(пусто)"), files_out