Spaces:

DrAbdulmalek
/

OmniFile-Processor

Running

Dr. Abdulmalek

deploy: OmniFile AI Processor v4.3.0

900df0b 7 days ago

11.2 kB

	"""
	HandwrittenOCR - تصدير بيانات التدريب ورفع إلى HuggingFace v4.0
	====================================================================
	المحسنات:
	- auto_export(): CSV + XLSX + نص كامل + JSONL تدريب
	- create_backup(): نسخ احتياطي شامل
	- push_to_huggingface(): مع commit_message يحتوي التاريخ
	"""

	import os
	import json
	import random
	import shutil
	import tempfile
	import logging
	from datetime import datetime
	from pathlib import Path

	from src.logger import log_step

	import pandas as pd

	logger = logging.getLogger("HandwrittenOCR")


	def auto_export(
	db,
	run_id: str,
	output_dir: str = None,
	config=None,
	) -> dict:
	"""
	تصدير تلقائي شامل: CSV + XLSX + النص الكامل + JSONL تدريب.

	Parameters:
	db: كائن قاعدة البيانات
	run_id: معرف التشغيل
	output_dir: مجلد الإخراج (اختياري)
	config: كائن الإعدادات (اختياري)

	Returns:
	ملخص التصدير {files, total_words, verified, ...}
	"""
	if output_dir is None:
	if config:
	output_dir = os.path.join(config.exports_dir, "auto", run_id)
	else:
	output_dir = os.path.join("exports", "auto", run_id)

	os.makedirs(output_dir, exist_ok=True)

	# جلب البيانات
	words = db.get_all()
	if not words:
	logger.warning("لا توجد بيانات للتصدير")
	return {}

	df_all = pd.DataFrame(words)
	df_verified = df_all[
	df_all["status"].isin(["verified", "sentence_corrected"])
	]
	df_csv = df_all.drop(columns=["image_data"], errors="ignore")

	exported = {}

	# --- CSV ---
	csv_path = os.path.join(output_dir, "all_words.csv")
	df_csv.to_csv(csv_path, index=False, encoding="utf-8-sig")
	exported["csv"] = csv_path

	# --- XLSX (مع ورقة لكل صفحة) ---
	try:
	xlsx_path = os.path.join(output_dir, "all_words.xlsx")
	with pd.ExcelWriter(xlsx_path, engine="openpyxl") as writer:
	df_csv.to_excel(writer, sheet_name="All", index=False)
	for pg in sorted(df_csv["page_num"].dropna().unique()):
	page_df = df_csv[df_csv["page_num"] == pg]
	page_df.to_excel(writer, sheet_name=f"P{int(pg)}", index=False)
	exported["xlsx"] = xlsx_path
	except ImportError:
	logger.warning("openpyxl غير مثبت - تخطي XLSX")

	# --- النص الكامل المُعاد بناؤه ---
	try:
	from src.reconstruction import reconstruct_sentences_direct
	text_lines = reconstruct_sentences_direct(df_all)
	text_path = os.path.join(output_dir, "reconstructed_text.txt")
	with open(text_path, "w", encoding="utf-8") as f:
	f.write("\n".join(text_lines))
	exported["text"] = text_path
	except Exception as e:
	logger.warning(f"فشل إعادة بناء النص: {e}")

	# --- JSONL للتدريب ---
	if not df_verified.empty:
	img_dir = os.path.join(output_dir, "training_images")
	os.makedirs(img_dir, exist_ok=True)
	records = []
	for _, row in df_verified.iterrows():
	fname = f"img_{row['image_id']}.png"
	with open(os.path.join(img_dir, fname), "wb") as f:
	f.write(row["image_data"])
	txt = (row["predicted_text"] or "").strip()
	if txt:
	records.append({"image": fname, "text": txt})

	jsonl_path = os.path.join(output_dir, "training_data.jsonl")
	with open(jsonl_path, "w", encoding="utf-8") as f:
	for rec in records:
	f.write(json.dumps(rec, ensure_ascii=False) + "\n")
	exported["jsonl"] = jsonl_path
	exported["training_samples"] = len(records)

	summary = {
	"run_id": run_id,
	"exported_at": datetime.now().isoformat(),
	"total_words": len(df_all),
	"verified": len(df_verified),
	"dir": output_dir,
	"files": exported,
	}

	summary_path = os.path.join(output_dir, "export_summary.json")
	with open(summary_path, "w", encoding="utf-8") as f:
	json.dump(summary, f, ensure_ascii=False, indent=2)

	logger.info(f"تم التصدير التلقائي: {output_dir}")
	return summary


	def export_finetuning_dataset(
	db,
	output_dir: str,
	val_ratio: float = 0.1,
	) -> str \| None:
	"""
	تصدير البيانات الموثقة كبيانات تدريب JSONL مع train/val split.

	Parameters:
	db: كائن قاعدة البيانات
	output_dir: مجلد الإخراج
	val_ratio: نسبة بيانات التحقق

	Returns:
	مسار مجلد الإخراج أو None
	"""
	verified = db.get_verified()
	verified = [
	w for w in verified
	if w.get("status") in ("verified", "sentence_corrected")
	]

	if not verified:
	logger.warning("لا توجد بيانات موثقة للتصدير")
	return None

	os.makedirs(output_dir, exist_ok=True)
	img_dir = os.path.join(output_dir, "images")
	os.makedirs(img_dir, exist_ok=True)

	jsonl_records = []
	for row in verified:
	filename = f"img_{row['image_id']}.png"
	filepath = os.path.join(img_dir, filename)
	with open(filepath, "wb") as f:
	f.write(row["image_data"])

	text = (row["predicted_text"] or "").strip()
	if text:
	jsonl_records.append({"image": filename, "text": text})

	if not jsonl_records:
	return None

	random.shuffle(jsonl_records)
	split_idx = int(len(jsonl_records) * (1 - val_ratio))
	train_data = jsonl_records[:split_idx]
	val_data = jsonl_records[split_idx:]

	def save_jsonl(data, fname):
	path = os.path.join(output_dir, fname)
	with open(path, "w", encoding="utf-8") as f:
	for rec in data:
	f.write(json.dumps(rec, ensure_ascii=False) + "\n")
	return path

	train_path = save_jsonl(train_data, "train.jsonl")
	val_path = save_jsonl(val_data, "val.jsonl")

	logger.info(
	f"تم التصدير: {len(jsonl_records)} عينة "
	f"(train={len(train_data)}, val={len(val_data)})"
	)
	return output_dir


	def create_backup(config) -> str:
	"""
	إنشاء نسخة احتياطية شاملة.
	— بدون استخدام !cp (تصحيح #1)
	"""
	label = datetime.now().strftime("%Y%m%d_%H%M%S")
	bk_dir = os.path.join(config.backups_dir, f"backup_{label}")
	os.makedirs(bk_dir, exist_ok=True)

	files_to_backup = [
	config.db_path,
	config.feedback_csv,
	config.stats_json,
	config.correction_dict_path,
	config.events_jsonl,
	]

	for p in files_to_backup:
	if os.path.exists(p):
	shutil.copy2(p, os.path.join(bk_dir, os.path.basename(p)))

	# نسخ مجلد artifacts إذا وُجد
	artifacts = config.artifacts_dir
	if os.path.isdir(artifacts):
	dest = os.path.join(bk_dir, "artifacts")
	if not os.path.exists(dest):
	shutil.copytree(artifacts, dest)

	logger.info(f"تم إنشاء نسخة احتياطية: {bk_dir}")
	return bk_dir


	def push_to_huggingface(
	local_dataset_dir: str,
	hf_repo_id: str,
	hf_token: str = "",
	commit_message: str = "",
	) -> bool:
	"""
	رفع البيانات الموثقة إلى HuggingFace Hub.
	مع commit_message يحتوي التاريخ.
	"""
	try:
	from huggingface_hub import HfApi, login
	except ImportError:
	logger.error("huggingface_hub غير مثبت")
	return False

	if not os.path.exists(local_dataset_dir):
	logger.error(f"المجلد غير موجود: {local_dataset_dir}")
	return False

	if hf_token:
	try:
	login(token=hf_token)
	except Exception as e:
	logger.error(f"فشل تسجيل الدخول: {e}")
	return False

	api = HfApi()

	try:
	api.create_repo(
	repo_id=hf_repo_id, repo_type="dataset", exist_ok=True
	)
	except Exception:
	pass

	# commit_message مع التاريخ
	if not commit_message:
	commit_message = f"Update dataset - {datetime.now().strftime('%Y-%m-%d %H:%M')}"

	try:
	api.upload_folder(
	folder_path=local_dataset_dir,
	repo_id=hf_repo_id,
	repo_type="dataset",
	commit_message=commit_message,
	)
	url = f"https://huggingface.co/datasets/{hf_repo_id}"
	logger.info(f"تم رفع البيانات إلى {url}")
	return True
	except Exception as e:
	logger.error(f"فشل الرفع: {e}")
	return False


	def export_pdf_report(db, output_path: str, title: str = "Handwriting OCR Report") -> str:
	"""
	تصدير تقرير PDF يحتوي على صور الكلمات والنصوص المصحّحة.
	يتطلب: pip install fpdf2
	"""
	try:
	from fpdf import FPDF
	except ImportError:
	logger.warning("fpdf2 غير مثبت — تثبيت: pip install fpdf2")
	return ""

	log_step(logger, "export_pdf_report", {"output_path": output_path})

	words = db.get_all()
	if not words:
	logger.info("لا توجد بيانات لتصدير PDF")
	return ""

	class OCRReport(FPDF):
	def header(self):
	self.set_font('Helvetica', 'B', 14)
	self.cell(0, 10, title, 0, 1, 'C')
	self.ln(5)

	def footer(self):
	self.set_y(-15)
	self.set_font('Helvetica', 'I', 8)
	self.cell(0, 10, f'Page {self.page_no()}', 0, 0, 'C')

	pdf = OCRReport()
	pdf.set_auto_page_break(auto=True, margin=15)

	for i, word in enumerate(words):
	if i % 4 == 0:
	pdf.add_page()

	# صورة الكلمة
	if word.get("image_data"):
	img_bytes = word["image_data"]
	if isinstance(img_bytes, str):
	img_bytes = bytes(img_bytes, 'latin-1')
	tmp = tempfile.NamedTemporaryFile(suffix=".png", delete=False)
	tmp.write(img_bytes)
	tmp.close()
	try:
	pdf.image(tmp.name, x=10, y=pdf.get_y(), w=40)
	except Exception:
	pass
	os.unlink(tmp.name)

	# النص
	pdf.set_xy(55, pdf.get_y())
	pdf.set_font('Helvetica', '', 10)
	text = str(word.get("predicted_text", ""))
	pdf.multi_cell(0, 8, txt=f"Text: {text}")
	pdf.set_font('Helvetica', 'I', 8)
	conf = word.get("confidence", 0)
	page_num = word.get("page_num", "?")
	status = word.get("status", "?")
	pdf.cell(0, 5, f"Conf: {conf:.2%} \| Page: {page_num} \| Status: {status}", 0, 1)
	pdf.ln(5)

	os.makedirs(os.path.dirname(output_path), exist_ok=True)
	pdf.output(output_path)
	file_size = os.path.getsize(output_path)
	logger.info(f"تم تصدير PDF: {output_path} ({file_size} bytes, {len(words)} كلمة)")
	return output_path