Spaces:

mostafa202025
/

Docx-FAISS

Sleeping

App Files Files Community

Docx-FAISS / app.py

mostafa202025

Update app.py

116f242 verified 6 days ago

raw

history blame contribute delete

4.57 kB

	import os
	import faiss
	import numpy as np
	import json
	from docx import Document
	from sentence_transformers import SentenceTransformer
	import gradio as gr
	import tempfile

	# ---------- تنظیمات ----------
	OUTPUT_DIR = "/tmp/output_faiss" # مسیر ذخیره فایل‌های خروجی
	EMBEDDING_MODEL_NAME = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"

	# ---------- تبدیل فایل docx به ساختار JSON ----------
	def docx_to_sections(docx_path):
	doc = Document(docx_path)
	sections = []
	current_h1 = None
	current_h2 = None
	buffer = ""

	for para in doc.paragraphs:
	style = para.style.name
	text = para.text.strip()

	if not text:
	continue

	if style.startswith("Heading 1"):
	if current_h2:
	sections.append({
	"heading": current_h2,
	"content": buffer.strip(),
	"full_content": buffer.strip(),
	"parent": current_h1
	})
	current_h2 = None
	buffer = ""

	if current_h1 and buffer:
	sections.append({
	"heading": current_h1,
	"content": buffer.strip(),
	"full_content": buffer.strip()
	})
	current_h1 = text
	buffer = ""

	elif style.startswith("Heading 2"):
	if current_h2:
	sections.append({
	"heading": current_h2,
	"content": buffer.strip(),
	"full_content": buffer.strip(),
	"parent": current_h1
	})
	current_h2 = text
	buffer = ""

	else:
	buffer += text + "\n"

	if current_h2:
	sections.append({
	"heading": current_h2,
	"content": buffer.strip(),
	"full_content": buffer.strip(),
	"parent": current_h1
	})
	elif current_h1:
	sections.append({
	"heading": current_h1,
	"content": buffer.strip(),
	"full_content": buffer.strip()
	})

	return sections

	# ---------- تولید embedding ----------
	def generate_embeddings(sections, model):
	texts = [s['content'] for s in sections]
	embeddings = model.encode(
	texts,
	convert_to_numpy=True,
	normalize_embeddings=True, # نرمال‌سازی برای دقت بهتر در FAISS
	show_progress_bar=True
	)
	return embeddings.astype("float32")

	# ---------- ذخیره FAISS + متادیتا ----------
	def save_faiss_and_metadata(embeddings, sections, base_name):
	# استفاده از دایرکتوری موقت
	temp_dir = tempfile.mkdtemp()
	os.makedirs(temp_dir, exist_ok=True)

	d = embeddings.shape[1]
	index = faiss.IndexFlatL2(d)
	index.add(embeddings)

	faiss_path = os.path.join(temp_dir, f"faiss_index_{base_name}.bin")
	metadata_path = os.path.join(temp_dir, f"metadata_{base_name}.json")

	faiss.write_index(index, faiss_path)

	with open(metadata_path, "w", encoding="utf-8") as f:
	json.dump(sections, f, ensure_ascii=False, indent=2)

	print(f"✅ ذخیره شد:\n - {faiss_path}\n - {metadata_path}")
	return faiss_path, metadata_path

	def build_from_docx(docx_file_path):
	print(f"📄 پردازش فایل: {docx_file_path}")
	sections = docx_to_sections(docx_file_path)
	print(f"🧩 {len(sections)} بخش استخراج شد.")

	model = SentenceTransformer(EMBEDDING_MODEL_NAME)
	embeddings = generate_embeddings(sections, model)

	base_name = os.path.splitext(os.path.basename(docx_file_path))[0].lower()
	faiss_path, metadata_path = save_faiss_and_metadata(embeddings, sections, base_name)

	return f"فایل‌های FAISS و متادیتا ایجاد شدند.", faiss_path, metadata_path

	def process_docx(file):
	message, faiss_path, metadata_path = build_from_docx(file.name)
	return message, gr.File(faiss_path), gr.File(metadata_path)

	iface = gr.Interface(
	fn=process_docx,
	inputs=gr.File(file_count="single", type="filepath", label="Upload DOCX File"),
	outputs=[
	gr.Textbox(label="Output"),
	gr.File(label="Download FAISS Index"),
	gr.File(label="Download Metadata")
	],
	title="Docx to FAISS & Metadata Generator",
	description="Upload a DOCX file, and it will process the contents to generate FAISS index and metadata."
	)

	if __name__ == "__main__":
	iface.launch()