Docx-FAISS / app.py
mostafa202025's picture
Update app.py
116f242 verified
import os
import faiss
import numpy as np
import json
from docx import Document
from sentence_transformers import SentenceTransformer
import gradio as gr
import tempfile
# ---------- تنظیمات ----------
OUTPUT_DIR = "/tmp/output_faiss" # مسیر ذخیره فایل‌های خروجی
EMBEDDING_MODEL_NAME = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
# ---------- تبدیل فایل docx به ساختار JSON ----------
def docx_to_sections(docx_path):
doc = Document(docx_path)
sections = []
current_h1 = None
current_h2 = None
buffer = ""
for para in doc.paragraphs:
style = para.style.name
text = para.text.strip()
if not text:
continue
if style.startswith("Heading 1"):
if current_h2:
sections.append({
"heading": current_h2,
"content": buffer.strip(),
"full_content": buffer.strip(),
"parent": current_h1
})
current_h2 = None
buffer = ""
if current_h1 and buffer:
sections.append({
"heading": current_h1,
"content": buffer.strip(),
"full_content": buffer.strip()
})
current_h1 = text
buffer = ""
elif style.startswith("Heading 2"):
if current_h2:
sections.append({
"heading": current_h2,
"content": buffer.strip(),
"full_content": buffer.strip(),
"parent": current_h1
})
current_h2 = text
buffer = ""
else:
buffer += text + "\n"
if current_h2:
sections.append({
"heading": current_h2,
"content": buffer.strip(),
"full_content": buffer.strip(),
"parent": current_h1
})
elif current_h1:
sections.append({
"heading": current_h1,
"content": buffer.strip(),
"full_content": buffer.strip()
})
return sections
# ---------- تولید embedding ----------
def generate_embeddings(sections, model):
texts = [s['content'] for s in sections]
embeddings = model.encode(
texts,
convert_to_numpy=True,
normalize_embeddings=True, # نرمال‌سازی برای دقت بهتر در FAISS
show_progress_bar=True
)
return embeddings.astype("float32")
# ---------- ذخیره FAISS + متادیتا ----------
def save_faiss_and_metadata(embeddings, sections, base_name):
# استفاده از دایرکتوری موقت
temp_dir = tempfile.mkdtemp()
os.makedirs(temp_dir, exist_ok=True)
d = embeddings.shape[1]
index = faiss.IndexFlatL2(d)
index.add(embeddings)
faiss_path = os.path.join(temp_dir, f"faiss_index_{base_name}.bin")
metadata_path = os.path.join(temp_dir, f"metadata_{base_name}.json")
faiss.write_index(index, faiss_path)
with open(metadata_path, "w", encoding="utf-8") as f:
json.dump(sections, f, ensure_ascii=False, indent=2)
print(f"✅ ذخیره شد:\n - {faiss_path}\n - {metadata_path}")
return faiss_path, metadata_path
def build_from_docx(docx_file_path):
print(f"📄 پردازش فایل: {docx_file_path}")
sections = docx_to_sections(docx_file_path)
print(f"🧩 {len(sections)} بخش استخراج شد.")
model = SentenceTransformer(EMBEDDING_MODEL_NAME)
embeddings = generate_embeddings(sections, model)
base_name = os.path.splitext(os.path.basename(docx_file_path))[0].lower()
faiss_path, metadata_path = save_faiss_and_metadata(embeddings, sections, base_name)
return f"فایل‌های FAISS و متادیتا ایجاد شدند.", faiss_path, metadata_path
def process_docx(file):
message, faiss_path, metadata_path = build_from_docx(file.name)
return message, gr.File(faiss_path), gr.File(metadata_path)
iface = gr.Interface(
fn=process_docx,
inputs=gr.File(file_count="single", type="filepath", label="Upload DOCX File"),
outputs=[
gr.Textbox(label="Output"),
gr.File(label="Download FAISS Index"),
gr.File(label="Download Metadata")
],
title="Docx to FAISS & Metadata Generator",
description="Upload a DOCX file, and it will process the contents to generate FAISS index and metadata."
)
if __name__ == "__main__":
iface.launch()