Spaces:
Sleeping
Sleeping
import os | |
import faiss | |
import numpy as np | |
import json | |
from docx import Document | |
from sentence_transformers import SentenceTransformer | |
import gradio as gr | |
import tempfile | |
# ---------- تنظیمات ---------- | |
OUTPUT_DIR = "/tmp/output_faiss" # مسیر ذخیره فایلهای خروجی | |
EMBEDDING_MODEL_NAME = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2" | |
# ---------- تبدیل فایل docx به ساختار JSON ---------- | |
def docx_to_sections(docx_path): | |
doc = Document(docx_path) | |
sections = [] | |
current_h1 = None | |
current_h2 = None | |
buffer = "" | |
for para in doc.paragraphs: | |
style = para.style.name | |
text = para.text.strip() | |
if not text: | |
continue | |
if style.startswith("Heading 1"): | |
if current_h2: | |
sections.append({ | |
"heading": current_h2, | |
"content": buffer.strip(), | |
"full_content": buffer.strip(), | |
"parent": current_h1 | |
}) | |
current_h2 = None | |
buffer = "" | |
if current_h1 and buffer: | |
sections.append({ | |
"heading": current_h1, | |
"content": buffer.strip(), | |
"full_content": buffer.strip() | |
}) | |
current_h1 = text | |
buffer = "" | |
elif style.startswith("Heading 2"): | |
if current_h2: | |
sections.append({ | |
"heading": current_h2, | |
"content": buffer.strip(), | |
"full_content": buffer.strip(), | |
"parent": current_h1 | |
}) | |
current_h2 = text | |
buffer = "" | |
else: | |
buffer += text + "\n" | |
if current_h2: | |
sections.append({ | |
"heading": current_h2, | |
"content": buffer.strip(), | |
"full_content": buffer.strip(), | |
"parent": current_h1 | |
}) | |
elif current_h1: | |
sections.append({ | |
"heading": current_h1, | |
"content": buffer.strip(), | |
"full_content": buffer.strip() | |
}) | |
return sections | |
# ---------- تولید embedding ---------- | |
def generate_embeddings(sections, model): | |
texts = [s['content'] for s in sections] | |
embeddings = model.encode( | |
texts, | |
convert_to_numpy=True, | |
normalize_embeddings=True, # نرمالسازی برای دقت بهتر در FAISS | |
show_progress_bar=True | |
) | |
return embeddings.astype("float32") | |
# ---------- ذخیره FAISS + متادیتا ---------- | |
def save_faiss_and_metadata(embeddings, sections, base_name): | |
# استفاده از دایرکتوری موقت | |
temp_dir = tempfile.mkdtemp() | |
os.makedirs(temp_dir, exist_ok=True) | |
d = embeddings.shape[1] | |
index = faiss.IndexFlatL2(d) | |
index.add(embeddings) | |
faiss_path = os.path.join(temp_dir, f"faiss_index_{base_name}.bin") | |
metadata_path = os.path.join(temp_dir, f"metadata_{base_name}.json") | |
faiss.write_index(index, faiss_path) | |
with open(metadata_path, "w", encoding="utf-8") as f: | |
json.dump(sections, f, ensure_ascii=False, indent=2) | |
print(f"✅ ذخیره شد:\n - {faiss_path}\n - {metadata_path}") | |
return faiss_path, metadata_path | |
def build_from_docx(docx_file_path): | |
print(f"📄 پردازش فایل: {docx_file_path}") | |
sections = docx_to_sections(docx_file_path) | |
print(f"🧩 {len(sections)} بخش استخراج شد.") | |
model = SentenceTransformer(EMBEDDING_MODEL_NAME) | |
embeddings = generate_embeddings(sections, model) | |
base_name = os.path.splitext(os.path.basename(docx_file_path))[0].lower() | |
faiss_path, metadata_path = save_faiss_and_metadata(embeddings, sections, base_name) | |
return f"فایلهای FAISS و متادیتا ایجاد شدند.", faiss_path, metadata_path | |
def process_docx(file): | |
message, faiss_path, metadata_path = build_from_docx(file.name) | |
return message, gr.File(faiss_path), gr.File(metadata_path) | |
iface = gr.Interface( | |
fn=process_docx, | |
inputs=gr.File(file_count="single", type="filepath", label="Upload DOCX File"), | |
outputs=[ | |
gr.Textbox(label="Output"), | |
gr.File(label="Download FAISS Index"), | |
gr.File(label="Download Metadata") | |
], | |
title="Docx to FAISS & Metadata Generator", | |
description="Upload a DOCX file, and it will process the contents to generate FAISS index and metadata." | |
) | |
if __name__ == "__main__": | |
iface.launch() | |