Spaces:

ramysaidagieb
/

Answer1

Sleeping

File size: 4,463 Bytes

import gradio as gr
import faiss
import numpy as np
from transformers import AutoTokenizer, AutoModel
from sentence_transformers import SentenceTransformer
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pdfminer.high_level import extract_text
import docx

# Initialize global variables
embedding_model = SentenceTransformer('CAMeL-Lab/bert-base-arabic-camelbert-mix')
index = None
texts = []

def extract_text_from_pdf(file_path):
    try:
        return extract_text(file_path)
    except Exception as e:
        print(f"Error extracting from PDF: {e}")
        return ""

def extract_text_from_docx(file_path):
    try:
        doc = docx.Document(file_path)
        return "\n".join([para.text for para in doc.paragraphs])
    except Exception as e:
        print(f"Error extracting from DOCX: {e}")
        return ""

def process_files(files, progress=gr.Progress()):
    global index, texts

    if not files or len(files) == 0:
        return "⚠️ لم يتم رفع أي ملفات. الرجاء رفع كتاب واحد على الأقل."

    texts = []

    try:
        # Step 1: Extract text
        progress(0.1, desc="جاري استخراج النصوص من الكتب...")
        for file_path in files:
            if isinstance(file_path, str):
                if file_path.endswith(".pdf"):
                    text = extract_text_from_pdf(file_path)
                elif file_path.endswith(".docx") or file_path.endswith(".doc"):
                    text = extract_text_from_docx(file_path)
                else:
                    continue

                if text:
                    texts.append(text)

        if len(texts) == 0:
            return "⚠️ لم يتم استخراج نصوص صالحة من الملفات."

        # Step 2: Chunk the text
        progress(0.4, desc="تقطيع النصوص إلى فقرات...")
        splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
        chunks = []
        for text in texts:
            chunks.extend(splitter.split_text(text))

        if len(chunks) == 0:
            return "⚠️ لا يوجد محتوى نصي كافٍ للتدريب."

        # Step 3: Embed the text
        progress(0.7, desc="تحويل الفقرات إلى متجهات...")
        embeddings = embedding_model.encode(chunks, show_progress_bar=True)

        # Step 4: Build FAISS index
        progress(0.9, desc="بناء قاعدة بيانات البحث...")
        embeddings = np.array(embeddings).astype(np.float32)
        index = faiss.IndexFlatL2(embeddings.shape[1])
        index.add(embeddings)
        texts.clear()
        texts.extend(chunks)

        return "✅ النظام جاهز للإجابة على أسئلتك"
    except Exception as e:
        return f"❌ حدث خطأ أثناء التدريب: {str(e)}"

def answer_question(question):
    global index, texts

    if index is None or len(texts) == 0:
        return "⚠️ الرجاء رفع كتبك وتدريب النظام أولاً."

    try:
        question_embedding = embedding_model.encode([question])
        question_embedding = np.array(question_embedding).astype(np.float32)

        D, I = index.search(question_embedding, k=1)
        if I[0][0] == -1:
            return "❌ لم يتم العثور على إجابة."

        retrieved_chunk = texts[I[0][0]]
        return retrieved_chunk
    except Exception as e:
        return f"❌ حدث خطأ أثناء الإجابة: {str(e)}"

with gr.Blocks() as demo:
    gr.Markdown("# 📚 نظام محاكاة دماغ المؤلف العربي\nارفع كتبك ودرب النظام للإجابة على أسئلتك باللغة العربية فقط.")

    with gr.Row():
        file_input = gr.File(label="📄 ارفع ملفات الكتب (PDF أو DOCX)", file_types=['.pdf', '.docx', '.doc'], file_count="multiple")

    with gr.Row():
        train_button = gr.Button("🚀 ابدأ التدريب على الكتب")

    output_text = gr.Textbox(label="🔵 حالة التدريب")

    with gr.Row():
        question_input = gr.Textbox(label="✍️ اكتب سؤالك هنا")
        answer_output = gr.Textbox(label="🧠 إجابة النظام")

    train_button.click(fn=process_files, inputs=[file_input], outputs=[output_text])
    question_input.submit(fn=answer_question, inputs=[question_input], outputs=[answer_output])

demo.launch()