Spaces:
Sleeping
Sleeping
from sentence_transformers import SentenceTransformer | |
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline | |
from langchain.vectorstores import Chroma | |
from langchain.embeddings import HuggingFaceEmbeddings | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from utils import extract_text_from_files | |
class RAGPipeline: | |
def __init__(self): | |
print("[RAG] جاري تحميل النموذج والمحول...") | |
self.embedding_model = HuggingFaceEmbeddings(model_name="intfloat/multilingual-e5-base") | |
self.generator = pipeline("text-generation", model="tiiuae/falcon-7b-instruct", trust_remote_code=True, device_map="auto") | |
self.db = None | |
print("[RAG] تم التحميل بنجاح.") | |
def load_and_index(self, files): | |
text = extract_text_from_files(files) | |
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50) | |
chunks = splitter.split_text(text) | |
self.db = Chroma.from_texts(chunks, embedding=self.embedding_model) | |
return f"[RAG] تم بناء الفهرس لـ {len(chunks)} مقاطع." | |
def answer_question(self, question): | |
if self.db is None: | |
return "⚠️ لم يتم تحميل مستندات.", [] | |
docs = self.db.similarity_search(question, k=3) | |
context = "\n".join([doc.page_content for doc in docs]) | |
prompt = f"أجب عن السؤال التالي بناءً على المراجع التالية فقط:\n{context}\n\nالسؤال: {question}\nالإجابة:" | |
result = self.generator(prompt, max_new_tokens=200)[0]["generated_text"] | |
answer = result.split("الإجابة:")[-1].strip() | |
return answer, [doc.page_content for doc in docs] |