Spaces:

MrSimple01
/

AIEXP_RAG_1

Sleeping

File size: 7,188 Bytes

9859bf3

from llama_index.core import VectorStoreIndex, Document, StorageContext, Settings
from llama_index.vector_stores.faiss import FaissVectorStore
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.response_synthesizers import ResponseMode, get_response_synthesizer
from document_processor import create_llama_documents, process_single_document, save_processed_chunks, load_processed_chunks
import pandas as pd
import faiss
import pickle
import os

EMBEDDING_MODEL = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
RETRIEVER_TOP_K = 10
RETRIEVER_SIMILARITY_CUTOFF = 0.7
RAG_FILES_DIR = "processed_data"
PROCESSED_DATA_FILE = "processed_data/processed_chunks.csv"

def setup_llm_settings():
    embed_model = HuggingFaceEmbedding(model_name=EMBEDDING_MODEL)
    Settings.embed_model = embed_model

def create_vector_index_with_faiss(documents):
    setup_llm_settings()
    
    d = 384
    faiss_index = faiss.IndexFlatIP(d)
    vector_store = FaissVectorStore(faiss_index=faiss_index)
    storage_context = StorageContext.from_defaults(vector_store=vector_store)
    
    index = VectorStoreIndex.from_documents(
        documents, 
        storage_context=storage_context
    )
    
    return index, faiss_index

def create_retriever(index):
    return VectorIndexRetriever(
        index=index, 
        similarity_top_k=RETRIEVER_TOP_K,
        similarity_cutoff=RETRIEVER_SIMILARITY_CUTOFF
    )

def create_response_synthesizer():
    return get_response_synthesizer(
        response_mode=ResponseMode.TREE_SUMMARIZE,
        streaming=False
    )

def create_query_engine(index):
    retriever = create_retriever(index)
    response_synthesizer = create_response_synthesizer()
    
    return RetrieverQueryEngine(
        retriever=retriever,
        response_synthesizer=response_synthesizer
    )

def save_rag_system(index, faiss_index, documents):
    os.makedirs(RAG_FILES_DIR, exist_ok=True)
    
    faiss.write_index(faiss_index, os.path.join(RAG_FILES_DIR, 'faiss_index.index'))
    
    index.storage_context.persist(persist_dir=RAG_FILES_DIR)
    
    with open(os.path.join(RAG_FILES_DIR, 'documents.pkl'), 'wb') as f:
        pickle.dump(documents, f)
    
    metadata_dict = {}
    for doc in documents:
        metadata_dict[doc.id_] = doc.metadata
    
    with open(os.path.join(RAG_FILES_DIR, 'chunk_metadata.pkl'), 'wb') as f:
        pickle.dump(metadata_dict, f)
    
    config = {
        'embed_model_name': EMBEDDING_MODEL,
        'vector_dim': 384,
        'total_documents': len(documents),
        'index_type': 'faiss_flat_ip'
    }
    
    with open(os.path.join(RAG_FILES_DIR, 'config.pkl'), 'wb') as f:
        pickle.dump(config, f)

def load_rag_system():
    if not os.path.exists(os.path.join(RAG_FILES_DIR, 'faiss_index.index')):
        return None
    
    try:
        setup_llm_settings()
        
        faiss_index = faiss.read_index(os.path.join(RAG_FILES_DIR, 'faiss_index.index'))
        vector_store = FaissVectorStore(faiss_index=faiss_index)
        storage_context = StorageContext.from_defaults(vector_store=vector_store)
        
        index = VectorStoreIndex.from_documents([], storage_context=storage_context)
        
        with open(os.path.join(RAG_FILES_DIR, 'documents.pkl'), 'rb') as f:
            documents = pickle.load(f)
        
        for doc in documents:
            index.insert(doc)
        
        query_engine = create_query_engine(index)
        return query_engine
        
    except Exception as e:
        print(f"Error loading RAG system: {str(e)}")
        return None

def build_rag_system(processed_chunks):
    setup_llm_settings()
    
    documents = create_llama_documents(processed_chunks)
    print(f"Created {len(documents)} documents for RAG system")
    
    index, faiss_index = create_vector_index_with_faiss(documents)
    query_engine = create_query_engine(index)
    
    save_rag_system(index, faiss_index, documents)
    
    return query_engine

def add_new_document_to_system(file_path, existing_query_engine):
    try:
        new_chunks = process_single_document(file_path)
        
        if not new_chunks:
            return existing_query_engine
        
        if os.path.exists(PROCESSED_DATA_FILE):
            existing_df = load_processed_chunks(PROCESSED_DATA_FILE)
            existing_chunks = existing_df.to_dict('records')
        else:
            existing_chunks = []
        
        all_chunks = existing_chunks + new_chunks
        save_processed_chunks(all_chunks, PROCESSED_DATA_FILE)
        
        query_engine = build_rag_system(all_chunks)
        
        print(f"Added {len(new_chunks)} new chunks from {os.path.basename(file_path)}")
        return query_engine
        
    except Exception as e:
        print(f"Error adding new document: {str(e)}")
        return existing_query_engine

def query_documents(query_engine, question):
    response = query_engine.query(question)
    return response

def get_response_sources(response):
    sources = []
    for i, node in enumerate(response.source_nodes):
        source_info = {
            'chunk_number': i + 1,
            'section': node.metadata.get('section', 'Не указан'),
            'subsection': node.metadata.get('subsection', 'Не указан'),
            'chunk_id': node.metadata.get('chunk_id', 'Не указан'),
            'document_id': node.metadata.get('document_id', 'Не указан'),
            'txt_file_id': node.metadata.get('txt_file_id', 'Не указан'),
            'file_link': node.metadata.get('file_link', 'Не указан'),
            'text_preview': node.text[:200] + "..." if len(node.text) > 200 else node.text,
            'score': getattr(node, 'score', 0.0)
        }
        sources.append(source_info)
    return sources

def format_response_with_sources(response):
    formatted_response = {
        'answer': response.response,
        'sources': get_response_sources(response)
    }
    return formatted_response

def test_rag_system(query_engine, test_questions):
    results = []
    
    for question in test_questions:
        print(f"Question: {question}")
        response = query_documents(query_engine, question)
        formatted_response = format_response_with_sources(response)
        
        print(f"Answer: {formatted_response['answer']}")
        print("Sources:")
        for source in formatted_response['sources']:
            print(f"  - Chunk {source['chunk_number']}: {source['document_id']}")
            print(f"    Section: {source['section']}, Subsection: {source['subsection']}")
            print(f"    Preview: {source['text_preview']}")
        print("=" * 80)
        
        results.append({
            'question': question,
            'response': formatted_response
        })
    
    return results