Spaces:
Running
Running
File size: 3,367 Bytes
b2e21c9 8b85fba f56e52a 8b85fba f56e52a 8b85fba f56e52a 8b85fba fc6490b 8b85fba b2e21c9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 |
import streamlit as st
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import chromadb
from chromadb.config import Settings
import torch
st.set_page_config(page_title="RAG Chatbot", page_icon="π€", layout="wide")
st.title("π€ RAG Chatbot β INSIEL")
import os
import streamlit as st
import subprocess
def run_ingest_if_needed():
if not os.path.exists("vectorstore"):
st.info("Inizializzazione: generazione vectorstore in corso...")
try:
subprocess.run(["python", "rag_ingest.py"], check=True)
st.success("Vectorstore generata correttamente β
")
except subprocess.CalledProcessError:
st.error("Errore durante la generazione della vectorstore.")
run_ingest_if_needed()
@st.cache_resource
def load_models():
# Embedding model
embedder = SentenceTransformer("sentence-transformers/distiluse-base-multilingual-cased-v1")
# LLM model (TinyLlama su CPU)
model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id).to(torch.device("cpu"))
rag_chat = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=300, device=-1)
return embedder, rag_chat
embedder, rag_chat = load_models()
# --- CHROMA DB SETUP ---
#client = chromadb.PersistentClient(path="./vectorstore")
#collection = client.get_or_create_collection("insiel_chunks")
client = chromadb.PersistentClient(path="./vectorstore")
collection = client.get_or_create_collection(name="insiel_chunks")
# --- FUNZIONE DI RISPOSTA ---
def generate_rag_response_local(query, retrieved_chunks):
context = "\n\n".join(retrieved_chunks)
context = context[:3000] # taglia se troppo lungo per evitare overflow
prompt = (
"Rispondi alla domanda usando solo le informazioni nel contesto. "
"Se la risposta non Γ¨ presente, di' chiaramente che non Γ¨ specificato nel documento.\n\n"
f"Contesto:\n{context}\n\n"
f"Domanda: \n{query}\n"
"Risposta:"
)
result = rag_chat(prompt)[0]["generated_text"]
return result.split("Risposta:")[-1].strip()
# --- INTERFACCIA ---
if "history" not in st.session_state:
st.session_state.history = []
query = st.text_input("π¬ Inserisci la tua domanda qui:")
if query:
# 1. Embedding della query
query_embedding = embedder.encode([query])
# 2. Retrieval da Chroma
results = collection.query(query_embeddings=query_embedding, n_results=3)
retrieved_chunks = results["documents"][0]
# 3. Risposta con modello locale
response = generate_rag_response_local(query, retrieved_chunks)
# 4. Aggiorna cronologia chat
st.session_state.history.append(("π§βπ» Tu", query))
response_preview = "\n".join(response.strip().split("\n")[:2])
st.session_state.history.append(("π€ RAG Bot", response_preview))
# --- OUTPUT CHAT ---
if st.session_state.history:
for speaker, msg in st.session_state.history:
st.markdown(f"**{speaker}**: {msg}")
# --- VISUALIZZA I CHUNK USATI ---
if query:
with st.expander("π Mostra i documenti/chunk usati"):
for i, chunk in enumerate(retrieved_chunks):
st.markdown(f"**Chunk {i+1}**\n\n{chunk}\n\n---")
|