|
import numpy as np |
|
import faiss |
|
from datasets import load_dataset |
|
from sentence_transformers import SentenceTransformer |
|
from transformers import pipeline |
|
import streamlit as st |
|
|
|
|
|
dataset = load_dataset("bitext/Bitext-customer-support-llm-chatbot-training-dataset", split="train") |
|
questions = [] |
|
answers = [] |
|
|
|
for i, item in enumerate(dataset): |
|
if i >= 1000: |
|
break |
|
questions.append(item["instruction"]) |
|
answers.append(item["response"]) |
|
|
|
|
|
embedder = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1') |
|
|
|
|
|
question_embeddings = embedder.encode(questions, convert_to_tensor=False) |
|
|
|
|
|
dimension = question_embeddings.shape[1] |
|
index = faiss.IndexFlatL2(dimension) |
|
index.add(np.array(question_embeddings)) |
|
|
|
|
|
generator = pipeline("text-generation", model="openai-community/gpt2") |
|
|
|
|
|
|
|
def retrieve_documents(query, top_k=2): |
|
query_embedding = embedder.encode([query]) |
|
distances, indices = index.search(np.array(query_embedding), top_k) |
|
retrieved = [{"text": answers[idx], "score": distances[0][i]} for i, idx in enumerate(indices[0])] |
|
return retrieved |
|
|
|
|
|
def generate_answer(query): |
|
|
|
retrieved_docs = retrieve_documents(query) |
|
context = " ".join([doc["text"] for doc in retrieved_docs]) |
|
|
|
|
|
input_text = f"Contexto: {context} Pergunta: {query}" |
|
answer = generator(input_text, max_length=50, do_sample=False) |
|
|
|
|
|
confidence_score = np.mean([doc["score"] for doc in retrieved_docs]) |
|
return answer[0]['generated_text'], confidence_score |
|
|
|
|
|
st.title("Assistente de Suporte ao Cliente com RAG - Hugging Face") |
|
|
|
question = st.text_input("Digite sua pergunta:") |
|
|
|
if st.button("Obter Resposta"): |
|
answer, confidence = generate_answer(question) |
|
st.write("Resposta:", answer) |
|
st.write("Pontuação de Confiança:", round(confidence, 2)) |
|
|
|
|
|
|