|
import pandas as pd |
|
import pickle |
|
import random |
|
from sentence_transformers import SentenceTransformer |
|
from utils import ( |
|
encode, |
|
cosine_sim, |
|
top_candidates, |
|
candidates_reranking, |
|
intent_classification, |
|
) |
|
from collections import deque |
|
from transformers import pipeline |
|
import torch |
|
from transformers import AutoTokenizer |
|
from dialog_tag import DialogTag |
|
|
|
|
|
|
|
low_scoring_list = [ |
|
"What does it mean?", |
|
"You have two strikes. Three strikes and you’ re out. It’ s a sports metaphor. Explain again!", |
|
"Again, urban slang. In which, I believe I’ m gaining remarkable fluency. So, could you repeat?", |
|
"I’m confused.", |
|
"I can’t comment without violating our agreement that I don’ t criticize you.", |
|
"Oh!", |
|
"I need to use the restroom.", |
|
"Move. Move. Move!", |
|
"I was going to mention it at the time, but then I thought, some day maybe...", |
|
"Well...", |
|
"Apparently... I have no idea!?", |
|
"I’m not sure...", |
|
"Nothing. I say nothing.", |
|
"Well, my friend. Focus and repeat!", |
|
] |
|
|
|
|
|
class ChatBot: |
|
def __init__(self): |
|
self.vect_data = [] |
|
self.scripts = [] |
|
self.conversation_history = deque([], maxlen=5) |
|
self.tag_model = None |
|
self.ranking_model = None |
|
self.reranking_model = None |
|
self.device = None |
|
self.tokenizer = None |
|
|
|
def load(self): |
|
""" "This method is called first to load all datasets and |
|
model used by the chat bot; all the data to be saved in |
|
tha data folder, models to be loaded from hugging face""" |
|
|
|
with open("data/scripts_vectors.pkl", "rb") as fp: |
|
self.vect_data = pickle.load(fp) |
|
self.scripts = pd.read_pickle("data/scripts.pkl") |
|
self.tag_model = DialogTag("distilbert-base-uncased") |
|
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
self.ranking_model = SentenceTransformer( |
|
"Shakhovak/chatbot_sentence-transformer" |
|
) |
|
|
|
self.tokenizer_reranking = AutoTokenizer.from_pretrained("bert-base-uncased") |
|
self.reranking_model = pipeline( |
|
model="Shakhovak/RerankerModel_chat_bot", |
|
device=self.device, |
|
tokenizer=self.tokenizer_reranking, |
|
) |
|
|
|
def generate_response(self, utterance: str) -> str: |
|
"""this functions identifies potential |
|
candidates for answer and ranks them""" |
|
|
|
intent = intent_classification(utterance, utterance, self.tag_model) |
|
query_encoding = encode( |
|
texts=utterance, |
|
intent=intent, |
|
model=self.ranking_model, |
|
contexts=self.conversation_history, |
|
) |
|
bot_cosine_scores = cosine_sim( |
|
self.vect_data, |
|
query_encoding, |
|
) |
|
top_scores, top_indexes = top_candidates( |
|
bot_cosine_scores, intent=intent, initial_data=self.scripts, top=10 |
|
) |
|
print(top_scores) |
|
if top_scores[0] < 0.9: |
|
answer = random.choice(low_scoring_list) |
|
self.conversation_history.clear() |
|
else: |
|
|
|
|
|
reranked_dict = candidates_reranking( |
|
top_indexes, |
|
self.conversation_history, |
|
utterance, |
|
self.scripts, |
|
self.reranking_model, |
|
) |
|
|
|
|
|
|
|
if len(reranked_dict) >= 1: |
|
updated_top_candidates = dict( |
|
sorted(reranked_dict.items(), key=lambda item: item[1]) |
|
) |
|
answer = self.scripts.iloc[list(updated_top_candidates.keys())[0]][ |
|
"answer" |
|
] |
|
print(self.scripts.iloc[top_indexes[0]]["answer"]) |
|
else: |
|
answer = self.scripts.iloc[top_indexes[0]]["answer"] |
|
|
|
self.conversation_history.append(utterance) |
|
self.conversation_history.append(answer) |
|
|
|
return answer |
|
|
|
|
|
|
|
|
|
|
|
|