File size: 18,471 Bytes
8eebfdf
b91bd2c
 
 
 
 
de03b20
25a8deb
 
 
4eb74c4
 
 
 
 
 
 
 
 
 
 
25a8deb
 
 
4eb74c4
 
25a8deb
 
 
4eb74c4
 
b91bd2c
 
4eb74c4
25a8deb
 
4eb74c4
 
25a8deb
 
 
 
 
844cfc1
 
 
25a8deb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93c49fb
25a8deb
 
 
 
b91bd2c
25a8deb
 
 
 
 
 
 
 
 
b91bd2c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
844cfc1
b91bd2c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25a8deb
b91bd2c
25a8deb
844cfc1
446742d
b91bd2c
14d7026
b91bd2c
25a8deb
b91bd2c
14d7026
25a8deb
b91bd2c
f30fc7f
b91bd2c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6c51481
b91bd2c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f30fc7f
6c51481
b91bd2c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93c49fb
b91bd2c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
844cfc1
 
fab4024
763c5e6
844cfc1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b8b02a2
844cfc1
 
 
e554d4f
763c5e6
844cfc1
 
 
 
 
763c5e6
844cfc1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b8b02a2
844cfc1
 
 
 
 
 
 
 
 
08648ba
844cfc1
 
 
 
 
 
 
 
 
 
 
763c5e6
844cfc1
763c5e6
844cfc1
b91bd2c
25a8deb
b91bd2c
844cfc1
b91bd2c
844cfc1
 
 
 
b91bd2c
844cfc1
 
 
 
 
 
 
 
b8b02a2
844cfc1
25a8deb
844cfc1
b8b02a2
844cfc1
25a8deb
 
 
 
 
6a3f0d0
25a8deb
 
844cfc1
25a8deb
 
 
 
 
 
 
 
 
 
 
 
 
e878310
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
# import parsing # decomment to download data from the website and parse it # 
from string import punctuation
from tqdm.auto import tqdm, trange

import torch
from transformers import AutoTokenizer, AutoModel
import datasets

import pandas as pd
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import wordnet  # for lemmtization
from nltk import pos_tag  # for parts of speech
nltk.download('omw-1.4')  #this is for the .apply() function to work
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('stopwords')


import numpy as np
import os
import re  #regular expressions
import time

from sklearn.feature_extraction.text import CountVectorizer  # for bag of words (bow)
from sklearn.feature_extraction.text import TfidfVectorizer  #for tfidf
from sklearn.metrics import pairwise_distances  # cosine similarity
from sklearn.metrics.pairwise import cosine_similarity

from gensim.models import Word2Vec, KeyedVectors
import gensim.downloader as api

import gradio as gr
import time




# Take Rachel as main character
df = pd.read_csv("rachel_friends.csv")  # read the database into a data frame


#-------------------------------------TF-IDF------------------------------------------#

# Define function for text normalization
def text_normalization(text):
    text = str(text).lower()  # convert to all lower letters
    spl_char_text = re.sub(r'[^a-z]', ' ', text)  # remove any special characters including numbers
    tokens = nltk.word_tokenize(spl_char_text)  # tokenize words
    lema = wordnet.WordNetLemmatizer()  # lemmatizer initiation
    tags_list = pos_tag(tokens, tagset = None)  # parts of speech
    lema_words = []
    for token, pos_token in tags_list:
        if pos_token.startswith('V'):  # if the tag from tag_list is a verb, assign 'v' to it's pos_val
            pos_val = 'v'
        elif pos_token.startswith('J'):  # adjective
            pos_val = 'a'
        elif pos_token.startswith('R'):  # adverb
            pos_val = 'r'
        else:  # otherwise it must be a noun
            pos_val = 'n'
        lema_token = lema.lemmatize(token, pos_val)  # performing lemmatization
        lema_words.append(lema_token)  # addid the lemmatized words into our list
    return " ".join(lema_words)  # return our list as a human sentence

# Preprocess data and insert to dataframe
question_normalized = df['question'].apply(text_normalization)
df.insert(2, 'Normalized question', question_normalized, True)

# Define function to delete stopwords from the sentences
stop = stopwords.words('english') # Include stop words
stop = [] # Exclude stopwords
def removeStopWords(text):
  Q = []
  s = text.split()  # create an array of words from our text sentence, cut it into words
  q = ''
  for w in s:  # for every word in the given sentence if the word is a stop word ignore it
      if w in stop:
          continue
      else:  # otherwise add it to the end of our array
          Q.append(w)
      q = " ".join(Q)  # create a sentence out of our array of non stop words
  return q

# Preprocess data and insert to dataframe
question_norm_and_stop = df['Normalized question'].apply(removeStopWords)
df.insert(3, 'Normalized and StopWords question', question_norm_and_stop, True)

tfidf = TfidfVectorizer(ngram_range=(1,3), max_features=5024)  # initializing tf-idf
x_tfidf = tfidf.fit_transform(df['Normalized and StopWords question']).toarray()  # oversimplifying this converts words to vectors
features_tfidf = tfidf.get_feature_names_out()  # use function to get all the normalized words
df_tfidf = pd.DataFrame(x_tfidf, columns = features_tfidf)  # create dataframe to show the 0, 1 value for each word

# bot tf idf algorithm without context
def chat_tfidf(question):
    tidy_question = text_normalization(removeStopWords(question))  # clean & lemmatize the question
    tf = tfidf.transform([tidy_question]).toarray()  # convert the question into a vector
    cos = 1- pairwise_distances(df_tfidf, tf, metric = 'cosine')  # calculate the cosine value
    index_value = cos.argmax()  # find the index of the maximum cosine value
    # answer = Answer("Ross", df['answer'].loc[index_value])
    answer = df['answer'].loc[index_value]
    return answer

# bot tf idf algorithm with context
def chat_tfidf_context(question, history):
  
  len_history = len(history)

  if len_history > 1:
    memory_weights = np.array([0.1, 0.3, 1.0]) # .reshape((3,1))
    # take last two sentences in accordance to bot's memory
    history = history[-2:]

  else:
    memory_weights = np.array([0.3, 1.0])

  history_sentence = np.zeros(shape=(len_history+1, 5024))

  for ind, h in enumerate(history):
    # normalize first question from context      
    tidy_question = text_normalization(removeStopWords(h[0]))
    # pass via tfidf
    tf = tfidf.transform([tidy_question]).toarray()

    # assign tf idf vector to history sentence
    history_sentence[ind] = tf * memory_weights[ind]
  
  tidy_question = text_normalization(removeStopWords(question))
  tf = tfidf.transform([tidy_question]).toarray()

  history_sentence[-1] = tf
  history_sentence = history_sentence.mean(axis=0).reshape(1,-1)

  cos = 1- pairwise_distances(df_tfidf, history_sentence, metric = 'cosine')
  index_value = cos.argmax()
  answer = df['answer'].loc[index_value]

  return answer  
#-------------------------------------W2V------------------------------------------#
punkt = [p for p in punctuation] + ["`", "``" ,"''", "'"]

def tokenize(sent: str) -> str:
  tokens = nltk.word_tokenize(sent.lower())  # tokenize words
  return ' '.join([word for word in tokens if word not in stop and word not in punkt])

questions_preprocessed = []
for question in df["question"].tolist() + df["answer"].tolist():
  questions_preprocessed.append(tokenize(question))
    
questions_w2v = [sent.split(" ") for sent in questions_preprocessed]

w2v = KeyedVectors.load('w2v.bin')
unknown_vector = np.random.uniform(low=-0.2, high=0.2, size=(25,))

# define function to form sentences with w2v
def w2v_get_vector_for_sentence(sentence):
  sent = nltk.word_tokenize(sentence.lower())
  sent = [word for word in sent if word not in punkt]
  sentence_vector = []
  if len(sent)==0:
    sentence_vector.append(unknown_vector)
  else:
    for word in sent:
      if word in w2v.key_to_index:
        sentence_vector.append(w2v[word])
      else:
        sentence_vector.append(unknown_vector)

  return np.array(sentence_vector).mean(axis=0)

# create base for w2v
base = np.zeros(shape=(len(df.question), 25))
for ind, sentence in enumerate(df['question']):  #  df[df['question'].str.len() >= 1]
  base[ind] = w2v_get_vector_for_sentence(sentence)

# bot w2v algorithm without context
def chat_word2vec(question):
  question = [w2v_get_vector_for_sentence(question)]
  cos = 1-pairwise_distances(base, question, metric = 'cosine')  # calculate the cosine value
  index_value = cos.argmax()  # find the index of the maximum cosine value
  answer = df['answer'].loc[index_value]
  return answer
    
# bot w2v algorithm with context
def chat_word2vec_context(question, history):
  
  len_history = len(history)

  if len_history > 1:
    memory_weights = np.array([0.1, 0.3, 1.0]) # .reshape((3,1))
    # take last two sentences in accordance to bot's memory
    history = history[-2:]

  else:
    memory_weights = np.array([0.3, 1.0])

  history_sentence = np.zeros(shape=(len_history+1, 25))

  for ind, h in enumerate(history):
    sentence = w2v_get_vector_for_sentence(h[0])
    history_sentence[ind] = sentence * memory_weights[ind]
  
  question = w2v_get_vector_for_sentence(question)

  history_sentence[-1] = question
  history_sentence = history_sentence.mean(axis=0).reshape(1, -1)

  cos = 1-pairwise_distances(base, history_sentence, metric = 'cosine')
  index_value = cos.argmax()
  answer = df['answer'].loc[index_value]

  return answer  

#-------------------------------------BERT------------------------------------------#
 
# Let's try bert model by elastic and with e5
model_name = "distilbert/distilbert-base-uncased"
device = "cpu"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

class BERTSearchEngine:
    def __init__(self, model, tokenizer, text_database):
        self.raw_procesed_data = [self.preprocess(sample, tokenizer) for sample in text_database]
        self.base = []
        self.retriever = None
        self.inverted_index = {}
        self._init_retriever(model, tokenizer, text_database)
        self._init_inverted_index(text_database)

    @staticmethod
    def preprocess(sentence: str, tokenizer):
        return tokenizer(sentence, padding=True, truncation=True, return_tensors='pt')

    def _embed_bert_cls(self, tokenized_text: dict[torch.Tensor]) -> np.array:
        with torch.no_grad():
            model_output = self.retriever(**{k: v.to(self.retriever.device) for k, v in tokenized_text.items()})
        embeddings = model_output.last_hidden_state[:, 0, :]
        embeddings = torch.nn.functional.normalize(embeddings)
        return embeddings[0].cpu().numpy()

    def _init_retriever(self, model, tokenizer, text_database):
        self.retriever = model
        self.tokenizer = tokenizer
        self.base = np.load("bert_base.npy") #np.array([self._embed_bert_cls(self.preprocess(text, tokenizer)) for text in tqdm(text_database)])

    def retrieve(self, query: str) -> np.array:
        return self._embed_bert_cls(self.preprocess(query, self.tokenizer))

    def retrieve_documents(self, query: str, top_k=3) -> list[int]:
        query_vector = self.retrieve(query)
        cosine_similarities = cosine_similarity([query_vector], self.base).flatten()
        relevant_indices = np.argsort(cosine_similarities, axis=0)[::-1][:top_k]
        return relevant_indices.tolist()

    def _init_inverted_index(self, text_database: list[str]):
        self.inverted_index = dict(enumerate(text_database))

    def display_relevant_docs(self, query, full_database, top_k=3) -> list[int]:
        docs_indexes = self.retrieve_documents(query, top_k=top_k)
        return [self.inverted_index[ind] for ind in docs_indexes]

    def find_answer(self, query: str) -> int:
        query_vector = self.retrieve(query)
        cosine_similarities = cosine_similarity([query_vector], self.base).flatten()
        relevant_indice = np.argmax(cosine_similarities, axis=0)
        return relevant_indice

simple_search_engine = BERTSearchEngine(model, tokenizer, df["question"])
# simple_search_engine.bert = np.load(bert_base.npy)

# bot bert algorithm without context
def chat_bert(question):
  ind = simple_search_engine.find_answer(question)
  answer = df['answer'].iloc[ind]
  return answer

# bot bert algorithm with context
def chat_bert_context(question, history):
  
  len_history = len(history)

  if len_history > 1:
    memory_weights = np.array([0.1, 0.3, 1.0]) # .reshape((3,1))
    # take last two sentences in accordance to bot's memory
    history = history[-2:]

  else:
    memory_weights = np.array([0.3, 1.0])

  history_sentence = np.zeros(shape=(len_history+1, 768))

  for ind, h in enumerate(history):

    sentence = simple_search_engine.retrieve(h)
    history_sentence[ind] = sentence * memory_weights[ind]
  
  question = simple_search_engine.retrieve(question)

  history_sentence[-1] = question
  history_sentence = history_sentence.mean(axis=0).reshape(1, -1)

  cosine_similarities = cosine_similarity(history_sentence, simple_search_engine.base).flatten()
  relevant_indice = np.argmax(cosine_similarities, axis=0)
  answer = df['answer'].loc[relevant_indice]

  return answer  
    
#-------------------------------------Bi-BERT-Encoder------------------------------------------#
MAX_LENGTH = 128
inverted_answer = dict(enumerate(df.answer.tolist()))
# Define function for mean-pooling
def mean_pool(token_embeds: torch.tensor, attention_mask: torch.tensor) -> torch.tensor:
    in_mask = attention_mask.unsqueeze(-1).expand(token_embeds.size()).float()
    pool = torch.sum(token_embeds * in_mask, 1) / torch.clamp(in_mask.sum(1), min=1e-9)
    return pool

# Define function for tokenization of the sentence and encoding it
def encode(input_texts: list[str], tokenizer: AutoTokenizer, model: AutoModel, device: str = "cpu"
) -> torch.tensor:

    model.eval()
    tokenized_texts = tokenizer(input_texts, max_length=128,
                                padding='max_length', truncation=True, return_tensors="pt")
    token_embeds = model(tokenized_texts["input_ids"].to(device),
                         tokenized_texts["attention_mask"].to(device)).last_hidden_state
    pooled_embeds = mean_pool(token_embeds, tokenized_texts["attention_mask"].to(device))
    return pooled_embeds

# Define architecture for bi-bert-encoder
class Sbert(torch.nn.Module):
    def __init__(self, max_length: int = 128):
        super().__init__()
        self.max_length = max_length
        self.bert_model = AutoModel.from_pretrained('distilbert-base-uncased')
        self.bert_tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
        self.linear = torch.nn.Linear(self.bert_model.config.hidden_size * 3, 1)
        # self.sigmoid = torch.nn.Sigmoid()

    def forward(self, data: datasets.arrow_dataset.Dataset) -> torch.tensor:
        question_input_ids = data["question_input_ids"].to(device)
        question_attention_mask = data["question_attention_mask"].to(device)
        answer_input_ids = data["answer_input_ids"].to(device)
        answer_attention_mask = data["answer_attention_mask"].to(device)

        out_question = self.bert_model(question_input_ids, question_attention_mask)
        out_answer = self.bert_model(answer_input_ids, answer_attention_mask)
        question_embeds = out_question.last_hidden_state
        answer_embeds = out_answer.last_hidden_state

        pooled_question_embeds = mean_pool(question_embeds, question_attention_mask)
        pooled_answer_embeds = mean_pool(answer_embeds, answer_attention_mask)

        embeds =  torch.cat([pooled_question_embeds, pooled_answer_embeds,
                             torch.abs(pooled_question_embeds - pooled_answer_embeds)],
                            dim=-1)
        # return self.sigmoid(self.linear(embeds))
        return self.linear(embeds)    

# Initialize the model
model_bi_encoder = Sbert().to(device)
# Load weights from training step
model_bi_encoder.bert_model.from_pretrained("models/friends_bi_encoder")

# Load question embeds
question_embeds = np.load("bi_bert_question.npy")

def chat_bi_bert(question, history):
    question = encode(question, model_bi_encoder.bert_tokenizer, model_bi_encoder.bert_model, device).squeeze().cpu().detach().numpy()
    cosine_similarities = cosine_similarity([question], question_embeds).flatten()
    top_indice = np.argmax(cosine_similarities, axis=0)
    answer = df['answer'].iloc[top_indice]
    answer = inverted_answer[top_indice]
    return answer



#-------------------------------------Bi+Cross-BERT-Encoder------------------------------------------#


#Define class for CrossEncoderBert
class CrossEncoderBert(torch.nn.Module):
    def __init__(self, max_length: int = MAX_LENGTH):
        super().__init__()
        self.max_length = max_length
        self.bert_model = AutoModel.from_pretrained('distilbert-base-uncased')
        self.bert_tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
        self.linear = torch.nn.Linear(self.bert_model.config.hidden_size, 1)
    
    def forward(self, input_ids, attention_mask):
        outputs = self.bert_model(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0]  # Use the CLS token's output
        return self.linear(pooled_output)

model_cross_encoder = CrossEncoderBert().to(device)
model_cross_encoder.bert_model.from_pretrained("models/friends_cross_encoder")

def chat_cross_bert(question, history):
    
    question_encoded = encode(question, model_bi_encoder.bert_tokenizer, model_bi_encoder.bert_model, device).squeeze().cpu().detach().numpy()
    cosine_similarities = cosine_similarity([question_encoded], question_embeds).flatten()
    topk_indices = np.argsort(cosine_similarities, axis=0)[::-1][:5]
    topk_indices=topk_indices.tolist()
    corpus = [inverted_answer[ind] for ind in topk_indices]

    queries = [question] * len(corpus)

    tokenized_texts = model_cross_encoder.bert_tokenizer(
        queries, corpus, max_length=MAX_LENGTH, padding=True, truncation=True, return_tensors="pt"
    ).to(device)

    # Finetuned CrossEncoder model scoring
    with torch.no_grad():
        ce_scores = model_cross_encoder(tokenized_texts['input_ids'], tokenized_texts['attention_mask']).squeeze(-1)
        ce_scores = torch.sigmoid(ce_scores)  # Apply sigmoid if needed


    # Process scores for finetuned model
    scores = ce_scores.cpu().numpy()
    ix = np.argmax(scores)
    # print(f"{corpus[scores_ix]}")
    return corpus[ix]

# gradio part 
def echo(message, history, model):
    
    if model=="TF-IDF":
    # answer = chat_tfidf(message)
        answer = chat_tfidf_context(message, history)
        return answer
    
    elif model=="W2V":
    # answer = chat_word2vec(message)
        answer = chat_word2vec_context(message, history)
        return answer
    
    elif model=="BERT":
        answer = chat_bert_context(message, history)
        return answer
    
    elif model=="Bi-BERT-Encoder":
        answer = chat_bi_bert(message, history)
        return answer

    elif model=="Bi+Cross-BERT-Encoder":
        answer = chat_cross_bert(message, history)
        return answer




title = "Chatbot who speaks like Rachel from Friends"
description = "You have a good opportunity to have a dialog with actress from Friends - Rachel Green"

# model = gr.CheckboxGroup(["TF-IDF", "W2V", "BERT", "BI-Encoder", "Cross-Encoder"], label="Model", info="What model do you want to use?", value="TF-IDF")
model = gr.Dropdown(["TF-IDF", "W2V", "BERT", "Bi-BERT-Encoder", "Bi+Cross-BERT-Encoder"], label="Retrieval model", info="What model do you want to use?", value="TF-IDF")

with gr.Blocks() as demo:

    gr.ChatInterface(
        fn=echo,
        title=title,
        description=description,
        additional_inputs=[model],
        retry_btn=None,
        undo_btn=None,
        clear_btn=None,
    )

demo.launch(debug=False, share=True)