File size: 6,301 Bytes
14c4173
 
b1979b2
14c4173
939efdd
 
14c4173
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b1979b2
 
14c4173
 
 
 
 
 
 
3b42ca3
14c4173
b1979b2
14c4173
 
 
2eb3f52
14c4173
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55e37de
14c4173
 
 
 
55e37de
14c4173
 
 
55e37de
14c4173
55e37de
14c4173
 
 
 
 
 
 
 
 
50c9653
b1979b2
b5f54c3
14c4173
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b1979b2
 
 
 
14c4173
 
 
 
 
 
 
b1979b2
 
 
14c4173
 
 
1e35828
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
#loading tfidf dataset 
import pandas as pd 
newsdf_sample = pd.read_excel("complete_tfidf_25.xlsx",engine="openpyxl")

print("file size",len(newsdf_sample))

#preprocessing for better tokenization (needed for tfidf)
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.corpus import stopwords
stopwords_list = stopwords.words('english')
stopwords_list

def process_row(row):
    import re
    from textblob import Word
    from string import punctuation
    from nltk.stem.snowball import SnowballStemmer    

    
    #Mail address
    row = re.sub('(\S+@\S+)(com|\s+com)', ' ', row)
    
    #Username
    row = re.sub('(\S+@\S+)', ' ', row)
#     print('username',len(row.split()))
    
    #punctuation
    punctuation = punctuation + '\n' + 'β€”β€œ,β€β€˜-’' + '0123456789' +"\t"
    row = ''.join(word for word in row if word not in punctuation)
#     print('punctuation',len(row.split()))
#     print('punctuation',row)
    
    #Lower case 
    row = row.lower()
#     print('lower',len(row.split()))
    
    #Stopwords
    stop = stopwords_list
    row = ' '.join(word for word in row.split() if word not in stop ) 
#     print('stop',len(row.split()))
#     print('stop',row)
    
#     Lemma
    row = " ".join([Word(word).lemmatize() for word in row.split()])
#     print('lemma',len(row.split()))
#     print('lemma',row)
    
    #Stemming
    stemmer = SnowballStemmer(language='english')
    row = " ".join([stemmer.stem(word) for word in row.split()])
#     print('stem',len(row.split()))
#     print('stem',row)
    
    #Extra whitespace
    row = re.sub('\s{1,}', ' ', row)
#     print('extra white',len(row.split()))
    
    row = " ".join([word for word in row.split() if len(word) > 2])

    return row

import pickle
kmeans_tfidf = pickle.load( open( "kmeans_tfidf_25_complete.p", "rb" ) )
vectorizer = pickle.load(open("tfidf_vectorizer_complete.p","rb"))

import matplotlib.pyplot as plt
from wordcloud import WordCloud


dictt_cluster_words={}

for i in range(0,25):
  # print(i)
  temp_df = newsdf_sample[newsdf_sample.exp25==i]
  text_list= temp_df["tfidf_cleaned"].values
  text_list = [element for element in text_list if str(element) != "nan"]
  single_text = " ".join(text_list)
  wordcloud = WordCloud(width = 1000, height = 500, max_words=1000).generate(single_text)
  dictt_cluster_words[i] = wordcloud.words_


#summarization model


from transformers import PegasusForConditionalGeneration, PegasusTokenizer
from transformers import pipeline
import torch



model_name = 'google/pegasus-cnn_dailymail'
device = "cuda" if torch.cuda.is_available() else "cpu"
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name).to(device)

def return_summary(text):
  src_text =[text]
  batch = tokenizer(src_text, truncation=True, padding="longest", return_tensors="pt").to(device)
  translated = model.generate(**batch)
  tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
  tgt_text= tgt_text[0].replace("<n>"," ")
  return tgt_text

############



def return_squad_answer(question, relevant_text):

  qa_pipeline = pipeline(
    "question-answering",
    model="mvonwyl/distilbert-base-uncased-finetuned-squad2",#csarron/bert-base-uncased-squad-v1",
    tokenizer="mvonwyl/distilbert-base-uncased-finetuned-squad2",#csarron/bert-base-uncased-squad-v1"
  )

  predictions = qa_pipeline({
    'context': relevant_text,
    'question': question
  })

  print(predictions)
  return predictions["answer"]

#keyword based cluster selection would be better
#document selection based on tfidf vector

import numpy as np
import math
def l2_norm(a):
    return math.sqrt(np.dot(a,a))

def cosine_similarity(a,b):
    return abs(np.dot(a,b)/ (l2_norm(a) * l2_norm(b)))

def return_selected_cluster(ques):
  ques_clean = process_row(ques)
  count_tokens = len(ques_clean.split())
  cluster_selected =-1
  cluster_score =0
  for clus_id in dictt_cluster_words:
    score_temp=0
    matched_token=0
    for word in ques_clean.split():
      dictt_temp = dictt_cluster_words[clus_id]    
      if word in dictt_temp:
        matched_token+=1
        score_temp+=dictt_temp[word]
    score_temp*= (matched_token/count_tokens)
    if score_temp>cluster_score:
      cluster_selected = clus_id
      cluster_score = score_temp
  return cluster_selected


def get_summary_answer(Question):
  print("question: ", Question)
  cluster_selected = return_selected_cluster(Question)
  print("cluster selected - ",cluster_selected)
  temp_df = newsdf_sample[newsdf_sample.exp25==cluster_selected]
  tfidf_ques = vectorizer.transform([process_row(Question)]).todense()
  cosine_score = []
  for sent in temp_df["tfidf_cleaned"].values:
    val = vectorizer.transform([sent]).todense()
    # print(np.array(tfidf_ques)[0], np.array(val)[0])
    cos_score = cosine_similarity(np.array(tfidf_ques)[0],np.array(val)[0])
    cosine_score.append(cos_score)

  temp_df["cos_score"] = cosine_score
  temp_df = temp_df.sort_values(by=['cos_score'], ascending=False)

  relevant_docs = temp_df["cleaned_doc"][:20]
  relevant_text = " ".join(relevant_docs)
  print("relevant_text", relevant_text)

  # print("summary - ",return_summary(relevant_text))
  # print("squad answer- ",return_squad_answer(ques, relevant_text))

  summary = return_summary(relevant_text)
  squad_answer = return_squad_answer(Question, relevant_text)

  relevant_text = " ".join(relevant_text.split()[:min(250,len(relevant_text.split()))])


  return relevant_text, summary, squad_answer


import gradio as gr
iface = gr.Interface(fn = get_summary_answer,
                     inputs = gr.Textbox(type="text", label="Type your question"), 
                    #  outputs = ["text", "text"], 
                     outputs = [
                         gr.Textbox(type="text", value=1, label="Relevant text"),
                         gr.Textbox(type="text", value=2, label="Answer from Generative Model"),
                         gr.Textbox(type="text", value=3, label="Answer from SQuAD model"),      
                     ],
                     title = "20NewsGroup_QA", 
                     description ="Returns answer from 20NewsGroup dataset")
iface.launch(inline = False, debug = True)