yogi commited on
Commit
0655e6e
1 Parent(s): edfe156

Upload amazon_text_sum.py

Browse files
Files changed (1) hide show
  1. amazon_text_sum.py +317 -0
amazon_text_sum.py ADDED
@@ -0,0 +1,317 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """Amazon_text_sum.ipynb
3
+
4
+ Automatically generated by Colaboratory.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1CD8zIL9GykU2qs8bHI-7l5akqA62b-jr
8
+ """
9
+
10
+ #import all the required libraries
11
+ import numpy as np
12
+ import pandas as pd
13
+ import pickle
14
+ from statistics import mode
15
+ import nltk
16
+ from nltk import word_tokenize
17
+ from nltk.stem import LancasterStemmer
18
+ nltk.download('wordnet')
19
+ nltk.download('stopwords')
20
+ nltk.download('punkt')
21
+ from nltk.corpus import stopwords
22
+ from tensorflow.keras.models import Model
23
+ from tensorflow.keras import models
24
+ from tensorflow.keras import backend as K
25
+ from tensorflow.keras.preprocessing.sequence import pad_sequences
26
+ from tensorflow.keras.preprocessing.text import Tokenizer
27
+ from tensorflow.keras.utils import plot_model
28
+ from tensorflow.keras.layers import Input,LSTM,Embedding,Dense,Concatenate,Attention
29
+ from sklearn.model_selection import train_test_split
30
+ from bs4 import BeautifulSoup
31
+
32
+ reviews = pd.read_csv("/content/drive/MyDrive/amazon_text_summarizer/Reviews.csv",nrows=100000)
33
+
34
+ reviews.head(2)
35
+
36
+ #drop the duplicate and na values from the records
37
+ reviews.drop_duplicates(subset=['Text'],inplace=True)
38
+ reviews.dropna(axis=0,inplace=True)
39
+ input_data = reviews.loc[:,'Text']
40
+ target_data = reviews.loc[:,'Summary']
41
+ target_data.replace('', np.nan, inplace=True)
42
+
43
+ input_texts=[]
44
+ target_texts=[]
45
+ input_words=[]
46
+ target_words=[]
47
+ contractions = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not",
48
+
49
+ "didn't": "did not", "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not",
50
+
51
+ "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",
52
+
53
+ "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would",
54
+
55
+ "i'd've": "i would have", "i'll": "i will", "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would",
56
+
57
+ "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam",
58
+
59
+ "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have",
60
+
61
+ "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock",
62
+
63
+ "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have",
64
+
65
+ "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is",
66
+
67
+ "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as",
68
+
69
+ "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would",
70
+
71
+ "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have",
72
+
73
+ "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have",
74
+
75
+ "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are",
76
+
77
+ "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",
78
+
79
+ "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is",
80
+
81
+ "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have",
82
+
83
+ "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have",
84
+
85
+ "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all",
86
+
87
+ "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have",
88
+
89
+ "you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have",
90
+
91
+ "you're": "you are", "you've": "you have"}
92
+ #initialize stop words and LancasterStemmer
93
+ stop_words=set(stopwords.words('english'))
94
+ stemm=LancasterStemmer()
95
+
96
+ def clean(texts,src):
97
+ #remove the html tags
98
+ texts = BeautifulSoup(texts, "lxml").text
99
+ #tokenize the text into words
100
+ words=word_tokenize(texts.lower())
101
+ #filter words which contains \
102
+ #integers or their length is less than or equal to 3
103
+ words= list(filter(lambda w:(w.isalpha() and len(w)>=3),words))
104
+ #contraction file to expand shortened words
105
+ words= [contractions[w] if w in contractions else w for w in words ]
106
+ #stem the words to their root word and filter stop words
107
+ if src=="inputs":
108
+ words= [stemm.stem(w) for w in words if w not in stop_words]
109
+ else:
110
+ words= [w for w in words if w not in stop_words]
111
+ return words
112
+
113
+ #pass the input records and taret records
114
+ for in_txt,tr_txt in zip(input_data,target_data):
115
+ in_words= clean(in_txt,"inputs")
116
+ input_texts+= [' '.join(in_words)]
117
+ input_words+= in_words
118
+ #add 'sos' at start and 'eos' at end of text
119
+ tr_words= clean("sos "+tr_txt+" eos","target")
120
+ target_texts+= [' '.join(tr_words)]
121
+ target_words+= tr_words
122
+
123
+ #store only unique words from input and target list of words
124
+ input_words = sorted(list(set(input_words)))
125
+ target_words = sorted(list(set(target_words)))
126
+ num_in_words = len(input_words) #total number of input words
127
+ num_tr_words = len(target_words) #total number of target words
128
+
129
+ #get the length of the input and target texts which appears most often
130
+ max_in_len = mode([len(i) for i in input_texts])
131
+ max_tr_len = mode([len(i) for i in target_texts])
132
+
133
+ print("number of input words : ",num_in_words)
134
+ print("number of target words : ",num_tr_words)
135
+ print("maximum input length : ",max_in_len)
136
+ print("maximum target length : ",max_tr_len)
137
+
138
+ #split the input and target text into 80:20 ratio or testing size of 20%.
139
+ x_train,x_test,y_train,y_test=train_test_split(input_texts,target_texts,test_size=0.2,random_state=0)
140
+
141
+ #train the tokenizer with all the words
142
+ in_tokenizer = Tokenizer()
143
+ in_tokenizer.fit_on_texts(x_train)
144
+ tr_tokenizer = Tokenizer()
145
+ tr_tokenizer.fit_on_texts(y_train)
146
+
147
+ #convert text into sequence of integers
148
+ #where the integer will be the index of that word
149
+ x_train= in_tokenizer.texts_to_sequences(x_train)
150
+ y_train= tr_tokenizer.texts_to_sequences(y_train)
151
+
152
+ #pad array of 0's if the length is less than the maximum length
153
+ en_in_data= pad_sequences(x_train, maxlen=max_in_len, padding='post')
154
+ dec_data= pad_sequences(y_train, maxlen=max_tr_len, padding='post')
155
+
156
+ #decoder input data will not include the last word
157
+ #i.e. 'eos' in decoder input data
158
+ dec_in_data = dec_data[:,:-1]
159
+ #decoder target data will be one time step ahead as it will not include
160
+ # the first word i.e 'sos'
161
+ dec_tr_data = dec_data.reshape(len(dec_data),max_tr_len,1)[:,1:]
162
+
163
+ K.clear_session()
164
+ latent_dim = 500
165
+
166
+ #create input object of total number of input words
167
+ en_inputs = Input(shape=(max_in_len,))
168
+ en_embedding = Embedding(num_in_words+1, latent_dim)(en_inputs)
169
+
170
+ #create 3 stacked LSTM layer with the shape of hidden dimension
171
+ #LSTM 1
172
+ en_lstm1= LSTM(latent_dim, return_state=True, return_sequences=True)
173
+ en_outputs1, state_h1, state_c1= en_lstm1(en_embedding)
174
+
175
+ #LSTM2
176
+ en_lstm2= LSTM(latent_dim, return_state=True, return_sequences=True)
177
+ en_outputs2, state_h2, state_c2= en_lstm2(en_outputs1)
178
+
179
+ #LSTM3
180
+ en_lstm3= LSTM(latent_dim,return_sequences=True,return_state=True)
181
+ en_outputs3 , state_h3 , state_c3= en_lstm3(en_outputs2)
182
+
183
+ #encoder states
184
+ en_states= [state_h3, state_c3]
185
+
186
+ # Decoder.
187
+ dec_inputs = Input(shape=(None,))
188
+ dec_emb_layer = Embedding(num_tr_words+1, latent_dim)
189
+ dec_embedding = dec_emb_layer(dec_inputs)
190
+
191
+ #initialize decoder's LSTM layer with the output states of encoder
192
+ dec_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
193
+ dec_outputs, *_ = dec_lstm(dec_embedding,initial_state=en_states)
194
+
195
+ #Attention layer
196
+ attention =Attention()
197
+ attn_out = attention([dec_outputs,en_outputs3])
198
+
199
+ #Concatenate the attention output with the decoder ouputs
200
+ merge=Concatenate(axis=-1, name='concat_layer1')([dec_outputs,attn_out])
201
+
202
+ #Dense layer (output layer)
203
+ dec_dense = Dense(num_tr_words+1, activation='softmax')
204
+ dec_outputs = dec_dense(merge)
205
+
206
+ #Mode class and model summary
207
+ model = Model([en_inputs, dec_inputs], dec_outputs)
208
+ model.summary()
209
+ plot_model(model, to_file='model_plot.png', show_shapes=True, show_layer_names=True)
210
+
211
+ model.compile(
212
+ optimizer="rmsprop", loss="sparse_categorical_crossentropy", metrics=["accuracy"] )
213
+ model.fit(
214
+ [en_in_data, dec_in_data],
215
+ dec_tr_data,
216
+ batch_size=512,
217
+ epochs=10,
218
+ validation_split=0.1,
219
+ )
220
+
221
+ #Save model
222
+ model.save("s2s")
223
+
224
+ # encoder inference
225
+ latent_dim=500
226
+ #load the model
227
+ model = models.load_model("s2s")
228
+
229
+ #construct encoder model from the output of 6 layer i.e.last LSTM layer
230
+ en_outputs,state_h_enc,state_c_enc = model.layers[6].output
231
+ en_states=[state_h_enc,state_c_enc]
232
+ #add input and state from the layer.
233
+ en_model = Model(model.input[0],[en_outputs]+en_states)
234
+
235
+ # decoder inference
236
+ #create Input object for hidden and cell state for decoder
237
+ #shape of layer with hidden or latent dimension
238
+ dec_state_input_h = Input(shape=(latent_dim,))
239
+ dec_state_input_c = Input(shape=(latent_dim,))
240
+ dec_hidden_state_input = Input(shape=(max_in_len,latent_dim))
241
+
242
+ # Get the embeddings and input layer from the model
243
+ dec_inputs = model.input[1]
244
+ dec_emb_layer = model.layers[5]
245
+ dec_lstm = model.layers[7]
246
+ dec_embedding= dec_emb_layer(dec_inputs)
247
+
248
+ #add input and initialize LSTM layer with encoder LSTM states.
249
+ dec_outputs2, state_h2, state_c2 = dec_lstm(dec_embedding, initial_state=[dec_state_input_h,dec_state_input_c])
250
+
251
+ #Attention layer
252
+ attention = model.layers[8]
253
+ attn_out2 = attention([dec_outputs2,dec_hidden_state_input])
254
+
255
+ merge2 = Concatenate(axis=-1)([dec_outputs2, attn_out2])
256
+
257
+ #Dense layer
258
+ dec_dense = model.layers[10]
259
+ dec_outputs2 = dec_dense(merge2)
260
+
261
+ # Finally define the Model Class
262
+ dec_model = Model(
263
+ [dec_inputs] + [dec_hidden_state_input,dec_state_input_h,dec_state_input_c],
264
+ [dec_outputs2] + [state_h2, state_c2])
265
+
266
+ #create a dictionary with a key as index and value as words.
267
+ reverse_target_word_index = tr_tokenizer.index_word
268
+ reverse_source_word_index = in_tokenizer.index_word
269
+ target_word_index = tr_tokenizer.word_index
270
+ reverse_target_word_index[0]=' '
271
+
272
+ def decode_sequence(input_seq):
273
+ #get the encoder output and states by passing the input sequence
274
+ en_out, en_h, en_c= en_model.predict(input_seq)
275
+
276
+ #target sequence with inital word as 'sos'
277
+ target_seq = np.zeros((1, 1))
278
+ target_seq[0, 0] = target_word_index['sos']
279
+
280
+ #if the iteration reaches the end of text than it will be stop the iteration
281
+ stop_condition = False
282
+ #append every predicted word in decoded sentence
283
+ decoded_sentence = ""
284
+ while not stop_condition:
285
+ #get predicted output, hidden and cell state.
286
+ output_words, dec_h, dec_c= dec_model.predict([target_seq] + [en_out,en_h, en_c])
287
+
288
+ #get the index and from the dictionary get the word for that index.
289
+ word_index = np.argmax(output_words[0, -1, :])
290
+ text_word = reverse_target_word_index[word_index]
291
+ decoded_sentence += text_word +" "
292
+
293
+ # Exit condition: either hit max length
294
+ # or find a stop word or last word.
295
+ if text_word == "eos" or len(decoded_sentence) > max_tr_len:
296
+ stop_condition = True
297
+
298
+ #update target sequence to the current word index.
299
+ target_seq = np.zeros((1, 1))
300
+ target_seq[0, 0] = word_index
301
+ en_h, en_c = dec_h, dec_c
302
+
303
+ #return the deocded sentence
304
+ return decoded_sentence
305
+
306
+ inp_review = input("Enter : ")
307
+ print("Review :",inp_review)
308
+
309
+ inp_review = clean(inp_review,"inputs")
310
+ inp_review = ' '.join(inp_review)
311
+ inp_x= in_tokenizer.texts_to_sequences([inp_review])
312
+ inp_x= pad_sequences(inp_x, maxlen=max_in_len, padding='post')
313
+
314
+ summary=decode_sequence(inp_x.reshape(1,max_in_len))
315
+ if 'eos' in summary :
316
+ summary=summary.replace('eos','')
317
+ print("\nPredicted summary:",summary);print("\n")