devraj4522 commited on
Commit
459c675
1 Parent(s): 3f47ce6

Upload 14 files

Browse files
app.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from flask import Flask, request, jsonify
2
+ import time
3
+ import ktrain
4
+
5
+ app = Flask(__name__)
6
+
7
+ predictor = ktrain.load_predictor('model/bert_model')
8
+
9
+ # worked
10
+ print(predictor.predict('I love this product!'))
11
+ print(predictor.predict('I hate this product!'))
12
+ print(predictor.predict('I am so sad!'))
13
+ print(predictor.predict('I am so happy!'))
14
+
15
+ print(predictor.predict("I am looking for a job."))
16
+ print(predictor.predict("I like to play football."))
17
+ print(predictor.predict("I am going to the beach."))
18
+ print(predictor.predict("I am going to the hospital."))
19
+ print(predictor.predict("His son is very sick."))
20
+
21
+ @app.route('/')
22
+ def index():
23
+ response = {
24
+ 'message': 'Social Media Emotion Analysis!'
25
+ }
26
+
27
+ return jsonify(response)
28
+
29
+ @app.route('/predict-str', methods=['POST'])
30
+ def predict_message():
31
+ data = request.json
32
+ message = data.get('message', '')
33
+ start_time = time.time()
34
+ prediction = predictor.predict(message)
35
+
36
+ response = {
37
+ 'message': message,
38
+ 'prediction': prediction,
39
+ 'elapsed_time': time.time() - start_time
40
+ }
41
+
42
+ return jsonify(response)
43
+
44
+ @app.route('/predict-list', methods=['POST'])
45
+ def predict_list():
46
+ data = request.json
47
+ messages = data.get('messages', [])
48
+ start_time = time.time()
49
+ predictions = predictor.predict(messages)
50
+
51
+ response = {
52
+ 'messages': messages,
53
+ 'predictions': predictions,
54
+ 'elapsed_time': time.time() - start_time
55
+ }
56
+
57
+ return jsonify(response)
58
+
59
+
60
+
61
+ if __name__ == '__main__':
62
+ app.run()
create_model.py ADDED
@@ -0,0 +1,222 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from flask import Flask
2
+ from keras.models import load_model
3
+ from tensorflow.keras.preprocessing.sequence import pad_sequences
4
+ import re
5
+ from tensorflow.keras.preprocessing.text import one_hot as oh
6
+ import numpy as np
7
+ import tensorflow as tf
8
+
9
+ app = Flask(__name__)
10
+
11
+ # Load the saved model
12
+ new_model = load_model('m.h5')
13
+
14
+
15
+ stopwords_lst = [ 'i',
16
+ 'me',
17
+ 'my',
18
+ 'myself',
19
+ 'we',
20
+ 'our',
21
+ 'ours',
22
+ 'ourselves',
23
+ 'you',
24
+ "you're",
25
+ "you've",
26
+ "you'll",
27
+ "you'd",
28
+ 'your',
29
+ 'yours',
30
+ 'yourself',
31
+ 'yourselves',
32
+ 'he',
33
+ 'him',
34
+ 'his',
35
+ 'himself',
36
+ 'she',
37
+ "she's",
38
+ 'her',
39
+ 'hers',
40
+ 'herself',
41
+ 'it',
42
+ "it's",
43
+ 'its',
44
+ 'itself',
45
+ 'they',
46
+ 'them',
47
+ 'their',
48
+ 'theirs',
49
+ 'themselves',
50
+ 'what',
51
+ 'which',
52
+ 'who',
53
+ 'whom',
54
+ 'this',
55
+ 'that',
56
+ "that'll",
57
+ 'these',
58
+ 'those',
59
+ 'am',
60
+ 'is',
61
+ 'are',
62
+ 'was',
63
+ 'were',
64
+ 'be',
65
+ 'been',
66
+ 'being',
67
+ 'have',
68
+ 'has',
69
+ 'had',
70
+ 'having',
71
+ 'do',
72
+ 'does',
73
+ 'did',
74
+ 'doing',
75
+ 'a',
76
+ 'an',
77
+ 'the',
78
+ 'and',
79
+ 'but',
80
+ 'if',
81
+ 'or',
82
+ 'because',
83
+ 'as',
84
+ 'until',
85
+ 'while',
86
+ 'of',
87
+ 'at',
88
+ 'by',
89
+ 'for',
90
+ 'with',
91
+ 'about',
92
+ 'against',
93
+ 'between',
94
+ 'into',
95
+ 'through',
96
+ 'during',
97
+ 'before',
98
+ 'after',
99
+ 'above',
100
+ 'below',
101
+ 'to',
102
+ 'from',
103
+ 'up',
104
+ 'down',
105
+ 'in',
106
+ 'out',
107
+ 'on',
108
+ 'off',
109
+ 'over',
110
+ 'under',
111
+ 'again',
112
+ 'further',
113
+ 'then',
114
+ 'once',
115
+ 'here',
116
+ 'there',
117
+ 'when',
118
+ 'where',
119
+ 'why',
120
+ 'how',
121
+ 'all',
122
+ 'any',
123
+ 'both',
124
+ 'each',
125
+ 'few',
126
+ 'more',
127
+ 'most',
128
+ 'other',
129
+ 'some',
130
+ 'such',
131
+ 'no',
132
+ 'nor',
133
+ 'not',
134
+ 'only',
135
+ 'own',
136
+ 'same',
137
+ 'so',
138
+ 'than',
139
+ 'too',
140
+ 'very',
141
+ 's',
142
+ 't',
143
+ 'can',
144
+ 'will',
145
+ 'just',
146
+ 'don',
147
+ "don't",
148
+ 'should',
149
+ "should've",
150
+ 'now',
151
+ 'd',
152
+ 'll',
153
+ 'm',
154
+ 'o',
155
+ 're',
156
+ 've',
157
+ 'y',
158
+ 'ain',
159
+ 'aren',
160
+ "aren't",
161
+ 'couldn',
162
+ "couldn't",
163
+ 'didn',
164
+ "didn't",
165
+ 'doesn',
166
+ "doesn't",
167
+ 'hadn',
168
+ "hadn't",
169
+ 'hasn',
170
+ "hasn't",
171
+ 'haven',
172
+ "haven't",
173
+ 'isn',
174
+ "isn't",
175
+ 'ma',
176
+ 'mightn',
177
+ "mightn't",
178
+ 'mustn',
179
+ "mustn't",
180
+ 'needn',
181
+ "needn't",
182
+ 'shan',
183
+ "shan't",
184
+ 'shouldn',
185
+ "shouldn't",
186
+ 'wasn',
187
+ "wasn't",
188
+ 'weren',
189
+ "weren't",
190
+ 'won',
191
+ "won't",
192
+ 'wouldn',
193
+ "wouldn't"]
194
+
195
+ import pickle
196
+ import random
197
+ random.seed(42)
198
+ with open ('oh.pkl','rb') as f:
199
+ oh = pickle.load(f)
200
+ with open ('ps.pkl','rb') as f:
201
+ ps = pickle.load(f)
202
+ def predict_emotion2(stri):
203
+ review = re.sub('[^a-zA-Z]', ' ', stri)
204
+ review = review.lower()
205
+ review = review.split()
206
+ # print(ps.stem(word))
207
+ review = [ps.stem(word) for word in review if not word in stopwords_lst]
208
+ review = [int(tf.strings.to_hash_bucket_fast(word, 1000)) for word in review]
209
+ onehot_repr = [review]
210
+ print(onehot_repr)
211
+ embed = pad_sequences(onehot_repr,padding='pre',maxlen=35)
212
+ # predicti = new_model.predict(embed)
213
+ # return np.argmax(predicti)
214
+
215
+ strs = ["I am surprised of my work", "I am happy of my work", "I am sad of my work", "I love my country and I am happy"]
216
+ for s in strs:
217
+ predict_emotion2(s)
218
+
219
+ # print("em: ", predict_emotion2("I am surprised of my work"))
220
+ # print("em: ", predict_emotion2("I am happy of my work"))
221
+ # print("em: ", predict_emotion2("I am sad of my work"))
222
+ # print("em: ", predict_emotion2("I love my country and I am happy"))
data/Emotion_final.csv ADDED
The diff for this file is too large to render. See raw diff
 
data/preprocess_data.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import pandas as pd
3
+ import tensorflow as tf
4
+ import nltk
5
+ import seaborn as sns
6
+ import re
7
+ import matplotlib.pyplot as plt
8
+ import pickle
9
+ from tensorflow.keras.layers import Embedding, Dense, LSTM, Dropout, Bidirectional
10
+ from tensorflow.keras.preprocessing.sequence import pad_sequences
11
+ from tensorflow.keras.models import Sequential
12
+ from tensorflow.keras.preprocessing.text import one_hot, Tokenizer
13
+ from tensorflow.keras.callbacks import ModelCheckpoint
14
+ from nltk.corpus import stopwords
15
+ from nltk.stem.porter import PorterStemmer
16
+
17
+ from sklearn.model_selection import train_test_split
18
+ from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
19
+
20
+
21
+ df=pd.read_csv('model/Emotion_final.csv') #Text data
22
+ EMBEDDING_FILE= f'model/glove.6B.100d.txt' #GloVe file path
23
+
24
+
25
+ #removing duplicated values
26
+ index = df[df.duplicated() == True].index
27
+ df.drop(index, axis = 0, inplace = True)
28
+ df.reset_index(inplace=True, drop = True)
29
+
30
+ #removing duplicated text
31
+ index = df[df['Text'].duplicated() == True].index
32
+ df.drop(index, axis = 0, inplace = True)
33
+ df.reset_index(inplace=True, drop = True)
34
+
35
+ df=df.dropna() #Drop columns with NA values
36
+ X=df.drop('Emotion',axis=1) #Taking Text
37
+ y=df['Emotion'] #Taking Emotion
38
+
39
+ messages=X.copy()
40
+ messages.reset_index(inplace=True) #Drop NA may cause inconsistency in index
41
+
42
+ stopword_lst = pickle.load(open('/home/devraj4522/Desktop/ML Model/model/stopwords.pkl', 'rb'))
43
+
44
+ # nltk.download('stopwords')
45
+ ps = PorterStemmer() # reduce word to root form
46
+ corpus = []
47
+ for i in range(0, len(messages)):
48
+ review = re.sub('[^a-zA-Z]', ' ', messages['Text'][i]) #Remove Special Characters
49
+ review = review.lower() #Lower case
50
+ review = review.split()
51
+ review = [ps.stem(word) for word in review if not word in stopword_lst] #Remove stopwords
52
+ review = ' '.join(review)
53
+ corpus.append(review)
54
+
55
+
56
+ with open("corpus.pkl", 'wb') as file:
57
+ pickle.dump(corpus, file)
58
+
59
+
60
+ with open("ps.pkl", 'wb') as file:
61
+ pickle.dump(ps, file)
62
+
63
+
64
+ #Creating the dictionary with word as key and pretrained-value array as value
65
+ def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
66
+ embeddings_index = dict(get_coefs(*o.strip().split()) for o in open(EMBEDDING_FILE))
67
+ all_embs = np.stack(embeddings_index.values())
68
+ emb_mean,emb_std = all_embs.mean(), all_embs.std()
69
+
70
+ voc_size=10000 # Vocabulary size
71
+ embed_size=100 #word vector size
72
+
73
+ tokenizer = Tokenizer(num_words=voc_size)
74
+ tokenizer.fit_on_texts(list(corpus))
75
+ word_index = tokenizer.word_index #Total words in the corpus
76
+ nb_words = min(voc_size, len(word_index))
77
+
78
+ #Initialize weight matrix for embedding layer
79
+ embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
80
+
81
+ for word, i in word_index.items():
82
+ if i >= voc_size: continue #Skip the words if vocab size is reached
83
+ embedding_vector = embeddings_index.get(word) #Extract the pretrained values from GloVe
84
+ if embedding_vector is not None: embedding_matrix[i] = embedding_vector
85
+
86
+ # #One hot representation for input
87
+ onehot_repr=[one_hot(words,voc_size)for words in corpus]
88
+
89
+ # #Finding max words
90
+ l = 0
91
+ for x in corpus:
92
+ l = max(l,len(x.split(' ')))
93
+
94
+ # #Padding the sequences for input
95
+ sent_length= l
96
+ embedded_docs=pad_sequences(onehot_repr,padding='pre',maxlen=sent_length)
97
+ # print(embedded_docs)
98
+ with open("embedded_docs.pkl", 'wb') as file:
99
+ pickle.dump(embedded_docs, file)
100
+
101
+ with open("l.txt", 'w') as file:
102
+ file.write(str(l))
103
+
104
+
data/stopwords.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4363f21909e984f52cfe3c4df4d312aa9d8442362fe1be2830963668c62f6d26
3
+ size 1310
final_dev.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
m.h5 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:82896dd547dd7aac85766d657465e8527605db15e4046bef983d5bdd8355175a
3
+ size 13176168
model/bert_model/tf_model.h5 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:94059651edcb1be167a0e44d58e15087ecdebe35a373d6db78db7a5c40a68e54
3
+ size 1313085304
model/bert_model/tf_model.preproc ADDED
Binary file (871 kB). View file
 
oh.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e89cb1ea7281e0e10273f1ee97ec9a1fa88dbea5b9368ff673eadbc5435b8df6
3
+ size 51
ps.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c0d0b21442919eb6ceacb4283ef2e5b11ba64d9febaa8db0eed954d63ace0130
3
+ size 309
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ flask==2.2.3
2
+ requests==2.28.1
3
+ ktrain==0.37.0
test.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import tensorflow as tf
2
+ from tensorflow.keras.preprocessing.sequence import pad_sequences
3
+
4
+ # Set the vocabulary size and maximum sequence length
5
+ voc_size = 10000
6
+ sent_length = 20
7
+
8
+ # Define the corpus
9
+ corpus = ['example text 1', 'example text 2', 'example text 3']
10
+
11
+ # Generate hashed integer sequences for the corpus
12
+ hashed_docs = []
13
+ for text in corpus:
14
+ hashed_doc = [tf.strings.to_hash_bucket_fast(word, voc_size) for word in text.split()]
15
+ for word in text.split():
16
+ print(int(tf.strings.to_hash_bucket_fast(word, voc_size)), end=' ')
17
+ hashed_docs.append(hashed_doc)
18
+
19
+ # Pad the sequences to a fixed length
20
+ padded_docs = pad_sequences(hashed_docs, padding='pre', maxlen=sent_length)
21
+
22
+ print(padded_docs)
tokenizer.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b811e16b951279bcea922fe994e5432079dbf4c55da67a0b05ea1726d5807712
3
+ size 508602