devraj4522
commited on
Commit
•
459c675
1
Parent(s):
3f47ce6
Upload 14 files
Browse files- app.py +62 -0
- create_model.py +222 -0
- data/Emotion_final.csv +0 -0
- data/preprocess_data.py +104 -0
- data/stopwords.pkl +3 -0
- final_dev.ipynb +0 -0
- m.h5 +3 -0
- model/bert_model/tf_model.h5 +3 -0
- model/bert_model/tf_model.preproc +0 -0
- oh.pkl +3 -0
- ps.pkl +3 -0
- requirements.txt +3 -0
- test.py +22 -0
- tokenizer.pickle +3 -0
app.py
ADDED
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from flask import Flask, request, jsonify
|
2 |
+
import time
|
3 |
+
import ktrain
|
4 |
+
|
5 |
+
app = Flask(__name__)
|
6 |
+
|
7 |
+
predictor = ktrain.load_predictor('model/bert_model')
|
8 |
+
|
9 |
+
# worked
|
10 |
+
print(predictor.predict('I love this product!'))
|
11 |
+
print(predictor.predict('I hate this product!'))
|
12 |
+
print(predictor.predict('I am so sad!'))
|
13 |
+
print(predictor.predict('I am so happy!'))
|
14 |
+
|
15 |
+
print(predictor.predict("I am looking for a job."))
|
16 |
+
print(predictor.predict("I like to play football."))
|
17 |
+
print(predictor.predict("I am going to the beach."))
|
18 |
+
print(predictor.predict("I am going to the hospital."))
|
19 |
+
print(predictor.predict("His son is very sick."))
|
20 |
+
|
21 |
+
@app.route('/')
|
22 |
+
def index():
|
23 |
+
response = {
|
24 |
+
'message': 'Social Media Emotion Analysis!'
|
25 |
+
}
|
26 |
+
|
27 |
+
return jsonify(response)
|
28 |
+
|
29 |
+
@app.route('/predict-str', methods=['POST'])
|
30 |
+
def predict_message():
|
31 |
+
data = request.json
|
32 |
+
message = data.get('message', '')
|
33 |
+
start_time = time.time()
|
34 |
+
prediction = predictor.predict(message)
|
35 |
+
|
36 |
+
response = {
|
37 |
+
'message': message,
|
38 |
+
'prediction': prediction,
|
39 |
+
'elapsed_time': time.time() - start_time
|
40 |
+
}
|
41 |
+
|
42 |
+
return jsonify(response)
|
43 |
+
|
44 |
+
@app.route('/predict-list', methods=['POST'])
|
45 |
+
def predict_list():
|
46 |
+
data = request.json
|
47 |
+
messages = data.get('messages', [])
|
48 |
+
start_time = time.time()
|
49 |
+
predictions = predictor.predict(messages)
|
50 |
+
|
51 |
+
response = {
|
52 |
+
'messages': messages,
|
53 |
+
'predictions': predictions,
|
54 |
+
'elapsed_time': time.time() - start_time
|
55 |
+
}
|
56 |
+
|
57 |
+
return jsonify(response)
|
58 |
+
|
59 |
+
|
60 |
+
|
61 |
+
if __name__ == '__main__':
|
62 |
+
app.run()
|
create_model.py
ADDED
@@ -0,0 +1,222 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from flask import Flask
|
2 |
+
from keras.models import load_model
|
3 |
+
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
4 |
+
import re
|
5 |
+
from tensorflow.keras.preprocessing.text import one_hot as oh
|
6 |
+
import numpy as np
|
7 |
+
import tensorflow as tf
|
8 |
+
|
9 |
+
app = Flask(__name__)
|
10 |
+
|
11 |
+
# Load the saved model
|
12 |
+
new_model = load_model('m.h5')
|
13 |
+
|
14 |
+
|
15 |
+
stopwords_lst = [ 'i',
|
16 |
+
'me',
|
17 |
+
'my',
|
18 |
+
'myself',
|
19 |
+
'we',
|
20 |
+
'our',
|
21 |
+
'ours',
|
22 |
+
'ourselves',
|
23 |
+
'you',
|
24 |
+
"you're",
|
25 |
+
"you've",
|
26 |
+
"you'll",
|
27 |
+
"you'd",
|
28 |
+
'your',
|
29 |
+
'yours',
|
30 |
+
'yourself',
|
31 |
+
'yourselves',
|
32 |
+
'he',
|
33 |
+
'him',
|
34 |
+
'his',
|
35 |
+
'himself',
|
36 |
+
'she',
|
37 |
+
"she's",
|
38 |
+
'her',
|
39 |
+
'hers',
|
40 |
+
'herself',
|
41 |
+
'it',
|
42 |
+
"it's",
|
43 |
+
'its',
|
44 |
+
'itself',
|
45 |
+
'they',
|
46 |
+
'them',
|
47 |
+
'their',
|
48 |
+
'theirs',
|
49 |
+
'themselves',
|
50 |
+
'what',
|
51 |
+
'which',
|
52 |
+
'who',
|
53 |
+
'whom',
|
54 |
+
'this',
|
55 |
+
'that',
|
56 |
+
"that'll",
|
57 |
+
'these',
|
58 |
+
'those',
|
59 |
+
'am',
|
60 |
+
'is',
|
61 |
+
'are',
|
62 |
+
'was',
|
63 |
+
'were',
|
64 |
+
'be',
|
65 |
+
'been',
|
66 |
+
'being',
|
67 |
+
'have',
|
68 |
+
'has',
|
69 |
+
'had',
|
70 |
+
'having',
|
71 |
+
'do',
|
72 |
+
'does',
|
73 |
+
'did',
|
74 |
+
'doing',
|
75 |
+
'a',
|
76 |
+
'an',
|
77 |
+
'the',
|
78 |
+
'and',
|
79 |
+
'but',
|
80 |
+
'if',
|
81 |
+
'or',
|
82 |
+
'because',
|
83 |
+
'as',
|
84 |
+
'until',
|
85 |
+
'while',
|
86 |
+
'of',
|
87 |
+
'at',
|
88 |
+
'by',
|
89 |
+
'for',
|
90 |
+
'with',
|
91 |
+
'about',
|
92 |
+
'against',
|
93 |
+
'between',
|
94 |
+
'into',
|
95 |
+
'through',
|
96 |
+
'during',
|
97 |
+
'before',
|
98 |
+
'after',
|
99 |
+
'above',
|
100 |
+
'below',
|
101 |
+
'to',
|
102 |
+
'from',
|
103 |
+
'up',
|
104 |
+
'down',
|
105 |
+
'in',
|
106 |
+
'out',
|
107 |
+
'on',
|
108 |
+
'off',
|
109 |
+
'over',
|
110 |
+
'under',
|
111 |
+
'again',
|
112 |
+
'further',
|
113 |
+
'then',
|
114 |
+
'once',
|
115 |
+
'here',
|
116 |
+
'there',
|
117 |
+
'when',
|
118 |
+
'where',
|
119 |
+
'why',
|
120 |
+
'how',
|
121 |
+
'all',
|
122 |
+
'any',
|
123 |
+
'both',
|
124 |
+
'each',
|
125 |
+
'few',
|
126 |
+
'more',
|
127 |
+
'most',
|
128 |
+
'other',
|
129 |
+
'some',
|
130 |
+
'such',
|
131 |
+
'no',
|
132 |
+
'nor',
|
133 |
+
'not',
|
134 |
+
'only',
|
135 |
+
'own',
|
136 |
+
'same',
|
137 |
+
'so',
|
138 |
+
'than',
|
139 |
+
'too',
|
140 |
+
'very',
|
141 |
+
's',
|
142 |
+
't',
|
143 |
+
'can',
|
144 |
+
'will',
|
145 |
+
'just',
|
146 |
+
'don',
|
147 |
+
"don't",
|
148 |
+
'should',
|
149 |
+
"should've",
|
150 |
+
'now',
|
151 |
+
'd',
|
152 |
+
'll',
|
153 |
+
'm',
|
154 |
+
'o',
|
155 |
+
're',
|
156 |
+
've',
|
157 |
+
'y',
|
158 |
+
'ain',
|
159 |
+
'aren',
|
160 |
+
"aren't",
|
161 |
+
'couldn',
|
162 |
+
"couldn't",
|
163 |
+
'didn',
|
164 |
+
"didn't",
|
165 |
+
'doesn',
|
166 |
+
"doesn't",
|
167 |
+
'hadn',
|
168 |
+
"hadn't",
|
169 |
+
'hasn',
|
170 |
+
"hasn't",
|
171 |
+
'haven',
|
172 |
+
"haven't",
|
173 |
+
'isn',
|
174 |
+
"isn't",
|
175 |
+
'ma',
|
176 |
+
'mightn',
|
177 |
+
"mightn't",
|
178 |
+
'mustn',
|
179 |
+
"mustn't",
|
180 |
+
'needn',
|
181 |
+
"needn't",
|
182 |
+
'shan',
|
183 |
+
"shan't",
|
184 |
+
'shouldn',
|
185 |
+
"shouldn't",
|
186 |
+
'wasn',
|
187 |
+
"wasn't",
|
188 |
+
'weren',
|
189 |
+
"weren't",
|
190 |
+
'won',
|
191 |
+
"won't",
|
192 |
+
'wouldn',
|
193 |
+
"wouldn't"]
|
194 |
+
|
195 |
+
import pickle
|
196 |
+
import random
|
197 |
+
random.seed(42)
|
198 |
+
with open ('oh.pkl','rb') as f:
|
199 |
+
oh = pickle.load(f)
|
200 |
+
with open ('ps.pkl','rb') as f:
|
201 |
+
ps = pickle.load(f)
|
202 |
+
def predict_emotion2(stri):
|
203 |
+
review = re.sub('[^a-zA-Z]', ' ', stri)
|
204 |
+
review = review.lower()
|
205 |
+
review = review.split()
|
206 |
+
# print(ps.stem(word))
|
207 |
+
review = [ps.stem(word) for word in review if not word in stopwords_lst]
|
208 |
+
review = [int(tf.strings.to_hash_bucket_fast(word, 1000)) for word in review]
|
209 |
+
onehot_repr = [review]
|
210 |
+
print(onehot_repr)
|
211 |
+
embed = pad_sequences(onehot_repr,padding='pre',maxlen=35)
|
212 |
+
# predicti = new_model.predict(embed)
|
213 |
+
# return np.argmax(predicti)
|
214 |
+
|
215 |
+
strs = ["I am surprised of my work", "I am happy of my work", "I am sad of my work", "I love my country and I am happy"]
|
216 |
+
for s in strs:
|
217 |
+
predict_emotion2(s)
|
218 |
+
|
219 |
+
# print("em: ", predict_emotion2("I am surprised of my work"))
|
220 |
+
# print("em: ", predict_emotion2("I am happy of my work"))
|
221 |
+
# print("em: ", predict_emotion2("I am sad of my work"))
|
222 |
+
# print("em: ", predict_emotion2("I love my country and I am happy"))
|
data/Emotion_final.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
data/preprocess_data.py
ADDED
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import pandas as pd
|
3 |
+
import tensorflow as tf
|
4 |
+
import nltk
|
5 |
+
import seaborn as sns
|
6 |
+
import re
|
7 |
+
import matplotlib.pyplot as plt
|
8 |
+
import pickle
|
9 |
+
from tensorflow.keras.layers import Embedding, Dense, LSTM, Dropout, Bidirectional
|
10 |
+
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
11 |
+
from tensorflow.keras.models import Sequential
|
12 |
+
from tensorflow.keras.preprocessing.text import one_hot, Tokenizer
|
13 |
+
from tensorflow.keras.callbacks import ModelCheckpoint
|
14 |
+
from nltk.corpus import stopwords
|
15 |
+
from nltk.stem.porter import PorterStemmer
|
16 |
+
|
17 |
+
from sklearn.model_selection import train_test_split
|
18 |
+
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
|
19 |
+
|
20 |
+
|
21 |
+
df=pd.read_csv('model/Emotion_final.csv') #Text data
|
22 |
+
EMBEDDING_FILE= f'model/glove.6B.100d.txt' #GloVe file path
|
23 |
+
|
24 |
+
|
25 |
+
#removing duplicated values
|
26 |
+
index = df[df.duplicated() == True].index
|
27 |
+
df.drop(index, axis = 0, inplace = True)
|
28 |
+
df.reset_index(inplace=True, drop = True)
|
29 |
+
|
30 |
+
#removing duplicated text
|
31 |
+
index = df[df['Text'].duplicated() == True].index
|
32 |
+
df.drop(index, axis = 0, inplace = True)
|
33 |
+
df.reset_index(inplace=True, drop = True)
|
34 |
+
|
35 |
+
df=df.dropna() #Drop columns with NA values
|
36 |
+
X=df.drop('Emotion',axis=1) #Taking Text
|
37 |
+
y=df['Emotion'] #Taking Emotion
|
38 |
+
|
39 |
+
messages=X.copy()
|
40 |
+
messages.reset_index(inplace=True) #Drop NA may cause inconsistency in index
|
41 |
+
|
42 |
+
stopword_lst = pickle.load(open('/home/devraj4522/Desktop/ML Model/model/stopwords.pkl', 'rb'))
|
43 |
+
|
44 |
+
# nltk.download('stopwords')
|
45 |
+
ps = PorterStemmer() # reduce word to root form
|
46 |
+
corpus = []
|
47 |
+
for i in range(0, len(messages)):
|
48 |
+
review = re.sub('[^a-zA-Z]', ' ', messages['Text'][i]) #Remove Special Characters
|
49 |
+
review = review.lower() #Lower case
|
50 |
+
review = review.split()
|
51 |
+
review = [ps.stem(word) for word in review if not word in stopword_lst] #Remove stopwords
|
52 |
+
review = ' '.join(review)
|
53 |
+
corpus.append(review)
|
54 |
+
|
55 |
+
|
56 |
+
with open("corpus.pkl", 'wb') as file:
|
57 |
+
pickle.dump(corpus, file)
|
58 |
+
|
59 |
+
|
60 |
+
with open("ps.pkl", 'wb') as file:
|
61 |
+
pickle.dump(ps, file)
|
62 |
+
|
63 |
+
|
64 |
+
#Creating the dictionary with word as key and pretrained-value array as value
|
65 |
+
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
|
66 |
+
embeddings_index = dict(get_coefs(*o.strip().split()) for o in open(EMBEDDING_FILE))
|
67 |
+
all_embs = np.stack(embeddings_index.values())
|
68 |
+
emb_mean,emb_std = all_embs.mean(), all_embs.std()
|
69 |
+
|
70 |
+
voc_size=10000 # Vocabulary size
|
71 |
+
embed_size=100 #word vector size
|
72 |
+
|
73 |
+
tokenizer = Tokenizer(num_words=voc_size)
|
74 |
+
tokenizer.fit_on_texts(list(corpus))
|
75 |
+
word_index = tokenizer.word_index #Total words in the corpus
|
76 |
+
nb_words = min(voc_size, len(word_index))
|
77 |
+
|
78 |
+
#Initialize weight matrix for embedding layer
|
79 |
+
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
|
80 |
+
|
81 |
+
for word, i in word_index.items():
|
82 |
+
if i >= voc_size: continue #Skip the words if vocab size is reached
|
83 |
+
embedding_vector = embeddings_index.get(word) #Extract the pretrained values from GloVe
|
84 |
+
if embedding_vector is not None: embedding_matrix[i] = embedding_vector
|
85 |
+
|
86 |
+
# #One hot representation for input
|
87 |
+
onehot_repr=[one_hot(words,voc_size)for words in corpus]
|
88 |
+
|
89 |
+
# #Finding max words
|
90 |
+
l = 0
|
91 |
+
for x in corpus:
|
92 |
+
l = max(l,len(x.split(' ')))
|
93 |
+
|
94 |
+
# #Padding the sequences for input
|
95 |
+
sent_length= l
|
96 |
+
embedded_docs=pad_sequences(onehot_repr,padding='pre',maxlen=sent_length)
|
97 |
+
# print(embedded_docs)
|
98 |
+
with open("embedded_docs.pkl", 'wb') as file:
|
99 |
+
pickle.dump(embedded_docs, file)
|
100 |
+
|
101 |
+
with open("l.txt", 'w') as file:
|
102 |
+
file.write(str(l))
|
103 |
+
|
104 |
+
|
data/stopwords.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4363f21909e984f52cfe3c4df4d312aa9d8442362fe1be2830963668c62f6d26
|
3 |
+
size 1310
|
final_dev.ipynb
ADDED
The diff for this file is too large to render.
See raw diff
|
|
m.h5
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:82896dd547dd7aac85766d657465e8527605db15e4046bef983d5bdd8355175a
|
3 |
+
size 13176168
|
model/bert_model/tf_model.h5
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:94059651edcb1be167a0e44d58e15087ecdebe35a373d6db78db7a5c40a68e54
|
3 |
+
size 1313085304
|
model/bert_model/tf_model.preproc
ADDED
Binary file (871 kB). View file
|
|
oh.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e89cb1ea7281e0e10273f1ee97ec9a1fa88dbea5b9368ff673eadbc5435b8df6
|
3 |
+
size 51
|
ps.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c0d0b21442919eb6ceacb4283ef2e5b11ba64d9febaa8db0eed954d63ace0130
|
3 |
+
size 309
|
requirements.txt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
flask==2.2.3
|
2 |
+
requests==2.28.1
|
3 |
+
ktrain==0.37.0
|
test.py
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import tensorflow as tf
|
2 |
+
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
3 |
+
|
4 |
+
# Set the vocabulary size and maximum sequence length
|
5 |
+
voc_size = 10000
|
6 |
+
sent_length = 20
|
7 |
+
|
8 |
+
# Define the corpus
|
9 |
+
corpus = ['example text 1', 'example text 2', 'example text 3']
|
10 |
+
|
11 |
+
# Generate hashed integer sequences for the corpus
|
12 |
+
hashed_docs = []
|
13 |
+
for text in corpus:
|
14 |
+
hashed_doc = [tf.strings.to_hash_bucket_fast(word, voc_size) for word in text.split()]
|
15 |
+
for word in text.split():
|
16 |
+
print(int(tf.strings.to_hash_bucket_fast(word, voc_size)), end=' ')
|
17 |
+
hashed_docs.append(hashed_doc)
|
18 |
+
|
19 |
+
# Pad the sequences to a fixed length
|
20 |
+
padded_docs = pad_sequences(hashed_docs, padding='pre', maxlen=sent_length)
|
21 |
+
|
22 |
+
print(padded_docs)
|
tokenizer.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b811e16b951279bcea922fe994e5432079dbf4c55da67a0b05ea1726d5807712
|
3 |
+
size 508602
|