devraj4522
/

sentiment

Model card Files Files and versions Community

devraj4522 commited on May 23, 2023

Commit

459c675

•

1 Parent(s): 3f47ce6

Upload 14 files

Browse files

Files changed (14) hide show

app.py +62 -0
create_model.py +222 -0
data/Emotion_final.csv +0 -0
data/preprocess_data.py +104 -0
data/stopwords.pkl +3 -0
final_dev.ipynb +0 -0
m.h5 +3 -0
model/bert_model/tf_model.h5 +3 -0
model/bert_model/tf_model.preproc +0 -0
oh.pkl +3 -0
ps.pkl +3 -0
requirements.txt +3 -0
test.py +22 -0
tokenizer.pickle +3 -0

app.py ADDED Viewed

	@@ -0,0 +1,62 @@

+from flask import Flask, request, jsonify
+import time
+import ktrain
+app = Flask(__name__)
+predictor = ktrain.load_predictor('model/bert_model')
+# worked
+print(predictor.predict('I love this product!'))
+print(predictor.predict('I hate this product!'))
+print(predictor.predict('I am so sad!'))
+print(predictor.predict('I am so happy!'))
+print(predictor.predict("I am looking for a job."))
+print(predictor.predict("I like to play football."))
+print(predictor.predict("I am going to the beach."))
+print(predictor.predict("I am going to the hospital."))
+print(predictor.predict("His son is very sick."))
+@app.route('/')
+def index():
+    response = {
+        'message': 'Social Media Emotion Analysis!'
+    }
+    return jsonify(response)
+@app.route('/predict-str', methods=['POST'])
+def predict_message():
+    data = request.json
+    message = data.get('message', '')
+    start_time = time.time()
+    prediction = predictor.predict(message)
+    response = {
+        'message': message,
+        'prediction': prediction,
+        'elapsed_time': time.time() - start_time
+    }
+    return jsonify(response)
+@app.route('/predict-list', methods=['POST'])
+def predict_list():
+    data = request.json
+    messages = data.get('messages', [])
+    start_time = time.time()
+    predictions = predictor.predict(messages)
+    response = {
+        'messages': messages,
+        'predictions': predictions,
+        'elapsed_time': time.time() - start_time
+    }
+    return jsonify(response)
+if __name__ == '__main__':
+    app.run()

create_model.py ADDED Viewed

	@@ -0,0 +1,222 @@

+from flask import Flask
+from keras.models import load_model
+from tensorflow.keras.preprocessing.sequence import pad_sequences
+import re
+from tensorflow.keras.preprocessing.text import one_hot as oh
+import numpy as np
+import tensorflow as tf
+app = Flask(__name__)
+# Load the saved model
+new_model = load_model('m.h5')
+stopwords_lst = [ 'i',
+ 'me',
+ 'my',
+ 'myself',
+ 'we',
+ 'our',
+ 'ours',
+ 'ourselves',
+ 'you',
+ "you're",
+ "you've",
+ "you'll",
+ "you'd",
+ 'your',
+ 'yours',
+ 'yourself',
+ 'yourselves',
+ 'he',
+ 'him',
+ 'his',
+ 'himself',
+ 'she',
+ "she's",
+ 'her',
+ 'hers',
+ 'herself',
+ 'it',
+ "it's",
+ 'its',
+ 'itself',
+ 'they',
+ 'them',
+ 'their',
+ 'theirs',
+ 'themselves',
+ 'what',
+ 'which',
+ 'who',
+ 'whom',
+ 'this',
+ 'that',
+ "that'll",
+ 'these',
+ 'those',
+ 'am',
+ 'is',
+ 'are',
+ 'was',
+ 'were',
+ 'be',
+ 'been',
+ 'being',
+ 'have',
+ 'has',
+ 'had',
+ 'having',
+ 'do',
+ 'does',
+ 'did',
+ 'doing',
+ 'a',
+ 'an',
+ 'the',
+ 'and',
+ 'but',
+ 'if',
+ 'or',
+ 'because',
+ 'as',
+ 'until',
+ 'while',
+ 'of',
+ 'at',
+ 'by',
+ 'for',
+ 'with',
+ 'about',
+ 'against',
+ 'between',
+ 'into',
+ 'through',
+ 'during',
+ 'before',
+ 'after',
+ 'above',
+ 'below',
+ 'to',
+ 'from',
+ 'up',
+ 'down',
+ 'in',
+ 'out',
+ 'on',
+ 'off',
+ 'over',
+ 'under',
+ 'again',
+ 'further',
+ 'then',
+ 'once',
+ 'here',
+ 'there',
+ 'when',
+ 'where',
+ 'why',
+ 'how',
+ 'all',
+ 'any',
+ 'both',
+ 'each',
+ 'few',
+ 'more',
+ 'most',
+ 'other',
+ 'some',
+ 'such',
+ 'no',
+ 'nor',
+ 'not',
+ 'only',
+ 'own',
+ 'same',
+ 'so',
+ 'than',
+ 'too',
+ 'very',
+ 's',
+ 't',
+ 'can',
+ 'will',
+ 'just',
+ 'don',
+ "don't",
+ 'should',
+ "should've",
+ 'now',
+ 'd',
+ 'll',
+ 'm',
+ 'o',
+ 're',
+ 've',
+ 'y',
+ 'ain',
+ 'aren',
+ "aren't",
+ 'couldn',
+ "couldn't",
+ 'didn',
+ "didn't",
+ 'doesn',
+ "doesn't",
+ 'hadn',
+ "hadn't",
+ 'hasn',
+ "hasn't",
+ 'haven',
+ "haven't",
+ 'isn',
+ "isn't",
+ 'ma',
+ 'mightn',
+ "mightn't",
+ 'mustn',
+ "mustn't",
+ 'needn',
+ "needn't",
+ 'shan',
+ "shan't",
+ 'shouldn',
+ "shouldn't",
+ 'wasn',
+ "wasn't",
+ 'weren',
+ "weren't",
+ 'won',
+ "won't",
+ 'wouldn',
+ "wouldn't"]
+import pickle
+import random
+random.seed(42)
+with open ('oh.pkl','rb') as f:
+    oh = pickle.load(f)
+with open ('ps.pkl','rb') as f:
+    ps = pickle.load(f)
+def predict_emotion2(stri):
+    review = re.sub('[^a-zA-Z]', ' ', stri)
+    review = review.lower()
+    review = review.split()
+    # print(ps.stem(word))
+    review = [ps.stem(word) for word in review if not word in stopwords_lst]
+    review = [int(tf.strings.to_hash_bucket_fast(word, 1000)) for word in review]
+    onehot_repr = [review]
+    print(onehot_repr)
+    embed = pad_sequences(onehot_repr,padding='pre',maxlen=35)
+    # predicti = new_model.predict(embed)
+    # return np.argmax(predicti)
+strs = ["I am surprised of my work", "I am happy of my work", "I am sad of my work", "I love my country and I am happy"]
+for s in strs:
+    predict_emotion2(s)
+# print("em: ", predict_emotion2("I am surprised of my work"))
+# print("em: ", predict_emotion2("I am happy of my work"))
+# print("em: ", predict_emotion2("I am sad of my work"))
+# print("em: ", predict_emotion2("I love my country and I am happy"))

data/Emotion_final.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

data/preprocess_data.py ADDED Viewed

	@@ -0,0 +1,104 @@

+import numpy as np
+import pandas as pd
+import tensorflow as tf
+import nltk
+import seaborn as sns
+import re
+import matplotlib.pyplot as plt
+import pickle
+from tensorflow.keras.layers import Embedding, Dense, LSTM, Dropout, Bidirectional
+from tensorflow.keras.preprocessing.sequence import pad_sequences
+from tensorflow.keras.models import Sequential
+from tensorflow.keras.preprocessing.text import one_hot, Tokenizer
+from tensorflow.keras.callbacks import ModelCheckpoint
+from nltk.corpus import stopwords
+from nltk.stem.porter import PorterStemmer
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
+df=pd.read_csv('model/Emotion_final.csv') #Text data
+EMBEDDING_FILE= f'model/glove.6B.100d.txt' #GloVe file path
+#removing duplicated values
+index = df[df.duplicated() == True].index
+df.drop(index, axis = 0, inplace = True)
+df.reset_index(inplace=True, drop = True)
+#removing duplicated text
+index = df[df['Text'].duplicated() == True].index
+df.drop(index, axis = 0, inplace = True)
+df.reset_index(inplace=True, drop = True)
+df=df.dropna() #Drop columns with NA values
+X=df.drop('Emotion',axis=1) #Taking Text
+y=df['Emotion'] #Taking Emotion
+messages=X.copy()
+messages.reset_index(inplace=True) #Drop NA may cause inconsistency in index
+stopword_lst = pickle.load(open('/home/devraj4522/Desktop/ML Model/model/stopwords.pkl', 'rb'))
+# nltk.download('stopwords')
+ps = PorterStemmer()  # reduce word to root form
+corpus = []
+for i in range(0, len(messages)):
+    review = re.sub('[^a-zA-Z]', ' ', messages['Text'][i]) #Remove Special Characters
+    review = review.lower() #Lower case
+    review = review.split()
+    review = [ps.stem(word) for word in review if not word in stopword_lst] #Remove stopwords
+    review = ' '.join(review)
+    corpus.append(review)
+with open("corpus.pkl", 'wb') as file:
+    pickle.dump(corpus, file)
+with open("ps.pkl", 'wb') as file:
+    pickle.dump(ps, file)
+#Creating the dictionary with word as key and pretrained-value array as value
+def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
+embeddings_index = dict(get_coefs(*o.strip().split()) for o in open(EMBEDDING_FILE))
+all_embs = np.stack(embeddings_index.values())
+emb_mean,emb_std = all_embs.mean(), all_embs.std()
+voc_size=10000 # Vocabulary size
+embed_size=100 #word vector size
+tokenizer = Tokenizer(num_words=voc_size)
+tokenizer.fit_on_texts(list(corpus))
+word_index = tokenizer.word_index #Total words in the corpus
+nb_words = min(voc_size, len(word_index))
+#Initialize weight matrix for embedding layer
+embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
+for word, i in word_index.items():
+    if i >= voc_size: continue #Skip the words if vocab size is reached
+    embedding_vector = embeddings_index.get(word) #Extract the pretrained values from GloVe
+    if embedding_vector is not None: embedding_matrix[i] = embedding_vector
+# #One hot representation for input
+onehot_repr=[one_hot(words,voc_size)for words in corpus]
+# #Finding max words
+l = 0
+for x in corpus:
+    l = max(l,len(x.split(' ')))
+# #Padding the sequences for input
+sent_length= l
+embedded_docs=pad_sequences(onehot_repr,padding='pre',maxlen=sent_length)
+# print(embedded_docs)
+with open("embedded_docs.pkl", 'wb') as file:
+    pickle.dump(embedded_docs, file)
+with open("l.txt", 'w') as file:
+    file.write(str(l))

data/stopwords.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4363f21909e984f52cfe3c4df4d312aa9d8442362fe1be2830963668c62f6d26
+size 1310

final_dev.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

m.h5 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:82896dd547dd7aac85766d657465e8527605db15e4046bef983d5bdd8355175a
+size 13176168

model/bert_model/tf_model.h5 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:94059651edcb1be167a0e44d58e15087ecdebe35a373d6db78db7a5c40a68e54
+size 1313085304

model/bert_model/tf_model.preproc ADDED Viewed

Binary file (871 kB). View file

oh.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e89cb1ea7281e0e10273f1ee97ec9a1fa88dbea5b9368ff673eadbc5435b8df6
+size 51

ps.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c0d0b21442919eb6ceacb4283ef2e5b11ba64d9febaa8db0eed954d63ace0130
+size 309

requirements.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+flask==2.2.3
+requests==2.28.1
+ktrain==0.37.0

test.py ADDED Viewed

	@@ -0,0 +1,22 @@

+import tensorflow as tf
+from tensorflow.keras.preprocessing.sequence import pad_sequences
+# Set the vocabulary size and maximum sequence length
+voc_size = 10000
+sent_length = 20
+# Define the corpus
+corpus = ['example text 1', 'example text 2', 'example text 3']
+# Generate hashed integer sequences for the corpus
+hashed_docs = []
+for text in corpus:
+    hashed_doc = [tf.strings.to_hash_bucket_fast(word, voc_size) for word in text.split()]
+    for word in text.split():
+        print(int(tf.strings.to_hash_bucket_fast(word, voc_size)), end=' ')
+    hashed_docs.append(hashed_doc)
+# Pad the sequences to a fixed length
+padded_docs = pad_sequences(hashed_docs, padding='pre', maxlen=sent_length)
+print(padded_docs)

tokenizer.pickle ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b811e16b951279bcea922fe994e5432079dbf4c55da67a0b05ea1726d5807712
+size 508602