|
import numpy as np |
|
import pandas as pd |
|
import tensorflow as tf |
|
import nltk |
|
import seaborn as sns |
|
import re |
|
import matplotlib.pyplot as plt |
|
import pickle |
|
from tensorflow.keras.layers import Embedding, Dense, LSTM, Dropout, Bidirectional |
|
from tensorflow.keras.preprocessing.sequence import pad_sequences |
|
from tensorflow.keras.models import Sequential |
|
from tensorflow.keras.preprocessing.text import one_hot, Tokenizer |
|
from tensorflow.keras.callbacks import ModelCheckpoint |
|
from nltk.corpus import stopwords |
|
from nltk.stem.porter import PorterStemmer |
|
|
|
from sklearn.model_selection import train_test_split |
|
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix |
|
|
|
|
|
df=pd.read_csv('model/Emotion_final.csv') |
|
EMBEDDING_FILE= f'model/glove.6B.100d.txt' |
|
|
|
|
|
|
|
index = df[df.duplicated() == True].index |
|
df.drop(index, axis = 0, inplace = True) |
|
df.reset_index(inplace=True, drop = True) |
|
|
|
|
|
index = df[df['Text'].duplicated() == True].index |
|
df.drop(index, axis = 0, inplace = True) |
|
df.reset_index(inplace=True, drop = True) |
|
|
|
df=df.dropna() |
|
X=df.drop('Emotion',axis=1) |
|
y=df['Emotion'] |
|
|
|
messages=X.copy() |
|
messages.reset_index(inplace=True) |
|
|
|
stopword_lst = pickle.load(open('/home/devraj4522/Desktop/ML Model/model/stopwords.pkl', 'rb')) |
|
|
|
|
|
ps = PorterStemmer() |
|
corpus = [] |
|
for i in range(0, len(messages)): |
|
review = re.sub('[^a-zA-Z]', ' ', messages['Text'][i]) |
|
review = review.lower() |
|
review = review.split() |
|
review = [ps.stem(word) for word in review if not word in stopword_lst] |
|
review = ' '.join(review) |
|
corpus.append(review) |
|
|
|
|
|
with open("corpus.pkl", 'wb') as file: |
|
pickle.dump(corpus, file) |
|
|
|
|
|
with open("ps.pkl", 'wb') as file: |
|
pickle.dump(ps, file) |
|
|
|
|
|
|
|
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32') |
|
embeddings_index = dict(get_coefs(*o.strip().split()) for o in open(EMBEDDING_FILE)) |
|
all_embs = np.stack(embeddings_index.values()) |
|
emb_mean,emb_std = all_embs.mean(), all_embs.std() |
|
|
|
voc_size=10000 |
|
embed_size=100 |
|
|
|
tokenizer = Tokenizer(num_words=voc_size) |
|
tokenizer.fit_on_texts(list(corpus)) |
|
word_index = tokenizer.word_index |
|
nb_words = min(voc_size, len(word_index)) |
|
|
|
|
|
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size)) |
|
|
|
for word, i in word_index.items(): |
|
if i >= voc_size: continue |
|
embedding_vector = embeddings_index.get(word) |
|
if embedding_vector is not None: embedding_matrix[i] = embedding_vector |
|
|
|
|
|
onehot_repr=[one_hot(words,voc_size)for words in corpus] |
|
|
|
|
|
l = 0 |
|
for x in corpus: |
|
l = max(l,len(x.split(' '))) |
|
|
|
|
|
sent_length= l |
|
embedded_docs=pad_sequences(onehot_repr,padding='pre',maxlen=sent_length) |
|
|
|
with open("embedded_docs.pkl", 'wb') as file: |
|
pickle.dump(embedded_docs, file) |
|
|
|
with open("l.txt", 'w') as file: |
|
file.write(str(l)) |
|
|
|
|
|
|