sentiment / data /preprocess_data.py
devraj4522's picture
Upload 14 files
459c675
import numpy as np
import pandas as pd
import tensorflow as tf
import nltk
import seaborn as sns
import re
import matplotlib.pyplot as plt
import pickle
from tensorflow.keras.layers import Embedding, Dense, LSTM, Dropout, Bidirectional
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot, Tokenizer
from tensorflow.keras.callbacks import ModelCheckpoint
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
df=pd.read_csv('model/Emotion_final.csv') #Text data
EMBEDDING_FILE= f'model/glove.6B.100d.txt' #GloVe file path
#removing duplicated values
index = df[df.duplicated() == True].index
df.drop(index, axis = 0, inplace = True)
df.reset_index(inplace=True, drop = True)
#removing duplicated text
index = df[df['Text'].duplicated() == True].index
df.drop(index, axis = 0, inplace = True)
df.reset_index(inplace=True, drop = True)
df=df.dropna() #Drop columns with NA values
X=df.drop('Emotion',axis=1) #Taking Text
y=df['Emotion'] #Taking Emotion
messages=X.copy()
messages.reset_index(inplace=True) #Drop NA may cause inconsistency in index
stopword_lst = pickle.load(open('/home/devraj4522/Desktop/ML Model/model/stopwords.pkl', 'rb'))
# nltk.download('stopwords')
ps = PorterStemmer() # reduce word to root form
corpus = []
for i in range(0, len(messages)):
review = re.sub('[^a-zA-Z]', ' ', messages['Text'][i]) #Remove Special Characters
review = review.lower() #Lower case
review = review.split()
review = [ps.stem(word) for word in review if not word in stopword_lst] #Remove stopwords
review = ' '.join(review)
corpus.append(review)
with open("corpus.pkl", 'wb') as file:
pickle.dump(corpus, file)
with open("ps.pkl", 'wb') as file:
pickle.dump(ps, file)
#Creating the dictionary with word as key and pretrained-value array as value
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.strip().split()) for o in open(EMBEDDING_FILE))
all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()
voc_size=10000 # Vocabulary size
embed_size=100 #word vector size
tokenizer = Tokenizer(num_words=voc_size)
tokenizer.fit_on_texts(list(corpus))
word_index = tokenizer.word_index #Total words in the corpus
nb_words = min(voc_size, len(word_index))
#Initialize weight matrix for embedding layer
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
for word, i in word_index.items():
if i >= voc_size: continue #Skip the words if vocab size is reached
embedding_vector = embeddings_index.get(word) #Extract the pretrained values from GloVe
if embedding_vector is not None: embedding_matrix[i] = embedding_vector
# #One hot representation for input
onehot_repr=[one_hot(words,voc_size)for words in corpus]
# #Finding max words
l = 0
for x in corpus:
l = max(l,len(x.split(' ')))
# #Padding the sequences for input
sent_length= l
embedded_docs=pad_sequences(onehot_repr,padding='pre',maxlen=sent_length)
# print(embedded_docs)
with open("embedded_docs.pkl", 'wb') as file:
pickle.dump(embedded_docs, file)
with open("l.txt", 'w') as file:
file.write(str(l))