devraj4522
/

sentiment

Model card Files Files and versions

sentiment / data /preprocess_data.py

devraj4522's picture

Upload 14 files

459c675 over 2 years ago

history blame contribute delete

3.47 kB

	import numpy as np
	import pandas as pd
	import tensorflow as tf
	import nltk
	import seaborn as sns
	import re
	import matplotlib.pyplot as plt
	import pickle
	from tensorflow.keras.layers import Embedding, Dense, LSTM, Dropout, Bidirectional
	from tensorflow.keras.preprocessing.sequence import pad_sequences
	from tensorflow.keras.models import Sequential
	from tensorflow.keras.preprocessing.text import one_hot, Tokenizer
	from tensorflow.keras.callbacks import ModelCheckpoint
	from nltk.corpus import stopwords
	from nltk.stem.porter import PorterStemmer

	from sklearn.model_selection import train_test_split
	from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


	df=pd.read_csv('model/Emotion_final.csv') #Text data
	EMBEDDING_FILE= f'model/glove.6B.100d.txt' #GloVe file path


	#removing duplicated values
	index = df[df.duplicated() == True].index
	df.drop(index, axis = 0, inplace = True)
	df.reset_index(inplace=True, drop = True)

	#removing duplicated text
	index = df[df['Text'].duplicated() == True].index
	df.drop(index, axis = 0, inplace = True)
	df.reset_index(inplace=True, drop = True)

	df=df.dropna() #Drop columns with NA values
	X=df.drop('Emotion',axis=1) #Taking Text
	y=df['Emotion'] #Taking Emotion

	messages=X.copy()
	messages.reset_index(inplace=True) #Drop NA may cause inconsistency in index

	stopword_lst = pickle.load(open('/home/devraj4522/Desktop/ML Model/model/stopwords.pkl', 'rb'))

	# nltk.download('stopwords')
	ps = PorterStemmer() # reduce word to root form
	corpus = []
	for i in range(0, len(messages)):
	review = re.sub('[^a-zA-Z]', ' ', messages['Text'][i]) #Remove Special Characters
	review = review.lower() #Lower case
	review = review.split()
	review = [ps.stem(word) for word in review if not word in stopword_lst] #Remove stopwords
	review = ' '.join(review)
	corpus.append(review)


	with open("corpus.pkl", 'wb') as file:
	pickle.dump(corpus, file)


	with open("ps.pkl", 'wb') as file:
	pickle.dump(ps, file)


	#Creating the dictionary with word as key and pretrained-value array as value
	def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
	embeddings_index = dict(get_coefs(*o.strip().split()) for o in open(EMBEDDING_FILE))
	all_embs = np.stack(embeddings_index.values())
	emb_mean,emb_std = all_embs.mean(), all_embs.std()

	voc_size=10000 # Vocabulary size
	embed_size=100 #word vector size

	tokenizer = Tokenizer(num_words=voc_size)
	tokenizer.fit_on_texts(list(corpus))
	word_index = tokenizer.word_index #Total words in the corpus
	nb_words = min(voc_size, len(word_index))

	#Initialize weight matrix for embedding layer
	embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))

	for word, i in word_index.items():
	if i >= voc_size: continue #Skip the words if vocab size is reached
	embedding_vector = embeddings_index.get(word) #Extract the pretrained values from GloVe
	if embedding_vector is not None: embedding_matrix[i] = embedding_vector

	# #One hot representation for input
	onehot_repr=[one_hot(words,voc_size)for words in corpus]

	# #Finding max words
	l = 0
	for x in corpus:
	l = max(l,len(x.split(' ')))

	# #Padding the sequences for input
	sent_length= l
	embedded_docs=pad_sequences(onehot_repr,padding='pre',maxlen=sent_length)
	# print(embedded_docs)
	with open("embedded_docs.pkl", 'wb') as file:
	pickle.dump(embedded_docs, file)

	with open("l.txt", 'w') as file:
	file.write(str(l))