import numpy as np
import pandas as pd 
import tensorflow as tf
import nltk
import seaborn as sns
import re
import matplotlib.pyplot as plt
import pickle
from tensorflow.keras.layers import Embedding, Dense, LSTM, Dropout, Bidirectional
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot, Tokenizer
from tensorflow.keras.callbacks import ModelCheckpoint
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


df=pd.read_csv('model/Emotion_final.csv') #Text data
EMBEDDING_FILE= f'model/glove.6B.100d.txt' #GloVe file path


#removing duplicated values
index = df[df.duplicated() == True].index
df.drop(index, axis = 0, inplace = True)
df.reset_index(inplace=True, drop = True)

#removing duplicated text 
index = df[df['Text'].duplicated() == True].index
df.drop(index, axis = 0, inplace = True)
df.reset_index(inplace=True, drop = True)

df=df.dropna() #Drop columns with NA values
X=df.drop('Emotion',axis=1) #Taking Text
y=df['Emotion'] #Taking Emotion

messages=X.copy()
messages.reset_index(inplace=True) #Drop NA may cause inconsistency in index

stopword_lst = pickle.load(open('/home/devraj4522/Desktop/ML Model/model/stopwords.pkl', 'rb'))

# nltk.download('stopwords')
ps = PorterStemmer()  # reduce word to root form
corpus = []
for i in range(0, len(messages)):
    review = re.sub('[^a-zA-Z]', ' ', messages['Text'][i]) #Remove Special Characters
    review = review.lower() #Lower case 
    review = review.split()
    review = [ps.stem(word) for word in review if not word in stopword_lst] #Remove stopwords
    review = ' '.join(review)
    corpus.append(review)
    
    
with open("corpus.pkl", 'wb') as file:
    pickle.dump(corpus, file)


with open("ps.pkl", 'wb') as file:
    pickle.dump(ps, file)
    

#Creating the dictionary with word as key and pretrained-value array as value
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.strip().split()) for o in open(EMBEDDING_FILE))
all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()

voc_size=10000 # Vocabulary size
embed_size=100 #word vector size

tokenizer = Tokenizer(num_words=voc_size)
tokenizer.fit_on_texts(list(corpus))
word_index = tokenizer.word_index #Total words in the corpus
nb_words = min(voc_size, len(word_index))

#Initialize weight matrix for embedding layer
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size)) 

for word, i in word_index.items():
    if i >= voc_size: continue #Skip the words if vocab size is reached
    embedding_vector = embeddings_index.get(word) #Extract the pretrained values from GloVe
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector
    
# #One hot representation for input
onehot_repr=[one_hot(words,voc_size)for words in corpus]

# #Finding max words
l = 0
for x in corpus:
    l = max(l,len(x.split(' ')))

# #Padding the sequences for input
sent_length= l
embedded_docs=pad_sequences(onehot_repr,padding='pre',maxlen=sent_length)
# print(embedded_docs)
with open("embedded_docs.pkl", 'wb') as file:
    pickle.dump(embedded_docs, file)
    
with open("l.txt", 'w') as file:
    file.write(str(l))