import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Embedding, Flatten
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pickle


# Load the IMDB dataset from a CSV file
path_to_csv = 'IMDB Dataset.csv'
df = pd.read_csv(path_to_csv)

reviews = df['review'].values
labels = df['sentiment'].values

# Convert string labels to numerical values
label_encoder = {'positive': 1, 'negative': 0}
y = np.array([label_encoder[label.lower()] for label in labels])

# Tokenize the text data
max_words = 10000
tokenizer = Tokenizer(num_words=max_words, oov_token='<OOV>')
tokenizer.fit_on_texts(reviews)
sequences = tokenizer.texts_to_sequences(reviews)

# Pad sequences to a fixed length
max_review_length = 200
x = pad_sequences(sequences, maxlen=max_review_length)
maxlen=200

# Model building
model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=64, input_length=maxlen))
model.add(Flatten())
model.add(Dense(64, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

# Training
print("Training started...")
history = model.fit(x, y, epochs=3, batch_size=16, validation_split=0.2)
loss, acc = model.evaluate(x, y)
print("Training finished.")
print(f'Test Accuracy: {round(acc*100)}%')


with open('tokenizer_dnn.pkl', 'wb') as tokenizer_file:
    pickle.dump(tokenizer, tokenizer_file)


# Save the model
model.save('dnn_model_imdb.h5')
print("Model saved as 'dnn_model_imdb.h5'")


# Example: Make a prediction on a movie review
sample_review = "I really enjoyed the movie. The plot was engaging, and the acting was superb."
sample_sequence = tokenizer.texts_to_sequences([sample_review])
padded_sample = pad_sequences(sample_sequence, maxlen=max_review_length)
prediction = model.predict(padded_sample)
sentiment = "Positive" if prediction[0][0] > 0.3 else "Negative"
print(f'Predicted Sentiment: {sentiment} (Probability: {prediction[0][0]:.2f})')