In [3]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from rouge import Rouge
import numpy as np

In [4]:
# Load and preprocess the Shakespeare dataset
file_path = "/kaggle/input/shakespeare-txt/shakespeare.txt"
with open(file_path, "r", encoding="utf-8") as file:
    text = file.read()

In [5]:
# Tokenize the text
tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts([text])
total_chars = len(tokenizer.word_index) + 1

In [6]:
# Create input sequences and target sequences
input_sequences = []
for i in range(1, len(text)):
    seq = text[i - 50:i + 1]  # Use a sequence length of 50 characters
    input_sequences.append(seq)

In [7]:
sequences = tokenizer.texts_to_sequences(input_sequences)
# Convert sequences to a NumPy array
sequences = tf.keras.preprocessing.sequence.pad_sequences(sequences, padding='pre')

# Split sequences into input (X) and target (y)
X = sequences[:, :-1]
y = sequences[:, -1]

In [8]:
# Convert y to one-hot encoding
y = tf.keras.utils.to_categorical(y, num_classes=total_chars)

In [9]:
# Build the RNN model
model = Sequential()
model.add(Embedding(total_chars, 50, input_length=X.shape[1]))
model.add(SimpleRNN(100, return_sequences=True))
model.add(SimpleRNN(100))
model.add(Dense(256))
model.add(Dense(total_chars, activation='softmax'))

In [10]:
# model.compile(loss='categorical_crossentropy', optimizer='adam')
model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

In [11]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 50, 50)            3300      
                                                                 
 simple_rnn (SimpleRNN)      (None, 50, 100)           15100     
                                                                 
 simple_rnn_1 (SimpleRNN)    (None, 100)               20100     
                                                                 
 dense (Dense)               (None, 66)                6666      
                                                                 
Total params: 45166 (176.43 KB)
Trainable params: 45166 (176.43 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [21]:
# # Train the model
# model.fit(X, y, epochs=5, batch_size=1024)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x7c22a41ae410>

In [22]:
# Text generation example
seed_text = "To be or not to be, that is the"
generated_text = seed_text
for _ in range(100):
    sequence = tokenizer.texts_to_sequences([seed_text])[0]
    sequence = pad_sequences([sequence], maxlen=X.shape[1], padding='pre')
    predicted_prob = model.predict(sequence)[0]
    predicted_char_index = tf.argmax(predicted_prob).numpy()
    predicted_char = tokenizer.index_word[predicted_char_index]
    seed_text += predicted_char
    generated_text += predicted_char



In [23]:
print("Generated Text:")
print(generated_text)

Generated Text:
To be or not to be, that is the seems to the seal the seems to the seal the seems to the seal the seems to the seal the seems to th


In [24]:
# Evaluate ROUGE scores
reference_text = "To be or not to be, that is the"
rouge = Rouge()
rouge_scores = rouge.get_scores(generated_text, reference_text)
print("\nROUGE Scores:")
for score_type, value in rouge_scores[0]['rouge-l'].items():
    print(f'{score_type}: {value}')


ROUGE Scores:
r: 1.0
p: 0.75
f: 0.8571428522448981


In [None]:
# perplexity = np.exp(model.evaluate(X, y))
# print(f'Perplexity: {perplexity}')