{"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"name":"python","version":"3.10.12","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"},"kaggle":{"accelerator":"nvidiaTeslaT4","dataSources":[{"sourceId":2124,"sourceType":"datasetVersion","datasetId":1028},{"sourceId":7297075,"sourceType":"datasetVersion","datasetId":4232737}],"dockerImageVersionId":30628,"isInternetEnabled":true,"language":"python","sourceType":"notebook","isGpuEnabled":true}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"code","source":"import tensorflow as tf\nfrom tensorflow.keras.models import Sequential\nfrom tensorflow.keras.layers import Embedding, SimpleRNN, Dense\nfrom tensorflow.keras.preprocessing.text import Tokenizer\nfrom tensorflow.keras.preprocessing.sequence import pad_sequences\nfrom rouge import Rouge\nimport numpy as np","metadata":{"_uuid":"8f2839f25d086af736a60e9eeb907d3b93b6e0e5","_cell_guid":"b1076dfc-b9ad-4769-8c92-a6c4dae69d19","execution":{"iopub.status.busy":"2024-01-02T08:16:47.097964Z","iopub.execute_input":"2024-01-02T08:16:47.098333Z","iopub.status.idle":"2024-01-02T08:16:47.109024Z","shell.execute_reply.started":"2024-01-02T08:16:47.098301Z","shell.execute_reply":"2024-01-02T08:16:47.108134Z"},"trusted":true},"execution_count":3,"outputs":[]},{"cell_type":"code","source":"# Load and preprocess the Shakespeare dataset\nfile_path = \"/kaggle/input/shakespeare-txt/shakespeare.txt\"\nwith open(file_path, \"r\", encoding=\"utf-8\") as file:\n text = file.read()","metadata":{"execution":{"iopub.status.busy":"2024-01-02T08:16:50.965545Z","iopub.execute_input":"2024-01-02T08:16:50.966366Z","iopub.status.idle":"2024-01-02T08:16:51.120359Z","shell.execute_reply.started":"2024-01-02T08:16:50.966331Z","shell.execute_reply":"2024-01-02T08:16:51.119602Z"},"trusted":true},"execution_count":4,"outputs":[]},{"cell_type":"code","source":"# Tokenize the text\ntokenizer = Tokenizer(char_level=True)\ntokenizer.fit_on_texts([text])\ntotal_chars = len(tokenizer.word_index) + 1","metadata":{"execution":{"iopub.status.busy":"2024-01-02T08:16:55.040316Z","iopub.execute_input":"2024-01-02T08:16:55.040921Z","iopub.status.idle":"2024-01-02T08:16:56.519672Z","shell.execute_reply.started":"2024-01-02T08:16:55.040864Z","shell.execute_reply":"2024-01-02T08:16:56.518915Z"},"trusted":true},"execution_count":5,"outputs":[]},{"cell_type":"code","source":"# Create input sequences and target sequences\ninput_sequences = []\nfor i in range(1, len(text)):\n seq = text[i - 50:i + 1] # Use a sequence length of 50 characters\n input_sequences.append(seq)","metadata":{"execution":{"iopub.status.busy":"2024-01-02T08:17:02.435496Z","iopub.execute_input":"2024-01-02T08:17:02.435821Z","iopub.status.idle":"2024-01-02T08:17:04.867028Z","shell.execute_reply.started":"2024-01-02T08:17:02.435795Z","shell.execute_reply":"2024-01-02T08:17:04.866223Z"},"trusted":true},"execution_count":6,"outputs":[]},{"cell_type":"code","source":"sequences = tokenizer.texts_to_sequences(input_sequences)\n# Convert sequences to a NumPy array\nsequences = tf.keras.preprocessing.sequence.pad_sequences(sequences, padding='pre')\n\n# Split sequences into input (X) and target (y)\nX = sequences[:, :-1]\ny = sequences[:, -1]","metadata":{"execution":{"iopub.status.busy":"2024-01-02T08:17:09.485312Z","iopub.execute_input":"2024-01-02T08:17:09.485673Z","iopub.status.idle":"2024-01-02T08:19:25.430357Z","shell.execute_reply.started":"2024-01-02T08:17:09.485643Z","shell.execute_reply":"2024-01-02T08:19:25.429571Z"},"trusted":true},"execution_count":7,"outputs":[]},{"cell_type":"code","source":"# Convert y to one-hot encoding\ny = tf.keras.utils.to_categorical(y, num_classes=total_chars)","metadata":{"execution":{"iopub.status.busy":"2024-01-02T08:19:52.190240Z","iopub.execute_input":"2024-01-02T08:19:52.190602Z","iopub.status.idle":"2024-01-02T08:19:53.114909Z","shell.execute_reply.started":"2024-01-02T08:19:52.190573Z","shell.execute_reply":"2024-01-02T08:19:53.114095Z"},"trusted":true},"execution_count":8,"outputs":[]},{"cell_type":"code","source":"# Build the RNN model\nmodel = Sequential()\nmodel.add(Embedding(total_chars, 50, input_length=X.shape[1]))\nmodel.add(SimpleRNN(100, return_sequences=True))\nmodel.add(SimpleRNN(100))\nmodel.add(Dense(256))\nmodel.add(Dense(total_chars, activation='softmax'))","metadata":{"execution":{"iopub.status.busy":"2024-01-02T08:20:07.355370Z","iopub.execute_input":"2024-01-02T08:20:07.356114Z","iopub.status.idle":"2024-01-02T08:20:13.095401Z","shell.execute_reply.started":"2024-01-02T08:20:07.356082Z","shell.execute_reply":"2024-01-02T08:20:13.094418Z"},"trusted":true},"execution_count":9,"outputs":[]},{"cell_type":"code","source":"# model.compile(loss='categorical_crossentropy', optimizer='adam')\nmodel.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])","metadata":{"execution":{"iopub.status.busy":"2024-01-02T08:20:16.130231Z","iopub.execute_input":"2024-01-02T08:20:16.130583Z","iopub.status.idle":"2024-01-02T08:20:16.150246Z","shell.execute_reply.started":"2024-01-02T08:20:16.130553Z","shell.execute_reply":"2024-01-02T08:20:16.149336Z"},"trusted":true},"execution_count":10,"outputs":[]},{"cell_type":"code","source":"model.summary()","metadata":{"execution":{"iopub.status.busy":"2024-01-02T08:20:23.385477Z","iopub.execute_input":"2024-01-02T08:20:23.385828Z","iopub.status.idle":"2024-01-02T08:20:23.407079Z","shell.execute_reply.started":"2024-01-02T08:20:23.385800Z","shell.execute_reply":"2024-01-02T08:20:23.406278Z"},"trusted":true},"execution_count":11,"outputs":[{"name":"stdout","text":"Model: \"sequential\"\n_________________________________________________________________\n Layer (type) Output Shape Param # \n=================================================================\n embedding (Embedding) (None, 50, 50) 3300 \n \n simple_rnn (SimpleRNN) (None, 50, 100) 15100 \n \n simple_rnn_1 (SimpleRNN) (None, 100) 20100 \n \n dense (Dense) (None, 66) 6666 \n \n=================================================================\nTotal params: 45166 (176.43 KB)\nTrainable params: 45166 (176.43 KB)\nNon-trainable params: 0 (0.00 Byte)\n_________________________________________________________________\n","output_type":"stream"}]},{"cell_type":"code","source":"# # Train the model\n# model.fit(X, y, epochs=5, batch_size=1024)","metadata":{"execution":{"iopub.status.busy":"2023-12-29T15:36:39.562808Z","iopub.execute_input":"2023-12-29T15:36:39.563682Z","iopub.status.idle":"2023-12-29T16:48:08.706422Z","shell.execute_reply.started":"2023-12-29T15:36:39.563633Z","shell.execute_reply":"2023-12-29T16:48:08.705425Z"},"trusted":true},"execution_count":21,"outputs":[{"name":"stdout","text":"Epoch 1/5\n5331/5331 [==============================] - 845s 158ms/step - loss: 1.6501 - accuracy: 0.5076\nEpoch 2/5\n5331/5331 [==============================] - 843s 158ms/step - loss: 1.5061 - accuracy: 0.5472\nEpoch 3/5\n5331/5331 [==============================] - 854s 160ms/step - loss: 1.4569 - accuracy: 0.5608\nEpoch 4/5\n5331/5331 [==============================] - 846s 159ms/step - loss: 1.4299 - accuracy: 0.5680\nEpoch 5/5\n5331/5331 [==============================] - 841s 158ms/step - loss: 1.4123 - accuracy: 0.5727\n","output_type":"stream"},{"execution_count":21,"output_type":"execute_result","data":{"text/plain":""},"metadata":{}}]},{"cell_type":"code","source":"# Text generation example\nseed_text = \"To be or not to be, that is the\"\ngenerated_text = seed_text\nfor _ in range(100):\n sequence = tokenizer.texts_to_sequences([seed_text])[0]\n sequence = pad_sequences([sequence], maxlen=X.shape[1], padding='pre')\n predicted_prob = model.predict(sequence)[0]\n predicted_char_index = tf.argmax(predicted_prob).numpy()\n predicted_char = tokenizer.index_word[predicted_char_index]\n seed_text += predicted_char\n generated_text += predicted_char","metadata":{"execution":{"iopub.status.busy":"2023-12-29T16:48:25.198779Z","iopub.execute_input":"2023-12-29T16:48:25.199576Z","iopub.status.idle":"2023-12-29T16:48:32.137917Z","shell.execute_reply.started":"2023-12-29T16:48:25.199541Z","shell.execute_reply":"2023-12-29T16:48:32.136990Z"},"trusted":true},"execution_count":22,"outputs":[{"name":"stdout","text":"1/1 [==============================] - 0s 278ms/step\n1/1 [==============================] - 0s 33ms/step\n1/1 [==============================] - 0s 32ms/step\n1/1 [==============================] - 0s 33ms/step\n1/1 [==============================] - 0s 31ms/step\n1/1 [==============================] - 0s 33ms/step\n1/1 [==============================] - 0s 33ms/step\n1/1 [==============================] - 0s 31ms/step\n1/1 [==============================] - 0s 31ms/step\n1/1 [==============================] - 0s 31ms/step\n1/1 [==============================] - 0s 31ms/step\n1/1 [==============================] - 0s 31ms/step\n1/1 [==============================] - 0s 32ms/step\n1/1 [==============================] - 0s 32ms/step\n1/1 [==============================] - 0s 33ms/step\n1/1 [==============================] - 0s 30ms/step\n1/1 [==============================] - 0s 30ms/step\n1/1 [==============================] - 0s 31ms/step\n1/1 [==============================] - 0s 30ms/step\n1/1 [==============================] - 0s 33ms/step\n1/1 [==============================] - 0s 31ms/step\n1/1 [==============================] - 0s 30ms/step\n1/1 [==============================] - 0s 31ms/step\n1/1 [==============================] - 0s 31ms/step\n1/1 [==============================] - 0s 31ms/step\n1/1 [==============================] - 0s 32ms/step\n1/1 [==============================] - 0s 31ms/step\n1/1 [==============================] - 0s 31ms/step\n1/1 [==============================] - 0s 31ms/step\n1/1 [==============================] - 0s 31ms/step\n1/1 [==============================] - 0s 31ms/step\n1/1 [==============================] - 0s 30ms/step\n1/1 [==============================] - 0s 32ms/step\n1/1 [==============================] - 0s 32ms/step\n1/1 [==============================] - 0s 30ms/step\n1/1 [==============================] - 0s 30ms/step\n1/1 [==============================] - 0s 32ms/step\n1/1 [==============================] - 0s 32ms/step\n1/1 [==============================] - 0s 30ms/step\n1/1 [==============================] - 0s 31ms/step\n1/1 [==============================] - 0s 31ms/step\n1/1 [==============================] - 0s 32ms/step\n1/1 [==============================] - 0s 31ms/step\n1/1 [==============================] - 0s 31ms/step\n1/1 [==============================] - 0s 33ms/step\n1/1 [==============================] - 0s 33ms/step\n1/1 [==============================] - 0s 33ms/step\n1/1 [==============================] - 0s 32ms/step\n1/1 [==============================] - 0s 31ms/step\n1/1 [==============================] - 0s 32ms/step\n1/1 [==============================] - 0s 31ms/step\n1/1 [==============================] - 0s 31ms/step\n1/1 [==============================] - 0s 33ms/step\n1/1 [==============================] - 0s 33ms/step\n1/1 [==============================] - 0s 32ms/step\n1/1 [==============================] - 0s 31ms/step\n1/1 [==============================] - 0s 31ms/step\n1/1 [==============================] - 0s 31ms/step\n1/1 [==============================] - 0s 31ms/step\n1/1 [==============================] - 0s 32ms/step\n1/1 [==============================] - 0s 30ms/step\n1/1 [==============================] - 0s 31ms/step\n1/1 [==============================] - 0s 32ms/step\n1/1 [==============================] - 0s 46ms/step\n1/1 [==============================] - 0s 32ms/step\n1/1 [==============================] - 0s 31ms/step\n1/1 [==============================] - 0s 31ms/step\n1/1 [==============================] - 0s 32ms/step\n1/1 [==============================] - 0s 31ms/step\n1/1 [==============================] - 0s 30ms/step\n1/1 [==============================] - 0s 32ms/step\n1/1 [==============================] - 0s 33ms/step\n1/1 [==============================] - 0s 31ms/step\n1/1 [==============================] - 0s 32ms/step\n1/1 [==============================] - 0s 32ms/step\n1/1 [==============================] - 0s 30ms/step\n1/1 [==============================] - 0s 31ms/step\n1/1 [==============================] - 0s 31ms/step\n1/1 [==============================] - 0s 30ms/step\n1/1 [==============================] - 0s 31ms/step\n1/1 [==============================] - 0s 30ms/step\n1/1 [==============================] - 0s 31ms/step\n1/1 [==============================] - 0s 32ms/step\n1/1 [==============================] - 0s 32ms/step\n1/1 [==============================] - 0s 31ms/step\n1/1 [==============================] - 0s 30ms/step\n1/1 [==============================] - 0s 31ms/step\n1/1 [==============================] - 0s 33ms/step\n1/1 [==============================] - 0s 30ms/step\n1/1 [==============================] - 0s 33ms/step\n1/1 [==============================] - 0s 32ms/step\n1/1 [==============================] - 0s 31ms/step\n1/1 [==============================] - 0s 31ms/step\n1/1 [==============================] - 0s 31ms/step\n1/1 [==============================] - 0s 30ms/step\n1/1 [==============================] - 0s 30ms/step\n1/1 [==============================] - 0s 31ms/step\n1/1 [==============================] - 0s 34ms/step\n1/1 [==============================] - 0s 30ms/step\n1/1 [==============================] - 0s 31ms/step\n","output_type":"stream"}]},{"cell_type":"code","source":"print(\"Generated Text:\")\nprint(generated_text)","metadata":{"execution":{"iopub.status.busy":"2023-12-29T16:48:46.117131Z","iopub.execute_input":"2023-12-29T16:48:46.117532Z","iopub.status.idle":"2023-12-29T16:48:46.122473Z","shell.execute_reply.started":"2023-12-29T16:48:46.117494Z","shell.execute_reply":"2023-12-29T16:48:46.121441Z"},"trusted":true},"execution_count":23,"outputs":[{"name":"stdout","text":"Generated Text:\nTo be or not to be, that is the seems to the seal the seems to the seal the seems to the seal the seems to the seal the seems to th\n","output_type":"stream"}]},{"cell_type":"code","source":"# Evaluate ROUGE scores\nreference_text = \"To be or not to be, that is the\"\nrouge = Rouge()\nrouge_scores = rouge.get_scores(generated_text, reference_text)\nprint(\"\\nROUGE Scores:\")\nfor score_type, value in rouge_scores[0]['rouge-l'].items():\n print(f'{score_type}: {value}')","metadata":{"execution":{"iopub.status.busy":"2023-12-29T16:48:53.186397Z","iopub.execute_input":"2023-12-29T16:48:53.186782Z","iopub.status.idle":"2023-12-29T16:48:53.193638Z","shell.execute_reply.started":"2023-12-29T16:48:53.186750Z","shell.execute_reply":"2023-12-29T16:48:53.192619Z"},"trusted":true},"execution_count":24,"outputs":[{"name":"stdout","text":"\nROUGE Scores:\nr: 1.0\np: 0.75\nf: 0.8571428522448981\n","output_type":"stream"}]},{"cell_type":"code","source":"# perplexity = np.exp(model.evaluate(X, y))\n# print(f'Perplexity: {perplexity}')","metadata":{"execution":{"iopub.status.busy":"2023-12-29T16:50:22.115660Z","iopub.execute_input":"2023-12-29T16:50:22.116713Z"},"trusted":true},"execution_count":null,"outputs":[]}]}