{ "cells": [ { "cell_type": "markdown", "id": "173f674b-75a4-43ff-97ff-c03b5a32a74d", "metadata": {}, "source": [ "# Neural Network-Based Language Model for Next Token Prediction" ] }, { "cell_type": "markdown", "id": "a27f24d0-0eb2-4424-8bfe-f2fbf5039883", "metadata": {}, "source": [ "### Data Loading And Cleaning" ] }, { "cell_type": "code", "execution_count": 65, "id": "6ddf05f0-8586-47b1-9c0f-f4cfe74db3f2", "metadata": { "editable": true, "slideshow": { "slide_type": "" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Training data size: 100\n", "Validation data size: 100\n" ] } ], "source": [ "import json\n", "import pandas as pd\n", "\n", "# Load English (alpaca_cleaned.json) and Assamese datasets (Assamese.json)\n", "def load_data(english_path, assamese_path):\n", " with open(english_path, 'r') as f:\n", " english_data = json.load(f)\n", " \n", " with open(assamese_path, 'r') as f:\n", " assamese_data = json.load(f)\n", " \n", " return english_data, assamese_data\n", "\n", "# Clean and extract the required data\n", "def prepare_data(english_data, assamese_data, sample_size=50):\n", " # Take the first 50 examples from each dataset for training\n", " train_english = english_data[:sample_size]\n", " train_assamese = assamese_data[:sample_size]\n", " \n", " # Take the last 50 examples from each dataset for validation\n", " val_english = english_data[-sample_size:]\n", " val_assamese = assamese_data[-sample_size:]\n", " \n", " # Merge training and validation datasets\n", " train_data = train_english + train_assamese\n", " val_data = val_english + val_assamese\n", " \n", " return train_data, val_data\n", "\n", "# Paths to data files\n", "english_path = 'alpaca_cleaned.json'\n", "assamese_path = 'Assamese.json'\n", "\n", "# Load and prepare the data\n", "english_data, assamese_data = load_data(english_path, assamese_path)\n", "train_data, val_data = prepare_data(english_data, assamese_data)\n", "\n", "# Display sample sizes\n", "print(f'Training data size: {len(train_data)}')\n", "print(f'Validation data size: {len(val_data)}')\n" ] }, { "cell_type": "markdown", "id": "e3d4d293-30ac-4c00-9aa7-019203547b80", "metadata": {}, "source": [ "# GP-2 Tokenizer" ] }, { "cell_type": "code", "execution_count": 24, "id": "4b5a09a4-458f-4b29-a600-27233ebf07bd", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Tokenized train data size: 100\n", "Tokenized validation data size: 100\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/opt/anaconda3/lib/python3.12/site-packages/transformers/tokenization_utils_base.py:1601: FutureWarning: `clean_up_tokenization_spaces` was not set. It will be set to `True` by default. This behavior will be depracted in transformers v4.45, and will be then set to `False` by default. For more details check this issue: https://github.com/huggingface/transformers/issues/31884\n", " warnings.warn(\n" ] } ], "source": [ " from transformers import GPT2Tokenizer\n", "\n", "# Load GPT-2 tokenizer\n", "tokenizer = GPT2Tokenizer.from_pretrained(\"gpt2\")\n", "\n", "# Add padding tokens to the tokenizer\n", "tokenizer.pad_token = tokenizer.eos_token\n", "\n", "# Function to tokenize a dataset\n", "def tokenize_data(data, tokenizer, max_length=512):\n", " inputs = []\n", " outputs = []\n", " \n", " for entry in data:\n", " instruction = entry.get('instruction', '')\n", " input_text = entry.get('input', '')\n", " output_text = entry.get('output', '')\n", " \n", " # Combine instruction and input for tokenization\n", " combined_input = instruction + \" \" + input_text\n", " tokenized_input = tokenizer(combined_input, truncation=True, padding='max_length', max_length=max_length)\n", " tokenized_output = tokenizer(output_text, truncation=True, padding='max_length', max_length=max_length)\n", " \n", " inputs.append(tokenized_input['input_ids'])\n", " outputs.append(tokenized_output['input_ids'])\n", " \n", " return inputs, outputs\n", "\n", "# Tokenize training and validation data\n", "train_inputs, train_outputs = tokenize_data(train_data, tokenizer)\n", "val_inputs, val_outputs = tokenize_data(val_data, tokenizer)\n", "\n", "print(f\"Tokenized train data size: {len(train_inputs)}\")\n", "print(f\"Tokenized validation data size: {len(val_inputs)}\")\n" ] }, { "cell_type": "markdown", "id": "718ebcdd-53e9-463d-8448-eac6ab9e62a8", "metadata": {}, "source": [ "# Embedding and LSTM Model Building" ] }, { "cell_type": "code", "execution_count": 26, "id": "7f442566-4342-4840-9052-e5be41fe76d7", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "NextTokenModel(\n", " (embedding): Embedding(50257, 256)\n", " (rnn): GRU(256, 512, batch_first=True)\n", " (fc): Linear(in_features=512, out_features=50257, bias=True)\n", ")" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import torch\n", "import torch.nn as nn\n", "import torch.optim as optim\n", "from torch.utils.data import DataLoader, TensorDataset\n", "\n", "# Neural Network model for next-token prediction\n", "class NextTokenModel(nn.Module):\n", " def __init__(self, vocab_size, embedding_dim, hidden_dim):\n", " super(NextTokenModel, self).__init__()\n", " self.embedding = nn.Embedding(vocab_size, embedding_dim)\n", " self.rnn = nn.GRU(embedding_dim, hidden_dim, batch_first=True)\n", " self.fc = nn.Linear(hidden_dim, vocab_size)\n", " \n", " def forward(self, x):\n", " x = self.embedding(x)\n", " rnn_out, _ = self.rnn(x)\n", " logits = self.fc(rnn_out)\n", " return logits\n", "\n", "# Create model\n", "vocab_size = tokenizer.vocab_size\n", "embedding_dim = 256\n", "hidden_dim = 512\n", "\n", "model = NextTokenModel(vocab_size, embedding_dim, hidden_dim)\n", "\n", "# Loss and optimizer\n", "criterion = nn.CrossEntropyLoss()\n", "optimizer = optim.Adam(model.parameters(), lr=0.001)\n", "\n", "# Move model to GPU if available\n", "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", "model.to(device)\n" ] }, { "cell_type": "markdown", "id": "22af2da3-5745-4f39-93a0-e58272073e54", "metadata": {}, "source": [ "# Training and losses" ] }, { "cell_type": "code", "execution_count": 28, "id": "b79b57f2-83e2-446d-ac44-bea359c87b89", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Epoch 1, Train Loss: 7.73902153968811, Val Loss: 6.2885777950286865\n", "Epoch 2, Train Loss: 3.731250762939453, Val Loss: 4.852449059486389\n", "Checkpoint saved at checkpoint_epoch_2.pth\n", "Epoch 3, Train Loss: 3.6915414333343506, Val Loss: 3.8921849131584167\n", "Epoch 4, Train Loss: 2.490002751350403, Val Loss: 3.1142460107803345\n", "Checkpoint saved at checkpoint_epoch_4.pth\n", "Epoch 5, Train Loss: 2.497803032398224, Val Loss: 2.928856372833252\n", "Epoch 6, Train Loss: 2.306287258863449, Val Loss: 2.9287983775138855\n", "Checkpoint saved at checkpoint_epoch_6.pth\n", "Epoch 7, Train Loss: 2.338519275188446, Val Loss: 3.0046048164367676\n", "Epoch 8, Train Loss: 2.1667630076408386, Val Loss: 2.9524718821048737\n", "Checkpoint saved at checkpoint_epoch_8.pth\n", "Epoch 9, Train Loss: 2.4194843769073486, Val Loss: 2.948956310749054\n", "Epoch 10, Train Loss: 2.283351480960846, Val Loss: 2.934361010789871\n", "Checkpoint saved at checkpoint_epoch_10.pth\n" ] }, { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ " import matplotlib.pyplot as plt\n", "\n", "# Create dataloaders\n", "train_dataset = TensorDataset(torch.tensor(train_inputs), torch.tensor(train_outputs))\n", "val_dataset = TensorDataset(torch.tensor(val_inputs), torch.tensor(val_outputs))\n", "\n", "train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)\n", "val_loader = DataLoader(val_dataset, batch_size=32)\n", "\n", "# Training function\n", "def train(model, train_loader, val_loader, criterion, optimizer, epochs=10, checkpoint_interval=2):\n", " train_losses = []\n", " val_losses = []\n", " \n", " for epoch in range(epochs):\n", " model.train()\n", " train_loss = 0\n", " \n", " for inputs, targets in train_loader:\n", " inputs, targets = inputs.to(device), targets.to(device)\n", " \n", " optimizer.zero_grad()\n", " outputs = model(inputs)\n", " \n", " # Reshape outputs for loss calculation\n", " loss = criterion(outputs.view(-1, vocab_size), targets.view(-1))\n", " loss.backward()\n", " optimizer.step()\n", " \n", " train_loss += loss.item()\n", " \n", " train_loss /= len(train_loader)\n", " train_losses.append(train_loss)\n", " \n", " # Validation loss\n", " model.eval()\n", " val_loss = 0\n", " with torch.no_grad():\n", " for inputs, targets in val_loader:\n", " inputs, targets = inputs.to(device), targets.to(device)\n", " outputs = model(inputs)\n", " loss = criterion(outputs.view(-1, vocab_size), targets.view(-1))\n", " val_loss += loss.item()\n", " \n", " val_loss /= len(val_loader)\n", " val_losses.append(val_loss)\n", " \n", " print(f\"Epoch {epoch+1}, Train Loss: {train_loss}, Val Loss: {val_loss}\")\n", " \n", " # Checkpoint saving\n", " if (epoch + 1) % checkpoint_interval == 0:\n", " checkpoint_path = f'checkpoint_epoch_{epoch+1}.pth'\n", " torch.save(model.state_dict(), checkpoint_path)\n", " print(f\"Checkpoint saved at {checkpoint_path}\")\n", " \n", " return train_losses, val_losses\n", "\n", "# Train the model\n", "train_losses, val_losses = train(model, train_loader, val_loader, criterion, optimizer, epochs=10)\n", "\n", "# Plotting the loss curves\n", "plt.plot(train_losses, label='Training Loss')\n", "plt.plot(val_losses, label='Validation Loss')\n", "plt.xlabel('Epochs')\n", "plt.ylabel('Loss')\n", "plt.legend()\n", "plt.savefig('loss_curve.png')\n", "plt.show()\n", "\n", "# Save the losses in a CSV file\n", "loss_df = pd.DataFrame({'Epoch': range(1, 11), 'Train Loss': train_losses, 'Validation Loss': val_losses})\n", "loss_df.to_csv('losses.csv', index=False)\n" ] }, { "cell_type": "code", "execution_count": 81, "id": "4f38dafa-9979-4aa7-8c4b-a65de0b8cda3", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Epoch 1, Validation Perplexity: 538.3870875451362\n", "Epoch 2, Validation Perplexity: 128.05361705905648\n", "Epoch 3, Validation Perplexity: 49.01786939486888\n", "Epoch 4, Validation Perplexity: 22.516446793441215\n", "Epoch 5, Validation Perplexity: 18.706225311776535\n", "Epoch 6, Validation Perplexity: 18.705140469723624\n", "Epoch 7, Validation Perplexity: 20.178240411215985\n", "Epoch 8, Validation Perplexity: 19.15323981208396\n", "Epoch 9, Validation Perplexity: 19.086023452013098\n", "Epoch 10, Validation Perplexity: 18.809480239422275\n" ] }, { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "import torch\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "import pandas as pd\n", "\n", "# Calculate perplexity from validation losses\n", "val_perplexities = [np.exp(loss) for loss in val_losses]\n", "\n", "# Print the validation perplexities for each epoch\n", "for epoch, perplexity in enumerate(val_perplexities, 1):\n", " print(f'Epoch {epoch}, Validation Perplexity: {perplexity}')\n", "\n", "# Plot the validation perplexity curve\n", "plt.plot(val_perplexities, label='Validation Perplexity')\n", "plt.xlabel('Epochs')\n", "plt.ylabel('Perplexity')\n", "plt.legend()\n", "plt.title('Validation Perplexity Over Epochs')\n", "plt.savefig('validation_perplexity_curve.png')\n", "plt.show()\n", "\n", "# Optionally, save perplexity values to a CSV file\n", "perplexity_df = pd.DataFrame({'Epoch': range(1, len(val_perplexities) + 1), 'Validation Perplexity': val_perplexities})\n", "perplexity_df.to_csv('validation_perplexity.csv', index=False)\n" ] }, { "cell_type": "markdown", "id": "f2f48d30-a70f-4cef-bec5-69ba024e16b8", "metadata": {}, "source": [ "# Next Token Prediction" ] }, { "cell_type": "code", "execution_count": 34, "id": "fd89e131-200b-4089-a524-48d242f99b4c", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Random Model Output: Hello world England ax���������������\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/var/folders/fx/vbj5djls49z6lsrd_27sfk900000gn/T/ipykernel_2184/266914567.py:42: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n", " model.load_state_dict(torch.load('checkpoint_epoch_10.pth')) # Load the final checkpoint\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Trained Model Output: Hello world England ax���������������\n", "Assamese Model Output: Give three tips for staying healthy. nutritious nutritious withheld shaft Fight anarchistulate alludedtype revertedpendulate��\n" ] } ], "source": [ "# Function to generate text from the model\n", "def generate_text(model, tokenizer, prompt, max_length=50):\n", " model.eval()\n", " inputs = tokenizer(prompt, return_tensors=\"pt\").input_ids.to(device)\n", " outputs = model.generate(inputs, max_length=max_length)\n", " return tokenizer.decode(outputs[0], skip_special_tokens=True)\n", "\n", "# Generate output from randomly initialized model\n", "random_model_output = generate_text(model, tokenizer, prompt=\"Hello world\", max_length=50)\n", "print(\"Random Model Output:\", random_model_output)\n", "\n", "# Generate output from trained model\n", "model.load_state_dict(torch.load('checkpoint_epoch_10.pth')) # Load the final checkpoint\n", "trained_model_output = generate_text(model, tokenizer, prompt=\"Hello world\", max_length=50)\n", "print(\"Trained Model Output:\", trained_model_output)\n", "\n", "# Generate output in Assamese and translate to English using Google Translate manually\n", "assamese_prompt = \"Give three tips for staying healthy.\"\n", "assamese_output = generate_text(model, tokenizer, prompt=assamese_prompt, max_length=50)\n", "print(\"Assamese Model Output:\", assamese_output)\n", "\n", " \n" ] }, { "cell_type": "code", "execution_count": 79, "id": "fcbff2d4-6272-4815-ab78-44fa8f80bab6", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Random Model Output: Hello world England ax���������������\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/var/folders/fx/vbj5djls49z6lsrd_27sfk900000gn/T/ipykernel_2184/436038338.py:42: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n", " model.load_state_dict(torch.load('checkpoint_epoch_10.pth')) # Load the final checkpoint\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Trained Model Output: Hello world England ax���������������\n", "Assamese Model Output: Give three tips for staying healthy. nutritious nutritious withheld shaft Fight anarchistulate alludedtype revertedpendulate��\n" ] } ], "source": [ "import torch.nn.functional as F\n", "\n", "# Function to generate text from the model\n", "def generate_text_custom(model, tokenizer, prompt, max_length=50):\n", " model.eval()\n", " input_ids = tokenizer(prompt, return_tensors=\"pt\").input_ids.to(device)\n", " \n", " generated_tokens = input_ids.tolist()[0] # Start with the input prompt\n", " \n", " # Manually generate tokens one by one\n", " for _ in range(max_length):\n", " # Convert current tokens to tensor\n", " input_tensor = torch.tensor([generated_tokens]).to(device)\n", " \n", " # Get the model's output (logits)\n", " with torch.no_grad():\n", " logits = model(input_tensor)\n", " \n", " # Take the logits for the last token and apply softmax to get probabilities\n", " next_token_logits = logits[0, -1, :]\n", " next_token_probs = F.softmax(next_token_logits, dim=-1)\n", " \n", " # Sample the next token (or take the argmax for greedy decoding)\n", " next_token = torch.argmax(next_token_probs).item()\n", " \n", " # Add the predicted token to the sequence\n", " generated_tokens.append(next_token)\n", " \n", " # Stop if the model generates the end-of-sequence token\n", " if next_token == tokenizer.eos_token_id:\n", " break\n", " \n", " # Decode the generated tokens back into text\n", " generated_text = tokenizer.decode(generated_tokens, skip_special_tokens=True)\n", " return generated_text\n", "\n", "# Generate output from randomly initialized model\n", "random_model_output = generate_text_custom(model, tokenizer, prompt=\"Hello world\", max_length=50)\n", "print(\"Random Model Output:\", random_model_output)\n", "\n", "# Load trained model's checkpoint\n", "model.load_state_dict(torch.load('checkpoint_epoch_10.pth')) # Load the final checkpoint\n", "\n", "# Generate output from the trained model\n", "trained_model_output = generate_text_custom(model, tokenizer, prompt=\"Hello world\", max_length=50)\n", "print(\"Trained Model Output:\", trained_model_output)\n", "\n", "# Generate output in Assamese\n", "assamese_prompt = \"Give three tips for staying healthy.\"\n", "assamese_output = generate_text_custom(model, tokenizer, prompt=assamese_prompt, max_length=50)\n", "print(\"Assamese Model Output:\", assamese_output)\n", "\n", " " ] }, { "cell_type": "code", "execution_count": null, "id": "f4c97bf9-81e1-4dae-8697-60e435c6d79f", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.4" } }, "nbformat": 4, "nbformat_minor": 5 }