File size: 60,598 Bytes
6bc5fb7
1
{"metadata":{"kernelspec":{"name":"python3","display_name":"Python 3","language":"python"},"language_info":{"name":"python","version":"3.10.12","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"},"kaggle":{"accelerator":"nvidiaTeslaT4","dataSources":[{"sourceId":11018742,"sourceType":"datasetVersion","datasetId":6860959}],"dockerImageVersionId":30919,"isInternetEnabled":true,"language":"python","sourceType":"notebook","isGpuEnabled":true}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"code","source":"# Importing HuggingFace Token\nfrom kaggle_secrets import UserSecretsClient\nuser_secrets = UserSecretsClient()\nsecret_value_0 = user_secrets.get_secret(\"HF_Token\")","metadata":{"execution":{"iopub.status.busy":"2025-03-14T06:02:34.158188Z","iopub.execute_input":"2025-03-14T06:02:34.158602Z","iopub.status.idle":"2025-03-14T06:02:34.361502Z","shell.execute_reply.started":"2025-03-14T06:02:34.158571Z","shell.execute_reply":"2025-03-14T06:02:34.360367Z"},"trusted":true},"outputs":[],"execution_count":1},{"cell_type":"code","source":"# Importing Libraries\nimport json\nimport torch\nimport os\nimport torch.nn as nn\nimport torch.optim as optim\nfrom torch.utils.data import Dataset, DataLoader\nfrom transformers import BertTokenizer, BertForSequenceClassification\nimport torch.nn.functional as F\nfrom sklearn.utils.class_weight import compute_class_weight\nimport numpy as np\nimport random\n\n# Load JSON data\nwith open(\"/kaggle/input/intents1/intents.json\", \"r\") as file:\n    intents = json.load(file)\n\n# Remove duplicate intent tags\nunique_intents = []\nseen_tags = set()\nfor intent in intents:\n    if intent[\"tag\"] not in seen_tags:\n        unique_intents.append(intent)\n        seen_tags.add(intent[\"tag\"])\n\n# Ensure unique intent tags\nintent_tags = [intent[\"tag\"] for intent in unique_intents]\nnum_labels = len(intent_tags)\n\n# Create label mapping\nlabel_map = {tag: i for i, tag in enumerate(intent_tags)}\n\n# Check for GPU availability\ndevice = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\nprint(f\"Using device: {device}\")\n\n# Load BERT tokenizer & model\ntokenizer = BertTokenizer.from_pretrained('bert-base-uncased', token=secret_value_0)\nmodel = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_labels, token=secret_value_0)\nmodel.to(device)\n\n# Define Dataset\nclass IntentDataset(Dataset):\n    def __init__(self, intents, tokenizer):\n        self.texts = []\n        self.labels = []\n        self.label_map = label_map\n\n        for intent in intents:\n            for pattern in intent[\"patterns\"]:\n                self.texts.append(pattern)\n                self.labels.append(self.label_map[intent[\"tag\"]])\n\n    def __len__(self):\n        return len(self.labels)\n\n    def __getitem__(self, idx):\n        text = self.texts[idx]\n        label = torch.tensor(self.labels[idx], dtype=torch.long).to(device)\n\n        encoding = tokenizer(text, truncation=True, padding=\"max_length\", max_length=32, return_tensors=\"pt\")\n        item = {key: val.squeeze(0).to(device) for key, val in encoding.items()}  # Remove batch dim\n\n        return item, label\n\n# Load dataset & dataloader\ndataset = IntentDataset(unique_intents, tokenizer)\ndataloader = DataLoader(dataset, batch_size=16, shuffle=True)  # Increased batch size\n\n# Compute class weights\nlabels = [dataset.label_map[intent[\"tag\"]] for intent in unique_intents for _ in intent[\"patterns\"]]\nclass_weights = compute_class_weight(\"balanced\", classes=np.unique(labels), y=labels)\nclass_weights = torch.tensor(class_weights, dtype=torch.float).to(device)\n\n# Define optimizer & loss function\noptimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)  # Lower learning rate\nloss_fn = torch.nn.CrossEntropyLoss(weight=class_weights)  # Use class-weighted loss\n\n# Training loop\nepochs = 90  # Increased from 20 to 50 for better training\nmodel.train()\n\nfor epoch in range(epochs):\n    total_loss = 0\n    for batch in dataloader:\n        inputs, labels = batch\n        optimizer.zero_grad()\n        outputs = model(**inputs)\n        loss = loss_fn(outputs.logits, labels)\n        loss.backward()\n        optimizer.step()\n        total_loss += loss.item()\n    \n    avg_loss = total_loss / len(dataloader)\n    print(f\"Epoch {epoch+1}/{epochs} - Average Loss: {avg_loss:.4f}\")\n\n# Function to predict intent\ndef predict_intent(user_input):\n    model.eval()\n    inputs = tokenizer(user_input, return_tensors=\"pt\", truncation=True, padding=True, max_length=32)\n    inputs = {key: val.to(device) for key, val in inputs.items()}  # Move input to GPU\n\n    with torch.no_grad():\n        outputs = model(**inputs)\n\n    predicted_label = torch.argmax(outputs.logits).item()\n    \n    # Map predicted label to intent\n    intent_tag = list(dataset.label_map.keys())[predicted_label]\n\n    # Fetch a random response for the predicted intent\n    for intent in unique_intents:\n        if intent[\"tag\"] == intent_tag:\n            return random.choice(intent[\"responses\"])","metadata":{"_cell_guid":"b1076dfc-b9ad-4769-8c92-a6c4dae69d19","_uuid":"8f2839f25d086af736a60e9eeb907d3b93b6e0e5","execution":{"iopub.status.busy":"2025-03-14T06:02:34.362718Z","iopub.execute_input":"2025-03-14T06:02:34.363000Z","iopub.status.idle":"2025-03-14T06:12:26.502668Z","shell.execute_reply.started":"2025-03-14T06:02:34.362977Z","shell.execute_reply":"2025-03-14T06:12:26.501655Z"},"trusted":true},"outputs":[{"name":"stdout","text":"Using device: cuda\n","output_type":"stream"},{"output_type":"display_data","data":{"text/plain":"tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"7824a999787d4bd1b195d7d91c77b73d"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":"vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"d429c8c5a09d44acabf466ce018c6bcd"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":"tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"0a455715281a4842b2b84fb392994a9a"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":"config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"cb3a2c53b638478c9aebbcb91ba50d1a"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":"model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"e7ab9075f9a542439774f2c2fba440a1"}},"metadata":{}},{"name":"stderr","text":"Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']\nYou should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n","output_type":"stream"},{"name":"stdout","text":"Epoch 1/90 - Average Loss: 5.6662\nEpoch 2/90 - Average Loss: 5.6118\nEpoch 3/90 - Average Loss: 5.5608\nEpoch 4/90 - Average Loss: 5.4642\nEpoch 5/90 - Average Loss: 5.3513\nEpoch 6/90 - Average Loss: 5.1811\nEpoch 7/90 - Average Loss: 5.0176\nEpoch 8/90 - Average Loss: 4.8236\nEpoch 9/90 - Average Loss: 4.6535\nEpoch 10/90 - Average Loss: 4.4724\nEpoch 11/90 - Average Loss: 4.3279\nEpoch 12/90 - Average Loss: 4.1504\nEpoch 13/90 - Average Loss: 4.0176\nEpoch 14/90 - Average Loss: 3.8612\nEpoch 15/90 - Average Loss: 3.7189\nEpoch 16/90 - Average Loss: 3.5748\nEpoch 17/90 - Average Loss: 3.4296\nEpoch 18/90 - Average Loss: 3.3193\nEpoch 19/90 - Average Loss: 3.1704\nEpoch 20/90 - Average Loss: 3.0567\nEpoch 21/90 - Average Loss: 2.9273\nEpoch 22/90 - Average Loss: 2.7933\nEpoch 23/90 - Average Loss: 2.6832\nEpoch 24/90 - Average Loss: 2.5414\nEpoch 25/90 - Average Loss: 2.4337\nEpoch 26/90 - Average Loss: 2.3230\nEpoch 27/90 - Average Loss: 2.2094\nEpoch 28/90 - Average Loss: 2.0913\nEpoch 29/90 - Average Loss: 1.9798\nEpoch 30/90 - Average Loss: 1.8935\nEpoch 31/90 - Average Loss: 1.7755\nEpoch 32/90 - Average Loss: 1.6802\nEpoch 33/90 - Average Loss: 1.5814\nEpoch 34/90 - Average Loss: 1.5013\nEpoch 35/90 - Average Loss: 1.4134\nEpoch 36/90 - Average Loss: 1.3328\nEpoch 37/90 - Average Loss: 1.2458\nEpoch 38/90 - Average Loss: 1.1845\nEpoch 39/90 - Average Loss: 1.1036\nEpoch 40/90 - Average Loss: 1.0327\nEpoch 41/90 - Average Loss: 0.9679\nEpoch 42/90 - Average Loss: 0.9215\nEpoch 43/90 - Average Loss: 0.8682\nEpoch 44/90 - Average Loss: 0.8089\nEpoch 45/90 - Average Loss: 0.7654\nEpoch 46/90 - Average Loss: 0.7181\nEpoch 47/90 - Average Loss: 0.6696\nEpoch 48/90 - Average Loss: 0.6318\nEpoch 49/90 - Average Loss: 0.5918\nEpoch 50/90 - Average Loss: 0.5542\nEpoch 51/90 - Average Loss: 0.5274\nEpoch 52/90 - Average Loss: 0.4944\nEpoch 53/90 - Average Loss: 0.4631\nEpoch 54/90 - Average Loss: 0.4428\nEpoch 55/90 - Average Loss: 0.4125\nEpoch 56/90 - Average Loss: 0.3950\nEpoch 57/90 - Average Loss: 0.3698\nEpoch 58/90 - Average Loss: 0.3491\nEpoch 59/90 - Average Loss: 0.3309\nEpoch 60/90 - Average Loss: 0.3142\nEpoch 61/90 - Average Loss: 0.2992\nEpoch 62/90 - Average Loss: 0.2829\nEpoch 63/90 - Average Loss: 0.2706\nEpoch 64/90 - Average Loss: 0.2552\nEpoch 65/90 - Average Loss: 0.2451\nEpoch 66/90 - Average Loss: 0.2355\nEpoch 67/90 - Average Loss: 0.2219\nEpoch 68/90 - Average Loss: 0.2122\nEpoch 69/90 - Average Loss: 0.2063\nEpoch 70/90 - Average Loss: 0.1950\nEpoch 71/90 - Average Loss: 0.1860\nEpoch 72/90 - Average Loss: 0.1758\nEpoch 73/90 - Average Loss: 0.1675\nEpoch 74/90 - Average Loss: 0.1630\nEpoch 75/90 - Average Loss: 0.1607\nEpoch 76/90 - Average Loss: 0.1525\nEpoch 77/90 - Average Loss: 0.1441\nEpoch 78/90 - Average Loss: 0.1364\nEpoch 79/90 - Average Loss: 0.1326\nEpoch 80/90 - Average Loss: 0.1293\nEpoch 81/90 - Average Loss: 0.1269\nEpoch 82/90 - Average Loss: 0.1199\nEpoch 83/90 - Average Loss: 0.1129\nEpoch 84/90 - Average Loss: 0.1095\nEpoch 85/90 - Average Loss: 0.1076\nEpoch 86/90 - Average Loss: 0.1030\nEpoch 87/90 - Average Loss: 0.0984\nEpoch 88/90 - Average Loss: 0.0944\nEpoch 89/90 - Average Loss: 0.0919\nEpoch 90/90 - Average Loss: 0.0880\n","output_type":"stream"}],"execution_count":2},{"cell_type":"code","source":"import torch\nfrom torch.utils.data import random_split\nfrom sklearn.metrics import accuracy_score\n\n# Split dataset into training (80%) and test (20%) sets\ntrain_size = int(0.8 * len(dataset))\ntest_size = len(dataset) - train_size\ntrain_dataset, test_dataset = random_split(dataset, [train_size, test_size])\n\n# Create test dataloader\ntest_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=False)\n\n# Function to evaluate model accuracy\ndef evaluate_model(model, test_dataloader):\n    model.eval()\n    all_preds, all_labels = [], []\n    \n    with torch.no_grad():\n        for batch in test_dataloader:\n            inputs, labels = batch\n            outputs = model(**inputs)\n            preds = torch.argmax(outputs.logits, dim=1)\n\n            all_preds.extend(preds.cpu().numpy())\n            all_labels.extend(labels.cpu().numpy())\n\n    accuracy = accuracy_score(all_labels, all_preds)\n    return accuracy\n\n# Compute accuracy\ntest_accuracy = evaluate_model(model, test_dataloader)\nprint(f\"Test Accuracy: {test_accuracy:.4f}\")\n","metadata":{"execution":{"iopub.status.busy":"2025-03-14T06:12:26.504747Z","iopub.execute_input":"2025-03-14T06:12:26.505510Z","iopub.status.idle":"2025-03-14T06:12:26.934471Z","shell.execute_reply.started":"2025-03-14T06:12:26.505474Z","shell.execute_reply":"2025-03-14T06:12:26.933341Z"},"trusted":true},"outputs":[{"name":"stdout","text":"Test Accuracy: 1.0000\n","output_type":"stream"}],"execution_count":3},{"cell_type":"code","source":"def evaluate_train_accuracy(model, train_dataloader):\n    model.eval()\n    all_preds, all_labels = [], []\n    \n    with torch.no_grad():\n        for batch in train_dataloader:\n            inputs, labels = batch\n            outputs = model(**inputs)\n            preds = torch.argmax(outputs.logits, dim=1)\n\n            all_preds.extend(preds.cpu().numpy())\n            all_labels.extend(labels.cpu().numpy())\n\n    accuracy = accuracy_score(all_labels, all_preds)\n    return accuracy\n\n# Compute Training Accuracy\ntrain_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=False)\ntrain_accuracy = evaluate_train_accuracy(model, train_dataloader)\nprint(f\"Training Accuracy: {train_accuracy:.4f}\")\n","metadata":{"execution":{"iopub.status.busy":"2025-03-14T06:12:26.936431Z","iopub.execute_input":"2025-03-14T06:12:26.936897Z","iopub.status.idle":"2025-03-14T06:12:28.400809Z","shell.execute_reply.started":"2025-03-14T06:12:26.936850Z","shell.execute_reply":"2025-03-14T06:12:28.399868Z"},"trusted":true},"outputs":[{"name":"stdout","text":"Training Accuracy: 0.9939\n","output_type":"stream"}],"execution_count":4},{"cell_type":"code","source":"import matplotlib.pyplot as plt\n\n# Assuming you store losses during training like this:\ntrain_losses = []\ntest_losses = []\n\nfor epoch in range(epochs):\n    total_train_loss = 0\n    total_test_loss = 0\n\n    model.train()\n    for batch in train_dataloader:\n        inputs, labels = batch\n        optimizer.zero_grad()\n        outputs = model(**inputs)\n        loss = loss_fn(outputs.logits, labels)\n        loss.backward()\n        optimizer.step()\n        total_train_loss += loss.item()\n    \n    model.eval()\n    with torch.no_grad():\n        for batch in test_dataloader:\n            inputs, labels = batch\n            outputs = model(**inputs)\n            loss = loss_fn(outputs.logits, labels)\n            total_test_loss += loss.item()\n\n    train_losses.append(total_train_loss / len(train_dataloader))\n    test_losses.append(total_test_loss / len(test_dataloader))\n\n# Plot the loss curves\nplt.plot(range(1, epochs+1), train_losses, label=\"Train Loss\")\nplt.plot(range(1, epochs+1), test_losses, label=\"Test Loss\")\nplt.xlabel(\"Epochs\")\nplt.ylabel(\"Loss\")\nplt.legend()\nplt.title(\"Training vs. Test Loss\")\nplt.show()\n","metadata":{"execution":{"iopub.status.busy":"2025-03-14T06:12:28.401654Z","iopub.execute_input":"2025-03-14T06:12:28.401902Z","iopub.status.idle":"2025-03-14T06:20:24.794528Z","shell.execute_reply.started":"2025-03-14T06:12:28.401882Z","shell.execute_reply":"2025-03-14T06:20:24.793362Z"},"trusted":true},"outputs":[{"output_type":"display_data","data":{"text/plain":"<Figure size 640x480 with 1 Axes>","image/png":"\n"},"metadata":{}}],"execution_count":5},{"cell_type":"code","source":"# Testing on New Data to test model generalization\ntest_inputs = [\n    \"Tell me a joke\", \n    \"Recommend a good book\", \n    \"What's the weather like today?\", \n    \"How can I save money?\", \n    \"How do I meditate?\"\n]\n\nfor input_text in test_inputs:\n    response = predict_intent(input_text)\n    print(f\"User: {input_text}\\nBot: {response}\\n\")\n","metadata":{"execution":{"iopub.status.busy":"2025-03-14T06:20:24.795687Z","iopub.execute_input":"2025-03-14T06:20:24.796070Z","iopub.status.idle":"2025-03-14T06:20:24.907652Z","shell.execute_reply.started":"2025-03-14T06:20:24.796036Z","shell.execute_reply":"2025-03-14T06:20:24.906513Z"},"trusted":true},"outputs":[{"name":"stdout","text":"User: Tell me a joke\nBot: Did you hear about the mathematician who’s afraid of negative numbers? He’ll stop at nothing to avoid them!\n\nUser: Recommend a good book\nBot: As an AI, I don't have personal preferences, but there are countless amazing books in various genres. Some popular ones include Harry Potter, To Kill a Mockingbird, and 1984.\n\nUser: What's the weather like today?\nBot: I'm sorry, I cannot provide real-time weather information.\n\nUser: How can I save money?\nBot: Investing in stocks, mutual funds, or real estate can help grow your wealth over time.\n\nUser: How do I meditate?\nBot: Meditation can reduce stress, improve focus, and promote emotional well-being.\n\n","output_type":"stream"}],"execution_count":6},{"cell_type":"code","source":"# Define save paths\nmodel_path = \"bert_chatbot_model.pth\"\ntokenizer_path = \"bert_chatbot_tokenizer\"\n\n# Save model state dictionary\ntorch.save(model.state_dict(), model_path)\n\n# Save tokenizer\ntokenizer.save_pretrained(tokenizer_path)\n\nprint(f\"Model saved to {model_path}\")\nprint(f\"Tokenizer saved to {tokenizer_path}\")","metadata":{"execution":{"iopub.status.busy":"2025-03-14T06:20:24.908862Z","iopub.execute_input":"2025-03-14T06:20:24.909244Z","iopub.status.idle":"2025-03-14T06:20:25.664937Z","shell.execute_reply.started":"2025-03-14T06:20:24.909206Z","shell.execute_reply":"2025-03-14T06:20:25.664169Z"},"trusted":true},"outputs":[{"name":"stdout","text":"Model saved to bert_chatbot_model.pth\nTokenizer saved to bert_chatbot_tokenizer\n","output_type":"stream"}],"execution_count":7},{"cell_type":"code","source":"model.save_pretrained(\"chatbot_model\")\ntokenizer.save_pretrained(\"chatbot_model\")","metadata":{"execution":{"iopub.status.busy":"2025-03-14T06:20:25.666633Z","iopub.execute_input":"2025-03-14T06:20:25.666894Z","iopub.status.idle":"2025-03-14T06:20:26.840930Z","shell.execute_reply.started":"2025-03-14T06:20:25.666873Z","shell.execute_reply":"2025-03-14T06:20:26.839944Z"},"trusted":true},"outputs":[{"execution_count":8,"output_type":"execute_result","data":{"text/plain":"('chatbot_model/tokenizer_config.json',\n 'chatbot_model/special_tokens_map.json',\n 'chatbot_model/vocab.txt',\n 'chatbot_model/added_tokens.json')"},"metadata":{}}],"execution_count":8}]}