Spaces:

jbraha
/

aiproject

Runtime error

App Files Files Community

jbraha commited on Apr 24, 2023

Commit

e1fe27b

•

1 Parent(s): 56c002f

'mint autosave'

Browse files

Files changed (15) hide show

.gitignore +3 -0
Copy of training.ipynb +0 -334
Copy_of_Copy_of_training.ipynb +0 -345
logs/1682300361.4426298/events.out.tfevents.1682300361.mint.371280.1 +0 -0
logs/1682300884.6095285/events.out.tfevents.1682300884.mint.371280.3 +0 -0
logs/1682300938.1223385/events.out.tfevents.1682300938.mint.371280.5 +0 -0
logs/1682301013.2686887/events.out.tfevents.1682301013.mint.371280.7 +0 -0
logs/events.out.tfevents.1682300361.mint.371280.0 +0 -0
logs/events.out.tfevents.1682300884.mint.371280.2 +0 -0
logs/events.out.tfevents.1682300938.mint.371280.4 +0 -0
logs/events.out.tfevents.1682301013.mint.371280.6 +0 -0
train.py +0 -138
training.ipynb +0 -164
traintokens.txt +0 -0
working_training.ipynb +0 -0

.gitignore CHANGED Viewed

@@ -1,2 +1,5 @@
 results/**
 data/**

 results/**
+<<<<<<< HEAD
 data/**
+=======
+>>>>>>> f375d50 ('mint autosave')

Copy of training.ipynb DELETED Viewed

@@ -1,334 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "id": "215a1aae",
-   "metadata": {
-    "executionInfo": {
-     "elapsed": 128,
-     "status": "ok",
-     "timestamp": 1682285319377,
-     "user": {
-      "displayName": "",
-      "userId": ""
-     },
-     "user_tz": 240
-    },
-    "id": "215a1aae"
-   },
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "2023-04-23 18:07:24.557548: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
-      "To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
-      "2023-04-23 18:07:25.431969: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n"
-     ]
-    }
-   ],
-   "source": [
-    "import torch\n",
-    "from torch.utils.data import Dataset, DataLoader\n",
-    "\n",
-    "import pandas as pd\n",
-    "\n",
-    "from transformers import BertTokenizerFast, BertForSequenceClassification\n",
-    "from transformers import Trainer, TrainingArguments"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "id": "J5Tlgp4tNd0U",
-   "metadata": {
-    "colab": {
-     "base_uri": "https://localhost:8080/"
-    },
-    "executionInfo": {
-     "elapsed": 1897,
-     "status": "ok",
-     "timestamp": 1682285321454,
-     "user": {
-      "displayName": "",
-      "userId": ""
-     },
-     "user_tz": 240
-    },
-    "id": "J5Tlgp4tNd0U",
-    "outputId": "3c9f0c5b-7bc3-4c15-c5ff-0a77d3b3b607"
-   },
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']\n",
-      "- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
-      "- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
-      "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']\n",
-      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
-     ]
-    }
-   ],
-   "source": [
-    "model_name = \"bert-base-uncased\"\n",
-    "tokenizer = BertTokenizerFast.from_pretrained(model_name)\n",
-    "model = BertForSequenceClassification.from_pretrained(model_name, num_labels=6)\n",
-    "max_len = 200\n",
-    "\n",
-    "training_args = TrainingArguments(\n",
-    "    output_dir=\"results\",\n",
-    "    num_train_epochs=1,\n",
-    "    per_device_train_batch_size=16,\n",
-    "    per_device_eval_batch_size=64,\n",
-    "    warmup_steps=500,\n",
-    "    learning_rate=5e-5,\n",
-    "    weight_decay=0.01,\n",
-    "    logging_dir=\"./logs\",\n",
-    "    logging_steps=10\n",
-    "    )\n",
-    "\n",
-    "# dataset class that inherits from torch.utils.data.Dataset\n",
-    "class TweetDataset(Dataset):\n",
-    "    def __init__(self, encodings, labels):\n",
-    "        self.encodings = encodings\n",
-    "        self.labels = labels\n",
-    "        self.tok = tokenizer\n",
-    "    \n",
-    "    def __getitem__(self, idx):\n",
-    "        # encoding = self.tok(self.encodings[idx], truncation=True, padding=\"max_length\", max_length=max_len)\n",
-    "        item = { key: torch.tensor(val[idx]) for key, val in self.encoding.items() }\n",
-    "        item['labels'] = torch.tensor(self.labels[idx])\n",
-    "        return item\n",
-    "    \n",
-    "    def __len__(self):\n",
-    "        return len(self.labels)\n",
-    "    \n",
-    "class TokenizerDataset(Dataset):\n",
-    "    def __init__(self, strings):\n",
-    "        self.strings = strings\n",
-    "    \n",
-    "    def __getitem__(self, idx):\n",
-    "        return self.strings[idx]\n",
-    "    \n",
-    "    def __len__(self):\n",
-    "        return len(self.strings)\n",
-    "    "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "id": "9969c58c",
-   "metadata": {
-    "executionInfo": {
-     "elapsed": 5145,
-     "status": "ok",
-     "timestamp": 1682285326593,
-     "user": {
-      "displayName": "",
-      "userId": ""
-     },
-     "user_tz": 240
-    },
-    "id": "9969c58c",
-    "scrolled": false
-   },
-   "outputs": [],
-   "source": [
-    "train_data = pd.read_csv(\"data/train.csv\")\n",
-    "train_text = train_data[\"comment_text\"]\n",
-    "train_labels = train_data[[\"toxic\", \"severe_toxic\", \n",
-    "                           \"obscene\", \"threat\", \n",
-    "                           \"insult\", \"identity_hate\"]]\n",
-    "\n",
-    "test_text = pd.read_csv(\"data/test.csv\")[\"comment_text\"]\n",
-    "test_labels = pd.read_csv(\"data/test_labels.csv\")[[\n",
-    "                           \"toxic\", \"severe_toxic\", \n",
-    "                           \"obscene\", \"threat\", \n",
-    "                           \"insult\", \"identity_hate\"]]\n",
-    "\n",
-    "# data preprocessing\n",
-    "\n",
-    "\n",
-    "\n",
-    "train_text = train_text.values.tolist()\n",
-    "train_labels = train_labels.values.tolist()\n",
-    "test_text = test_text.values.tolist()\n",
-    "test_labels = test_labels.values.tolist()\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "1n56TME9Njde",
-   "metadata": {
-    "executionInfo": {
-     "elapsed": 12,
-     "status": "ok",
-     "timestamp": 1682285326594,
-     "user": {
-      "displayName": "",
-      "userId": ""
-     },
-     "user_tz": 240
-    },
-    "id": "1n56TME9Njde"
-   },
-   "outputs": [],
-   "source": [
-    "# prepare tokenizer and dataset\n",
-    "\n",
-    "train_strings = TokenizerDataset(train_text)\n",
-    "test_strings = TokenizerDataset(test_text)\n",
-    "\n",
-    "train_dataloader = DataLoader(train_strings, batch_size=16, shuffle=True)\n",
-    "test_dataloader = DataLoader(test_strings, batch_size=16, shuffle=True)\n",
-    "\n",
-    "\n",
-    "\n",
-    "\n",
-    "# train_encodings = tokenizer.batch_encode_plus(train_text, \\\n",
-    "#                             max_length=200, pad_to_max_length=True, \\\n",
-    "#                             truncation=True, return_token_type_ids=False \\\n",
-    "#                             )\n",
-    "# test_encodings = tokenizer.batch_encode_plus(test_text, \\\n",
-    "#                             max_length=200, pad_to_max_length=True, \\\n",
-    "#                             truncation=True, return_token_type_ids=False \\\n",
-    "#                             )\n",
-    "\n",
-    "\n",
-    "train_encodings = tokenizer(train_text, truncation=True, padding=True)\n",
-    "test_encodings = tokenizer(test_text, truncation=True, padding=True)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "a5c7a657",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "f = open(\"traintokens.txt\", 'a')\n",
-    "f.write(train_encodings)\n",
-    "f.write('\\n\\n\\n\\n\\n')\n",
-    "f.close()\n",
-    "\n",
-    "g = open(\"testtokens.txt\", 'a')\n",
-    "g.write(test_encodings)\n",
-    "g.write('\\n\\n\\n\\n\\n')\n",
-    "\n",
-    "g.close()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "4kwydz67qjW9",
-   "metadata": {
-    "executionInfo": {
-     "elapsed": 10,
-     "status": "ok",
-     "timestamp": 1682285326595,
-     "user": {
-      "displayName": "",
-      "userId": ""
-     },
-     "user_tz": 240
-    },
-    "id": "4kwydz67qjW9"
-   },
-   "outputs": [],
-   "source": [
-    "train_dataset = TweetDataset(train_ecnodings, train_labels)\n",
-    "test_dataset = TweetDataset(test_encodings, test_labels)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "krZKjDVwNnWI",
-   "metadata": {
-    "executionInfo": {
-     "elapsed": 10,
-     "status": "ok",
-     "timestamp": 1682285326596,
-     "user": {
-      "displayName": "",
-      "userId": ""
-     },
-     "user_tz": 240
-    },
-    "id": "krZKjDVwNnWI"
-   },
-   "outputs": [],
-   "source": [
-    "# training\n",
-    "trainer = Trainer(\n",
-    "    model=model, \n",
-    "    args=training_args, \n",
-    "    train_dataset=train_dataset, \n",
-    "    eval_dataset=test_dataset\n",
-    "    )"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "VwsyMZg_tgTg",
-   "metadata": {
-    "colab": {
-     "base_uri": "https://localhost:8080/",
-     "height": 416
-    },
-    "executionInfo": {
-     "elapsed": 27193,
-     "status": "error",
-     "timestamp": 1682285353779,
-     "user": {
-      "displayName": "",
-      "userId": ""
-     },
-     "user_tz": 240
-    },
-    "id": "VwsyMZg_tgTg",
-    "outputId": "49c3f5c8-0342-45c5-8d0f-5cd5d2d1f9e9"
-   },
-   "outputs": [],
-   "source": [
-    "trainer.train()"
-   ]
-  }
- ],
- "metadata": {
-  "colab": {
-   "provenance": [
-    {
-     "file_id": "https://github.com/joebraha/aiproject/blob/milestone-3/training.ipynb",
-     "timestamp": 1682285843150
-    }
-   ]
-  },
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.10.6"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}

Copy_of_Copy_of_training.ipynb DELETED Viewed

@@ -1,345 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "id": "215a1aae",
-   "metadata": {
-    "id": "215a1aae"
-   },
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "2023-04-23 21:39:14.489766: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
-      "To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
-      "2023-04-23 21:39:15.104927: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n"
-     ]
-    }
-   ],
-   "source": [
-    "import torch\n",
-    "from torch.utils.data import Dataset, DataLoader\n",
-    "\n",
-    "import pandas as pd\n",
-    "\n",
-    "from transformers import BertTokenizerFast, BertForSequenceClassification\n",
-    "from transformers import Trainer, TrainingArguments"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "id": "J5Tlgp4tNd0U",
-   "metadata": {
-    "colab": {
-     "base_uri": "https://localhost:8080/"
-    },
-    "id": "J5Tlgp4tNd0U",
-    "outputId": "f2eef2ee-7d9d-4f5b-e35c-e6015e68f59e"
-   },
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight']\n",
-      "- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
-      "- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
-      "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']\n",
-      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
-     ]
-    }
-   ],
-   "source": [
-    "model_name = \"bert-base-uncased\"\n",
-    "tokenizer = BertTokenizerFast.from_pretrained(model_name)\n",
-    "model = BertForSequenceClassification.from_pretrained(model_name, num_labels=6)\n",
-    "model = model.to(\"cuda:0\")\n",
-    "max_len = 200\n",
-    "\n",
-    "training_args = TrainingArguments(\n",
-    "    output_dir=\"results\",\n",
-    "    num_train_epochs=1,\n",
-    "    per_device_train_batch_size=16,\n",
-    "    per_device_eval_batch_size=64,\n",
-    "    warmup_steps=500,\n",
-    "    learning_rate=5e-5,\n",
-    "    weight_decay=0.01,\n",
-    "    logging_dir=\"./logs\",\n",
-    "    logging_steps=10\n",
-    "    )\n",
-    "\n",
-    "# dataset class that inherits from torch.utils.data.Dataset\n",
-    "\n",
-    "    \n",
-    "class TokenizerDataset(Dataset):\n",
-    "    def __init__(self, strings):\n",
-    "        self.strings = strings\n",
-    "    \n",
-    "    def __getitem__(self, idx):\n",
-    "        return self.strings[idx]\n",
-    "    \n",
-    "    def __len__(self):\n",
-    "        return len(self.strings)\n",
-    "    "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "id": "9969c58c",
-   "metadata": {
-    "colab": {
-     "base_uri": "https://localhost:8080/"
-    },
-    "id": "9969c58c",
-    "outputId": "5933b10b-9ddb-4b67-b66b-589207bef2d3",
-    "scrolled": false
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "                      id                                       comment_text  \\\n",
-      "0       0000997932d777bf  Explanation\\nWhy the edits made under my usern...   \n",
-      "1       000103f0d9cfb60f  D'aww! He matches this background colour I'm s...   \n",
-      "2       000113f07ec002fd  Hey man, I'm really not trying to edit war. It...   \n",
-      "3       0001b41b1c6bb37e  \"\\nMore\\nI can't make any real suggestions on ...   \n",
-      "4       0001d958c54c6e35  You, sir, are my hero. Any chance you remember...   \n",
-      "...                  ...                                                ...   \n",
-      "159566  ffe987279560d7ff  \":::::And for the second time of asking, when ...   \n",
-      "159567  ffea4adeee384e90  You should be ashamed of yourself \\n\\nThat is ...   \n",
-      "159568  ffee36eab5c267c9  Spitzer \\n\\nUmm, theres no actual article for ...   \n",
-      "159569  fff125370e4aaaf3  And it looks like it was actually you who put ...   \n",
-      "159570  fff46fc426af1f9a  \"\\nAnd ... I really don't think you understand...   \n",
-      "\n",
-      "        toxic  severe_toxic  obscene  threat  insult  identity_hate  \n",
-      "0           0             0        0       0       0              0  \n",
-      "1           0             0        0       0       0              0  \n",
-      "2           0             0        0       0       0              0  \n",
-      "3           0             0        0       0       0              0  \n",
-      "4           0             0        0       0       0              0  \n",
-      "...       ...           ...      ...     ...     ...            ...  \n",
-      "159566      0             0        0       0       0              0  \n",
-      "159567      0             0        0       0       0              0  \n",
-      "159568      0             0        0       0       0              0  \n",
-      "159569      0             0        0       0       0              0  \n",
-      "159570      0             0        0       0       0              0  \n",
-      "\n",
-      "[159571 rows x 8 columns]\n"
-     ]
-    }
-   ],
-   "source": [
-    "train_data = pd.read_csv(\"data/train.csv\")\n",
-    "print(train_data)\n",
-    "train_text = train_data[\"comment_text\"]\n",
-    "train_labels = train_data[[\"toxic\", \"severe_toxic\", \n",
-    "                           \"obscene\", \"threat\", \n",
-    "                           \"insult\", \"identity_hate\"]]\n",
-    "\n",
-    "test_text = pd.read_csv(\"data/test.csv\")[\"comment_text\"]\n",
-    "test_labels = pd.read_csv(\"data/test_labels.csv\")[[\n",
-    "                           \"toxic\", \"severe_toxic\", \n",
-    "                           \"obscene\", \"threat\", \n",
-    "                           \"insult\", \"identity_hate\"]]\n",
-    "\n",
-    "# data preprocessing\n",
-    "\n",
-    "\n",
-    "\n",
-    "train_text = train_text.values.tolist()\n",
-    "train_labels = train_labels.values.tolist()\n",
-    "test_text = test_text.values.tolist()\n",
-    "test_labels = test_labels.values.tolist()\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 10,
-   "id": "1n56TME9Njde",
-   "metadata": {
-    "id": "1n56TME9Njde"
-   },
-   "outputs": [],
-   "source": [
-    "# prepare tokenizer and dataset\n",
-    "\n",
-    "class TweetDataset(Dataset):\n",
-    "    def __init__(self, encodings, labels):\n",
-    "        self.encodings = encodings\n",
-    "        self.labels = labels\n",
-    "        self.tok = tokenizer\n",
-    "    \n",
-    "    def __getitem__(self, idx):\n",
-    "#         print(idx)\n",
-    "        print(len(self.labels))\n",
-    "        encoding = self.tok(self.encodings.strings[idx], truncation=True, padding=\"max_length\", max_length=max_len).to(\"cuda:0\")\n",
-    "        print(encoding.items())\n",
-    "        item = { key: torch.tensor(val) for key, val in encoding.items() }\n",
-    "        item['labels'] = torch.tensor(self.labels[idx])\n",
-    "#         print(item)\n",
-    "        return item\n",
-    "    \n",
-    "    def __len__(self):\n",
-    "        return len(self.labels)\n",
-    "\n",
-    "# no tokenizer\n",
-    "class TweetDataset2(Dataset):\n",
-    "    def __init__(self, encodings, labels):\n",
-    "        self.encodings = encodings\n",
-    "        self.labels = labels\n",
-    "        self.tok = tokenizer\n",
-    "    \n",
-    "    def __getitem__(self, idx):\n",
-    "#         print(idx)\n",
-    "        print(len(self.labels))\n",
-    "        encoding = self.tok(self.encodings.strings[idx], truncation=True, padding=\"max_length\", max_length=max_len).to(\"cuda:0\")\n",
-    "        print(encoding.items())\n",
-    "        item = { key: torch.tensor(val) for key, val in encoding.items() }\n",
-    "        item['labels'] = torch.tensor(self.labels[idx])\n",
-    "#         print(item)\n",
-    "        return item\n",
-    "    \n",
-    "    def __len__(self):\n",
-    "        return len(self.labels)\n",
-    "\n",
-    "\n",
-    "\n",
-    "\n",
-    "train_strings = TokenizerDataset(train_text)\n",
-    "test_strings = TokenizerDataset(test_text)\n",
-    "\n",
-    "train_dataloader = DataLoader(train_strings, batch_size=16, shuffle=True)\n",
-    "test_dataloader = DataLoader(test_strings, batch_size=16, shuffle=True)\n",
-    "\n",
-    "\n",
-    "\n",
-    "\n",
-    "train_encodings = tokenizer.batch_encode_plus(train_text, \\\n",
-    "                            max_length=200, pad_to_max_length=True, \\\n",
-    "                            truncation=True, return_token_type_ids=False, return_tensors='pt' \\\n",
-    "                            ).to(\"cuda:0\")\n",
-    "test_encodings = tokenizer.batch_encode_plus(test_text, \\\n",
-    "                            max_length=200, pad_to_max_length=True, \\\n",
-    "                            truncation=True, return_token_type_ids=False, return_tensors='pt' \\\n",
-    "                            ).to(\"cuda:0\")\n",
-    "\n",
-    "# train_encodings = tokenizer(train_text, truncation=True, padding=True)\n",
-    "# test_encodings = tokenizer(test_text, truncation=True, padding=True)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 15,
-   "id": "4kwydz67qjW9",
-   "metadata": {
-    "colab": {
-     "base_uri": "https://localhost:8080/"
-    },
-    "id": "4kwydz67qjW9",
-    "outputId": "1653744e-69cf-46f8-a2d1-ffc3a3a4d58a"
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "159571\n",
-      "159571\n"
-     ]
-    }
-   ],
-   "source": [
-    "# no tokenizer\n",
-    "class TweetDataset3(Dataset):\n",
-    "    def __init__(self, encodings, labels):\n",
-    "        self.encodings = encodings\n",
-    "        self.labels = labels\n",
-    "        self.tok = tokenizer\n",
-    "    \n",
-    "    def __getitem__(self, idx):\n",
-    "        print(idx)\n",
-    "        item = { key: torch.tensor(val) for key, val in self.encodings.items() }\n",
-    "        item['labels'] = torch.tensor(self.labels[idx])\n",
-    "#         print(item)\n",
-    "        return item\n",
-    "    \n",
-    "    def __len__(self):\n",
-    "        return len(self.labels)\n",
-    "\n",
-    "\n",
-    "\n",
-    "train_dataset = TweetDataset3(train_encodings, train_labels)\n",
-    "test_dataset = TweetDataset3(test_encodings, test_labels)\n",
-    "\n",
-    "print(len(train_dataset.labels))\n",
-    "print(len(train_strings))\n",
-    "\n",
-    "\n",
-    "class MultilabelTrainer(Trainer):\n",
-    "    def compute_loss(self, model, inputs, return_outputs=False):\n",
-    "        labels = inputs.pop(\"labels\")\n",
-    "        outputs = model(**inputs)\n",
-    "        logits = outputs.logits\n",
-    "        loss_fct = torch.nn.BCEWithLogitsLoss()\n",
-    "        loss = loss_fct(logits.view(-1, self.model.config.num_labels), \n",
-    "                        labels.float().view(-1, self.model.config.num_labels))\n",
-    "        return (loss, outputs) if return_outputs else loss\n",
-    "\n",
-    "\n",
-    "# training\n",
-    "trainer = MultilabelTrainer(\n",
-    "    model=model, \n",
-    "    args=training_args, \n",
-    "    train_dataset=train_dataset, \n",
-    "    eval_dataset=test_dataset\n",
-    "    )"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "VwsyMZg_tgTg",
-   "metadata": {
-    "colab": {
-     "base_uri": "https://localhost:8080/",
-     "height": 1000
-    },
-    "id": "VwsyMZg_tgTg",
-    "outputId": "6cf8f3aa-629e-4650-9bbd-dfeb11071ef7"
-   },
-   "outputs": [],
-   "source": [
-    "trainer.train()"
-   ]
-  }
- ],
- "metadata": {
-  "colab": {
-   "provenance": []
-  },
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.10.6"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}

logs/1682300361.4426298/events.out.tfevents.1682300361.mint.371280.1 DELETED Viewed

Binary file (5.8 kB)

logs/1682300884.6095285/events.out.tfevents.1682300884.mint.371280.3 DELETED Viewed

Binary file (5.8 kB)

logs/1682300938.1223385/events.out.tfevents.1682300938.mint.371280.5 DELETED Viewed

Binary file (5.8 kB)

logs/1682301013.2686887/events.out.tfevents.1682301013.mint.371280.7 DELETED Viewed

Binary file (5.8 kB)

logs/events.out.tfevents.1682300361.mint.371280.0 DELETED Viewed

Binary file (4.19 kB)

logs/events.out.tfevents.1682300884.mint.371280.2 DELETED Viewed

Binary file (4.19 kB)

logs/events.out.tfevents.1682300938.mint.371280.4 DELETED Viewed

Binary file (4.19 kB)

logs/events.out.tfevents.1682301013.mint.371280.6 DELETED Viewed

Binary file (4.19 kB)

train.py DELETED Viewed

@@ -1,138 +0,0 @@
-import torch
-from torch.utils.data import Dataset, DataLoader
-import pandas as pd
-from transformers import BertTokenizerFast, BertForSequenceClassification
-from transformers import Trainer, TrainingArguments
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-model_name = "bert-base-uncased"
-tokenizer = BertTokenizerFast.from_pretrained(model_name)
-model = BertForSequenceClassification.from_pretrained(model_name, num_labels=6).to(device)
-max_len = 200
-training_args = TrainingArguments(
-    output_dir="results",
-    num_train_epochs=1,
-    per_device_train_batch_size=16,
-    per_device_eval_batch_size=64,
-    warmup_steps=500,
-    learning_rate=5e-5,
-    weight_decay=0.01,
-    logging_dir="./logs",
-    logging_steps=10
-    )
-# dataset class that inherits from torch.utils.data.Dataset
-class TokenizerDataset(Dataset):
-    def __init__(self, strings):
-        self.strings = strings
-    def __getitem__(self, idx):
-        return self.strings[idx]
-    def __len__(self):
-        return len(self.strings)
-train_data = pd.read_csv("data/train.csv")
-print(train_data)
-train_text = train_data["comment_text"]
-train_labels = train_data[["toxic", "severe_toxic",
-                           "obscene", "threat",
-                           "insult", "identity_hate"]]
-test_text = pd.read_csv("data/test.csv")["comment_text"]
-test_labels = pd.read_csv("data/test_labels.csv")[[
-                           "toxic", "severe_toxic",
-                           "obscene", "threat",
-                           "insult", "identity_hate"]]
-# data preprocessing
-train_text = train_text.values.tolist()
-train_labels = train_labels.values.tolist()
-test_text = test_text.values.tolist()
-test_labels = test_labels.values.tolist()
-# prepare tokenizer and dataset
-class TweetDataset(Dataset):
-    def __init__(self, encodings, labels):
-        self.encodings = encodings
-        self.labels = labels
-        self.tok = tokenizer
-    def __getitem__(self, idx):
-        print(idx)
-        # print(len(self.labels))
-        encoding = self.tok(self.encodings.strings[idx], truncation=True,
-                            padding="max_length", max_length=max_len)
-        # print(encoding.items())
-        item = { key: torch.tensor(val) for key, val in encoding.items() }
-        item['labels'] = torch.tensor(self.labels[idx])
-        # print(item)
-        return item
-    def __len__(self):
-        return len(self.labels)
-train_strings = TokenizerDataset(train_text)
-test_strings = TokenizerDataset(test_text)
-train_dataloader = DataLoader(train_strings, batch_size=16, shuffle=True)
-test_dataloader = DataLoader(test_strings, batch_size=16, shuffle=True)
-# train_encodings = tokenizer.batch_encode_plus(train_text, \
-#                             max_length=200, pad_to_max_length=True, \
-#                             truncation=True, return_token_type_ids=False \
-#                             )
-# test_encodings = tokenizer.batch_encode_plus(test_text, \
-#                             max_length=200, pad_to_max_length=True, \
-#                             truncation=True, return_token_type_ids=False \
-#                             )
-# train_encodings = tokenizer(train_text, truncation=True, padding=True)
-# test_encodings = tokenizer(test_text, truncation=True, padding=True)
-train_dataset = TweetDataset(train_strings, train_labels)
-test_dataset = TweetDataset(test_strings, test_labels)
-print(len(train_dataset.labels))
-print(len(train_strings))
-class MultilabelTrainer(Trainer):
-    def compute_loss(self, model, inputs, return_outputs=False):
-        labels = inputs.pop("labels")
-        outputs = model(**inputs)
-        logits = outputs.logits
-        loss_fct = torch.nn.BCEWithLogitsLoss()
-        loss = loss_fct(logits.view(-1, self.model.config.num_labels),
-                        labels.float().view(-1, self.model.config.num_labels))
-        return (loss, outputs) if return_outputs else loss
-# training
-trainer = MultilabelTrainer(
-    model=model,
-    args=training_args,
-    train_dataset=train_dataset,
-    eval_dataset=test_dataset
-    )
-trainer.train()

training.ipynb DELETED Viewed

@@ -1,164 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "id": "215a1aae",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "2023-04-23 12:34:45.188102: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
-      "To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
-      "2023-04-23 12:34:45.742757: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n"
-     ]
-    }
-   ],
-   "source": [
-    "import torch\n",
-    "from torch.utils.data import Dataset\n",
-    "\n",
-    "import pandas as pd\n",
-    "# import numpy as np\n",
-    "\n",
-    "from transformers import BertTokenizer, BertForSequenceClassification\n",
-    "from transformers import Trainer, TrainingArguments"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 10,
-   "id": "9969c58c",
-   "metadata": {
-    "scrolled": false
-   },
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "IOPub data rate exceeded.\n",
-      "The notebook server will temporarily stop sending output\n",
-      "to the client in order to avoid crashing it.\n",
-      "To change this limit, set the config variable\n",
-      "`--NotebookApp.iopub_data_rate_limit`.\n",
-      "\n",
-      "Current values:\n",
-      "NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)\n",
-      "NotebookApp.rate_limit_window=3.0 (secs)\n",
-      "\n",
-      "Token indices sequence length is longer than the specified maximum sequence length for this model (631 > 512). Running this sequence through the model will result in indexing errors\n"
-     ]
-    },
-    {
-     "ename": "ValueError",
-     "evalue": "text input must of type `str` (single example), `List[str]` (batch or single pretokenized example) or `List[List[str]]` (batch of pretokenized examples).",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mValueError\u001b[0m                                Traceback (most recent call last)",
-      "\u001b[0;32m/tmp/ipykernel_325077/677523904.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m     38\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     39\u001b[0m \u001b[0mtrain_encodings\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtokenizer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtrain_text\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 40\u001b[0;31m \u001b[0mtest_encodings\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtokenizer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtest_text\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     41\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     42\u001b[0m \u001b[0mtrain_dataset\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mTweetDataset\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtrain_encodings\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtrain_labels\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;32m~/.local/lib/python3.10/site-packages/transformers/tokenization_utils_base.py\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self, text, text_pair, text_target, text_pair_target, add_special_tokens, padding, truncation, max_length, stride, is_split_into_words, pad_to_multiple_of, return_tensors, return_token_type_ids, return_attention_mask, return_overflowing_tokens, return_special_tokens_mask, return_offsets_mapping, return_length, verbose, **kwargs)\u001b[0m\n\u001b[1;32m   2536\u001b[0m             \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_in_target_context_manager\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   2537\u001b[0m                 \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_switch_to_input_mode\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2538\u001b[0;31m             \u001b[0mencodings\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_call_one\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtext\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtext\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtext_pair\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtext_pair\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mall_kwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   2539\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0mtext_target\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   2540\u001b[0m             \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_switch_to_target_mode\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;32m~/.local/lib/python3.10/site-packages/transformers/tokenization_utils_base.py\u001b[0m in \u001b[0;36m_call_one\u001b[0;34m(self, text, text_pair, add_special_tokens, padding, truncation, max_length, stride, is_split_into_words, pad_to_multiple_of, return_tensors, return_token_type_ids, return_attention_mask, return_overflowing_tokens, return_special_tokens_mask, return_offsets_mapping, return_length, verbose, **kwargs)\u001b[0m\n\u001b[1;32m   2594\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   2595\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0m_is_valid_text_input\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtext\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2596\u001b[0;31m             raise ValueError(\n\u001b[0m\u001b[1;32m   2597\u001b[0m                 \u001b[0;34m\"text input must of type `str` (single example), `List[str]` (batch or single pretokenized example) \"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   2598\u001b[0m                 \u001b[0;34m\"or `List[List[str]]` (batch of pretokenized examples).\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;31mValueError\u001b[0m: text input must of type `str` (single example), `List[str]` (batch or single pretokenized example) or `List[List[str]]` (batch of pretokenized examples)."
-     ]
-    }
-   ],
-   "source": [
-    "model_name = \"bert-base-uncased\"\n",
-    "\n",
-    "# dataset class that inherits from torch.utils.data.Dataset\n",
-    "class TweetDataset(Dataset):\n",
-    "    def __init__(self, encodings, labels):\n",
-    "        self.encodings = encodings\n",
-    "        self.labels = labels\n",
-    "    \n",
-    "    def __getitem__(self, idx):\n",
-    "        item = { key: torch.tensor(val[idx]) for key, val in self.encodings.items() }\n",
-    "        item['labels'] = torch.tensor(self.labels[idx])\n",
-    "        return item\n",
-    "    \n",
-    "    def __len__(self):\n",
-    "        return len(self.labels)\n",
-    "    \n",
-    "\n",
-    "\n",
-    "train_data = pd.read_csv(\"data/train.csv\")\n",
-    "train_text = train_data[\"comment_text\"].values.tolist()\n",
-    "train_labels = train_data[[\"toxic\", \"severe_toxic\", \n",
-    "                           \"obscene\", \"threat\", \n",
-    "                           \"insult\", \"identity_hate\"]].values.tolist()\n",
-    "\n",
-    "test_text = pd.read_csv(\"data/test.csv\")[\"comment_text\"].values.tolist()\n",
-    "test_labels = pd.read_csv(\"data/test_labels.csv\")[[\n",
-    "                           \"toxic\", \"severe_toxic\", \n",
-    "                           \"obscene\", \"threat\", \n",
-    "                           \"insult\", \"identity_hate\"]].values.tolist()\n",
-    "\n",
-    "\n",
-    "# prepare tokenizer and dataset\n",
-    "\n",
-    "tokenizer = BertTokenizer.from_pretrained(model_name)\n",
-    "\n",
-    "print(train_text)\n",
-    "\n",
-    "\n",
-    "train_encodings = tokenizer(train_text)\n",
-    "test_encodings = tokenizer(test_text)\n",
-    "\n",
-    "train_dataset = TweetDataset(train_encodings, train_labels)\n",
-    "test_dataset = TweetDataset(test_encodings, test_labels)\n",
-    "\n",
-    "\n",
-    "# training\n",
-    "\n",
-    "\n",
-    "training_args = TrainingArguments(\n",
-    "    output_dir=\"results\",\n",
-    "    num_train_epochs=2,\n",
-    "    per_device_train_batch_size=16,\n",
-    "    per_device_eval_barch_size=64,\n",
-    "    warmup_steps=500,\n",
-    "    learning_rate=5e-5,\n",
-    "    weight_decay=0.01,\n",
-    "    logging_dir=\"./logs\",\n",
-    "    logging_steps=10\n",
-    "    )\n",
-    "\n",
-    "model = BertForSequenceClassification.from_pretrained(model_name, num_labels=6)\n",
-    "\n",
-    "\n",
-    "trainer = Trainer(\n",
-    "    model=model, \n",
-    "    args=args, \n",
-    "    train_dataset=train_dataset, \n",
-    "    val_dataset=test_dataset)\n",
-    "\n",
-    "trainer.train()\n"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.10.6"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}

traintokens.txt DELETED Viewed

File without changes

working_training.ipynb CHANGED Viewed

The diff for this file is too large to render. See raw diff