Spaces:

JKJanosko
/

Toxicity-Analysis

Sleeping

App Files Files Community

JKJanosko commited on Apr 27, 2023

Commit

27b305f

•

1 Parent(s): 469e6cd

Add files via upload

Browse files

Files changed (3) hide show

milestone-3/app.py +192 -0
milestone-3/requirements.txt +0 -0
milestone-3/toxicityclassification.ipynb +367 -0

milestone-3/app.py ADDED Viewed

	@@ -0,0 +1,192 @@

+import pandas as pd
+import numpy as np
+from torch.utils.data import Dataset
+import torch
+from transformers import AutoTokenizer
+import pytorch_lightning as pl
+from torch.utils.data import DataLoader
+from transformers import AutoModel, AdamW, get_cosine_schedule_with_warmup
+import torch.nn as nn
+import math
+from torchmetrics.functional.classification import auroc
+import torch.nn.functional as F
+import streamlit as st
+from transformers import pipeline
+class toxicity_dataset(Dataset):
+    def __init__(self,data_path,tokenizer,attributes,max_token_len= 128,sample = 1000):
+        self.data_path=data_path
+        self.tokenizer=tokenizer
+        self.attributes=attributes
+        self.max_token_len=max_token_len
+        self.sample=sample
+        self._prepare_data()
+    def _prepare_data(self):
+        data=pd.read_csv(self.data_path)
+        if self.sample is not None:
+            self.data=data.sample(self.sample,random_state=7)
+        else:
+            self.data=data
+    def __len__(self):
+        return(len(self.data))
+    def __getitem__(self,index):
+        item = self.data.iloc[index]
+        comment = str(item.comment_text)
+        attributes = torch.FloatTensor(item[self.attributes])
+        tokens = self.tokenizer.encode_plus(comment,add_special_tokens=True,return_tensors="pt",truncation=True,max_length=self.max_token_len,padding="max_length",return_attention_mask=True)
+        return{'input_ids':tokens.input_ids.flatten(),"attention_mask":tokens.attention_mask.flatten(),"labels":attributes}
+class Toxcity_Data_Module(pl.LightningDataModule):
+    def __init__(self,train_path,test_path,attributes,batch_size = 16, max_token_len = 128, model_name="roberta-base"):
+        super().__init__()
+        self.train_path=train_path
+        self.test_path=test_path
+        self.attributes=attributes
+        self.batch_size=batch_size
+        self.max_token_len=max_token_len
+        self.model_name=model_name
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+    def setup(self, stage = None):
+        if stage in (None, "fit"):
+            self.train_dataset=toxicity_dataset(self.train_path,self.tokenizer,self.attributes)
+            self.test_dataset=toxicity_dataset(self.test_path,self.tokenizer,self.attributes, sample=None)
+        if stage == "predict":
+            self.val_dataset=toxicity_dataset(self.test_path,self.tokenizer,self.attributes)
+    def train_dataloader(self):
+        return DataLoader(self.train_dataset,batch_size=self.batch_size,shuffle=True)
+    def val_dataloader(self):
+        return DataLoader(self.train_dataset,batch_size=self.batch_size,shuffle=False)
+    def predict_dataloader(self):
+        return DataLoader(self.test_dataset,batch_size=self.batch_size,shuffle=False)
+class Toxic_Comment_Classifier(pl.LightningModule):
+  def __init__(self, config: dict):
+    super().__init__()
+    self.config = config
+    self.pretrained_model = AutoModel.from_pretrained(config['model_name'], return_dict = True)
+    self.hidden = torch.nn.Linear(self.pretrained_model.config.hidden_size, self.pretrained_model.config.hidden_size)
+    self.classifier = torch.nn.Linear(self.pretrained_model.config.hidden_size, self.config['n_labels'])
+    torch.nn.init.xavier_uniform_(self.classifier.weight)
+    self.loss_func = nn.BCEWithLogitsLoss(reduction='mean')
+    self.dropout = nn.Dropout()
+  def forward(self, input_ids, attention_mask=None, labels=None):
+    # roberta layer
+    output = self.pretrained_model(input_ids=input_ids, attention_mask=attention_mask)
+    pooled_output = torch.mean(output.last_hidden_state, 1)
+    # final logits
+    pooled_output = self.dropout(pooled_output)
+    pooled_output = self.hidden(pooled_output)
+    pooled_output = F.relu(pooled_output)
+    pooled_output = self.dropout(pooled_output)
+    logits = self.classifier(pooled_output)
+    # calculate loss
+    loss = 0
+    if labels is not None:
+      loss = self.loss_func(logits.view(-1, self.config['n_labels']), labels.view(-1, self.config['n_labels']))
+    return loss, logits
+  def training_step(self, batch, batch_index):
+    loss, outputs = self(**batch)
+    self.log("train loss ", loss, prog_bar = True, logger=True)
+    return {"loss":loss, "predictions":outputs, "labels": batch["labels"]}
+  def validation_step(self, batch, batch_index):
+    loss, outputs = self(**batch)
+    self.log("validation loss ", loss, prog_bar = True, logger=True)
+    return {"val_loss": loss, "predictions":outputs, "labels": batch["labels"]}
+  def predict_step(self, batch, batch_index):
+    loss, outputs = self(**batch)
+    return outputs
+  def configure_optimizers(self):
+    optimizer = AdamW(self.parameters(), lr=self.config['lr'], weight_decay=self.config['w_decay'])
+    total_steps = self.config['train_size']/self.config['bs']
+    warmup_steps = math.floor(total_steps * self.config['warmup'])
+    warmup_steps = math.floor(total_steps * self.config['warmup'])
+    scheduler = get_cosine_schedule_with_warmup(optimizer, warmup_steps, total_steps)
+    return [optimizer],[scheduler]
+def predict_raw_comments(model, dm, trainer):
+  print("debug1")
+  predictions = trainer.predict(model,dm)
+  print("debug2")
+  flattened_predictions = np.stack([torch.sigmoid(torch.Tensor(p)) for batch in predictions for p in batch])
+  print("debug3")
+  return flattened_predictions
+def main():
+    # -- Creates Variables for Use of Model --
+    attributes=["toxic","severe_toxic","obscene","threat","insult","identity_hate"]
+    tokenizer=AutoTokenizer.from_pretrained("roberta-base")
+    toxic_comments_dataset=toxicity_dataset("data/train.csv",tokenizer,attributes)
+    toxicity_data_module=Toxcity_Data_Module("data/train.csv","data/test.csv",attributes)
+    toxicity_data_module.setup()
+    dataloader=toxicity_data_module.train_dataloader()
+    config = {
+        'model_name':"distilroberta-base",
+        'n_labels':len(attributes),
+        'bs':128,
+        'lr':1.5e-6,
+        'warmup':0.2,
+        "train_size":len(toxicity_data_module.train_dataloader()),
+        'w_decay':0.001,
+        'n_epochs':1
+    }
+    toxicity_data_module=Toxcity_Data_Module("data/train.csv","data/reduced_test.csv",attributes,batch_size=config['bs'])
+    toxicity_data_module.setup()
+    trainer = pl.Trainer(max_epochs=config['n_epochs'],num_sanity_val_steps=50)
+    ## -- Creates Streamlit App --
+    st.title("Tweet Toxicity Classifier ")
+    st.header("Fine tuned model from roberta-base using PyTorch")
+    st.header("Jozef Janosko - CS 482, Milestone 3")
+    model_name = st.selectbox("Select Model...", ["Toxicity Classification Model"])
+    if st.button("Click to Load Data"):
+        if model_name=="Toxicity Classification Model":
+            model = torch.load("ToxicityClassificationModel.pt")
+            with st.spinner('Analyzing Text...'):
+                logits = predict_raw_comments(model,toxicity_data_module,trainer=trainer)
+            torch_logits = torch.from_numpy(logits)
+            probabilities = F.softmax(torch_logits, dim = -1).numpy()
+            inputs=pd.read_csv("data/reduced_test.csv")
+            data=[]
+            #print(inputs["comment_text"][0]," ",probabilities)
+            for i in range(len(probabilities)):
+                max_prob = 0
+                max_cat = 6
+                prob=0
+                for j in range(6):
+                    prob=probabilities[i][j]
+                    if(prob >= max_prob):
+                        max_prob = prob
+                        max_cat = j
+                #print(inputs["comment_text"][i]," ",attributes[max_cat]," ",max_prob," ",probabilities[i])
+                data.append([inputs["comment_text"][i][0:16],attributes[max_cat],max_prob])
+            results_df=pd.DataFrame(data,columns=["Comment Text","Most Likely Classification","Classification Probability"])
+            st.table(data=results_df)
+        else:
+            model = pipeline("sentiment-analysis",model_name)
+if __name__ == '__main__' :
+    main()

milestone-3/requirements.txt ADDED Viewed

Binary file (112 Bytes). View file

milestone-3/toxicityclassification.ipynb ADDED Viewed

	@@ -0,0 +1,367 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "\n",
+    "train_data=pd.read_csv(\"data/train.csv\", engine=\"python\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "attributes=[\"toxic\",\"severe_toxic\",\"obscene\",\"threat\",\"insult\",\"identity_hate\"]\n",
+    "#train_data[attributes].sum().plot.bar()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from torch.utils.data import Dataset\n",
+    "import torch\n",
+    "\n",
+    "class toxicity_dataset(Dataset):\n",
+    "    def __init__(self,data_path,tokenizer,attributes,max_token_len= 128,sample = 5000):\n",
+    "        self.data_path=data_path\n",
+    "        self.tokenizer=tokenizer\n",
+    "        self.attributes=attributes\n",
+    "        self.max_token_len=max_token_len\n",
+    "        self.sample=sample\n",
+    "        self._prepare_data()\n",
+    "    def _prepare_data(self):\n",
+    "        data=pd.read_csv(self.data_path)\n",
+    "        if self.sample is not None:\n",
+    "            self.data=data.sample(self.sample,random_state=7)\n",
+    "        else:\n",
+    "            self.data=data\n",
+    "    def __len__(self):\n",
+    "        return(len(self.data))\n",
+    "    def __getitem__(self,index):\n",
+    "        item = self.data.iloc[index]\n",
+    "        comment = str(item.comment_text)\n",
+    "        attributes = torch.FloatTensor(item[self.attributes])\n",
+    "        tokens = self.tokenizer.encode_plus(comment,add_special_tokens=True,return_tensors=\"pt\",truncation=True,max_length=self.max_token_len,padding=\"max_length\",return_attention_mask=True)\n",
+    "        return{'input_ids':tokens.input_ids.flatten(),\"attention_mask\":tokens.attention_mask.flatten(),\"labels\":attributes}\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "c:\\Users\\jozef\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "{'input_ids': tensor([    0,   113, 43292,   487,  1073,  6619, 16519,  4261,  1012, 28845,\n",
+       "         43292, 50118, 30086,     6,    38,   206,     5,  7729,  6619, 16519,\n",
+       "          4261,  1012,  6717,  4867,    11,  2370,    32,    45, 41039,  7140,\n",
+       "           250, 48149,    53,   888,    95, 41762,    30, 40823, 34740,  2071,\n",
+       "             4,   407,    51,   197,   213,    11, 29617,   101, 25046, 12467,\n",
+       "           381, 11742,   646, 19065,  6026,   742,  1195,    87,   634, 14530,\n",
+       "           250,     4,  1437,  1437,    22,     2,     1,     1,     1,     1,\n",
+       "             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,\n",
+       "             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,\n",
+       "             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,\n",
+       "             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,\n",
+       "             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,\n",
+       "             1,     1,     1,     1,     1,     1,     1,     1]),\n",
+       " 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
+       "         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
+       "         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,\n",
+       "         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
+       "         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
+       "         0, 0, 0, 0, 0, 0, 0, 0]),\n",
+       " 'labels': tensor([0., 0., 0., 0., 0., 0.])}"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from transformers import AutoTokenizer\n",
+    "model_name=\"roberta-base\"\n",
+    "tokenizer=AutoTokenizer.from_pretrained(model_name)\n",
+    "toxic_comments_dataset=toxicity_dataset(\"data/train.csv\",tokenizer,attributes)\n",
+    "toxic_comments_dataset.__getitem__(0)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pytorch_lightning as pl\n",
+    "from torch.utils.data import DataLoader"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class Toxcity_Data_Module(pl.LightningDataModule):\n",
+    "    def __init__(self,train_path,test_path,attributes,batch_size = 16, max_token_len = 128, model_name=\"roberta-base\"):\n",
+    "        super().__init__()\n",
+    "        self.train_path=train_path\n",
+    "        self.test_path=test_path\n",
+    "        self.attributes=attributes\n",
+    "        self.batch_size=batch_size\n",
+    "        self.max_token_len=max_token_len\n",
+    "        self.model_name=model_name\n",
+    "        self.tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
+    "    def setup(self, stage = None):\n",
+    "        if stage in (None, \"fit\"):\n",
+    "            self.train_dataset=toxicity_dataset(self.train_path,self.tokenizer,self.attributes)\n",
+    "            self.test_dataset=toxicity_dataset(self.test_path,self.tokenizer,self.attributes, sample=None)\n",
+    "        if stage == \"predict\":\n",
+    "            self.val_dataset=toxicity_dataset(self.test_path,self.tokenizer,self.attributes)\n",
+    "    def train_dataloader(self):\n",
+    "        return DataLoader(self.train_dataset,batch_size=self.batch_size,num_workers=4,shuffle=True)\n",
+    "    def val_dataloader(self):\n",
+    "        return DataLoader(self.test_dataset,batch_size=self.batch_size,num_workers=4,shuffle=False)\n",
+    "    def predict_dataloader(self):\n",
+    "        return DataLoader(self.test_dataset,batch_size=self.batch_size,num_workers=4,shuffle=False)\n",
+    "    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<torch.utils.data.dataloader.DataLoader at 0x2241d16e5d0>"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "toxicity_data_module=Toxcity_Data_Module(\"data/train.csv\",\"data/test.csv\",attributes)\n",
+    "\n",
+    "toxicity_data_module.setup()\n",
+    "dataloader=toxicity_data_module.train_dataloader()\n",
+    "dataloader"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import AutoModel, AdamW, get_cosine_schedule_with_warmup\n",
+    "import torch.nn as nn\n",
+    "import math\n",
+    "from torchmetrics.functional.classification import auroc\n",
+    "import torch.nn.functional as F"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class Toxic_Comment_Classifier(pl.LightningModule):\n",
+    "    def __init__(self,config:dict):\n",
+    "        super().__init__()\n",
+    "        self.config=config\n",
+    "        self.pretrained_model = AutoModel.from_pretrained(config['model_name'],return_dict=True)\n",
+    "        self.hidden = nn.Linear(self.pretrained_model.config.hidden_size, self.pretrained_model.config.hidden_size)\n",
+    "        self.classifier = nn.Linear(self.pretrained_model.config.hidden_size, self.config['n_labels'])\n",
+    "        torch.nn.init.xavier_uniform_(self.hidden.weight)\n",
+    "        torch.nn.init.xavier_uniform_(self.classifier.weight)\n",
+    "        self.loss_func=nn.BCEWithLogitsLoss(reduction='mean')\n",
+    "        self.dropout = nn.Dropout()\n",
+    "\n",
+    "    def forward(self, input_ids,attention_mask,labels=None):\n",
+    "        output = self.pretrained_model(input_ids=input_ids,attention_mask=attention_mask)\n",
+    "        pooled_output=torch.mean(output.last_hidden_state, 1)\n",
+    "        #nn classification\n",
+    "        pooled_output=self.hidden(pooled_output)\n",
+    "        pooled_output=self.dropout(pooled_output)\n",
+    "        pooled_output=F.relu(pooled_output)\n",
+    "        logits=self.classifier(pooled_output)\n",
+    "        #loss\n",
+    "        loss = 0\n",
+    "        if labels is not None:\n",
+    "            loss = self.loss_func(logits.view(-1,self.config['n_labels']), labels.view(-1,self.config['n_labels']))\n",
+    "        return loss, logits\n",
+    "    \n",
+    "    def training_step(self, batch, batch_index):\n",
+    "        loss, logits = self(**batch)\n",
+    "        self.log(\"train loss\", loss, prog_bar=True, logger=True)\n",
+    "        return{'loss':loss,'predictions':logits,'labels':batch['labels']}\n",
+    "    def validation_step(self, batch, batch_index):\n",
+    "        loss, logits = self(**batch)\n",
+    "        self.log(\"validation loss\", loss, prog_bar=True, logger=True)\n",
+    "        return{'val_loss':loss,'predictions':logits,'labels':batch['labels']}\n",
+    "    def prediction_step(self, batch, batch_index):\n",
+    "        logits = self(**batch)\n",
+    "        return logits\n",
+    "    \n",
+    "    def configure_optimizers(self):\n",
+    "        optimizer = AdamW(self.parameters(), lr=self.config['lr'], weight_decay=self.config['w_decay'])\n",
+    "        total_steps = self.config['train_size']/self.config['bs']\n",
+    "        warmup_steps = math.floor(total_steps*self.config['warmup'])\n",
+    "        scheduler = get_cosine_schedule_with_warmup(optimizer,warmup_steps,total_steps)\n",
+    "        return [optimizer],[scheduler]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.bias']\n",
+      "- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
+      "- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n"
+     ]
+    }
+   ],
+   "source": [
+    "config = {\n",
+    "    'model_name':\"distilroberta-base\",\n",
+    "    'n_labels':len(attributes),\n",
+    "    'bs':128,\n",
+    "    'lr':1.5e-6,\n",
+    "    'warmup':0.2,\n",
+    "    \"train_size\":len(toxicity_data_module.train_dataloader()),\n",
+    "    'w_decay':0.001,\n",
+    "    'n_epochs':1\n",
+    "}\n",
+    "\n",
+    "model = Toxic_Comment_Classifier(config)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "idx=0\n",
+    "input_ids = toxic_comments_dataset.__getitem__(idx)[\"input_ids\"]\n",
+    "attention_mask = toxic_comments_dataset.__getitem__(idx)[\"attention_mask\"]\n",
+    "labels = toxic_comments_dataset.__getitem__(idx)[\"labels\"]\n",
+    "loss,output = model(input_ids.unsqueeze(dim=0), attention_mask.unsqueeze(dim=0),labels.unsqueeze(dim=0))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.bias']\n",
+      "- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
+      "- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
+      "GPU available: False, used: False\n",
+      "TPU available: False, using: 0 TPU cores\n",
+      "IPU available: False, using: 0 IPUs\n",
+      "HPU available: False, using: 0 HPUs\n",
+      "c:\\Users\\jozef\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\pytorch_lightning\\trainer\\connectors\\logger_connector\\logger_connector.py:67: UserWarning: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `pytorch_lightning` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default\n",
+      "  warning_cache.warn(\n"
+     ]
+    }
+   ],
+   "source": [
+    "toxicity_data_module=Toxcity_Data_Module(\"data/train.csv\",\"data/test.csv\",attributes,batch_size=config['bs'])\n",
+    "toxicity_data_module.setup()\n",
+    "model = Toxic_Comment_Classifier(config)\n",
+    "\n",
+    "trainer = pl.Trainer(max_epochs=config['n_epochs'],num_sanity_val_steps=50)\n",
+    "#device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
+    "#print(torch.cuda.get_device_name())\n",
+    "#trainer.fit(model,toxicity_data_module)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model = torch.load(\"ToxicityClassificationModel.pt\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def predict_raw_comments(model, dm):\n",
+    "    predictions = trainer.predict(model,datamodule=dm)\n",
+    "    flattened_predictions = np.stack([torch.sigmoid(torch.Tensor(p)) for batch in predictions for p in batch])\n",
+    "    return flattened_predictions"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "predictions = predict_raw_comments(model=model,dm=toxicity_data_module)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.3"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}