JKJanosko commited on
Commit
27b305f
1 Parent(s): 469e6cd

Add files via upload

Browse files
milestone-3/app.py ADDED
@@ -0,0 +1,192 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ from torch.utils.data import Dataset
4
+ import torch
5
+ from transformers import AutoTokenizer
6
+ import pytorch_lightning as pl
7
+ from torch.utils.data import DataLoader
8
+ from transformers import AutoModel, AdamW, get_cosine_schedule_with_warmup
9
+ import torch.nn as nn
10
+ import math
11
+ from torchmetrics.functional.classification import auroc
12
+ import torch.nn.functional as F
13
+ import streamlit as st
14
+ from transformers import pipeline
15
+
16
+
17
+
18
+ class toxicity_dataset(Dataset):
19
+ def __init__(self,data_path,tokenizer,attributes,max_token_len= 128,sample = 1000):
20
+ self.data_path=data_path
21
+ self.tokenizer=tokenizer
22
+ self.attributes=attributes
23
+ self.max_token_len=max_token_len
24
+ self.sample=sample
25
+ self._prepare_data()
26
+ def _prepare_data(self):
27
+ data=pd.read_csv(self.data_path)
28
+ if self.sample is not None:
29
+ self.data=data.sample(self.sample,random_state=7)
30
+ else:
31
+ self.data=data
32
+ def __len__(self):
33
+ return(len(self.data))
34
+ def __getitem__(self,index):
35
+ item = self.data.iloc[index]
36
+ comment = str(item.comment_text)
37
+ attributes = torch.FloatTensor(item[self.attributes])
38
+ tokens = self.tokenizer.encode_plus(comment,add_special_tokens=True,return_tensors="pt",truncation=True,max_length=self.max_token_len,padding="max_length",return_attention_mask=True)
39
+ return{'input_ids':tokens.input_ids.flatten(),"attention_mask":tokens.attention_mask.flatten(),"labels":attributes}
40
+
41
+ class Toxcity_Data_Module(pl.LightningDataModule):
42
+ def __init__(self,train_path,test_path,attributes,batch_size = 16, max_token_len = 128, model_name="roberta-base"):
43
+ super().__init__()
44
+ self.train_path=train_path
45
+ self.test_path=test_path
46
+ self.attributes=attributes
47
+ self.batch_size=batch_size
48
+ self.max_token_len=max_token_len
49
+ self.model_name=model_name
50
+ self.tokenizer = AutoTokenizer.from_pretrained(model_name)
51
+ def setup(self, stage = None):
52
+ if stage in (None, "fit"):
53
+ self.train_dataset=toxicity_dataset(self.train_path,self.tokenizer,self.attributes)
54
+ self.test_dataset=toxicity_dataset(self.test_path,self.tokenizer,self.attributes, sample=None)
55
+ if stage == "predict":
56
+ self.val_dataset=toxicity_dataset(self.test_path,self.tokenizer,self.attributes)
57
+ def train_dataloader(self):
58
+ return DataLoader(self.train_dataset,batch_size=self.batch_size,shuffle=True)
59
+ def val_dataloader(self):
60
+ return DataLoader(self.train_dataset,batch_size=self.batch_size,shuffle=False)
61
+ def predict_dataloader(self):
62
+ return DataLoader(self.test_dataset,batch_size=self.batch_size,shuffle=False)
63
+
64
+ class Toxic_Comment_Classifier(pl.LightningModule):
65
+ def __init__(self, config: dict):
66
+ super().__init__()
67
+ self.config = config
68
+ self.pretrained_model = AutoModel.from_pretrained(config['model_name'], return_dict = True)
69
+ self.hidden = torch.nn.Linear(self.pretrained_model.config.hidden_size, self.pretrained_model.config.hidden_size)
70
+ self.classifier = torch.nn.Linear(self.pretrained_model.config.hidden_size, self.config['n_labels'])
71
+ torch.nn.init.xavier_uniform_(self.classifier.weight)
72
+ self.loss_func = nn.BCEWithLogitsLoss(reduction='mean')
73
+ self.dropout = nn.Dropout()
74
+
75
+ def forward(self, input_ids, attention_mask=None, labels=None):
76
+ # roberta layer
77
+ output = self.pretrained_model(input_ids=input_ids, attention_mask=attention_mask)
78
+ pooled_output = torch.mean(output.last_hidden_state, 1)
79
+ # final logits
80
+ pooled_output = self.dropout(pooled_output)
81
+ pooled_output = self.hidden(pooled_output)
82
+ pooled_output = F.relu(pooled_output)
83
+ pooled_output = self.dropout(pooled_output)
84
+ logits = self.classifier(pooled_output)
85
+ # calculate loss
86
+ loss = 0
87
+ if labels is not None:
88
+ loss = self.loss_func(logits.view(-1, self.config['n_labels']), labels.view(-1, self.config['n_labels']))
89
+ return loss, logits
90
+
91
+ def training_step(self, batch, batch_index):
92
+ loss, outputs = self(**batch)
93
+ self.log("train loss ", loss, prog_bar = True, logger=True)
94
+ return {"loss":loss, "predictions":outputs, "labels": batch["labels"]}
95
+
96
+ def validation_step(self, batch, batch_index):
97
+ loss, outputs = self(**batch)
98
+ self.log("validation loss ", loss, prog_bar = True, logger=True)
99
+ return {"val_loss": loss, "predictions":outputs, "labels": batch["labels"]}
100
+
101
+ def predict_step(self, batch, batch_index):
102
+ loss, outputs = self(**batch)
103
+ return outputs
104
+
105
+ def configure_optimizers(self):
106
+ optimizer = AdamW(self.parameters(), lr=self.config['lr'], weight_decay=self.config['w_decay'])
107
+ total_steps = self.config['train_size']/self.config['bs']
108
+ warmup_steps = math.floor(total_steps * self.config['warmup'])
109
+ warmup_steps = math.floor(total_steps * self.config['warmup'])
110
+ scheduler = get_cosine_schedule_with_warmup(optimizer, warmup_steps, total_steps)
111
+ return [optimizer],[scheduler]
112
+
113
+
114
+
115
+ def predict_raw_comments(model, dm, trainer):
116
+ print("debug1")
117
+ predictions = trainer.predict(model,dm)
118
+ print("debug2")
119
+ flattened_predictions = np.stack([torch.sigmoid(torch.Tensor(p)) for batch in predictions for p in batch])
120
+ print("debug3")
121
+ return flattened_predictions
122
+
123
+
124
+
125
+ def main():
126
+ # -- Creates Variables for Use of Model --
127
+ attributes=["toxic","severe_toxic","obscene","threat","insult","identity_hate"]
128
+ tokenizer=AutoTokenizer.from_pretrained("roberta-base")
129
+ toxic_comments_dataset=toxicity_dataset("data/train.csv",tokenizer,attributes)
130
+
131
+ toxicity_data_module=Toxcity_Data_Module("data/train.csv","data/test.csv",attributes)
132
+ toxicity_data_module.setup()
133
+ dataloader=toxicity_data_module.train_dataloader()
134
+
135
+ config = {
136
+ 'model_name':"distilroberta-base",
137
+ 'n_labels':len(attributes),
138
+ 'bs':128,
139
+ 'lr':1.5e-6,
140
+ 'warmup':0.2,
141
+ "train_size":len(toxicity_data_module.train_dataloader()),
142
+ 'w_decay':0.001,
143
+ 'n_epochs':1
144
+ }
145
+
146
+ toxicity_data_module=Toxcity_Data_Module("data/train.csv","data/reduced_test.csv",attributes,batch_size=config['bs'])
147
+ toxicity_data_module.setup()
148
+
149
+
150
+ trainer = pl.Trainer(max_epochs=config['n_epochs'],num_sanity_val_steps=50)
151
+
152
+ ## -- Creates Streamlit App --
153
+ st.title("Tweet Toxicity Classifier ")
154
+ st.header("Fine tuned model from roberta-base using PyTorch")
155
+ st.header("Jozef Janosko - CS 482, Milestone 3")
156
+
157
+ model_name = st.selectbox("Select Model...", ["Toxicity Classification Model"])
158
+
159
+ if st.button("Click to Load Data"):
160
+ if model_name=="Toxicity Classification Model":
161
+ model = torch.load("ToxicityClassificationModel.pt")
162
+ with st.spinner('Analyzing Text...'):
163
+ logits = predict_raw_comments(model,toxicity_data_module,trainer=trainer)
164
+ torch_logits = torch.from_numpy(logits)
165
+ probabilities = F.softmax(torch_logits, dim = -1).numpy()
166
+ inputs=pd.read_csv("data/reduced_test.csv")
167
+ data=[]
168
+ #print(inputs["comment_text"][0]," ",probabilities)
169
+ for i in range(len(probabilities)):
170
+ max_prob = 0
171
+ max_cat = 6
172
+
173
+ prob=0
174
+ for j in range(6):
175
+ prob=probabilities[i][j]
176
+ if(prob >= max_prob):
177
+ max_prob = prob
178
+ max_cat = j
179
+ #print(inputs["comment_text"][i]," ",attributes[max_cat]," ",max_prob," ",probabilities[i])
180
+ data.append([inputs["comment_text"][i][0:16],attributes[max_cat],max_prob])
181
+ results_df=pd.DataFrame(data,columns=["Comment Text","Most Likely Classification","Classification Probability"])
182
+ st.table(data=results_df)
183
+ else:
184
+ model = pipeline("sentiment-analysis",model_name)
185
+
186
+
187
+
188
+
189
+
190
+
191
+ if __name__ == '__main__' :
192
+ main()
milestone-3/requirements.txt ADDED
Binary file (112 Bytes). View file
 
milestone-3/toxicityclassification.ipynb ADDED
@@ -0,0 +1,367 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "import pandas as pd\n",
10
+ "import numpy as np\n",
11
+ "\n",
12
+ "train_data=pd.read_csv(\"data/train.csv\", engine=\"python\")"
13
+ ]
14
+ },
15
+ {
16
+ "cell_type": "code",
17
+ "execution_count": 2,
18
+ "metadata": {},
19
+ "outputs": [],
20
+ "source": [
21
+ "attributes=[\"toxic\",\"severe_toxic\",\"obscene\",\"threat\",\"insult\",\"identity_hate\"]\n",
22
+ "#train_data[attributes].sum().plot.bar()"
23
+ ]
24
+ },
25
+ {
26
+ "cell_type": "code",
27
+ "execution_count": 3,
28
+ "metadata": {},
29
+ "outputs": [],
30
+ "source": [
31
+ "from torch.utils.data import Dataset\n",
32
+ "import torch\n",
33
+ "\n",
34
+ "class toxicity_dataset(Dataset):\n",
35
+ " def __init__(self,data_path,tokenizer,attributes,max_token_len= 128,sample = 5000):\n",
36
+ " self.data_path=data_path\n",
37
+ " self.tokenizer=tokenizer\n",
38
+ " self.attributes=attributes\n",
39
+ " self.max_token_len=max_token_len\n",
40
+ " self.sample=sample\n",
41
+ " self._prepare_data()\n",
42
+ " def _prepare_data(self):\n",
43
+ " data=pd.read_csv(self.data_path)\n",
44
+ " if self.sample is not None:\n",
45
+ " self.data=data.sample(self.sample,random_state=7)\n",
46
+ " else:\n",
47
+ " self.data=data\n",
48
+ " def __len__(self):\n",
49
+ " return(len(self.data))\n",
50
+ " def __getitem__(self,index):\n",
51
+ " item = self.data.iloc[index]\n",
52
+ " comment = str(item.comment_text)\n",
53
+ " attributes = torch.FloatTensor(item[self.attributes])\n",
54
+ " tokens = self.tokenizer.encode_plus(comment,add_special_tokens=True,return_tensors=\"pt\",truncation=True,max_length=self.max_token_len,padding=\"max_length\",return_attention_mask=True)\n",
55
+ " return{'input_ids':tokens.input_ids.flatten(),\"attention_mask\":tokens.attention_mask.flatten(),\"labels\":attributes}\n"
56
+ ]
57
+ },
58
+ {
59
+ "cell_type": "code",
60
+ "execution_count": 4,
61
+ "metadata": {},
62
+ "outputs": [
63
+ {
64
+ "name": "stderr",
65
+ "output_type": "stream",
66
+ "text": [
67
+ "c:\\Users\\jozef\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
68
+ " from .autonotebook import tqdm as notebook_tqdm\n"
69
+ ]
70
+ },
71
+ {
72
+ "data": {
73
+ "text/plain": [
74
+ "{'input_ids': tensor([ 0, 113, 43292, 487, 1073, 6619, 16519, 4261, 1012, 28845,\n",
75
+ " 43292, 50118, 30086, 6, 38, 206, 5, 7729, 6619, 16519,\n",
76
+ " 4261, 1012, 6717, 4867, 11, 2370, 32, 45, 41039, 7140,\n",
77
+ " 250, 48149, 53, 888, 95, 41762, 30, 40823, 34740, 2071,\n",
78
+ " 4, 407, 51, 197, 213, 11, 29617, 101, 25046, 12467,\n",
79
+ " 381, 11742, 646, 19065, 6026, 742, 1195, 87, 634, 14530,\n",
80
+ " 250, 4, 1437, 1437, 22, 2, 1, 1, 1, 1,\n",
81
+ " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
82
+ " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
83
+ " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
84
+ " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
85
+ " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
86
+ " 1, 1, 1, 1, 1, 1, 1, 1]),\n",
87
+ " 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
88
+ " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
89
+ " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,\n",
90
+ " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
91
+ " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
92
+ " 0, 0, 0, 0, 0, 0, 0, 0]),\n",
93
+ " 'labels': tensor([0., 0., 0., 0., 0., 0.])}"
94
+ ]
95
+ },
96
+ "execution_count": 4,
97
+ "metadata": {},
98
+ "output_type": "execute_result"
99
+ }
100
+ ],
101
+ "source": [
102
+ "from transformers import AutoTokenizer\n",
103
+ "model_name=\"roberta-base\"\n",
104
+ "tokenizer=AutoTokenizer.from_pretrained(model_name)\n",
105
+ "toxic_comments_dataset=toxicity_dataset(\"data/train.csv\",tokenizer,attributes)\n",
106
+ "toxic_comments_dataset.__getitem__(0)\n"
107
+ ]
108
+ },
109
+ {
110
+ "cell_type": "code",
111
+ "execution_count": 5,
112
+ "metadata": {},
113
+ "outputs": [],
114
+ "source": [
115
+ "import pytorch_lightning as pl\n",
116
+ "from torch.utils.data import DataLoader"
117
+ ]
118
+ },
119
+ {
120
+ "cell_type": "code",
121
+ "execution_count": 6,
122
+ "metadata": {},
123
+ "outputs": [],
124
+ "source": [
125
+ "class Toxcity_Data_Module(pl.LightningDataModule):\n",
126
+ " def __init__(self,train_path,test_path,attributes,batch_size = 16, max_token_len = 128, model_name=\"roberta-base\"):\n",
127
+ " super().__init__()\n",
128
+ " self.train_path=train_path\n",
129
+ " self.test_path=test_path\n",
130
+ " self.attributes=attributes\n",
131
+ " self.batch_size=batch_size\n",
132
+ " self.max_token_len=max_token_len\n",
133
+ " self.model_name=model_name\n",
134
+ " self.tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
135
+ " def setup(self, stage = None):\n",
136
+ " if stage in (None, \"fit\"):\n",
137
+ " self.train_dataset=toxicity_dataset(self.train_path,self.tokenizer,self.attributes)\n",
138
+ " self.test_dataset=toxicity_dataset(self.test_path,self.tokenizer,self.attributes, sample=None)\n",
139
+ " if stage == \"predict\":\n",
140
+ " self.val_dataset=toxicity_dataset(self.test_path,self.tokenizer,self.attributes)\n",
141
+ " def train_dataloader(self):\n",
142
+ " return DataLoader(self.train_dataset,batch_size=self.batch_size,num_workers=4,shuffle=True)\n",
143
+ " def val_dataloader(self):\n",
144
+ " return DataLoader(self.test_dataset,batch_size=self.batch_size,num_workers=4,shuffle=False)\n",
145
+ " def predict_dataloader(self):\n",
146
+ " return DataLoader(self.test_dataset,batch_size=self.batch_size,num_workers=4,shuffle=False)\n",
147
+ " "
148
+ ]
149
+ },
150
+ {
151
+ "cell_type": "code",
152
+ "execution_count": 7,
153
+ "metadata": {},
154
+ "outputs": [
155
+ {
156
+ "data": {
157
+ "text/plain": [
158
+ "<torch.utils.data.dataloader.DataLoader at 0x2241d16e5d0>"
159
+ ]
160
+ },
161
+ "execution_count": 7,
162
+ "metadata": {},
163
+ "output_type": "execute_result"
164
+ }
165
+ ],
166
+ "source": [
167
+ "toxicity_data_module=Toxcity_Data_Module(\"data/train.csv\",\"data/test.csv\",attributes)\n",
168
+ "\n",
169
+ "toxicity_data_module.setup()\n",
170
+ "dataloader=toxicity_data_module.train_dataloader()\n",
171
+ "dataloader"
172
+ ]
173
+ },
174
+ {
175
+ "cell_type": "code",
176
+ "execution_count": 8,
177
+ "metadata": {},
178
+ "outputs": [],
179
+ "source": [
180
+ "from transformers import AutoModel, AdamW, get_cosine_schedule_with_warmup\n",
181
+ "import torch.nn as nn\n",
182
+ "import math\n",
183
+ "from torchmetrics.functional.classification import auroc\n",
184
+ "import torch.nn.functional as F"
185
+ ]
186
+ },
187
+ {
188
+ "cell_type": "code",
189
+ "execution_count": 9,
190
+ "metadata": {},
191
+ "outputs": [],
192
+ "source": [
193
+ "class Toxic_Comment_Classifier(pl.LightningModule):\n",
194
+ " def __init__(self,config:dict):\n",
195
+ " super().__init__()\n",
196
+ " self.config=config\n",
197
+ " self.pretrained_model = AutoModel.from_pretrained(config['model_name'],return_dict=True)\n",
198
+ " self.hidden = nn.Linear(self.pretrained_model.config.hidden_size, self.pretrained_model.config.hidden_size)\n",
199
+ " self.classifier = nn.Linear(self.pretrained_model.config.hidden_size, self.config['n_labels'])\n",
200
+ " torch.nn.init.xavier_uniform_(self.hidden.weight)\n",
201
+ " torch.nn.init.xavier_uniform_(self.classifier.weight)\n",
202
+ " self.loss_func=nn.BCEWithLogitsLoss(reduction='mean')\n",
203
+ " self.dropout = nn.Dropout()\n",
204
+ "\n",
205
+ " def forward(self, input_ids,attention_mask,labels=None):\n",
206
+ " output = self.pretrained_model(input_ids=input_ids,attention_mask=attention_mask)\n",
207
+ " pooled_output=torch.mean(output.last_hidden_state, 1)\n",
208
+ " #nn classification\n",
209
+ " pooled_output=self.hidden(pooled_output)\n",
210
+ " pooled_output=self.dropout(pooled_output)\n",
211
+ " pooled_output=F.relu(pooled_output)\n",
212
+ " logits=self.classifier(pooled_output)\n",
213
+ " #loss\n",
214
+ " loss = 0\n",
215
+ " if labels is not None:\n",
216
+ " loss = self.loss_func(logits.view(-1,self.config['n_labels']), labels.view(-1,self.config['n_labels']))\n",
217
+ " return loss, logits\n",
218
+ " \n",
219
+ " def training_step(self, batch, batch_index):\n",
220
+ " loss, logits = self(**batch)\n",
221
+ " self.log(\"train loss\", loss, prog_bar=True, logger=True)\n",
222
+ " return{'loss':loss,'predictions':logits,'labels':batch['labels']}\n",
223
+ " def validation_step(self, batch, batch_index):\n",
224
+ " loss, logits = self(**batch)\n",
225
+ " self.log(\"validation loss\", loss, prog_bar=True, logger=True)\n",
226
+ " return{'val_loss':loss,'predictions':logits,'labels':batch['labels']}\n",
227
+ " def prediction_step(self, batch, batch_index):\n",
228
+ " logits = self(**batch)\n",
229
+ " return logits\n",
230
+ " \n",
231
+ " def configure_optimizers(self):\n",
232
+ " optimizer = AdamW(self.parameters(), lr=self.config['lr'], weight_decay=self.config['w_decay'])\n",
233
+ " total_steps = self.config['train_size']/self.config['bs']\n",
234
+ " warmup_steps = math.floor(total_steps*self.config['warmup'])\n",
235
+ " scheduler = get_cosine_schedule_with_warmup(optimizer,warmup_steps,total_steps)\n",
236
+ " return [optimizer],[scheduler]"
237
+ ]
238
+ },
239
+ {
240
+ "cell_type": "code",
241
+ "execution_count": 10,
242
+ "metadata": {},
243
+ "outputs": [
244
+ {
245
+ "name": "stderr",
246
+ "output_type": "stream",
247
+ "text": [
248
+ "Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.bias']\n",
249
+ "- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
250
+ "- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n"
251
+ ]
252
+ }
253
+ ],
254
+ "source": [
255
+ "config = {\n",
256
+ " 'model_name':\"distilroberta-base\",\n",
257
+ " 'n_labels':len(attributes),\n",
258
+ " 'bs':128,\n",
259
+ " 'lr':1.5e-6,\n",
260
+ " 'warmup':0.2,\n",
261
+ " \"train_size\":len(toxicity_data_module.train_dataloader()),\n",
262
+ " 'w_decay':0.001,\n",
263
+ " 'n_epochs':1\n",
264
+ "}\n",
265
+ "\n",
266
+ "model = Toxic_Comment_Classifier(config)"
267
+ ]
268
+ },
269
+ {
270
+ "cell_type": "code",
271
+ "execution_count": 11,
272
+ "metadata": {},
273
+ "outputs": [],
274
+ "source": [
275
+ "idx=0\n",
276
+ "input_ids = toxic_comments_dataset.__getitem__(idx)[\"input_ids\"]\n",
277
+ "attention_mask = toxic_comments_dataset.__getitem__(idx)[\"attention_mask\"]\n",
278
+ "labels = toxic_comments_dataset.__getitem__(idx)[\"labels\"]\n",
279
+ "loss,output = model(input_ids.unsqueeze(dim=0), attention_mask.unsqueeze(dim=0),labels.unsqueeze(dim=0))"
280
+ ]
281
+ },
282
+ {
283
+ "cell_type": "code",
284
+ "execution_count": 12,
285
+ "metadata": {},
286
+ "outputs": [
287
+ {
288
+ "name": "stderr",
289
+ "output_type": "stream",
290
+ "text": [
291
+ "Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.bias']\n",
292
+ "- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
293
+ "- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
294
+ "GPU available: False, used: False\n",
295
+ "TPU available: False, using: 0 TPU cores\n",
296
+ "IPU available: False, using: 0 IPUs\n",
297
+ "HPU available: False, using: 0 HPUs\n",
298
+ "c:\\Users\\jozef\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\pytorch_lightning\\trainer\\connectors\\logger_connector\\logger_connector.py:67: UserWarning: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `pytorch_lightning` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default\n",
299
+ " warning_cache.warn(\n"
300
+ ]
301
+ }
302
+ ],
303
+ "source": [
304
+ "toxicity_data_module=Toxcity_Data_Module(\"data/train.csv\",\"data/test.csv\",attributes,batch_size=config['bs'])\n",
305
+ "toxicity_data_module.setup()\n",
306
+ "model = Toxic_Comment_Classifier(config)\n",
307
+ "\n",
308
+ "trainer = pl.Trainer(max_epochs=config['n_epochs'],num_sanity_val_steps=50)\n",
309
+ "#device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
310
+ "#print(torch.cuda.get_device_name())\n",
311
+ "#trainer.fit(model,toxicity_data_module)"
312
+ ]
313
+ },
314
+ {
315
+ "cell_type": "code",
316
+ "execution_count": 13,
317
+ "metadata": {},
318
+ "outputs": [],
319
+ "source": [
320
+ "model = torch.load(\"ToxicityClassificationModel.pt\")"
321
+ ]
322
+ },
323
+ {
324
+ "cell_type": "code",
325
+ "execution_count": 14,
326
+ "metadata": {},
327
+ "outputs": [],
328
+ "source": [
329
+ "def predict_raw_comments(model, dm):\n",
330
+ " predictions = trainer.predict(model,datamodule=dm)\n",
331
+ " flattened_predictions = np.stack([torch.sigmoid(torch.Tensor(p)) for batch in predictions for p in batch])\n",
332
+ " return flattened_predictions"
333
+ ]
334
+ },
335
+ {
336
+ "cell_type": "code",
337
+ "execution_count": 15,
338
+ "metadata": {},
339
+ "outputs": [],
340
+ "source": [
341
+ "predictions = predict_raw_comments(model=model,dm=toxicity_data_module)"
342
+ ]
343
+ }
344
+ ],
345
+ "metadata": {
346
+ "kernelspec": {
347
+ "display_name": "Python 3",
348
+ "language": "python",
349
+ "name": "python3"
350
+ },
351
+ "language_info": {
352
+ "codemirror_mode": {
353
+ "name": "ipython",
354
+ "version": 3
355
+ },
356
+ "file_extension": ".py",
357
+ "mimetype": "text/x-python",
358
+ "name": "python",
359
+ "nbconvert_exporter": "python",
360
+ "pygments_lexer": "ipython3",
361
+ "version": "3.11.3"
362
+ },
363
+ "orig_nbformat": 4
364
+ },
365
+ "nbformat": 4,
366
+ "nbformat_minor": 2
367
+ }