cgr28 commited on
Commit
01769d2
1 Parent(s): cf5d81e

milestone-3

Browse files
Files changed (2) hide show
  1. .gitignore +1 -1
  2. milestone_3.py +69 -65
.gitignore CHANGED
@@ -127,4 +127,4 @@ dmypy.json
127
 
128
  # Pyre type checker
129
  .pyre/
130
- ./data
 
127
 
128
  # Pyre type checker
129
  .pyre/
130
+ data/
milestone_3.py CHANGED
@@ -1,96 +1,100 @@
1
- from transformers import DistilBertTokenizerFast, DistilBertModel, AdamW
2
  import torch
3
- from torch.utils.data import Dataset, DataLoader
 
4
  import pandas as pd
 
5
 
6
 
7
  # assignment 3
8
  model_name = "distilbert-base-uncased"
9
- tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
 
11
  print("Reading data...")
12
  data = pd.read_csv("./data/train.csv")
13
  toxic_data = pd.DataFrame()
14
  toxic_data["text"] = data["comment_text"]
15
  toxic_data["labels"] = data.iloc[:, 2:].values.tolist()
16
- print(toxic_data.head())
17
-
18
- class ToxicDataset(Dataset):
19
 
20
- def __init__(self, dataframe, tokenizer):
21
- self.tokenizer = tokenizer
22
- self.data = dataframe
23
- self.text = dataframe.text
24
- self.labels = self.data.labels
25
 
26
- def __len__(self):
27
- return len(self.text)
28
-
29
- def __getitem__(self, idx):
30
- text = str(self.text[idx])
31
- if len(text) > 12:
32
- text = text[:12]
33
-
34
- inputs = self.tokenizer.encode_plus(
35
- text,
36
- None,
37
- max_length=12,
38
- add_special_tokens=True,
39
- pad_to_max_length=True,
40
- return_token_type_ids=True
41
- )
42
-
43
- ids = inputs["input_ids"]
44
- mask = inputs["attention_mask"]
45
- token_type_ids = inputs["token_type_ids"]
46
-
47
- return {
48
- "ids": torch.tensor(ids, dtype=torch.long),
49
- "mask": torch.tensor(mask, dtype=torch.long),
50
- "token_type_ids": torch.tensor(token_type_ids, dtype=torch.long),
51
- "targets": torch.tensor(self.labels[idx], dtype=torch.float)
52
- }
53
 
 
 
54
 
55
- print("Data read. Splitting data...")
56
- train_data = toxic_data.sample(frac=.8)
57
- test_data = toxic_data.drop(train_data.index).reset_index(drop=True)
58
- train_data = train_data.reset_index(drop=True)
59
 
60
- print("Data split. Tokenizing data...")
61
- train_set = ToxicDataset(train_data, tokenizer)
62
- test_set = ToxicDataset(test_data, tokenizer)
63
 
64
- train_loader = DataLoader(train_set, batch_size=8, shuffle=True, num_workers=0)
65
- test_loader = DataLoader(test_set, batch_size=8, shuffle=True, num_workers=0)
66
 
67
  print("Data tokenized. Beginning training...")
68
 
69
- device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
 
71
- model = DistilBertModel.from_pretrained(model_name)
72
- model.to(device)
73
- model.train()
74
 
75
- optim = AdamW(model.parameters(), lr=5e-5)
76
 
77
- num_train_epochs = 2
78
 
79
- for epoch in range(num_train_epochs):
80
- for batch in train_loader:
81
- optim.zero_grad()
82
- input_ids = batch["ids"].to(device)
83
- attention_mask = batch["mask"].to(device)
84
- token_type_ids = batch["token_type_ids"].to(device, dtype = torch.long)
85
- targets = batch["targets"].to(device)
86
 
87
- outputs = model(input_ids, attention_mask, token_type_ids)
88
 
89
- loss = torch.nn.BCEWithLogitsLoss()(outputs, targets)
90
- loss.backward()
91
- optim.step()
92
 
93
- model.eval()
94
 
95
 
96
 
 
1
+ from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments
2
  import torch
3
+ from torch.utils.data import Dataset
4
+ # from torch.optim import AdamW
5
  import pandas as pd
6
+ from sklearn.model_selection import train_test_split
7
 
8
 
9
  # assignment 3
10
  model_name = "distilbert-base-uncased"
11
+
12
+ class ToxicDataset(Dataset):
13
+
14
+ def __init__(self, encodings, labels):
15
+ self.encodings = encodings
16
+ self.labels = labels
17
+
18
+ def __getitem__(self, idx):
19
+ item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
20
+ item["labels"] = torch.tensor(self.labels[idx])
21
+ print(item)
22
+ return item
23
+
24
+ def __len__(self):
25
+ return len(self.labels)
26
 
27
  print("Reading data...")
28
  data = pd.read_csv("./data/train.csv")
29
  toxic_data = pd.DataFrame()
30
  toxic_data["text"] = data["comment_text"]
31
  toxic_data["labels"] = data.iloc[:, 2:].values.tolist()
 
 
 
32
 
33
+ print("Data read. Splitting data...")
34
+ train_texts, val_texts, train_labels, val_labels = train_test_split(toxic_data.text.to_list(), toxic_data.labels.to_list(), test_size=.2)
 
 
 
35
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
 
37
+ print("Data split. Tokenizing data...")
38
+ tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)
39
 
40
+ train_encodings = tokenizer.batch_encode_plus(train_texts, truncation=True, padding=True, return_tensors='pt')
41
+ val_encodings = tokenizer.batch_encode_plus(val_texts, truncation=True, padding=True, return_tensors='pt')
 
 
42
 
 
 
 
43
 
44
+ train_dataset = ToxicDataset(train_encodings, train_labels)
45
+ val_dataset = ToxicDataset(val_encodings, val_labels)
46
 
47
  print("Data tokenized. Beginning training...")
48
 
49
+ training_args = TrainingArguments(
50
+ output_dir="./results",
51
+ num_train_epochs=2,
52
+ per_device_train_batch_size=4,
53
+ per_device_eval_batch_size=16,
54
+ warmup_steps=500,
55
+ weight_decay=0.01,
56
+ logging_dir="./logs",
57
+ logging_steps=10,
58
+ )
59
+
60
+ # device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
61
+
62
+ model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=6)
63
+
64
+ trainer = Trainer(
65
+ model=model,
66
+ args=training_args,
67
+ train_dataset=train_dataset,
68
+ eval_dataset=val_dataset,
69
+ )
70
+
71
+ trainer.train()
72
+
73
+ # model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=6)
74
+
75
+ # model.to(device)
76
+ # model.train()
77
 
78
+ # train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
 
 
79
 
80
+ # optim = AdamW(model.parameters(), lr=5e-5)
81
 
82
+ # num_train_epochs = 2
83
 
84
+ # for epoch in range(num_train_epochs):
85
+ # for batch in train_loader:
86
+ # optim.zero_grad()
87
+ # input_ids = batch["input_ids"].to(device)
88
+ # attention_mask = batch["attention_mask"].to(device)
89
+ # labels = batch["labels"].to(device)
 
90
 
91
+ # outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
92
 
93
+ # loss = outputs[0]
94
+ # loss.backward()
95
+ # optim.step()
96
 
97
+ # model.eval()
98
 
99
 
100