andyqin18 commited on
Commit
c59cf35
1 Parent(s): 90ae92b

Structure Change

Browse files
milestone3/comp/test.csv CHANGED
Binary files a/milestone3/comp/test.csv and b/milestone3/comp/test.csv differ
 
milestone3/comp/test_comment.csv ADDED
Binary file (60.4 MB). View file
 
milestone3/milestone3.py CHANGED
@@ -1,4 +1,82 @@
1
- from datasets import load_dataset
2
 
3
- dataset = load_dataset("comp")
4
- dataset["train"][100]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
2
 
3
+ # import torch
4
+ # import torch.nn.functional as F
5
+
6
+ # model_name = "andyqin18/test-finetuned"
7
+
8
+ # model = AutoModelForSequenceClassification.from_pretrained(model_name)
9
+ # tokenizer = AutoTokenizer.from_pretrained(model_name)
10
+
11
+ # classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)
12
+
13
+ # res = classifier(["Fuck your mom",
14
+ # "Hope you don't hate it"])
15
+
16
+ # for result in res:
17
+ # print(result)
18
+ import pandas as pd
19
+ from sklearn.model_selection import train_test_split
20
+ import torch
21
+ from torch.utils.data import Dataset
22
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
23
+ device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
24
+ import numpy as np
25
+
26
+ df = pd.read_csv("comp/train.csv")
27
+
28
+ train_texts = df["comment_text"].values
29
+ train_labels = df[df.columns[2:]].values
30
+ # print(train_labels[0])
31
+
32
+ # np.random.seed(123)
33
+ # small_train_texts = np.random.choice(train_texts, size=1000, replace=False)
34
+ # small_train_labels_idx = np.random.choice(train_labels.shape[0], size=1000, replace=False)
35
+ # small_train_labels = train_labels[small_train_labels_idx, :]
36
+
37
+
38
+ # train_texts, val_texts, train_labels, val_labels = train_test_split(small_train_texts, small_train_labels, test_size=.2)
39
+ train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=.2)
40
+
41
+ class TextDataset(Dataset):
42
+ def __init__(self,texts,labels):
43
+ self.texts = texts
44
+ self.labels = labels
45
+
46
+ def __getitem__(self,idx):
47
+ encodings = tokenizer(self.texts[idx], truncation=True, padding="max_length")
48
+ item = {key: torch.tensor(val) for key, val in encodings.items()}
49
+ item['labels'] = torch.tensor(self.labels[idx],dtype=torch.float32)
50
+ del encodings
51
+ return item
52
+
53
+ def __len__(self):
54
+ return len(self.labels)
55
+
56
+ tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
57
+ train_dataset = TextDataset(train_texts,train_labels)
58
+ val_dataset = TextDataset(val_texts, val_labels)
59
+ # small_train_dataset = train_dataset.shuffle(seed=42).select(range(1000))
60
+ # small_val_dataset = val_dataset.shuffle(seed=42).select(range(1000))
61
+
62
+
63
+
64
+ model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=6, problem_type="multi_label_classification")
65
+ model.to(device)
66
+ training_args = TrainingArguments(
67
+ output_dir="finetuned-bert-uncased",
68
+ per_device_train_batch_size=16,
69
+ per_device_eval_batch_size=64,
70
+ learning_rate=5e-4,
71
+ weight_decay=0.01,
72
+ evaluation_strategy="epoch",
73
+ push_to_hub=True)
74
+
75
+ trainer = Trainer(
76
+ model=model,
77
+ args=training_args,
78
+ train_dataset=train_dataset,
79
+ eval_dataset=val_dataset,
80
+ )
81
+
82
+ trainer.train()