Spaces:
Runtime error
Runtime error
added notes
Browse files- finetune.py +9 -0
finetune.py
CHANGED
@@ -10,11 +10,13 @@ from transformers import DistilBertForSequenceClassification, AdamW
|
|
10 |
|
11 |
model_name = "distilbert-base-uncased"
|
12 |
|
|
|
13 |
df = pd.read_csv('train.csv')
|
14 |
train_texts = df["comment_text"].values
|
15 |
train_labels = df[df.columns[2:]].values
|
16 |
train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=.2)
|
17 |
|
|
|
18 |
class TextDataset(Dataset):
|
19 |
def __init__(self,texts,labels):
|
20 |
self.texts = texts
|
@@ -30,21 +32,26 @@ class TextDataset(Dataset):
|
|
30 |
def __len__(self):
|
31 |
return len(self.labels)
|
32 |
|
|
|
33 |
tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)
|
34 |
|
|
|
35 |
train_dataset = TextDataset(train_texts,train_labels)
|
36 |
val_dataset = TextDataset(val_texts, val_labels)
|
37 |
|
38 |
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
|
39 |
|
|
|
40 |
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=6, problem_type="multi_label_classification")
|
41 |
model.to(device)
|
42 |
model.train()
|
43 |
|
|
|
44 |
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
|
45 |
|
46 |
optim = AdamW(model.parameters(), lr=5e-5)
|
47 |
|
|
|
48 |
for epoch in range(1):
|
49 |
for batch in train_loader:
|
50 |
optim.zero_grad()
|
@@ -59,6 +66,8 @@ for epoch in range(1):
|
|
59 |
|
60 |
model.eval()
|
61 |
|
|
|
62 |
model.save_pretrained("sentiment_custom_model")
|
63 |
|
|
|
64 |
tokenizer.save_pretrained("sentiment_tokenizer")
|
|
|
10 |
|
11 |
model_name = "distilbert-base-uncased"
|
12 |
|
13 |
+
#Reading text
|
14 |
df = pd.read_csv('train.csv')
|
15 |
train_texts = df["comment_text"].values
|
16 |
train_labels = df[df.columns[2:]].values
|
17 |
train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=.2)
|
18 |
|
19 |
+
#Dataset class to create the labels and encode them
|
20 |
class TextDataset(Dataset):
|
21 |
def __init__(self,texts,labels):
|
22 |
self.texts = texts
|
|
|
32 |
def __len__(self):
|
33 |
return len(self.labels)
|
34 |
|
35 |
+
#This is the tokenizer for the current model
|
36 |
tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)
|
37 |
|
38 |
+
#Set up the dataset
|
39 |
train_dataset = TextDataset(train_texts,train_labels)
|
40 |
val_dataset = TextDataset(val_texts, val_labels)
|
41 |
|
42 |
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
|
43 |
|
44 |
+
#Use multilabel model because there are 6 variables to fintune for
|
45 |
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=6, problem_type="multi_label_classification")
|
46 |
model.to(device)
|
47 |
model.train()
|
48 |
|
49 |
+
#Use these parameters
|
50 |
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
|
51 |
|
52 |
optim = AdamW(model.parameters(), lr=5e-5)
|
53 |
|
54 |
+
#Finetune process
|
55 |
for epoch in range(1):
|
56 |
for batch in train_loader:
|
57 |
optim.zero_grad()
|
|
|
66 |
|
67 |
model.eval()
|
68 |
|
69 |
+
#Upload trained model to a file
|
70 |
model.save_pretrained("sentiment_custom_model")
|
71 |
|
72 |
+
#Upload tokenizer to a file
|
73 |
tokenizer.save_pretrained("sentiment_tokenizer")
|