dlwlgus53 commited on
Commit
02218b1
1 Parent(s): d954ed5

First model version

Browse files
Files changed (1) hide show
  1. main.py +83 -0
main.py ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import load_dataset
2
+ from transformers import AutoTokenizer
3
+ from transformers import DataCollatorWithPadding
4
+ import numpy as np
5
+ import evaluate
6
+
7
+ accuracy = evaluate.load("accuracy")
8
+
9
+
10
+ def compute_metrics(eval_pred):
11
+ predictions, labels = eval_pred
12
+ predictions = np.argmax(predictions, axis=1)
13
+ return accuracy.compute(predictions=predictions, references=labels)
14
+
15
+
16
+ def load_data():
17
+ ### load dataset
18
+ imdb = load_dataset("imdb")
19
+ return imdb
20
+
21
+
22
+ tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
23
+
24
+
25
+ def preprocess_function(examples):
26
+ return tokenizer(examples["text"], truncation=True)
27
+
28
+
29
+ def main():
30
+ imdb = load_data()
31
+ tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
32
+
33
+ def preprocess_function(examples):
34
+ return tokenizer(examples["text"], truncation=True)
35
+
36
+ tokenized_imdb = imdb.map(preprocess_function, batched=True)
37
+ data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
38
+
39
+ id2label = {0: "NEGATIVE", 1: "POSITIVE"}
40
+ label2id = {"NEGATIVE": 0, "POSITIVE": 1}
41
+
42
+ from transformers import (
43
+ AutoModelForSequenceClassification,
44
+ TrainingArguments,
45
+ Trainer,
46
+ )
47
+
48
+ model = AutoModelForSequenceClassification.from_pretrained(
49
+ "distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id
50
+ )
51
+
52
+ # FP16, Multi GPU, accelerator
53
+ # is it possible to continue training?
54
+
55
+ training_args = TrainingArguments(
56
+ output_dir="./",
57
+ learning_rate=2e-5,
58
+ per_device_train_batch_size=16,
59
+ per_device_eval_batch_size=16,
60
+ num_train_epochs=2,
61
+ weight_decay=0.01,
62
+ evaluation_strategy="epoch",
63
+ save_strategy="epoch",
64
+ load_best_model_at_end=True,
65
+ push_to_hub=True,
66
+ )
67
+
68
+ trainer = Trainer(
69
+ model=model,
70
+ args=training_args,
71
+ train_dataset=tokenized_imdb["train"],
72
+ eval_dataset=tokenized_imdb["test"],
73
+ tokenizer=tokenizer,
74
+ data_collator=data_collator,
75
+ compute_metrics=compute_metrics,
76
+ )
77
+
78
+ # trainer.train()
79
+ trainer.push_to_hub()
80
+
81
+
82
+ if __name__ == "__main__":
83
+ main()