nicholasKluge commited on
Commit
184eab2
1 Parent(s): bcc4ec8

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +95 -1
README.md CHANGED
@@ -12,4 +12,98 @@ tags:
12
  - sentiment-analysis
13
  ---
14
 
15
- ##
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  - sentiment-analysis
13
  ---
14
 
15
+ ```python
16
+ # IMDB
17
+ ! pip install transformers datasets evaluate accelerate -q
18
+
19
+ import evaluate
20
+ import numpy as np
21
+ from huggingface_hub import login
22
+ from datasets import load_dataset, Dataset, DatasetDict
23
+ from transformers import AutoTokenizer, DataCollatorWithPadding
24
+ from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
25
+
26
+ # Basic fine-tuning arguments
27
+ token="your_token"
28
+ task="christykoh/imdb_pt"
29
+ model_name="neuralmind/bert-base-portuguese-cased"
30
+ output_dir="checkpoint"
31
+ learning_rate=4e-5
32
+ per_device_train_batch_size=32
33
+ per_device_eval_batch_size=32
34
+ num_train_epochs=3
35
+ weight_decay=0.01
36
+ evaluation_strategy="epoch"
37
+ save_strategy="epoch"
38
+ hub_model_id="nicholasKluge/Teeny-tiny-llama-162m-imdb"
39
+
40
+ # Login on the hub to load and push
41
+ login(token=token)
42
+
43
+ # Load the task
44
+ dataset = load_dataset(task)
45
+
46
+ # Create a `ModelForSequenceClassification`
47
+ model = AutoModelForSequenceClassification.from_pretrained(
48
+ model_name,
49
+ num_labels=2,
50
+ id2label={0: "NEGATIVE", 1: "POSITIVE"},
51
+ label2id={"NEGATIVE": 0, "POSITIVE": 1}
52
+ )
53
+
54
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
55
+
56
+ # If model does not have a pad_token, we need to add it
57
+ #tokenizer.pad_token = tokenizer._eos_token
58
+ #model.config.pad_token_id = model.config.eos_token_id
59
+
60
+ # Pre process the dataset
61
+ def preprocess_function(examples):
62
+ return tokenizer(examples["text"], truncation=True, max_length=256)
63
+
64
+ dataset_tokenized = dataset.map(preprocess_function, batched=True)
65
+
66
+ # Create a simple data collactor
67
+ data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
68
+
69
+ # Use accuracy as evaluation metric
70
+ accuracy = evaluate.load("accuracy")
71
+
72
+ # Function to compute accuracy
73
+ def compute_metrics(eval_pred):
74
+ predictions, labels = eval_pred
75
+ predictions = np.argmax(predictions, axis=1)
76
+ return accuracy.compute(predictions=predictions, references=labels)
77
+
78
+ # Define training arguments
79
+ training_args = TrainingArguments(
80
+ output_dir=output_dir,
81
+ learning_rate=learning_rate,
82
+ per_device_train_batch_size=per_device_train_batch_size,
83
+ per_device_eval_batch_size=per_device_eval_batch_size,
84
+ num_train_epochs=num_train_epochs,
85
+ weight_decay=weight_decay,
86
+ evaluation_strategy=evaluation_strategy,
87
+ save_strategy=save_strategy,
88
+ load_best_model_at_end=True,
89
+ push_to_hub=False,
90
+ hub_token=token,
91
+ hub_private_repo=True,
92
+ hub_model_id=hub_model_id,
93
+ tf32=False,
94
+ )
95
+
96
+ # Define the Trainer
97
+ trainer = Trainer(
98
+ model=model,
99
+ args=training_args,
100
+ train_dataset=dataset_tokenized["train"],
101
+ eval_dataset=dataset_tokenized["test"],
102
+ tokenizer=tokenizer,
103
+ data_collator=data_collator,
104
+ compute_metrics=compute_metrics,
105
+ )
106
+
107
+ # Train!
108
+ trainer.train()
109
+ ```