from datasets import load_dataset from transformers import AutoTokenizer from transformers import DataCollatorWithPadding import numpy as np import evaluate accuracy = evaluate.load("accuracy") def compute_metrics(eval_pred): predictions, labels = eval_pred predictions = np.argmax(predictions, axis=1) return accuracy.compute(predictions=predictions, references=labels) def load_data(): ### load dataset imdb = load_dataset("imdb") return imdb tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased") def preprocess_function(examples): return tokenizer(examples["text"], truncation=True) def main(): imdb = load_data() tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased") def preprocess_function(examples): return tokenizer(examples["text"], truncation=True) tokenized_imdb = imdb.map(preprocess_function, batched=True) data_collator = DataCollatorWithPadding(tokenizer=tokenizer) id2label = {0: "NEGATIVE", 1: "POSITIVE"} label2id = {"NEGATIVE": 0, "POSITIVE": 1} from transformers import ( AutoModelForSequenceClassification, TrainingArguments, Trainer, ) model = AutoModelForSequenceClassification.from_pretrained( "distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id ) # FP16, Multi GPU, accelerator # is it possible to continue training? training_args = TrainingArguments( output_dir="./", learning_rate=2e-5, per_device_train_batch_size=16, per_device_eval_batch_size=16, num_train_epochs=2, weight_decay=0.01, evaluation_strategy="epoch", save_strategy="epoch", load_best_model_at_end=True, push_to_hub=True, ) trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_imdb["train"], eval_dataset=tokenized_imdb["test"], tokenizer=tokenizer, data_collator=data_collator, compute_metrics=compute_metrics, ) # trainer.train() trainer.push_to_hub() if __name__ == "__main__": main()