# ============================== # 訓練BART # ============================== import os from datasets import load_dataset from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer import numpy as np from sklearn.metrics import precision_recall_fscore_support, accuracy_score from huggingface_hub import HfApi, HfFolder # 登入 Hugging Face hf_token = os.environ["TOGETHER_API_KEY"] HfFolder.save_token(hf_token) #push_to_hub_model_id = "picard47at/tuned-albert-tiny" # Add this line push_to_hub_model_id = "picard47at/tunned_albert_model2" # 1. Load the dataset #dataset_name = "picard47at/dataset2" dataset_name = "Luigi/dinercall-intent" try: dataset = load_dataset(dataset_name) print(f"Dataset '{dataset_name}' loaded successfully.") print(dataset) except Exception as e: print(f"Error loading dataset '{dataset_name}': {e}") exit() # Ensure the dataset has 'train' and optionally 'validation' splits if 'train' not in dataset: print("Error: The dataset must contain a 'train' split.") exit() # If a validation split doesn't exist, create one if 'validation' not in dataset: print("Warning: The dataset does not have a 'validation' split. Creating one from the training data.") dataset = dataset['train'].train_test_split(test_size=0.1) dataset['validation'] = dataset['test'] del dataset['test'] print(dataset) # Assuming your dataset has a 'text' column for the input and a 'label' column for the target text_column = "text" # Adjust if your text column has a different name label_column = "label" # Adjust if your label column has a different name # 2. Load the tokenizer and model checkpoint = "ckiplab/albert-tiny-chinese" try: tokenizer = AutoTokenizer.from_pretrained(checkpoint) model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=len(dataset['train'].features[label_column].names)) print(f"Tokenizer and model '{checkpoint}' loaded successfully.") except Exception as e: print(f"Error loading tokenizer or model '{checkpoint}': {e}") exit() # 3. Preprocess the dataset def tokenize_function(examples): return tokenizer(examples[text_column], truncation=True) tokenized_datasets = dataset.map(tokenize_function, batched=True) # 4. Define training arguments output_dir = "./albert-tiny-chinese-finetuned2" batch_size = 16 num_epochs = 100 logging_steps = len(tokenized_datasets["train"]) // (5 * batch_size) # Log every 5 steps #save_steps = logging_steps * 2 save_steps = logging_steps # Save at every logging step eval_steps = logging_steps ''' training_args = TrainingArguments( output_dir=output_dir, learning_rate=2e-5, per_device_train_batch_size=batch_size, per_device_eval_batch_size=batch_size, num_train_epochs=num_epochs, weight_decay=0.01, evaluation_strategy="epoch", save_strategy="steps", logging_steps=logging_steps, save_steps=save_steps, load_best_model_at_end=True, metric_for_best_model="eval_loss", # Can also use "eval_f1" if you adjust compute_metrics push_to_hub=False, )''' """ The error message indicates that load_best_model_at_end requires the evaluation_strategy and save_strategy to have the same value. In the original code, evaluation_strategy was set to "epoch" while save_strategy was set to "steps". To fix this, I've made the following changes in the Canvas: Changed evaluation_strategy from "epoch" to "steps". Set save_steps to logging_steps to ensure a save happens at the same frequency as evaluation. Added eval_steps and set it to logging_steps to explicitly control the evaluation frequency. """ ''' training_args = TrainingArguments( output_dir=output_dir, learning_rate=2e-5, per_device_train_batch_size=batch_size, per_device_eval_batch_size=batch_size, num_train_epochs=num_epochs, weight_decay=0.01, evaluation_strategy="steps", # Change to "steps" to match save_strategy save_strategy="steps", logging_steps=logging_steps, save_steps=save_steps, eval_steps=eval_steps, load_best_model_at_end=True, metric_for_best_model="eval_loss", # Can also use "eval_f1" if you adjust compute_metrics push_to_hub=False, ) ''' training_args = TrainingArguments( output_dir=output_dir, learning_rate=2e-5, per_device_train_batch_size=batch_size, per_device_eval_batch_size=batch_size, num_train_epochs=num_epochs, weight_decay=0.01, evaluation_strategy="steps", save_strategy="steps", logging_steps=logging_steps, save_steps=save_steps, eval_steps=eval_steps, load_best_model_at_end=True, metric_for_best_model="eval_loss", push_to_hub=True, hub_model_id=push_to_hub_model_id, save_total_limit=1, # Add this line ) # 5. Define a function to compute metrics def compute_metrics(eval_pred): predictions = np.argmax(eval_pred.predictions, axis=-1) labels = eval_pred.label_ids accuracy = accuracy_score(labels, predictions) precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='macro') return { 'accuracy': accuracy, 'precision': precision, 'recall': recall, 'f1': f1, } # 6. Create the Trainer trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_datasets["train"], eval_dataset=tokenized_datasets["validation"], tokenizer=tokenizer, compute_metrics=compute_metrics, ) # 7. Train the model print("Starting training...") trainer.train() print("Training finished!") # 8. Evaluate the model print("Evaluating the model...") evaluation_results = trainer.evaluate() print(evaluation_results) # 9. Save the fine-tuned model trainer.save_model(output_dir) tokenizer.save_pretrained(output_dir) print(f"Fine-tuned model and tokenizer saved to '{output_dir}'.") # 10. Push to Hub trainer.push_to_hub() print(f"Model pushed to Hugging Face Hub: {push_to_hub_model_id}")