Spaces:
Paused
Paused
voice_clone
/
transformers
/examples
/research_projects
/self-training-text-classification
/finetuning.py
| # coding=utf-8 | |
| # Copyright 2022 The Google Research Authors. | |
| # | |
| # Licensed under the Apache License, Version 2.0 (the "License"); | |
| # you may not use this file except in compliance with the License. | |
| # You may obtain a copy of the License at | |
| # | |
| # http://www.apache.org/licenses/LICENSE-2.0 | |
| # | |
| # Unless required by applicable law or agreed to in writing, software | |
| # distributed under the License is distributed on an "AS IS" BASIS, | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| # See the License for the specific language governing permissions and | |
| # limitations under the License. | |
| """Fine-tuning the library models for sequence classification.""" | |
| import argparse | |
| import dataclasses | |
| import json | |
| import logging | |
| import math | |
| import os | |
| import random | |
| import shutil | |
| from typing import List, Optional | |
| import datasets | |
| import numpy as np | |
| import pandas as pd | |
| import torch | |
| from datasets import load_dataset, load_metric | |
| from torch.utils.data import DataLoader | |
| from tqdm.auto import tqdm | |
| from transformers import ( | |
| AdamW, | |
| AutoConfig, | |
| AutoModelForSequenceClassification, | |
| AutoTokenizer, | |
| DataCollatorWithPadding, | |
| default_data_collator, | |
| get_scheduler, | |
| set_seed, | |
| ) | |
| from transformers.file_utils import ExplicitEnum | |
| from transformers.trainer_utils import IntervalStrategy | |
| logger = logging.getLogger(__name__) | |
| class Split(ExplicitEnum): | |
| TRAIN = "train" | |
| EVAL = "eval" | |
| TEST = "test" | |
| INFER = "infer" | |
| class FTModelArguments: | |
| """Arguments pertaining to which config/tokenizer/model we are going to fine-tune from.""" | |
| model_name_or_path: str = dataclasses.field( | |
| metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models."} | |
| ) | |
| use_fast_tokenizer: Optional[bool] = dataclasses.field( | |
| default=True, | |
| metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."}, | |
| ) | |
| cache_dir: Optional[str] = dataclasses.field( | |
| default=None, | |
| metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co."}, | |
| ) | |
| class FTDataArguments: | |
| """Arguments pertaining to what data we are going to input our model for training and evaluation.""" | |
| train_file: str = dataclasses.field( | |
| default=None, metadata={"help": "A csv or a json file containing the training data."} | |
| ) | |
| eval_file: Optional[str] = dataclasses.field( | |
| default=None, metadata={"help": "A csv or a json file containing the validation data."} | |
| ) | |
| test_file: Optional[str] = dataclasses.field( | |
| default=None, metadata={"help": "A csv or a json file containing the test data."} | |
| ) | |
| infer_file: Optional[str] = dataclasses.field( | |
| default=None, metadata={"help": "A csv or a json file containing the data to predict on."} | |
| ) | |
| task_name: Optional[str] = dataclasses.field( | |
| default=None, | |
| metadata={"help": "The name of the task to train on."}, | |
| ) | |
| label_list: Optional[List[str]] = dataclasses.field( | |
| default=None, metadata={"help": "The list of labels for the task."} | |
| ) | |
| max_length: Optional[int] = dataclasses.field( | |
| default=128, | |
| metadata={ | |
| "help": ( | |
| "The maximum total input sequence length after tokenization. Sequences longer " | |
| "than this will be truncated, sequences shorter will be padded." | |
| ) | |
| }, | |
| ) | |
| pad_to_max_length: Optional[bool] = dataclasses.field( | |
| default=False, | |
| metadata={ | |
| "help": ( | |
| "Whether to pad all samples to `max_seq_length`. " | |
| "If False, will pad the samples dynamically when batching to the maximum length in the batch." | |
| ) | |
| }, | |
| ) | |
| class FTTrainingArguments: | |
| """Training arguments pertaining to the training loop itself.""" | |
| output_dir: str = dataclasses.field( | |
| metadata={"help": "The output directory where the model predictions and checkpoints will be written."} | |
| ) | |
| do_train: Optional[bool] = dataclasses.field( | |
| default=False, | |
| metadata={"help": "Whether to run training or not."}, | |
| ) | |
| do_eval: Optional[bool] = dataclasses.field( | |
| default=False, | |
| metadata={"help": "Whether to run evaluation on the validation set or not."}, | |
| ) | |
| do_predict: Optional[bool] = dataclasses.field( | |
| default=False, | |
| metadata={"help": "Whether to run inference on the inference set or not."}, | |
| ) | |
| seed: Optional[int] = dataclasses.field( | |
| default=42, | |
| metadata={"help": "Random seed that will be set at the beginning of training."}, | |
| ) | |
| per_device_train_batch_size: Optional[int] = dataclasses.field( | |
| default=8, | |
| metadata={"help": "The batch size per GPU/TPU core/CPU for training."}, | |
| ) | |
| per_device_eval_batch_size: Optional[int] = dataclasses.field( | |
| default=8, | |
| metadata={"help": "The batch size per GPU/TPU core/CPU for evaluation."}, | |
| ) | |
| weight_decay: Optional[float] = dataclasses.field( | |
| default=0.0, | |
| metadata={ | |
| "help": ( | |
| "The weight decay to apply (if not zero) to all layers except all bias and LayerNorm weights in" | |
| " [`AdamW`] optimizer." | |
| ) | |
| }, | |
| ) | |
| learning_rate: Optional[float] = dataclasses.field( | |
| default=5e-5, | |
| metadata={"help": "The initial learning rate for [`AdamW`] optimizer."}, | |
| ) | |
| gradient_accumulation_steps: Optional[int] = dataclasses.field( | |
| default=1, | |
| metadata={ | |
| "help": ( | |
| "Number of updates steps to accumulate the gradients for, before performing a backward/update pass." | |
| ) | |
| }, | |
| ) | |
| max_steps: Optional[int] = dataclasses.field( | |
| default=-1, | |
| metadata={ | |
| "help": ( | |
| "If set to a positive number, the total number of training steps to perform. Overrides" | |
| " `num_train_epochs`." | |
| ) | |
| }, | |
| ) | |
| lr_scheduler_type: Optional[str] = dataclasses.field( | |
| default="linear", metadata={"help": "The scheduler type to use."} | |
| ) | |
| warmup_steps: Optional[int] = dataclasses.field( | |
| default=1, | |
| metadata={ | |
| "help": ( | |
| "Number of steps used for a linear warmup from 0 to `learning_rate`. Overrides any effect of" | |
| " `warmup_ratio`." | |
| ) | |
| }, | |
| ) | |
| evaluation_strategy: Optional[str] = dataclasses.field( | |
| default="no", | |
| metadata={ | |
| "help": 'The evaluation strategy to adopt during training. Possible values are: ["no", "step", "epoch]' | |
| }, | |
| ) | |
| eval_steps: Optional[int] = dataclasses.field( | |
| default=1, | |
| metadata={"help": 'Number of update steps between two evaluations if `evaluation_strategy="steps"`.'}, | |
| ) | |
| eval_metric: Optional[str] = dataclasses.field( | |
| default="accuracy", metadata={"help": "The evaluation metric used for the task."} | |
| ) | |
| keep_checkpoint_max: Optional[int] = dataclasses.field( | |
| default=1, | |
| metadata={"help": "The maximum number of best checkpoint files to keep."}, | |
| ) | |
| early_stopping_patience: Optional[int] = dataclasses.field( | |
| default=10, | |
| metadata={"help": "Number of evaluation calls with no improvement after which training will be stopped."}, | |
| ) | |
| early_stopping_threshold: Optional[float] = dataclasses.field( | |
| default=0.0, | |
| metadata={ | |
| "help": "How much the specified evaluation metric must improve to satisfy early stopping conditions." | |
| }, | |
| ) | |
| def train(args, accelerator, model, tokenizer, train_dataloader, optimizer, lr_scheduler, eval_dataloader=None): | |
| """Train a model on the given training data.""" | |
| total_batch_size = args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps | |
| logger.info("***** Running training *****") | |
| logger.info(" Num examples = %d", args.num_examples[Split.TRAIN.value]) | |
| logger.info(" Instantaneous batch size per device = %d", args.per_device_train_batch_size) | |
| logger.info(" Total train batch size (w. parallel, distributed & accumulation) = %d", total_batch_size) | |
| logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) | |
| logger.info(" Total optimization steps = %d", args.max_steps) | |
| # Only show the progress bar once on each machine. | |
| progress_bar = tqdm(range(args.max_steps), disable=not accelerator.is_local_main_process) | |
| checkpoints = None | |
| eval_results = None | |
| best_checkpoint = None | |
| best_eval_result = None | |
| early_stopping_patience_counter = 0 | |
| should_training_stop = False | |
| epoch = 0 | |
| completed_steps = 0 | |
| train_loss = 0.0 | |
| model.zero_grad() | |
| for _ in range(args.num_train_epochs): | |
| epoch += 1 | |
| model.train() | |
| for step, batch in enumerate(train_dataloader): | |
| outputs = model(**batch) | |
| loss = outputs.loss | |
| loss = loss / args.gradient_accumulation_steps | |
| accelerator.backward(loss) | |
| train_loss += loss.item() | |
| if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1: | |
| optimizer.step() | |
| lr_scheduler.step() | |
| optimizer.zero_grad() | |
| progress_bar.update(1) | |
| completed_steps += 1 | |
| # Evaluate during training | |
| if ( | |
| eval_dataloader is not None | |
| and args.evaluation_strategy == IntervalStrategy.STEPS.value | |
| and args.eval_steps > 0 | |
| and completed_steps % args.eval_steps == 0 | |
| ): | |
| accelerator.wait_for_everyone() | |
| new_checkpoint = f"checkpoint-{IntervalStrategy.STEPS.value}-{completed_steps}" | |
| new_eval_result = evaluate(args, accelerator, eval_dataloader, "eval", model, new_checkpoint)[ | |
| args.eval_metric | |
| ] | |
| logger.info( | |
| "Evaluation result at step %d: %s = %f", completed_steps, args.eval_metric, new_eval_result | |
| ) | |
| if checkpoints is None: | |
| checkpoints = np.array([new_checkpoint]) | |
| eval_results = np.array([new_eval_result]) | |
| best_checkpoint = new_checkpoint | |
| best_eval_result = new_eval_result | |
| else: | |
| if new_eval_result - best_eval_result > args.early_stopping_threshold: | |
| best_checkpoint = new_checkpoint | |
| best_eval_result = new_eval_result | |
| early_stopping_patience_counter = 0 | |
| else: | |
| if new_eval_result == best_eval_result: | |
| best_checkpoint = new_checkpoint | |
| best_eval_result = new_eval_result | |
| early_stopping_patience_counter += 1 | |
| if early_stopping_patience_counter >= args.early_stopping_patience: | |
| should_training_stop = True | |
| checkpoints = np.append(checkpoints, [new_checkpoint], axis=0) | |
| eval_results = np.append(eval_results, [new_eval_result], axis=0) | |
| sorted_ids = np.argsort(eval_results) | |
| eval_results = eval_results[sorted_ids] | |
| checkpoints = checkpoints[sorted_ids] | |
| if len(checkpoints) > args.keep_checkpoint_max: | |
| # Delete the current worst checkpoint | |
| checkpoint_to_remove, *checkpoints = checkpoints | |
| eval_results = eval_results[1:] | |
| if checkpoint_to_remove != new_checkpoint: | |
| if accelerator.is_main_process: | |
| shutil.rmtree(os.path.join(args.output_dir, checkpoint_to_remove), ignore_errors=True) | |
| accelerator.wait_for_everyone() | |
| if new_checkpoint in checkpoints: | |
| # Save model checkpoint | |
| checkpoint_output_dir = os.path.join(args.output_dir, new_checkpoint) | |
| if accelerator.is_main_process: | |
| if not os.path.exists(checkpoint_output_dir): | |
| os.makedirs(checkpoint_output_dir) | |
| accelerator.wait_for_everyone() | |
| unwrapped_model = accelerator.unwrap_model(model) | |
| unwrapped_model.save_pretrained(checkpoint_output_dir, save_function=accelerator.save) | |
| if accelerator.is_main_process: | |
| tokenizer.save_pretrained(checkpoint_output_dir) | |
| logger.info("Saving model checkpoint to %s", checkpoint_output_dir) | |
| if completed_steps >= args.max_steps: | |
| break | |
| if should_training_stop: | |
| break | |
| # Evaluate during training | |
| if eval_dataloader is not None and args.evaluation_strategy == IntervalStrategy.EPOCH.value: | |
| accelerator.wait_for_everyone() | |
| new_checkpoint = f"checkpoint-{IntervalStrategy.EPOCH.value}-{epoch}" | |
| new_eval_result = evaluate(args, accelerator, eval_dataloader, "eval", model, new_checkpoint)[ | |
| args.eval_metric | |
| ] | |
| logger.info("Evaluation result at epoch %d: %s = %f", epoch, args.eval_metric, new_eval_result) | |
| if checkpoints is None: | |
| checkpoints = np.array([new_checkpoint]) | |
| eval_results = np.array([new_eval_result]) | |
| best_checkpoint = new_checkpoint | |
| best_eval_result = new_eval_result | |
| else: | |
| if new_eval_result - best_eval_result > args.early_stopping_threshold: | |
| best_checkpoint = new_checkpoint | |
| best_eval_result = new_eval_result | |
| early_stopping_patience_counter = 0 | |
| else: | |
| if new_eval_result == best_eval_result: | |
| best_checkpoint = new_checkpoint | |
| best_eval_result = new_eval_result | |
| early_stopping_patience_counter += 1 | |
| if early_stopping_patience_counter >= args.early_stopping_patience: | |
| should_training_stop = True | |
| checkpoints = np.append(checkpoints, [new_checkpoint], axis=0) | |
| eval_results = np.append(eval_results, [new_eval_result], axis=0) | |
| sorted_ids = np.argsort(eval_results) | |
| eval_results = eval_results[sorted_ids] | |
| checkpoints = checkpoints[sorted_ids] | |
| if len(checkpoints) > args.keep_checkpoint_max: | |
| # Delete the current worst checkpoint | |
| checkpoint_to_remove, *checkpoints = checkpoints | |
| eval_results = eval_results[1:] | |
| if checkpoint_to_remove != new_checkpoint: | |
| if accelerator.is_main_process: | |
| shutil.rmtree(os.path.join(args.output_dir, checkpoint_to_remove), ignore_errors=True) | |
| accelerator.wait_for_everyone() | |
| if new_checkpoint in checkpoints: | |
| # Save model checkpoint | |
| checkpoint_output_dir = os.path.join(args.output_dir, new_checkpoint) | |
| if accelerator.is_main_process: | |
| if not os.path.exists(checkpoint_output_dir): | |
| os.makedirs(checkpoint_output_dir) | |
| accelerator.wait_for_everyone() | |
| unwrapped_model = accelerator.unwrap_model(model) | |
| unwrapped_model.save_pretrained(checkpoint_output_dir, save_function=accelerator.save) | |
| if accelerator.is_main_process: | |
| tokenizer.save_pretrained(checkpoint_output_dir) | |
| logger.info("Saving model checkpoint to %s", checkpoint_output_dir) | |
| if completed_steps >= args.max_steps: | |
| break | |
| if should_training_stop: | |
| break | |
| if best_checkpoint is not None: | |
| # Save the best checkpoint | |
| logger.info("Best checkpoint: %s", best_checkpoint) | |
| logger.info("Best evaluation result: %s = %f", args.eval_metric, best_eval_result) | |
| best_checkpoint_output_dir = os.path.join(args.output_dir, best_checkpoint) | |
| if accelerator.is_main_process: | |
| shutil.move(best_checkpoint_output_dir, os.path.join(args.output_dir, "best-checkpoint")) | |
| shutil.rmtree(best_checkpoint_output_dir, ignore_errors=True) | |
| accelerator.wait_for_everyone() | |
| else: | |
| # Assume that the last checkpoint is the best checkpoint and save it | |
| checkpoint_output_dir = os.path.join(args.output_dir, "best-checkpoint") | |
| if not os.path.exists(checkpoint_output_dir): | |
| os.makedirs(checkpoint_output_dir) | |
| accelerator.wait_for_everyone() | |
| unwrapped_model = accelerator.unwrap_model(model) | |
| unwrapped_model.save_pretrained(checkpoint_output_dir, save_function=accelerator.save) | |
| if accelerator.is_main_process: | |
| tokenizer.save_pretrained(checkpoint_output_dir) | |
| logger.info("Saving model checkpoint to %s", checkpoint_output_dir) | |
| return completed_steps, train_loss / completed_steps | |
| def evaluate(args, accelerator, dataloader, eval_set, model, checkpoint, has_labels=True, write_to_file=True): | |
| """Evaluate a model checkpoint on the given evaluation data.""" | |
| num_examples = args.num_examples[eval_set] | |
| eval_metric = None | |
| completed_steps = 0 | |
| eval_loss = 0.0 | |
| all_predictions = None | |
| all_references = None | |
| all_probabilities = None | |
| if has_labels: | |
| # Get the metric function | |
| eval_metric = load_metric(args.eval_metric) | |
| eval_results = {} | |
| model.eval() | |
| for _, batch in enumerate(dataloader): | |
| with torch.no_grad(): | |
| outputs = model(**batch) | |
| eval_loss += outputs.loss.item() | |
| logits = outputs.logits | |
| predictions = logits.argmax(dim=-1) if not args.is_regression else logits.squeeze() | |
| predictions = accelerator.gather(predictions) | |
| if all_predictions is None: | |
| all_predictions = predictions.detach().cpu().numpy() | |
| else: | |
| all_predictions = np.append(all_predictions, predictions.detach().cpu().numpy(), axis=0) | |
| if not args.is_regression: | |
| probabilities = logits.softmax(dim=-1).max(dim=-1).values | |
| probabilities = accelerator.gather(probabilities) | |
| if all_probabilities is None: | |
| all_probabilities = probabilities.detach().cpu().numpy() | |
| else: | |
| all_probabilities = np.append(all_probabilities, probabilities.detach().cpu().numpy(), axis=0) | |
| if has_labels: | |
| references = batch["labels"] | |
| references = accelerator.gather(references) | |
| if all_references is None: | |
| all_references = references.detach().cpu().numpy() | |
| else: | |
| all_references = np.append(all_references, references.detach().cpu().numpy(), axis=0) | |
| eval_metric.add_batch( | |
| predictions=predictions, | |
| references=references, | |
| ) | |
| completed_steps += 1 | |
| if has_labels: | |
| eval_results.update(eval_metric.compute()) | |
| eval_results["completed_steps"] = completed_steps | |
| eval_results["avg_eval_loss"] = eval_loss / completed_steps | |
| if write_to_file: | |
| accelerator.wait_for_everyone() | |
| if accelerator.is_main_process: | |
| results_file = os.path.join(args.output_dir, f"{eval_set}_results_{checkpoint}.json") | |
| with open(results_file, "w") as f: | |
| json.dump(eval_results, f, indent=4, sort_keys=True) | |
| if write_to_file: | |
| accelerator.wait_for_everyone() | |
| if accelerator.is_main_process: | |
| output_file = os.path.join(args.output_dir, f"{eval_set}_output_{checkpoint}.csv") | |
| if not args.is_regression: | |
| assert len(all_predictions) == len(all_probabilities) | |
| df = pd.DataFrame(list(zip(all_predictions, all_probabilities)), columns=["prediction", "probability"]) | |
| else: | |
| df = pd.DataFrame(all_predictions, columns=["prediction"]) | |
| df = df.head(num_examples) | |
| df.to_csv(output_file, header=True, index=False) | |
| return eval_results | |
| def load_from_pretrained(args, pretrained_model_name_or_path): | |
| """Load the pretrained model and tokenizer.""" | |
| # In distributed training, the .from_pretrained methods guarantee that only | |
| # one local process can concurrently perform this procedure. | |
| config = AutoConfig.from_pretrained( | |
| pretrained_model_name_or_path, | |
| num_labels=args.num_labels if hasattr(args, "num_labels") else None, | |
| finetuning_task=args.task_name.lower(), | |
| cache_dir=args.cache_dir, | |
| ) | |
| tokenizer = AutoTokenizer.from_pretrained( | |
| pretrained_model_name_or_path, use_fast=args.use_fast_tokenizer, cache_dir=args.cache_dir | |
| ) | |
| model = AutoModelForSequenceClassification.from_pretrained( | |
| pretrained_model_name_or_path, | |
| from_tf=bool(".ckpt" in args.model_name_or_path), | |
| config=config, | |
| ignore_mismatched_sizes=True, | |
| cache_dir=args.cache_dir, | |
| ) | |
| return config, tokenizer, model | |
| def finetune(accelerator, model_name_or_path, train_file, output_dir, **kwargs): | |
| """Fine-tuning a pre-trained model on a downstream task. | |
| Args: | |
| accelerator: An instance of an accelerator for distributed training (on | |
| multi-GPU, TPU) or mixed precision training. | |
| model_name_or_path: Path to pretrained model or model identifier from | |
| huggingface.co/models. | |
| train_file: A csv or a json file containing the training data. | |
| output_dir: The output directory where the model predictions and checkpoints | |
| will be written. | |
| **kwargs: Dictionary of key/value pairs with which to update the | |
| configuration object after loading. The values in kwargs of any keys which | |
| are configuration attributes will be used to override the loaded values. | |
| """ | |
| # Make one log on every process with the configuration for debugging. | |
| logging.basicConfig( | |
| format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", | |
| datefmt="%m/%d/%Y %H:%M:%S", | |
| level=logging.INFO, | |
| ) | |
| logger.info(accelerator.state) | |
| # Setup logging, we only want one process per machine to log things on the | |
| # screen. accelerator.is_local_main_process is only True for one process per | |
| # machine. | |
| logger.setLevel(logging.INFO if accelerator.is_local_main_process else logging.ERROR) | |
| model_args = FTModelArguments(model_name_or_path=model_name_or_path) | |
| data_args = FTDataArguments(train_file=train_file) | |
| training_args = FTTrainingArguments(output_dir=output_dir) | |
| args = argparse.Namespace() | |
| for arg_class in (model_args, data_args, training_args): | |
| for key, value in vars(arg_class).items(): | |
| setattr(args, key, value) | |
| for key, value in kwargs.items(): | |
| if hasattr(args, key): | |
| setattr(args, key, value) | |
| # Sanity checks | |
| data_files = {} | |
| args.data_file_extension = None | |
| # You need to provide the training data as we always run training | |
| args.do_train = True | |
| assert args.train_file is not None | |
| data_files[Split.TRAIN.value] = args.train_file | |
| if args.do_eval or args.evaluation_strategy != IntervalStrategy.NO.value: | |
| assert args.eval_file is not None | |
| data_files[Split.EVAL.value] = args.eval_file | |
| if args.do_eval and args.test_file is not None: | |
| data_files[Split.TEST.value] = args.test_file | |
| if args.do_predict: | |
| assert args.infer_file is not None | |
| data_files[Split.INFER.value] = args.infer_file | |
| for key in data_files: | |
| extension = data_files[key].split(".")[-1] | |
| assert extension in ["csv", "json"], f"`{key}_file` should be a csv or a json file." | |
| if args.data_file_extension is None: | |
| args.data_file_extension = extension | |
| else: | |
| assert extension == args.data_file_extension, f"`{key}_file` should be a {args.data_file_extension} file`." | |
| assert ( | |
| args.eval_metric in datasets.list_metrics() | |
| ), f"{args.eval_metric} not in the list of supported metrics {datasets.list_metrics()}." | |
| # Handle the output directory creation | |
| if accelerator.is_main_process: | |
| if args.output_dir is not None: | |
| os.makedirs(args.output_dir, exist_ok=True) | |
| accelerator.wait_for_everyone() | |
| # If passed along, set the training seed now. | |
| if args.seed is not None: | |
| set_seed(args.seed) | |
| # You need to provide your CSV/JSON data files. | |
| # | |
| # For CSV/JSON files, this script will use as labels the column called 'label' | |
| # and as pair of sentences the sentences in columns called 'sentence1' and | |
| # 'sentence2' if these columns exist or the first two columns not named | |
| # 'label' if at least two columns are provided. | |
| # | |
| # If the CSVs/JSONs contain only one non-label column, the script does single | |
| # sentence classification on this single column. | |
| # | |
| # In distributed training, the load_dataset function guarantees that only one | |
| # local process can download the dataset. | |
| # Loading the dataset from local csv or json files. | |
| raw_datasets = load_dataset(args.data_file_extension, data_files=data_files) | |
| # Labels | |
| is_regression = raw_datasets[Split.TRAIN.value].features["label"].dtype in ["float32", "float64"] | |
| args.is_regression = is_regression | |
| if args.is_regression: | |
| label_list = None | |
| num_labels = 1 | |
| else: | |
| label_list = args.label_list | |
| assert label_list is not None | |
| label_list.sort() # Let's sort it for determinism | |
| num_labels = len(label_list) | |
| args.num_labels = num_labels | |
| # Load pre-trained model | |
| config, tokenizer, model = load_from_pretrained(args, args.model_name_or_path) | |
| # Preprocessing the datasets | |
| non_label_column_names = [name for name in raw_datasets[Split.TRAIN.value].column_names if name != "label"] | |
| if "sentence1" in non_label_column_names and "sentence2" in non_label_column_names: | |
| sentence1_key, sentence2_key = "sentence1", "sentence2" | |
| else: | |
| if len(non_label_column_names) >= 2: | |
| sentence1_key, sentence2_key = non_label_column_names[:2] | |
| else: | |
| sentence1_key, sentence2_key = non_label_column_names[0], None | |
| label_to_id = {v: i for i, v in enumerate(label_list)} | |
| config.label2id = label_to_id | |
| config.id2label = {id: label for label, id in config.label2id.items()} | |
| padding = "max_length" if args.pad_to_max_length else False | |
| def preprocess_function(examples): | |
| # Tokenize the texts | |
| texts = ( | |
| (examples[sentence1_key],) if sentence2_key is None else (examples[sentence1_key], examples[sentence2_key]) | |
| ) | |
| result = tokenizer(*texts, padding=padding, max_length=args.max_length, truncation=True) | |
| if "label" in examples: | |
| if label_to_id is not None: | |
| # Map labels to IDs (not necessary for GLUE tasks) | |
| result["labels"] = [label_to_id[l] for l in examples["label"]] | |
| else: | |
| # In all cases, rename the column to labels because the model will | |
| # expect that. | |
| result["labels"] = examples["label"] | |
| return result | |
| with accelerator.main_process_first(): | |
| processed_datasets = raw_datasets.map( | |
| preprocess_function, | |
| batched=True, | |
| remove_columns=raw_datasets[Split.TRAIN.value].column_names, | |
| desc="Running tokenizer on dataset", | |
| ) | |
| num_examples = {} | |
| splits = [s.value for s in Split] | |
| for split in splits: | |
| if split in processed_datasets: | |
| num_examples[split] = len(processed_datasets[split]) | |
| args.num_examples = num_examples | |
| train_dataset = processed_datasets[Split.TRAIN.value] | |
| eval_dataset = processed_datasets[Split.EVAL.value] if Split.EVAL.value in processed_datasets else None | |
| test_dataset = processed_datasets[Split.TEST.value] if Split.TEST.value in processed_datasets else None | |
| infer_dataset = processed_datasets[Split.INFER.value] if Split.INFER.value in processed_datasets else None | |
| # Log a few random samples from the training set: | |
| for index in random.sample(range(len(train_dataset)), 3): | |
| logger.info("Sample %d of the training set: %s.", index, train_dataset[index]) | |
| # DataLoaders creation: | |
| if args.pad_to_max_length: | |
| # If padding was already done ot max length, we use the default data | |
| # collator that will just convert everything to tensors. | |
| data_collator = default_data_collator | |
| else: | |
| # Otherwise, `DataCollatorWithPadding` will apply dynamic padding for us (by | |
| # padding to the maximum length of the samples passed). When using mixed | |
| # precision, we add `pad_to_multiple_of=8` to pad all tensors to multiple of | |
| # 8s, which will enable the use of Tensor Cores on NVIDIA hardware with | |
| # compute capability >= 7.5 (Volta). | |
| data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=(8 if accelerator.use_fp16 else None)) | |
| train_dataloader = DataLoader( | |
| train_dataset, | |
| batch_size=args.per_device_train_batch_size, | |
| shuffle=True, | |
| collate_fn=data_collator, | |
| ) | |
| eval_dataloader, test_dataloader, infer_dataloader = None, None, None | |
| if eval_dataset is not None: | |
| eval_dataloader = DataLoader( | |
| eval_dataset, batch_size=args.per_device_eval_batch_size, collate_fn=data_collator | |
| ) | |
| if test_dataset is not None: | |
| test_dataloader = DataLoader( | |
| test_dataset, batch_size=args.per_device_eval_batch_size, collate_fn=data_collator | |
| ) | |
| if infer_dataset is not None: | |
| infer_dataloader = DataLoader( | |
| infer_dataset, batch_size=args.per_device_eval_batch_size, collate_fn=data_collator | |
| ) | |
| # Optimizer | |
| # Split weights in two groups, one with weight decay and the other not. | |
| no_decay = ["bias", "LayerNorm.weight"] | |
| optimizer_grouped_parameters = [ | |
| { | |
| "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], | |
| "weight_decay": args.weight_decay, | |
| }, | |
| { | |
| "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], | |
| "weight_decay": 0.0, | |
| }, | |
| ] | |
| optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate) | |
| # Prepare everything with our `accelerator`. | |
| model, optimizer, train_dataloader, eval_dataloader, test_dataloader, infer_dataloader = accelerator.prepare( | |
| model, optimizer, train_dataloader, eval_dataloader, test_dataloader, infer_dataloader | |
| ) | |
| # Note -> the training dataloader needs to be prepared before we grab its | |
| # length below (cause its length will be shorter in multiprocess) | |
| # Scheduler and math around the number of training steps. | |
| num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps) | |
| if args.max_steps == -1: | |
| args.max_steps = args.num_train_epochs * num_update_steps_per_epoch | |
| else: | |
| args.num_train_epochs = math.ceil(args.max_steps / num_update_steps_per_epoch) | |
| lr_scheduler = get_scheduler( | |
| name=args.lr_scheduler_type, | |
| optimizer=optimizer, | |
| num_warmup_steps=args.warmup_steps, | |
| num_training_steps=args.max_steps, | |
| ) | |
| # Train | |
| completed_steps, avg_train_loss = train( | |
| args, accelerator, model, tokenizer, train_dataloader, optimizer, lr_scheduler, eval_dataloader | |
| ) | |
| accelerator.wait_for_everyone() | |
| logger.info("Training job completed: completed_steps = %d, avg_train_loss = %f", completed_steps, avg_train_loss) | |
| args.model_name_or_path = os.path.join(args.output_dir, "best-checkpoint") | |
| logger.info("Loading the best checkpoint: %s", args.model_name_or_path) | |
| config, tokenizer, model = load_from_pretrained(args, args.model_name_or_path) | |
| model = accelerator.prepare(model) | |
| if args.do_eval: | |
| # Evaluate | |
| if eval_dataloader is not None: | |
| logger.info("***** Running evaluation on the eval data using the best checkpoint *****") | |
| eval_results = evaluate(args, accelerator, eval_dataloader, Split.EVAL.value, model, "best-checkpoint") | |
| avg_eval_loss = eval_results["avg_eval_loss"] | |
| eval_metric = eval_results[args.eval_metric] | |
| logger.info("Evaluation job completed: avg_eval_loss = %f", avg_eval_loss) | |
| logger.info("Evaluation result for the best checkpoint: %s = %f", args.eval_metric, eval_metric) | |
| if test_dataloader is not None: | |
| logger.info("***** Running evaluation on the test data using the best checkpoint *****") | |
| eval_results = evaluate(args, accelerator, test_dataloader, Split.TEST.value, model, "best-checkpoint") | |
| avg_eval_loss = eval_results["avg_eval_loss"] | |
| eval_metric = eval_results[args.eval_metric] | |
| logger.info("Test job completed: avg_test_loss = %f", avg_eval_loss) | |
| logger.info("Test result for the best checkpoint: %s = %f", args.eval_metric, eval_metric) | |
| if args.do_predict: | |
| # Predict | |
| if infer_dataloader is not None: | |
| logger.info("***** Running inference using the best checkpoint *****") | |
| evaluate( | |
| args, accelerator, infer_dataloader, Split.INFER.value, model, "best-checkpoint", has_labels=False | |
| ) | |
| logger.info("Inference job completed.") | |
| # Release all references to the internal objects stored and call the garbage | |
| # collector. You should call this method between two trainings with different | |
| # models/optimizers. | |
| accelerator.free_memory() | |