flax-community
/

bert-swahili-news-classification

+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Finetuning a 🤗 Flax Transformers model for sequence classification on GLUE."""
+import argparse
+import logging
+import os
+import random
+import time
+from itertools import chain
+from typing import Any, Callable, Dict, Tuple
+import datasets
+from datasets import load_dataset, load_metric
+import jax
+import jax.numpy as jnp
+import optax
+import transformers
+from flax import struct, traverse_util
+from flax.jax_utils import replicate, unreplicate
+from flax.metrics import tensorboard
+from flax.training import train_state
+from flax.training.common_utils import get_metrics, onehot, shard
+from transformers import AutoConfig, AutoTokenizer, FlaxAutoModelForSequenceClassification, PretrainedConfig
+logger = logging.getLogger(__name__)
+Array = Any
+Dataset = datasets.arrow_dataset.Dataset
+PRNGKey = Any
+task_to_keys = {
+    "cola": ("sentence", None),
+    "mnli": ("premise", "hypothesis"),
+    "mrpc": ("sentence1", "sentence2"),
+    "qnli": ("question", "sentence"),
+    "qqp": ("question1", "question2"),
+    "rte": ("sentence1", "sentence2"),
+    "sst2": ("sentence", None),
+    "swahili_news": ("text", None),
+    "stsb": ("sentence1", "sentence2"),
+    "wnli": ("sentence1", "sentence2"),
+}
+def parse_args():
+    parser = argparse.ArgumentParser(description="Finetune a transformers model on a text classification task")
+    parser.add_argument(
+        "--task_name",
+        type=str,
+        default=None,
+        help="The name of the glue task to train on.",
+        choices=list(task_to_keys.keys()),
+    )
+    parser.add_argument(
+        "--train_file", type=str, default=None, help="A csv or a json file containing the training data."
+    )
+    parser.add_argument(
+        "--validation_file", type=str, default=None, help="A csv or a json file containing the validation data."
+    )
+    parser.add_argument(
+        "--max_length",
+        type=int,
+        default=128,
+        help=(
+            "The maximum total input sequence length after tokenization. Sequences longer than this will be truncated,"
+            " sequences shorter will be padded."
+        ),
+    )
+    parser.add_argument(
+        "--model_name_or_path",
+        type=str,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+        required=True,
+    )
+    parser.add_argument(
+        "--use_slow_tokenizer",
+        action="store_true",
+        help="If passed, will use a slow tokenizer (not backed by the 🤗 Tokenizers library).",
+    )
+    parser.add_argument(
+        "--per_device_train_batch_size",
+        type=int,
+        default=8,
+        help="Batch size (per device) for the training dataloader.",
+    )
+    parser.add_argument(
+        "--per_device_eval_batch_size",
+        type=int,
+        default=8,
+        help="Batch size (per device) for the evaluation dataloader.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=5e-5,
+        help="Initial learning rate (after the potential warmup period) to use.",
+    )
+    parser.add_argument("--weight_decay", type=float, default=0.0, help="Weight decay to use.")
+    parser.add_argument("--num_train_epochs", type=int, default=3, help="Total number of training epochs to perform.")
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=None,
+        help="Total number of training steps to perform. If provided, overrides num_train_epochs.",
+    )
+    parser.add_argument(
+        "--num_warmup_steps", type=int, default=0, help="Number of steps for the warmup in the lr scheduler."
+    )
+    parser.add_argument("--output_dir", type=str, default=None, help="Where to store the final model.")
+    parser.add_argument("--seed", type=int, default=3, help="A seed for reproducible training.")
+    parser.add_argument(
+        "--push_to_hub",
+        action="store_true",
+        help="If passed, model checkpoints and tensorboard logs will be pushed to the hub",
+    )
+    args = parser.parse_args()
+    # Sanity checks
+    if args.task_name is None and args.train_file is None and args.validation_file is None:
+        raise ValueError("Need either a task name or a training/validation file.")
+    else:
+        if args.train_file is not None:
+            extension = args.train_file.split(".")[-1]
+            assert extension in ["csv", "json"], "`train_file` should be a csv or a json file."
+        if args.validation_file is not None:
+            extension = args.validation_file.split(".")[-1]
+            assert extension in ["csv", "json"], "`validation_file` should be a csv or a json file."
+    if args.output_dir is not None:
+        os.makedirs(args.output_dir, exist_ok=True)
+    return args
+def create_train_state(
+    model: FlaxAutoModelForSequenceClassification,
+    learning_rate_fn: Callable[[int], float],
+    is_regression: bool,
+    num_labels: int,
+    weight_decay: float,
+) -> train_state.TrainState:
+    """Create initial training state."""
+    class TrainState(train_state.TrainState):
+        """Train state with an Optax optimizer.
+        The two functions below differ depending on whether the task is classification
+        or regression.
+        Args:
+          logits_fn: Applied to last layer to obtain the logits.
+          loss_fn: Function to compute the loss.
+        """
+        logits_fn: Callable = struct.field(pytree_node=False)
+        loss_fn: Callable = struct.field(pytree_node=False)
+    # We use Optax's "masking" functionality to not apply weight decay
+    # to bias and LayerNorm scale parameters. decay_mask_fn returns a
+    # mask boolean with the same structure as the parameters.
+    # The mask is True for parameters that should be decayed.
+    def decay_mask_fn(params):
+        flat_params = traverse_util.flatten_dict(params)
+        flat_mask = {path: (path[-1] != "bias" and path[-2:] != ("LayerNorm", "scale")) for path in flat_params}
+        return traverse_util.unflatten_dict(flat_mask)
+    tx = optax.adamw(
+        learning_rate=learning_rate_fn, b1=0.9, b2=0.999, eps=1e-6, weight_decay=weight_decay, mask=decay_mask_fn
+    )
+    if is_regression:
+        def mse_loss(logits, labels):
+            return jnp.mean((logits[..., 0] - labels) ** 2)
+        return TrainState.create(
+            apply_fn=model.__call__,
+            params=model.params,
+            tx=tx,
+            logits_fn=lambda logits: logits[..., 0],
+            loss_fn=mse_loss,
+        )
+    else:  # Classification.
+        def cross_entropy_loss(logits, labels):
+            xentropy = optax.softmax_cross_entropy(logits, onehot(labels, num_classes=num_labels))
+            return jnp.mean(xentropy)
+        return TrainState.create(
+            apply_fn=model.__call__,
+            params=model.params,
+            tx=tx,
+            logits_fn=lambda logits: logits.argmax(-1),
+            loss_fn=cross_entropy_loss,
+        )
+def create_learning_rate_fn(
+    train_ds_size: int, train_batch_size: int, num_train_epochs: int, num_warmup_steps: int, learning_rate: float
+) -> Callable[[int], jnp.array]:
+    """Returns a linear warmup, linear_decay learning rate function."""
+    steps_per_epoch = train_ds_size // train_batch_size
+    num_train_steps = steps_per_epoch * num_train_epochs
+    warmup_fn = optax.linear_schedule(init_value=0.0, end_value=learning_rate, transition_steps=num_warmup_steps)
+    decay_fn = optax.linear_schedule(
+        init_value=learning_rate, end_value=0, transition_steps=num_train_steps - num_warmup_steps
+    )
+    schedule_fn = optax.join_schedules(schedules=[warmup_fn, decay_fn], boundaries=[num_warmup_steps])
+    return schedule_fn
+def glue_train_data_collator(rng: PRNGKey, dataset: Dataset, batch_size: int):
+    """Returns shuffled batches of size `batch_size` from truncated `train dataset`, sharded over all local devices."""
+    steps_per_epoch = len(dataset) // batch_size
+    perms = jax.random.permutation(rng, len(dataset))
+    perms = perms[: steps_per_epoch * batch_size]  # Skip incomplete batch.
+    perms = perms.reshape((steps_per_epoch, batch_size))
+    for perm in perms:
+        batch = dataset[perm]
+        batch = {k: jnp.array(v) for k, v in batch.items()}
+        batch = shard(batch)
+        yield batch
+def glue_eval_data_collator(dataset: Dataset, batch_size: int):
+    """Returns batches of size `batch_size` from `eval dataset`, sharded over all local devices."""
+    for i in range(len(dataset) // batch_size):
+        batch = dataset[i * batch_size : (i + 1) * batch_size]
+        batch = {k: jnp.array(v) for k, v in batch.items()}
+        batch = shard(batch)
+        yield batch
+def main():
+    args = parse_args()
+    # Make one log on every process with the configuration for debugging.
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    # Setup logging, we only want one process per machine to log things on the screen.
+    logger.setLevel(logging.INFO if jax.process_index() == 0 else logging.ERROR)
+    if jax.process_index() == 0:
+        datasets.utils.logging.set_verbosity_warning()
+        transformers.utils.logging.set_verbosity_info()
+    else:
+        datasets.utils.logging.set_verbosity_error()
+        transformers.utils.logging.set_verbosity_error()
+    # Get the datasets: you can either provide your own CSV/JSON training and evaluation files (see below)
+    # or specify a GLUE benchmark task (the dataset will be downloaded automatically from the datasets Hub).
+    # For CSV/JSON files, this script will use as labels the column called 'label' and as pair of sentences the
+    # sentences in columns called 'sentence1' and 'sentence2' if such column exists or the first two columns not named
+    # label if at least two columns are provided.
+    # If the CSVs/JSONs contain only one non-label column, the script does single sentence classification on this
+    # single column. You can easily tweak this behavior (see below)
+    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
+    # download the dataset.
+    if args.task_name == "swahili_news":
+        raw_datasets = load_dataset("swahili_news")
+        valid_test_split = 10
+        raw_datasets["validation"] = load_dataset(
+            "swahili_news",
+            split=f"train[:{valid_test_split}%]"
+        )
+        raw_datasets["train"] = load_dataset(
+            "swahili_news",
+            split=f"train[{valid_test_split}%:]"
+        )
+        print(f"train: {len(raw_datasets['train'])}, validation: {len(raw_datasets['validation'])},")
+    elif args.task_name is not None:
+        # Downloading and loading a dataset from the hub.
+        raw_datasets = load_dataset("glue", args.task_name)
+    else:
+        # Loading the dataset from local csv or json file.
+        data_files = {}
+        if args.train_file is not None:
+            data_files["train"] = args.train_file
+        if args.validation_file is not None:
+            data_files["validation"] = args.validation_file
+        extension = (args.train_file if args.train_file is not None else args.valid_file).split(".")[-1]
+        raw_datasets = load_dataset(extension, data_files=data_files)
+    # See more about loading any type of standard or custom dataset at
+    # https://huggingface.co/docs/datasets/loading_datasets.html.
+    # Labels
+    if args.task_name is not None:
+        is_regression = args.task_name == "stsb"
+        if not is_regression:
+            label_list = raw_datasets["train"].features["label"].names
+            num_labels = len(label_list)
+        else:
+            num_labels = 1
+    else:
+        # Trying to have good defaults here, don't hesitate to tweak to your needs.
+        is_regression = raw_datasets["train"].features["label"].dtype in ["float32", "float64"]
+        if is_regression:
+            num_labels = 1
+        else:
+            # A useful fast method:
+            # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.unique
+            label_list = raw_datasets["train"].unique("label")
+            label_list.sort()  # Let's sort it for determinism
+            num_labels = len(label_list)
+    # Load pretrained model and tokenizer
+    config = AutoConfig.from_pretrained(args.model_name_or_path, num_labels=num_labels, finetuning_task=args.task_name)
+    tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path, use_fast=not args.use_slow_tokenizer)
+    model = FlaxAutoModelForSequenceClassification.from_pretrained(args.model_name_or_path, config=config)
+    # Preprocessing the datasets
+    if args.task_name is not None:
+        sentence1_key, sentence2_key = task_to_keys[args.task_name]
+    else:
+        # Again, we try to have some nice defaults but don't hesitate to tweak to your use case.
+        non_label_column_names = [name for name in raw_datasets["train"].column_names if name != "label"]
+        if "sentence1" in non_label_column_names and "sentence2" in non_label_column_names:
+            sentence1_key, sentence2_key = "sentence1", "sentence2"
+        else:
+            if len(non_label_column_names) >= 2:
+                sentence1_key, sentence2_key = non_label_column_names[:2]
+            else:
+                sentence1_key, sentence2_key = non_label_column_names[0], None
+    # Some models have set the order of the labels to use, so let's make sure we do use it.
+    label_to_id = None
+    if (
+        model.config.label2id != PretrainedConfig(num_labels=num_labels).label2id
+        and args.task_name is not None
+        and not is_regression
+    ):
+        # Some have all caps in their config, some don't.
+        label_name_to_id = {k.lower(): v for k, v in model.config.label2id.items()}
+        if list(sorted(label_name_to_id.keys())) == list(sorted(label_list)):
+            logger.info(
+                f"The configuration of the model provided the following label correspondence: {label_name_to_id}. "
+                "Using it!"
+            )
+            label_to_id = {i: label_name_to_id[label_list[i]] for i in range(num_labels)}
+        else:
+            logger.warning(
+                "Your model seems to have been trained with labels, but they don't match the dataset: ",
+                f"model labels: {list(sorted(label_name_to_id.keys()))}, dataset labels: {list(sorted(label_list))}."
+                "\nIgnoring the model labels as a result.",
+            )
+    elif args.task_name is None:
+        label_to_id = {v: i for i, v in enumerate(label_list)}
+    def preprocess_function(examples):
+        # Tokenize the texts
+        texts = (
+            (examples[sentence1_key],) if sentence2_key is None else (examples[sentence1_key], examples[sentence2_key])
+        )
+        result = tokenizer(*texts, padding="max_length", max_length=args.max_length, truncation=True)
+        if "label" in examples:
+            if label_to_id is not None:
+                # Map labels to IDs (not necessary for GLUE tasks)
+                result["labels"] = [label_to_id[l] for l in examples["label"]]
+            else:
+                # In all cases, rename the column to labels because the model will expect that.
+                result["labels"] = examples["label"]
+        return result
+    processed_datasets = raw_datasets.map(
+        preprocess_function, batched=True, remove_columns=raw_datasets["train"].column_names
+    )
+    train_dataset = processed_datasets["train"]
+    eval_dataset = processed_datasets["validation_matched" if args.task_name == "mnli" else "validation"]
+    # Log a few random samples from the training set:
+    for index in random.sample(range(len(train_dataset)), 3):
+        logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")
+    # Define a summary writer
+    summary_writer = tensorboard.SummaryWriter(args.output_dir)
+    summary_writer.hparams(vars(args))
+    def write_metric(train_metrics, eval_metrics, train_time, step):
+        summary_writer.scalar("train_time", train_time, step)
+        train_metrics = get_metrics(train_metrics)
+        for key, vals in train_metrics.items():
+            tag = f"train_{key}"
+            for i, val in enumerate(vals):
+                summary_writer.scalar(tag, val, step - len(vals) + i + 1)
+        for metric_name, value in eval_metrics.items():
+            summary_writer.scalar(f"eval_{metric_name}", value, step)
+    num_epochs = int(args.num_train_epochs)
+    rng = jax.random.PRNGKey(args.seed)
+    dropout_rngs = jax.random.split(rng, jax.local_device_count())
+    train_batch_size = args.per_device_train_batch_size * jax.local_device_count()
+    eval_batch_size = args.per_device_eval_batch_size * jax.local_device_count()
+    learning_rate_fn = create_learning_rate_fn(
+        len(train_dataset), train_batch_size, args.num_train_epochs, args.num_warmup_steps, args.learning_rate
+    )
+    state = create_train_state(
+        model, learning_rate_fn, is_regression, num_labels=num_labels, weight_decay=args.weight_decay
+    )
+    # define step functions
+    def train_step(
+        state: train_state.TrainState, batch: Dict[str, Array], dropout_rng: PRNGKey
+    ) -> Tuple[train_state.TrainState, float]:
+        """Trains model with an optimizer (both in `state`) on `batch`, returning a pair `(new_state, loss)`."""
+        dropout_rng, new_dropout_rng = jax.random.split(dropout_rng)
+        targets = batch.pop("labels")
+        def loss_fn(params):
+            logits = state.apply_fn(**batch, params=params, dropout_rng=dropout_rng, train=True)[0]
+            loss = state.loss_fn(logits, targets)
+            return loss
+        grad_fn = jax.value_and_grad(loss_fn)
+        loss, grad = grad_fn(state.params)
+        grad = jax.lax.pmean(grad, "batch")
+        new_state = state.apply_gradients(grads=grad)
+        metrics = jax.lax.pmean({"loss": loss, "learning_rate": learning_rate_fn(state.step)}, axis_name="batch")
+        return new_state, metrics, new_dropout_rng
+    p_train_step = jax.pmap(train_step, axis_name="batch", donate_argnums=(0,))
+    def eval_step(state, batch):
+        logits = state.apply_fn(**batch, params=state.params, train=False)[0]
+        return state.logits_fn(logits)
+    p_eval_step = jax.pmap(eval_step, axis_name="batch")
+    if args.task_name == "swahili_news":
+        metric = load_metric("glue", "sst2")
+    elif args.task_name is not None:
+        metric = load_metric("glue", args.task_name)
+    else:
+        metric = load_metric("accuracy")
+    logger.info(f"===== Starting training ({num_epochs} epochs) =====")
+    train_time = 0
+    # make sure weights are replicated on each device
+    state = replicate(state)
+    for epoch in range(1, num_epochs + 1):
+        logger.info(f"Epoch {epoch}")
+        logger.info("  Training...")
+        train_start = time.time()
+        train_metrics = []
+        rng, input_rng = jax.random.split(rng)
+        # train
+        for batch in glue_train_data_collator(input_rng, train_dataset, train_batch_size):
+            state, metrics, dropout_rngs = p_train_step(state, batch, dropout_rngs)
+            train_metrics.append(metrics)
+        train_time += time.time() - train_start
+        logger.info(f"    Done! Training metrics: {unreplicate(metrics)}")
+        logger.info("  Evaluating...")
+        # evaluate
+        for batch in glue_eval_data_collator(eval_dataset, eval_batch_size):
+            labels = batch.pop("labels")
+            predictions = p_eval_step(state, batch)
+            metric.add_batch(predictions=chain(*predictions), references=chain(*labels))
+        # evaluate also on leftover examples (not divisible by batch_size)
+        num_leftover_samples = len(eval_dataset) % eval_batch_size
+        # make sure leftover batch is evaluated on one device
+        if num_leftover_samples > 0 and jax.process_index() == 0:
+            # take leftover samples
+            batch = eval_dataset[-num_leftover_samples:]
+            batch = {k: jnp.array(v) for k, v in batch.items()}
+            labels = batch.pop("labels")
+            predictions = eval_step(unreplicate(state), batch)
+            metric.add_batch(predictions=predictions, references=labels)
+        eval_metric = metric.compute()
+        logger.info(f"    Done! Eval metrics: {eval_metric}")
+        cur_step = epoch * (len(train_dataset) // train_batch_size)
+        write_metric(train_metrics, eval_metric, train_time, cur_step)
+        # save checkpoint after each epoch and push checkpoint to the hub
+        if jax.process_index() == 0:
+            params = jax.device_get(jax.tree_map(lambda x: x[0], state.params))
+            model.save_pretrained(
+                args.output_dir,
+                params=params,
+                push_to_hub=args.push_to_hub,
+                commit_message=f"Saving weights and logs of epoch {epoch}",
+            )
+if __name__ == "__main__":
+    main()