# Preamble

In [5]:
from pathlib import Path
from tqdm.notebook import tqdm

import math
import numpy as np

import jax
import jax.numpy as jnp
import optax
import flax
from flax.training import train_state
from flax.training.common_utils import get_metrics, onehot, shard
from flax import jax_utils, traverse_util

from datasets import load_dataset
from transformers import AutoTokenizer, AutoConfig, GPT2Tokenizer

# Set up model

In [2]:
model_config = 'gpt2-large'
model_dir = model_config + f"-finetuned"
Path(model_dir).mkdir(parents=True, exist_ok=True)
config = AutoConfig.from_pretrained('gpt2-large')
config.save_pretrained(f"{model_dir}")

In [3]:
from transformers import FlaxGPT2LMHeadModel
model = FlaxGPT2LMHeadModel.from_pretrained('gpt2-large')#, dtype=jnp.dtype("bfloat16"))

INFO:absl:Starting the local TPU driver.
INFO:absl:Unable to initialize backend 'tpu_driver': Not found: Unable to find driver in registry given worker: local://
INFO:absl:Unable to initialize backend 'gpu': Not found: Could not find registered platform with name: "cuda". Available platform names are: Interpreter TPU Host
tcmalloc: large alloc 3096141824 bytes == 0x8c128000 @  0x7f216d775680 0x7f216d796824 0x5f7b11 0x648631 0x5c38e6 0x4f30e6 0x64ee88 0x505653 0x56acb6 0x568d9a 0x50b868 0x56fb87 0x568d9a 0x68cdc7 0x5ff5d4 0x5c3cb0 0x56aadf 0x501148 0x56c422 0x501148 0x56c422 0x501148 0x504d56 0x56acb6 0x5f5956 0x56aadf 0x5f5956 0x56acb6 0x568d9a 0x5f5b33 0x50b7f8


# Load preprocessed data

In [6]:
dataset = load_dataset('text', 
                       data_files={'train': "project-data/raw_data/layout_prompts_train.txt",
                                   'test':  "project-data/raw_data/layout_prompts_valid.txt"})

tokenizer = GPT2Tokenizer.from_pretrained('gpt2-large', use_fast=True)

lm_dataset = dataset.load_from_disk('project-data/gpt2_processed/grouped_256')



# Training options

In [13]:
per_device_batch_size = 1
num_epochs = 3
training_seed=42
learning_rate=5e-5
total_batch_size = per_device_batch_size * jax.device_count()
num_train_steps = len(lm_dataset["train"]) // total_batch_size * num_epochs
transition = int(num_train_steps * 0.1)

In [14]:
def decay_mask_fn(params):
    flat_params = traverse_util.flatten_dict(params)
    flat_mask = {
        path: (path[-1] != "bias" and path[-2:] not in [("ln_1", "scale"), ("ln_2", "scale"), ("ln_f", "scale")])
        for path in flat_params
    }
    return traverse_util.unflatten_dict(flat_mask)

linear_decay_lr_schedule_fn = optax.linear_schedule(init_value=learning_rate, end_value=5e-06, transition_steps=num_train_steps-transition, transition_begin=transition)
adamw = optax.adamw(
    learning_rate=linear_decay_lr_schedule_fn, 
    b1=0.9, 
    b2=0.98, 
    eps=1e-8, 
    weight_decay=0.1,
    mask=decay_mask_fn)

state = train_state.TrainState.create(apply_fn=model.__call__, params=model.params, tx=adamw)

# Train

In [15]:
def data_loader(rng, dataset, batch_size, shuffle=False):
    steps_per_epoch = len(dataset) // batch_size

    if shuffle:
        batch_idx = jax.random.permutation(rng, len(dataset))
    else:
        batch_idx = jnp.arange(len(dataset))

    batch_idx = batch_idx[: steps_per_epoch * batch_size]  # Skip incomplete batch.
    batch_idx = batch_idx.reshape((steps_per_epoch, batch_size))

    for idx in batch_idx:
        batch = dataset[idx]
        batch = {k: jnp.array(v) for k, v in batch.items()}

        batch = shard(batch)

        yield batch
        
def train_step(state, batch, dropout_rng):
    dropout_rng, new_dropout_rng = jax.random.split(dropout_rng)

    def loss_fn(params):
        labels = batch.pop("labels")
        logits = state.apply_fn(**batch, params=params, dropout_rng=dropout_rng, train=True)[0]
        
        loss = optax.softmax_cross_entropy(logits[..., :-1, :], onehot(labels[..., 1:], logits.shape[-1])).mean()
        return loss

    grad_fn = jax.value_and_grad(loss_fn)
    loss, grad = grad_fn(state.params)
    grad = jax.lax.pmean(grad, "batch")
    new_state = state.apply_gradients(grads=grad)

    metrics = jax.lax.pmean(
        {"loss": loss, "learning_rate": linear_decay_lr_schedule_fn(state.step)}, axis_name="batch"
    )

    return new_state, metrics, new_dropout_rng

def eval_step(params, batch):
    labels = batch.pop("labels")

    logits = model(**batch, params=params, train=False)[0]

    loss = optax.softmax_cross_entropy(logits[..., :-1, :], onehot(labels[..., 1:], logits.shape[-1])).mean()

    # summarize metrics
    metrics = {"loss": loss, "perplexity": jnp.exp(loss)}
    metrics = jax.lax.pmean(metrics, axis_name="batch")
    return metrics

In [16]:
parallel_train_step = jax.pmap(train_step, "batch")
parallel_eval_step = jax.pmap(eval_step, "batch")

In [17]:
state = flax.jax_utils.replicate(state)
rng = jax.random.PRNGKey(training_seed)
dropout_rngs = jax.random.split(rng, jax.local_device_count())



In [18]:
for epoch in tqdm(range(1, num_epochs + 1), desc=f"Epoch ...", position=0, leave=True):
    rng, input_rng = jax.random.split(rng)

    # -- Train --
    train_loader = data_loader(input_rng, lm_dataset["train"], total_batch_size, shuffle=True)
    with tqdm(total=len(lm_dataset["train"]) // total_batch_size, desc="Training...", leave=False) as progress_bar_train:
        for model_inputs in train_loader:
            # Model forward
            state, train_metric, dropout_rngs = parallel_train_step(state, model_inputs, dropout_rngs)

            progress_bar_train.update(1)

        progress_bar_train.write(
              f"Train... ({epoch}/{num_epochs} | Loss: {round(train_metric['loss'].mean(), 3)}, Learning Rate: {round(train_metric['learning_rate'].mean(), 6)})"
        )

    # -- Eval --
    eval_loader = data_loader(input_rng, lm_dataset["test"], total_batch_size)
    eval_metrics = []
   
    with tqdm(total=len(lm_dataset["test"]) // total_batch_size, desc="Evaluation...", leave=False) as progress_bar_eval:
        for model_inputs in eval_loader:
            # Model forward
            eval_metric = parallel_eval_step(state.params, model_inputs)
            eval_metrics.append(eval_metric)

            progress_bar_eval.update(1)
 
        eval_metrics = get_metrics(eval_metrics)
        eval_metrics = jax.tree_map(jnp.mean, eval_metrics)
        progress_bar_eval.write(
            f"Eval... ({epoch}/{num_epochs} | Loss: {eval_metrics['loss']} | Perplexity: {eval_metrics['perplexity']})"
        )

Epoch ...:   0%|          | 0/3 [00:00<?, ?it/s]

Training...:   0%|          | 0/396670 [00:00<?, ?it/s]

2021-07-06 10:40:07.010317: E external/org_tensorflow/tensorflow/compiler/xla/pjrt/pjrt_stream_executor_client.cc:2036] Execution of replica 0 failed: Resource exhausted: Failed to allocate request for 18.75MiB (19660800B) on device ordinal 0
2021-07-06 10:40:07.019948: E external/org_tensorflow/tensorflow/compiler/xla/pjrt/pjrt_stream_executor_client.cc:2036] Execution of replica 4 failed: Resource exhausted: Failed to allocate request for 6.25MiB (6553600B) on device ordinal 6
2021-07-06 10:40:07.020278: E external/org_tensorflow/tensorflow/compiler/xla/pjrt/pjrt_stream_executor_client.cc:2036] Execution of replica 1 failed: Resource exhausted: Failed to allocate request for 6.25MiB (6553600B) on device ordinal 1
2021-07-06 10:40:07.020375: E external/org_tensorflow/tensorflow/compiler/xla/pjrt/pjrt_stream_executor_client.cc:2036] Execution of replica 3 failed: Resource exhausted: Failed to allocate request for 6.25MiB (6553600B) on device ordinal 3
2021-07-06 10:40:07.020546: E exte

RuntimeError: Resource exhausted: Failed to allocate request for 18.75MiB (19660800B) on device ordinal 0: while running replica 0 and partition 0 of a replicated computation (other replicas may have failed as well).