File size: 2,815 Bytes
ee6e328
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import os
from dataclasses import replace

import jax
import wandb
from bigbird_flax import Args, DataCollator, FlaxBigBirdForNaturalQuestions, Trainer, build_tx, train_step, val_step
from datasets import load_dataset
from flax import jax_utils

from transformers import BigBirdTokenizerFast


if __name__ == "__main__":
    print("#################### AVAILABLE DEVICES ####################")
    print(jax.devices())
    print("###########################################################")

    # setup for wandb sweep
    args = Args()
    logger = wandb.init(project="bigbird-natural-questions", config=args.__dict__)
    wandb_args = dict(logger.config)
    del wandb_args["batch_size"]
    args = replace(args, **wandb_args)
    base_dir = args.base_dir + "-" + wandb.run.id
    args = replace(args, base_dir=base_dir)
    print(args)

    tr_dataset = load_dataset("json", data_files=args.tr_data_path)["train"]
    val_dataset = load_dataset("json", data_files=args.val_data_path)["train"]

    # drop extra batch for now
    indices = range(len(tr_dataset) - len(tr_dataset) % args.batch_size)
    tr_dataset = tr_dataset.shuffle().select(indices)
    indices = range(len(val_dataset) - len(val_dataset) % args.batch_size)
    val_dataset = val_dataset.shuffle().select(indices)

    if os.environ.get("TRAIN_ON_SMALL", "false") == "true":
        tr_dataset = tr_dataset.shuffle().select(range(80000))
        val_dataset = val_dataset.shuffle().select(range(8000))

    print(tr_dataset)
    print(val_dataset)

    model = FlaxBigBirdForNaturalQuestions.from_pretrained(
        args.model_id, block_size=args.block_size, num_random_blocks=args.num_random_blocks
    )
    tokenizer = BigBirdTokenizerFast.from_pretrained(args.model_id)
    data_collator = DataCollator(pad_id=tokenizer.pad_token_id, max_length=4096)

    tx_args = {
        "lr": args.lr,
        "init_lr": args.init_lr,
        "warmup_steps": args.warmup_steps,
        "num_train_steps": args.max_epochs * (len(tr_dataset) // args.batch_size),
        "weight_decay": args.weight_decay,
    }
    tx, lr = build_tx(**tx_args)

    trainer = Trainer(
        args=args,
        data_collator=data_collator,
        model_save_fn=model.save_pretrained,
        train_step_fn=train_step,
        val_step_fn=val_step,
        logger=logger,
        scheduler_fn=lr,
    )

    ckpt_dir = None
    state = trainer.create_state(model, tx, num_train_steps=tx_args["num_train_steps"], ckpt_dir=ckpt_dir)
    try:
        trainer.train(state, tr_dataset, val_dataset)
    except KeyboardInterrupt:
        print("Oooops; TRAINING STOPPED UNFORTUNATELY")

    print("SAVING WEIGHTS IN `final-weights`")
    params = jax_utils.unreplicate(state.params)
    model.save_pretrained(os.path.join(args.base_dir, "final-weights"), params=params)