Jacob Bayless commited on
Commit
31d1292
1 Parent(s): 644965b

Added sorting model and modified nanoGPT files

Browse files
nanoGPT/data/config/train_sort.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Train a small and simple sorting network
3
+ Requires at least 6 GB of memory
4
+
5
+ Based on nanoGPT's shakespeare example: https://github.com/karpathy/nanoGPT
6
+ """
7
+
8
+ out_dir = 'out-sort-lists'
9
+ eval_interval = 2000
10
+ eval_iters = 200
11
+ log_interval = 200
12
+ verbose_log_interval = 2000
13
+ always_save_checkpoint = False
14
+
15
+ init_from = 'scratch' # 'scratch' or 'resume'
16
+
17
+ wandb_log = False
18
+ wandb_project = 'sort_lists'
19
+ wandb_run_name = 'mini-gpt'
20
+
21
+ dataset = 'sort_lists'
22
+ gradient_accumulation_steps = 1
23
+ batch_size = 12
24
+ block_size = 256 # context window, keep synchronized with training data
25
+
26
+ # Transformer network parameters
27
+ n_layer = 64
28
+ n_head = 4
29
+ n_embd = 256
30
+ dropout = 0.01
31
+
32
+ # Training parameters
33
+ learning_rate = 1e-4
34
+ max_iters = 900000
35
+ weight_decay = 1e-1
36
+ beta1 = 0.95
37
+ beta2 = 0.99
38
+ grad_clip = 1.0
39
+ # learning rate decay settings
40
+ decay_lr = True
41
+ warmup_iters = 2000
42
+ lr_decay_iters = max_iters
43
+ min_lr = 0.1*learning_rate
nanoGPT/data/sort_lists/meta.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:749a5d6b08620256f53b531a210014219cea2b542dea0175d5493f6390052f1b
3
+ size 417
nanoGPT/data/sort_lists/prepare.py ADDED
@@ -0,0 +1,172 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Prepare the list-sorting dataset for language modeling.
3
+ Will saves train.bin, val.bin containing the ids, and meta.pkl containing the
4
+ encoder and decoder and some other related info.
5
+
6
+ Based on nanoGPT: https://github.com/karpathy/nanoGPT
7
+ """
8
+ import os
9
+ import pickle
10
+ import numpy as np
11
+ import json
12
+
13
+ random = np.random.default_rng()
14
+
15
+ context_window_length = 256 # Keep this synchronized with the model definition
16
+
17
+ num_training_lists = 2e6 # Output file is about 2 GB
18
+ num_val_lists = int(0.1*num_training_lists)
19
+ distribution = "uniform"
20
+
21
+ def str_array_to_sorted_array(output_string):
22
+ template = '{{"values": {}}}'
23
+ array = np.array(json.loads(template.format(output_string))["values"], dtype = np.uint16)
24
+ array.sort()
25
+ return array
26
+
27
+
28
+ def generate_list_pairs(max_list_length_chars = None,
29
+ max_int = 65536,
30
+ distribution = "uniform",
31
+ num_lists = 1,
32
+ data_file = None,
33
+ fill_blanks = True):
34
+ if max_list_length_chars is None:
35
+ max_list_length_chars = int(np.floor(0.5*context_window_length - 2))
36
+
37
+ if data_file is None:
38
+ data_file = os.path.join(os.path.dirname(__file__), "train.txt")
39
+
40
+ with open(data_file, 'a') as f:
41
+ for n_list in range(int(num_lists)):
42
+
43
+ max_random_length = int(np.floor(0.5*max_list_length_chars))
44
+
45
+ list_length_ints = random.integers(0, max_random_length)
46
+ if distribution.casefold() == "uniform":
47
+ random_shuffled = random.integers(0, max_int, size = list_length_ints, endpoint = False)
48
+ elif distribution.casefold() == "gaussian":
49
+ random_floats = random.normal(loc = 0.5*max_int,
50
+ scale = max_int,
51
+ size = list_length_ints)
52
+ random_ints = np.around(random_floats, decimals = 0).astype(np.int64, casting = "unsafe")
53
+ invalid_integers = np.logical_or(random_ints < 0, random_ints >= max_int)
54
+ random_ints[invalid_integers]\
55
+ = random.integers(0, max_int, size = invalid_integers, endpoint = False)
56
+ else:
57
+ raise NotImplementedError("No distribution called '{}'".format(distribution))
58
+ # Crop based on string length
59
+ shuffled_str = np.array2string(random_shuffled,
60
+ max_line_width = max_list_length_chars*2,
61
+ separator = ',').replace(" ","")
62
+ shuffled_str = shuffled_str[:np.min([len(shuffled_str), max_list_length_chars - 1])]
63
+ shuffled_str = shuffled_str[:-1].strip()
64
+
65
+ if shuffled_str[-1] in [",", "\n"]:
66
+ shuffled_str = shuffled_str[:-1]
67
+
68
+ shuffled_str += "]"
69
+ sorted_str = np.array2string(str_array_to_sorted_array(shuffled_str),
70
+ max_line_width = max_list_length_chars*2,
71
+ separator = ',').replace(" ","")
72
+
73
+ data_line = "(" + shuffled_str[1:-1] + "): [" + sorted_str[1:-1] + "];\n"
74
+ if fill_blanks:
75
+ len_align = context_window_length - ((len(data_line) - 5) % context_window_length)
76
+ filler = "_"*(context_window_length + len_align)
77
+ f.write(filler)
78
+ f.write(data_line)
79
+ if n_list%100 == 0:
80
+ print("{:.2f}% ({}/{}) -- {}".format(100.*(n_list + 1)/num_lists, n_list, num_lists, len(data_line)))
81
+ print("Data written to file: {}".format(data_file))
82
+ return data_file
83
+
84
+
85
+ train_file_path = os.path.join(os.path.dirname(__file__), 'train.txt')
86
+ val_file_path = os.path.join(os.path.dirname(__file__), 'val.txt')
87
+
88
+ print("Generating training data")
89
+ train_data_file = generate_list_pairs(num_lists = num_training_lists,
90
+ fill_blanks = True,
91
+ max_int = 100,
92
+ distribution = distribution,
93
+ data_file = train_file_path)
94
+ print("Generating validation data")
95
+ val_data_file = generate_list_pairs(num_lists = num_val_lists,
96
+ fill_blanks = True,
97
+ max_int = 100,
98
+ distribution = distribution,
99
+ data_file = val_file_path)
100
+
101
+
102
+ tokens = ['0','1','2','3','4','5','6','7','8','9',',','(','): [','];\n','_']
103
+ vocab_size = len(tokens)
104
+ print("all the unique characters:", ''.join(tokens))
105
+ print(f"vocab size: {vocab_size:,}")
106
+ print("Still working...")
107
+
108
+ # create a mapping from characters to integers
109
+ stoi = { ch:i for i,ch in enumerate(tokens) }
110
+ itos = { i:ch for i,ch in enumerate(tokens) }
111
+ char_to_token = {token[0]:token for token in tokens}
112
+ chars_to_skip = {token[0]:len(token)-1 for token in tokens}
113
+
114
+
115
+ def encode(s):
116
+ encoded = []
117
+ skip = 0
118
+ for char in s:
119
+ if skip:
120
+ skip -= 1
121
+ continue
122
+ else:
123
+ skip = chars_to_skip[char]
124
+ encoded.append(stoi[char_to_token[char]])
125
+ return encoded
126
+
127
+ def decode(l):
128
+ return ''.join([itos[i] for i in l])
129
+
130
+
131
+ # save the meta information as well, to help us encode/decode later
132
+ meta = {
133
+ 'vocab_size': vocab_size,
134
+ 'tokens': tokens,
135
+ 'itos': itos,
136
+ 'stoi': stoi,
137
+ "char_to_token": char_to_token,
138
+ "chars_to_skip": chars_to_skip
139
+ }
140
+ with open(os.path.join(os.path.dirname(__file__), 'meta.pkl'), 'wb') as f:
141
+ pickle.dump(meta, f)
142
+ print("Saved metadata file")
143
+
144
+ if False:
145
+ print("Still working...")
146
+
147
+ # encode both to integers
148
+ with open(train_data_file, 'r') as f:
149
+ train_data = f.read()
150
+ train_ids = encode(train_data)
151
+ del train_data
152
+
153
+ print("Still working...")
154
+
155
+ with open(val_data_file, 'r') as f:
156
+ val_data = f.read()
157
+ val_ids = encode(val_data)
158
+ del val_data
159
+ print(f"train has {len(train_ids):,} tokens")
160
+ print(f"val has {len(val_ids):,} tokens")
161
+
162
+ print("Still working...")
163
+ # export to bin files
164
+ train_ids = np.array(train_ids, dtype=np.uint16)
165
+ print("Still working...")
166
+ val_ids = np.array(val_ids, dtype=np.uint16)
167
+ print("Still working...")
168
+ train_ids.tofile(os.path.join(os.path.dirname(__file__), 'train.bin'))
169
+ print("Still working...")
170
+ val_ids.tofile(os.path.join(os.path.dirname(__file__), 'val.bin'))
171
+
172
+ print("Export complete.")
nanoGPT/out-sort-lists/ckpt.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7037aba6fa5003481a8dd53b71a0621d284ad167351d65033cbe62f574df7904
3
+ size 605632927
nanoGPT/sample.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Sample from a trained model
3
+ """
4
+ import os
5
+ import pickle
6
+ from contextlib import nullcontext
7
+ import torch
8
+ import tiktoken
9
+ from model import GPTConfig, GPT
10
+
11
+ # -----------------------------------------------------------------------------
12
+ init_from = 'resume' # either 'resume' (from an out_dir) or a gpt2 variant (e.g. 'gpt2-xl')
13
+ out_dir = 'out' # ignored if init_from is not 'resume'
14
+ start = "\n" # or "<|endoftext|>" or etc. Can also specify a file, use as: "FILE:prompt.txt"
15
+ num_samples = 10 # number of samples to draw
16
+ max_new_tokens = 500 # number of tokens generated in each sample
17
+ temperature = 0.0 # 1.0 = no change, < 1.0 = less random, > 1.0 = more random, in predictions
18
+ top_k = 200 # retain only the top_k most likely tokens, clamp others to have 0 probability
19
+ seed = 1337
20
+ device = 'cuda' # examples: 'cpu', 'cuda', 'cuda:0', 'cuda:1', etc.
21
+ dtype = 'float16' # 'float32' or 'bfloat16' or 'float16'
22
+ compile = False # use PyTorch 2.0 to compile the model to be faster
23
+ exec(open('configurator.py').read()) # overrides from command line or config file
24
+ # -----------------------------------------------------------------------------
25
+
26
+ torch.manual_seed(seed)
27
+ torch.cuda.manual_seed(seed)
28
+ torch.backends.cuda.matmul.allow_tf32 = True # allow tf32 on matmul
29
+ torch.backends.cudnn.allow_tf32 = True # allow tf32 on cudnn
30
+ device_type = 'cuda' if 'cuda' in device else 'cpu' # for later use in torch.autocast
31
+ ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]
32
+ ctx = nullcontext() if device_type == 'cpu' else torch.amp.autocast(device_type=device_type, dtype=ptdtype)
33
+
34
+ # model
35
+ if init_from == 'resume':
36
+ # init from a model saved in a specific directory
37
+ ckpt_path = os.path.join(out_dir, 'ckpt.pt')
38
+ checkpoint = torch.load(ckpt_path, map_location=device)
39
+ gptconf = GPTConfig(**checkpoint['model_args'])
40
+ model = GPT(gptconf)
41
+ state_dict = checkpoint['model']
42
+ unwanted_prefix = '_orig_mod.'
43
+ for k,v in list(state_dict.items()):
44
+ if k.startswith(unwanted_prefix):
45
+ state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k)
46
+ model.load_state_dict(state_dict)
47
+ elif init_from.startswith('gpt2'):
48
+ # init from a given GPT-2 model
49
+ model = GPT.from_pretrained(init_from, dict(dropout=0.0))
50
+
51
+ model.eval()
52
+ model.to(device)
53
+ if compile:
54
+ model = torch.compile(model) # requires PyTorch 2.0 (optional)
55
+
56
+ # look for the meta pickle in case it is available in the dataset folder
57
+ load_meta = False
58
+ if init_from == 'resume' and 'config' in checkpoint and 'dataset' in checkpoint['config']: # older checkpoints might not have these...
59
+ meta_path = os.path.join('data', checkpoint['config']['dataset'], 'meta.pkl')
60
+ load_meta = os.path.exists(meta_path)
61
+ if load_meta:
62
+ print(f"Loading meta from {meta_path}...")
63
+ with open(meta_path, 'rb') as f:
64
+ meta = pickle.load(f)
65
+ stoi, itos = meta['stoi'], meta['itos']
66
+ char_to_token = meta["char_to_token"]
67
+ chars_to_skip = meta["chars_to_skip"]
68
+
69
+ def encode(s):
70
+ encoded = []
71
+ skip = 0
72
+ for char in s:
73
+ if skip:
74
+ skip -= 1
75
+ continue
76
+ else:
77
+ skip = chars_to_skip[char]
78
+ encoded.append(stoi[char_to_token[char]])
79
+ return encoded
80
+
81
+ def decode(l):
82
+ return ''.join([itos[i] for i in l])
83
+ else:
84
+ raise RuntimeError("No meta.pkl found for sorting! Cannot find token encoder or decoder.")
85
+
86
+ # encode the beginning of the prompt
87
+ if start.startswith('FILE:'):
88
+ with open(start[5:], 'r', encoding='utf-8') as f:
89
+ start = f.read()
90
+ start_ids = encode(start)
91
+ x = (torch.tensor(start_ids, dtype=torch.long, device=device)[None, ...])
92
+
93
+ # run generation
94
+ with torch.no_grad():
95
+ with ctx:
96
+ for k in range(num_samples):
97
+ y = model.generate(x, max_new_tokens, temperature=temperature, top_k=top_k)
98
+ print(decode(y[0].tolist()))
99
+ print('---------------')
nanoGPT/train.py ADDED
@@ -0,0 +1,520 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ This training script can be run both on a single gpu in debug mode,
3
+ and also in a larger training run with distributed data parallel (ddp).
4
+
5
+ To run on a single GPU, example:
6
+ $ python train.py config/train_sort.py
7
+
8
+ Based on nanoGPT by Andrej Karpathy: https://github.com/karpathy/nanoGPT
9
+ Modified for a learn-to-sort experiment by Jacob Bayless
10
+ """
11
+
12
+ import os
13
+ import time
14
+ import math
15
+ import pickle
16
+ from contextlib import nullcontext
17
+
18
+ import numpy as np
19
+ import torch
20
+ from torch.nn.parallel import DistributedDataParallel as DDP
21
+ from torch.distributed import init_process_group, destroy_process_group
22
+
23
+ from model import GPTConfig, GPT
24
+ import json
25
+
26
+ # -----------------------------------------------------------------------------
27
+ # default config values designed to train a gpt2 (124M) on OpenWebText
28
+ # I/O
29
+ out_dir = 'out'
30
+ eval_interval = 2000
31
+ verbose_log_interval = 250
32
+ log_interval = 1
33
+ eval_iters = 200
34
+ eval_only = False # if True, script exits right after the first eval
35
+ always_save_checkpoint = True # if True, always save a checkpoint after each eval
36
+ init_from = 'scratch' # 'scratch' or 'resume' or 'gpt2*'
37
+ # wandb logging
38
+ wandb_log = False # disabled by default
39
+ wandb_project = 'owt'
40
+ wandb_run_name = 'gpt2' # 'run' + str(time.time())
41
+ # data
42
+ dataset = 'openwebtext'
43
+ gradient_accumulation_steps = 5 * 8 # used to simulate larger batch sizes
44
+ batch_size = 12 # if gradient_accumulation_steps > 1, this is the micro-batch size
45
+ block_size = 1024
46
+ # model
47
+ n_layer = 12
48
+ n_head = 12
49
+ n_embd = 768
50
+ dropout = 0.0 # for pretraining 0 is good, for finetuning try 0.1+
51
+ bias = False # do we use bias inside LayerNorm and Linear layers?
52
+ # adamw optimizer
53
+ learning_rate = 6e-4 # max learning rate
54
+ max_iters = 600000 # total number of training iterations
55
+ weight_decay = 1e-1
56
+ beta1 = 0.9
57
+ beta2 = 0.95
58
+ grad_clip = 1.0 # clip gradients at this value, or disable if == 0.0
59
+ # learning rate decay settings
60
+ decay_lr = True # whether to decay the learning rate
61
+ warmup_iters = 2000 # how many steps to warm up for
62
+ lr_decay_iters = 600000 # should be ~= max_iters per Chinchilla
63
+ min_lr = 6e-5 # minimum learning rate, should be ~= learning_rate/10 per Chinchilla
64
+ # DDP settings
65
+ backend = 'nccl' # 'nccl', 'gloo', etc.
66
+ # system
67
+ device = 'cuda' # examples: 'cpu', 'cuda', 'cuda:0', 'cuda:1' etc., or try 'mps' on macbooks
68
+ dtype = 'float16' # 'float32', 'bfloat16', or 'float16', the latter will auto implement a GradScaler
69
+ compile = False # use PyTorch 2.0 to compile the model to be faster
70
+ # -----------------------------------------------------------------------------
71
+ config_keys = [k for k,v in globals().items() if not k.startswith('_') and isinstance(v, (int, float, bool, str))]
72
+ exec(open('configurator.py').read()) # overrides from command line or config file
73
+ config = {k: globals()[k] for k in config_keys} # will be useful for logging
74
+ # -----------------------------------------------------------------------------
75
+
76
+
77
+ verbose_logfile = os.path.join(out_dir, "verbose_log.txt")
78
+ performance_file = os.path.join(out_dir, "perf_log.txt")
79
+
80
+ random = np.random.default_rng()
81
+
82
+ meta_path = os.path.join('data', dataset, 'meta.pkl')
83
+ print(f"Loading meta from {meta_path}...")
84
+ with open(meta_path, 'rb') as f:
85
+ meta = pickle.load(f)
86
+ stoi, itos = meta['stoi'], meta['itos']
87
+ char_to_token = meta["char_to_token"]
88
+ chars_to_skip = meta["chars_to_skip"]
89
+
90
+ def encode(s):
91
+ encoded = []
92
+ skip = 0
93
+ for char in s:
94
+ if skip:
95
+ skip -= 1
96
+ continue
97
+ else:
98
+ skip = chars_to_skip[char]
99
+ encoded.append(stoi[char_to_token[char]])
100
+ return encoded
101
+
102
+ def decode(l):
103
+ return ''.join([itos[i] for i in l])
104
+
105
+ def str_array_to_sorted_array(output_string):
106
+ template = '{{"values": {}}}'
107
+ array = np.array(json.loads(template.format(output_string))["values"], dtype = np.uint16)
108
+ array.sort()
109
+ return array
110
+
111
+ def generate_validation_list(max_int = 99,
112
+ force_list_length = None,
113
+ max_list_length_chars = None):
114
+ if max_list_length_chars is None:
115
+ max_list_length_chars = int(np.floor(0.5*block_size - 2))
116
+ max_random_length = int(np.floor(0.5*max_list_length_chars))
117
+ if force_list_length is None:
118
+ list_length_ints = random.integers(0, max_random_length)
119
+ else:
120
+ list_length_ints = np.min([force_list_length, max_random_length])
121
+ random_shuffled = random.integers(0, max_int,
122
+ size = list_length_ints,
123
+ endpoint = False)
124
+ # Crop based on string length
125
+ shuffled_str = np.array2string(random_shuffled,
126
+ max_line_width = max_list_length_chars*2,
127
+ separator = ',').replace(" ","")
128
+ shuffled_str = shuffled_str[:np.min([len(shuffled_str), max_list_length_chars - 1])]
129
+ shuffled_str = shuffled_str[:-1].strip()
130
+
131
+ if shuffled_str[-1] in [",", "\n"]:
132
+ shuffled_str = shuffled_str[:-1]
133
+ shuffled_str += "]"
134
+ sorted_str = np.array2string(str_array_to_sorted_array(shuffled_str),
135
+ max_line_width = max_list_length_chars*2,
136
+ separator = ',').replace(" ","")
137
+ input_line = "(" + shuffled_str[1:-1] + "): ["
138
+ correct_output = sorted_str[1:-1] + "];\n"
139
+ return (input_line, correct_output)
140
+
141
+
142
+ def score_performance(model_output,
143
+ correct_output,
144
+ output_terminator = "];\n",
145
+ blank_separator = "_",
146
+ list_separator = ","):
147
+ errors = 0
148
+
149
+ model_output, _, _ = model_output.partition(blank_separator)
150
+
151
+ if output_terminator in correct_output and output_terminator not in model_output:
152
+ errors += 2
153
+
154
+ correct_output, _, _ = correct_output.partition(output_terminator)
155
+ model_output, _, _ = model_output.partition(output_terminator)
156
+
157
+ correct_output = correct_output.split(list_separator)
158
+ model_output = model_output.split(list_separator)
159
+ min_length = np.min([len(correct_output), len(model_output)])
160
+
161
+ length_error = np.abs(len(correct_output) - len(model_output))
162
+ errors += length_error
163
+
164
+ for entry in range(min_length):
165
+ if model_output[entry] != correct_output[entry]:
166
+ errors += 1
167
+
168
+ return errors, errors/float(len(correct_output) + 1), len(correct_output)
169
+
170
+
171
+ def evaluate_performance(model,
172
+ force_list_length = None,
173
+ max_list_length_chars = None,
174
+ output_separator = "): [",
175
+ output_terminator = "];\n",
176
+ list_separator = ","):
177
+
178
+ input_line, correct_output = generate_validation_list(force_list_length = force_list_length,
179
+ max_list_length_chars = max_list_length_chars)
180
+
181
+ max_new_tokens = len(correct_output) + 10
182
+
183
+ temperature = 0.0001
184
+
185
+ start_ids = encode(input_line)
186
+ x = (torch.tensor(start_ids, dtype=torch.long, device=device)[None, ...])
187
+
188
+ y = model.generate(x, max_new_tokens, temperature=temperature, top_k=15)
189
+ model_output = str(decode(y[0].tolist())).partition(output_separator)[-1]
190
+
191
+ error_abs, error_rel, correct_length = score_performance(model_output, correct_output,
192
+ output_terminator = output_terminator,
193
+ list_separator = list_separator)
194
+
195
+ return(input_line, correct_output, model_output, error_abs, error_rel, correct_length)
196
+
197
+ def log_performance_verbose(model, iter_number,
198
+ n = 10,
199
+ verbose_log_file = None,
200
+ performance_log_file = None):
201
+
202
+ max_list_length_chars = int(np.floor(0.5*block_size - 2))
203
+ max_random_length = int(np.floor(0.5*max_list_length_chars))
204
+ force_list_lengths = np.linspace(int(np.floor(0.2*max_random_length)),
205
+ max_random_length,
206
+ n, dtype = np.int64)
207
+
208
+
209
+ errors_abs = []
210
+ errors_rel = []
211
+ with open(verbose_log_file, 'a') as log_file:
212
+ log_file.write("\n\n_____ {} ______\n".format(iter_number))
213
+
214
+ with open(performance_log_file, 'a') as perf_file:
215
+ perf_file.write("{}:".format(iter_number))
216
+
217
+ list_length_total = 0
218
+ best_rel_error = np.inf
219
+ worst_rel_error = -np.inf
220
+ worst_abs_error = -1
221
+ worst_list_length = -1
222
+ best_abs_error = -1
223
+ best_list_length = -1
224
+ for n_ind, force_list_length in enumerate(force_list_lengths):
225
+ input_line, correct_output, model_output,\
226
+ error_abs, error_rel, correct_length = evaluate_performance(model, force_list_length = force_list_length)
227
+ errors_abs.append(error_abs)
228
+ errors_rel.append(error_rel)
229
+ list_length_total += correct_length
230
+
231
+ if(error_rel > worst_rel_error):
232
+ worst_rel_error = error_rel
233
+ worst_abs_error = error_abs
234
+ worst_list_length = correct_length
235
+ if(error_rel < best_rel_error):
236
+ best_rel_error = error_rel
237
+ best_abs_error = error_abs
238
+ best_list_length = correct_length
239
+
240
+ with open(verbose_log_file, 'a') as log_file:
241
+ log_file.write("\n\tINPUT: {}\n\tEXAMPLE: {}\n\t OUTPUT: {}\n\tERRORS:{} / {} ({:.2f}%)\n".format(input_line,
242
+ correct_output,
243
+ model_output,
244
+ error_abs,
245
+ correct_length,
246
+ error_rel*100.0))
247
+ with open(performance_log_file, 'a') as perf_file:
248
+ if n_ind > 0:
249
+ perf_file.write(",")
250
+ perf_file.write(" {} / {} ({:.2f}%)".format(error_abs, correct_length, error_rel*100.0))
251
+ with open(performance_log_file, 'a') as perf_file:
252
+ perf_file.write("\n")
253
+ print("ITER {}: total score is {} errors / {} ({:.2f}%)".format(iter_number,
254
+ np.sum(errors_abs),
255
+ np.sum(list_length_total),
256
+ 100.0*np.mean(errors_rel)))
257
+ print("\t Best: {} errors / {} ({:.2f}%)".format(best_abs_error,
258
+ best_list_length,
259
+ 100.0*best_rel_error))
260
+ print("\t Worst: {} errors / {} ({:.2f}%)".format(worst_abs_error,
261
+ worst_list_length,
262
+ 100.0*worst_rel_error))
263
+
264
+ # various inits, derived attributes, I/O setup
265
+ ddp = int(os.environ.get('RANK', -1)) != -1 # is this a ddp run?
266
+ if ddp:
267
+ init_process_group(backend=backend)
268
+ ddp_rank = int(os.environ['RANK'])
269
+ ddp_local_rank = int(os.environ['LOCAL_RANK'])
270
+ ddp_world_size = int(os.environ['WORLD_SIZE'])
271
+ device = f'cuda:{ddp_local_rank}'
272
+ torch.cuda.set_device(device)
273
+ master_process = ddp_rank == 0 # this process will do logging, checkpointing etc.
274
+ seed_offset = ddp_rank # each process gets a different seed
275
+ assert gradient_accumulation_steps % torch.cuda.device_count() == 0
276
+ gradient_accumulation_steps //= torch.cuda.device_count()
277
+ else:
278
+ # if not ddp, we are running on a single gpu, and one process
279
+ master_process = True
280
+ seed_offset = 0
281
+ ddp_world_size = 1
282
+ tokens_per_iter = gradient_accumulation_steps * ddp_world_size * batch_size * block_size
283
+ print(f"tokens per iteration will be: {tokens_per_iter:,}")
284
+
285
+ if master_process:
286
+ os.makedirs(out_dir, exist_ok=True)
287
+ torch.manual_seed(1337 + seed_offset)
288
+ torch.backends.cuda.matmul.allow_tf32 = True # allow tf32 on matmul
289
+ torch.backends.cudnn.allow_tf32 = True # allow tf32 on cudnn
290
+ device_type = 'cuda' if 'cuda' in device else 'cpu' # for later use in torch.autocast
291
+ # note: float16 data type will automatically use a GradScaler
292
+ ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]
293
+ ctx = nullcontext() if device_type == 'cpu' else torch.amp.autocast(device_type=device_type, dtype=ptdtype)
294
+
295
+ # poor man's data loader
296
+ data_dir = os.path.join('data', dataset)
297
+ train_data = np.memmap(os.path.join(data_dir, 'train.bin'), dtype=np.uint16, mode='r')
298
+ val_data = np.memmap(os.path.join(data_dir, 'val.bin'), dtype=np.uint16, mode='r')
299
+ def get_batch(split):
300
+ data = train_data if split == 'train' else val_data
301
+ ix = torch.randint(len(data) - block_size, (batch_size,))
302
+ x = torch.stack([torch.from_numpy((data[i:i+block_size]).astype(np.int64)) for i in ix])
303
+ y = torch.stack([torch.from_numpy((data[i+1:i+1+block_size]).astype(np.int64)) for i in ix])
304
+ if device_type == 'cuda':
305
+ # pin arrays x,y, which allows us to move them to GPU asynchronously (non_blocking=True)
306
+ x, y = x.pin_memory().to(device, non_blocking=True), y.pin_memory().to(device, non_blocking=True)
307
+ else:
308
+ x, y = x.to(device), y.to(device)
309
+ return x, y
310
+
311
+ # init these up here, can override if init_from='resume' (i.e. from a checkpoint)
312
+ iter_num = 0
313
+ best_val_loss = 1e9
314
+
315
+ # attempt to derive vocab_size from the dataset
316
+ meta_path = os.path.join(data_dir, 'meta.pkl')
317
+ meta_vocab_size = None
318
+ if os.path.exists(meta_path):
319
+ with open(meta_path, 'rb') as f:
320
+ meta = pickle.load(f)
321
+ meta_vocab_size = meta['vocab_size']
322
+ print(f"found vocab_size = {meta_vocab_size} (inside {meta_path})")
323
+
324
+ # model init
325
+ model_args = dict(n_layer=n_layer, n_head=n_head, n_embd=n_embd, block_size=block_size,
326
+ bias=bias, vocab_size=None, dropout=dropout) # start with model_args from command line
327
+ if init_from == 'scratch':
328
+ # init a new model from scratch
329
+ print("Initializing a new model from scratch")
330
+ # determine the vocab size we'll use for from-scratch training
331
+ if meta_vocab_size is None:
332
+ print("defaulting to vocab_size of GPT-2 to 50304 (50257 rounded up for efficiency)")
333
+ model_args['vocab_size'] = meta_vocab_size if meta_vocab_size is not None else 50304
334
+ gptconf = GPTConfig(**model_args)
335
+ model = GPT(gptconf)
336
+ elif init_from == 'resume':
337
+ print(f"Resuming training from {out_dir}")
338
+ # resume training from a checkpoint.
339
+ ckpt_path = os.path.join(out_dir, 'ckpt.pt')
340
+ checkpoint = torch.load(ckpt_path, map_location=device)
341
+ checkpoint_model_args = checkpoint['model_args']
342
+ # force these config attributes to be equal otherwise we can't even resume training
343
+ # the rest of the attributes (e.g. dropout) can stay as desired from command line
344
+ for k in ['n_layer', 'n_head', 'n_embd', 'block_size', 'bias', 'vocab_size']:
345
+ model_args[k] = checkpoint_model_args[k]
346
+ # create the model
347
+ gptconf = GPTConfig(**model_args)
348
+ model = GPT(gptconf)
349
+ state_dict = checkpoint['model']
350
+ # fix the keys of the state dictionary :(
351
+ # honestly no idea how checkpoints sometimes get this prefix, have to debug more
352
+ unwanted_prefix = '_orig_mod.'
353
+ for k,v in list(state_dict.items()):
354
+ if k.startswith(unwanted_prefix):
355
+ state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k)
356
+ model.load_state_dict(state_dict)
357
+ iter_num = checkpoint['iter_num']
358
+ best_val_loss = checkpoint['best_val_loss']
359
+ elif init_from.startswith('gpt2'):
360
+ print(f"Initializing from OpenAI GPT-2 weights: {init_from}")
361
+ # initialize from OpenAI GPT-2 weights
362
+ override_args = dict(dropout=dropout)
363
+ model = GPT.from_pretrained(init_from, override_args)
364
+ # read off the created config params, so we can store them into checkpoint correctly
365
+ for k in ['n_layer', 'n_head', 'n_embd', 'block_size', 'bias', 'vocab_size']:
366
+ model_args[k] = getattr(model.config, k)
367
+ # crop down the model block size if desired, using model surgery
368
+ if block_size < model.config.block_size:
369
+ model.crop_block_size(block_size)
370
+ model_args['block_size'] = block_size # so that the checkpoint will have the right value
371
+ model.to(device)
372
+
373
+ # initialize a GradScaler. If enabled=False scaler is a no-op
374
+ scaler = torch.cuda.amp.GradScaler(enabled=(dtype == 'float16'))
375
+
376
+ # optimizer
377
+ optimizer = model.configure_optimizers(weight_decay, learning_rate, (beta1, beta2), device_type)
378
+ if init_from == 'resume':
379
+ optimizer.load_state_dict(checkpoint['optimizer'])
380
+ checkpoint = None # free up memory
381
+
382
+ # compile the model
383
+ if compile:
384
+ print("compiling the model... (takes a ~minute)")
385
+ unoptimized_model = model
386
+ model = torch.compile(model) # requires PyTorch 2.0
387
+
388
+ # wrap model into DDP container
389
+ if ddp:
390
+ model = DDP(model, device_ids=[ddp_local_rank])
391
+
392
+ # helps estimate an arbitrarily accurate loss over either split using many batches
393
+ @torch.no_grad()
394
+ def estimate_loss():
395
+ out = {}
396
+ model.eval()
397
+ for split in ['train', 'val']:
398
+ losses = torch.zeros(eval_iters)
399
+ for k in range(eval_iters):
400
+ X, Y = get_batch(split)
401
+ with ctx:
402
+ logits, loss = model(X, Y)
403
+ losses[k] = loss.item()
404
+ out[split] = losses.mean()
405
+ model.train()
406
+ return out
407
+
408
+ # learning rate decay scheduler (cosine with warmup)
409
+ def get_lr(it):
410
+ # 1) linear warmup for warmup_iters steps
411
+ if it < warmup_iters:
412
+ return learning_rate * it / warmup_iters
413
+ # 2) if it > lr_decay_iters, return min learning rate
414
+ if it > lr_decay_iters:
415
+ return min_lr
416
+ # 3) in between, use cosine decay down to min learning rate
417
+ decay_ratio = (it - warmup_iters) / (lr_decay_iters - warmup_iters)
418
+ assert 0 <= decay_ratio <= 1
419
+ coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio)) # coeff ranges 0..1
420
+ return min_lr + coeff * (learning_rate - min_lr)
421
+
422
+ # logging
423
+ if wandb_log and master_process:
424
+ import wandb
425
+ wandb.init(project=wandb_project, name=wandb_run_name, config=config)
426
+
427
+ # training loop
428
+ X, Y = get_batch('train') # fetch the very first batch
429
+ t0 = time.time()
430
+ local_iter_num = 0 # number of iterations in the lifetime of this process
431
+ raw_model = model.module if ddp else model # unwrap DDP container if needed
432
+ running_mfu = -1.0
433
+ while True:
434
+
435
+ # determine and set the learning rate for this iteration
436
+ lr = get_lr(iter_num) if decay_lr else learning_rate
437
+ for param_group in optimizer.param_groups:
438
+ param_group['lr'] = lr
439
+
440
+ # evaluate the loss on train/val sets and write checkpoints
441
+ if iter_num % eval_interval == 0 and master_process:
442
+ losses = estimate_loss()
443
+ print(f"step {iter_num}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
444
+ if wandb_log:
445
+ wandb.log({
446
+ "iter": iter_num,
447
+ "train/loss": losses['train'],
448
+ "val/loss": losses['val'],
449
+ "lr": lr,
450
+ "mfu": running_mfu*100, # convert to percentage
451
+ })
452
+ if losses['val'] < best_val_loss or always_save_checkpoint:
453
+ best_val_loss = losses['val']
454
+ if iter_num > 0:
455
+ checkpoint = {
456
+ 'model': raw_model.state_dict(),
457
+ 'optimizer': optimizer.state_dict(),
458
+ 'model_args': model_args,
459
+ 'iter_num': iter_num,
460
+ 'best_val_loss': best_val_loss,
461
+ 'config': config,
462
+ }
463
+ print(f"saving checkpoint to {out_dir}")
464
+ torch.save(checkpoint, os.path.join(out_dir, 'ckpt.pt'))
465
+ if iter_num == 0 and eval_only:
466
+ break
467
+
468
+ # forward backward update, with optional gradient accumulation to simulate larger batch size
469
+ # and using the GradScaler if data type is float16
470
+ for micro_step in range(gradient_accumulation_steps):
471
+ if ddp:
472
+ # in DDP training we only need to sync gradients at the last micro step.
473
+ # the official way to do this is with model.no_sync() context manager, but
474
+ # I really dislike that this bloats the code and forces us to repeat code
475
+ # looking at the source of that context manager, it just toggles this variable
476
+ model.require_backward_grad_sync = (micro_step == gradient_accumulation_steps - 1)
477
+ with ctx:
478
+ logits, loss = model(X, Y)
479
+ loss = loss / gradient_accumulation_steps # scale the loss to account for gradient accumulation
480
+ # immediately async prefetch next batch while model is doing the forward pass on the GPU
481
+ X, Y = get_batch('train')
482
+ # backward pass, with gradient scaling if training in fp16
483
+ scaler.scale(loss).backward()
484
+ # clip the gradient
485
+ if grad_clip != 0.0:
486
+ scaler.unscale_(optimizer)
487
+ torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
488
+ # step the optimizer and scaler if training in fp16
489
+ scaler.step(optimizer)
490
+ scaler.update()
491
+ # flush the gradients as soon as we can, no need for this memory anymore
492
+ optimizer.zero_grad(set_to_none=True)
493
+
494
+ # timing and logging
495
+ t1 = time.time()
496
+ dt = t1 - t0
497
+ t0 = t1
498
+ if iter_num % log_interval == 0 and master_process:
499
+ # get loss as float. note: this is a CPU-GPU sync point
500
+ # scale up to undo the division above, approximating the true total loss (exact would have been a sum)
501
+ lossf = loss.item() * gradient_accumulation_steps
502
+ if local_iter_num >= 5: # let the training loop settle a bit
503
+ mfu = raw_model.estimate_mfu(batch_size * gradient_accumulation_steps, dt)
504
+ running_mfu = mfu if running_mfu == -1.0 else 0.9*running_mfu + 0.1*mfu
505
+ print(f"iter {iter_num}: loss {lossf:.4f}, time {dt*1000:.2f}ms, mfu {running_mfu*100:.2f}%")
506
+
507
+ if (iter_num % verbose_log_interval == 0) and master_process and (local_iter_num > 0):
508
+ log_performance_verbose(raw_model, iter_num,
509
+ verbose_log_file = verbose_logfile,
510
+ performance_log_file = performance_file)
511
+
512
+ iter_num += 1
513
+ local_iter_num += 1
514
+
515
+ # termination conditions
516
+ if iter_num > max_iters:
517
+ break
518
+
519
+ if ddp:
520
+ destroy_process_group()