db commited on
Commit
0d40b6e
1 Parent(s): 9ac7846
Files changed (1) hide show
  1. app.py +60 -381
app.py CHANGED
@@ -1,410 +1,89 @@
1
- # saves the openwebtext dataset to a binary file for training. following was helpful:
2
- # https://github.com/HazyResearch/flash-attention/blob/main/training/src/datamodules/language_modeling_hf.py
3
-
4
- import os
5
- from tqdm import tqdm
6
- import numpy as np
7
- import tiktoken
8
- from datasets import load_dataset # huggingface datasets
9
-
10
- # number of workers in .map() call
11
- # good number to use is ~order number of cpu cores // 2
12
- num_proc = 8
13
-
14
- # takes 54GB in huggingface .cache dir, about 8M documents (8,013,769)
15
- dataset = load_dataset("openwebtext")
16
-
17
- # owt by default only contains the 'train' split, so create a test split
18
- split_dataset = dataset["train"].train_test_split(test_size=0.0005, seed=2357, shuffle=True)
19
- split_dataset['val'] = split_dataset.pop('test') # rename the test split to val
20
-
21
- # this results in:
22
- # >>> split_dataset
23
- # DatasetDict({
24
- # train: Dataset({
25
- # features: ['text'],
26
- # num_rows: 8009762
27
- # })
28
- # val: Dataset({
29
- # features: ['text'],
30
- # num_rows: 4007
31
- # })
32
- # })
33
-
34
- # we now want to tokenize the dataset. first define the encoding function (gpt2 bpe)
35
- enc = tiktoken.get_encoding("gpt2")
36
- def process(example):
37
- ids = enc.encode_ordinary(example['text']) # encode_ordinary ignores any special tokens
38
- ids.append(enc.eot_token) # add the end of text token, e.g. 50256 for gpt2 bpe
39
- # note: I think eot should be prepended not appended... hmm. it's called "eot" though...
40
- out = {'ids': ids, 'len': len(ids)}
41
- return out
42
-
43
- # tokenize the dataset
44
- tokenized = split_dataset.map(
45
- process,
46
- remove_columns=['text'],
47
- desc="tokenizing the splits",
48
- num_proc=num_proc,
49
- )
50
-
51
- # concatenate all the ids in each dataset into one large file we can use for training
52
- for split, dset in tokenized.items():
53
- arr_len = np.sum(dset['len'])
54
- filename = os.path.join(os.path.dirname(__file__), f'{split}.bin')
55
- dtype = np.uint16 # (can do since enc.max_token_value == 50256 is < 2**16)
56
- arr = np.memmap(filename, dtype=dtype, mode='w+', shape=(arr_len,))
57
- total_batches = 1024
58
-
59
- idx = 0
60
- for batch_idx in tqdm(range(total_batches), desc=f'writing {filename}'):
61
- # Batch together samples for faster write
62
- batch = dset.shard(num_shards=total_batches, index=batch_idx, contiguous=True).with_format('numpy')
63
- arr_batch = np.concatenate(batch['ids'])
64
- # Write into mmap
65
- arr[idx : idx + len(arr_batch)] = arr_batch
66
- idx += len(arr_batch)
67
- arr.flush()
68
-
69
- # train.bin is ~17GB, val.bin ~8.5MB
70
- # train has ~9B tokens (9,035,582,198)
71
- # val has ~4M tokens (4,434,897)
72
-
73
- # to read the bin files later, e.g. with numpy:
74
- # m = np.memmap('train.bin', dtype=np.uint16, mode='r')
75
-
76
-
77
- ##########################################################################################
78
-
79
  """
80
- This training script can be run both on a single gpu in debug mode,
81
- and also in a larger training run with distributed data parallel (ddp).
82
-
83
- To run on a single GPU, example:
84
- $ python train.py --batch_size=32 --compile=False
85
-
86
- To run with DDP on 4 gpus on 1 node, example:
87
- $ torchrun --standalone --nproc_per_node=4 train.py
88
-
89
- To run with DDP on 4 gpus across 2 nodes, example:
90
- - Run on the first (master) node with example IP 123.456.123.456:
91
- $ torchrun --nproc_per_node=8 --nnodes=2 --node_rank=0 --master_addr=123.456.123.456 --master_port=1234 train.py
92
- - Run on the worker node:
93
- $ torchrun --nproc_per_node=8 --nnodes=2 --node_rank=1 --master_addr=123.456.123.456 --master_port=1234 train.py
94
- (If your cluster does not have Infiniband interconnect prepend NCCL_IB_DISABLE=1)
95
  """
96
-
97
  import os
98
- import time
99
- import math
100
  import pickle
101
  from contextlib import nullcontext
102
-
103
- import numpy as np
104
  import torch
105
- from torch.nn.parallel import DistributedDataParallel as DDP
106
- from torch.distributed import init_process_group, destroy_process_group
107
-
108
  from model import GPTConfig, GPT
109
 
110
  # -----------------------------------------------------------------------------
111
- # default config values designed to train a gpt2 (124M) on OpenWebText
112
- # I/O
113
- out_dir = 'out'
114
- eval_interval = 2000
115
- log_interval = 1
116
- eval_iters = 200
117
- eval_only = False # if True, script exits right after the first eval
118
- always_save_checkpoint = True # if True, always save a checkpoint after each eval
119
- init_from = 'scratch' # 'scratch' or 'resume' or 'gpt2*'
120
- # wandb logging
121
- wandb_log = False # disabled by default
122
- wandb_project = 'owt'
123
- wandb_run_name = 'gpt2' # 'run' + str(time.time())
124
- # data
125
- dataset = 'openwebtext'
126
- gradient_accumulation_steps = 5 * 8 # used to simulate larger batch sizes
127
- batch_size = 12 # if gradient_accumulation_steps > 1, this is the micro-batch size
128
- block_size = 1024
129
- # model
130
- n_layer = 12
131
- n_head = 12
132
- n_embd = 768
133
- dropout = 0.0 # for pretraining 0 is good, for finetuning try 0.1+
134
- bias = False # do we use bias inside LayerNorm and Linear layers?
135
- # adamw optimizer
136
- learning_rate = 6e-4 # max learning rate
137
- max_iters = 600000 # total number of training iterations
138
- weight_decay = 1e-1
139
- beta1 = 0.9
140
- beta2 = 0.95
141
- grad_clip = 1.0 # clip gradients at this value, or disable if == 0.0
142
- # learning rate decay settings
143
- decay_lr = True # whether to decay the learning rate
144
- warmup_iters = 2000 # how many steps to warm up for
145
- lr_decay_iters = 600000 # should be ~= max_iters per Chinchilla
146
- min_lr = 6e-5 # minimum learning rate, should be ~= learning_rate/10 per Chinchilla
147
- # DDP settings
148
- backend = 'nccl' # 'nccl', 'gloo', etc.
149
- # system
150
- device = 'cuda' # examples: 'cpu', 'cuda', 'cuda:0', 'cuda:1' etc., or try 'mps' on macbooks
151
- dtype = 'bfloat16' # 'float32', 'bfloat16', or 'float16', the latter will auto implement a GradScaler
152
- compile = True # use PyTorch 2.0 to compile the model to be faster
153
- # -----------------------------------------------------------------------------
154
- config_keys = [k for k,v in globals().items() if not k.startswith('_') and isinstance(v, (int, float, bool, str))]
155
  exec(open('configurator.py').read()) # overrides from command line or config file
156
- config = {k: globals()[k] for k in config_keys} # will be useful for logging
157
  # -----------------------------------------------------------------------------
158
 
159
- # various inits, derived attributes, I/O setup
160
- ddp = int(os.environ.get('RANK', -1)) != -1 # is this a ddp run?
161
- if ddp:
162
- init_process_group(backend=backend)
163
- ddp_rank = int(os.environ['RANK'])
164
- ddp_local_rank = int(os.environ['LOCAL_RANK'])
165
- ddp_world_size = int(os.environ['WORLD_SIZE'])
166
- device = f'cuda:{ddp_local_rank}'
167
- torch.cuda.set_device(device)
168
- master_process = ddp_rank == 0 # this process will do logging, checkpointing etc.
169
- seed_offset = ddp_rank # each process gets a different seed
170
- assert gradient_accumulation_steps % torch.cuda.device_count() == 0
171
- gradient_accumulation_steps //= torch.cuda.device_count()
172
- else:
173
- # if not ddp, we are running on a single gpu, and one process
174
- master_process = True
175
- seed_offset = 0
176
- ddp_world_size = 1
177
- tokens_per_iter = gradient_accumulation_steps * ddp_world_size * batch_size * block_size
178
- print(f"tokens per iteration will be: {tokens_per_iter:,}")
179
-
180
- if master_process:
181
- os.makedirs(out_dir, exist_ok=True)
182
- torch.manual_seed(1337 + seed_offset)
183
  torch.backends.cuda.matmul.allow_tf32 = True # allow tf32 on matmul
184
  torch.backends.cudnn.allow_tf32 = True # allow tf32 on cudnn
185
  device_type = 'cuda' if 'cuda' in device else 'cpu' # for later use in torch.autocast
186
- # note: float16 data type will automatically use a GradScaler
187
  ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]
188
- ctx = nullcontext() if device_type == 'cpu' else torch.cuda.amp.autocast(dtype=torch.float16)
189
-
190
- # poor man's data loader
191
- data_dir = os.path.join('data', dataset)
192
- train_data = np.memmap(os.path.join(data_dir, 'train.bin'), dtype=np.uint16, mode='r')
193
- val_data = np.memmap(os.path.join(data_dir, 'val.bin'), dtype=np.uint16, mode='r')
194
- def get_batch(split):
195
- data = train_data if split == 'train' else val_data
196
- ix = torch.randint(len(data) - block_size, (batch_size,))
197
- x = torch.stack([torch.from_numpy((data[i:i+block_size]).astype(np.int64)) for i in ix])
198
- y = torch.stack([torch.from_numpy((data[i+1:i+1+block_size]).astype(np.int64)) for i in ix])
199
- if device_type == 'cuda':
200
- # pin arrays x,y, which allows us to move them to GPU asynchronously (non_blocking=True)
201
- x, y = x.pin_memory().to(device, non_blocking=True), y.pin_memory().to(device, non_blocking=True)
202
- else:
203
- x, y = x.to(device), y.to(device)
204
- return x, y
205
 
206
- # init these up here, can override if init_from='resume' (i.e. from a checkpoint)
207
- iter_num = 0
208
- best_val_loss = 1e9
209
-
210
- # attempt to derive vocab_size from the dataset
211
- meta_path = os.path.join(data_dir, 'meta.pkl')
212
- meta_vocab_size = None
213
- if os.path.exists(meta_path):
214
- with open(meta_path, 'rb') as f:
215
- meta = pickle.load(f)
216
- meta_vocab_size = meta['vocab_size']
217
- print(f"found vocab_size = {meta_vocab_size} (inside {meta_path})")
218
-
219
- # model init
220
- model_args = dict(n_layer=n_layer, n_head=n_head, n_embd=n_embd, block_size=block_size,
221
- bias=bias, vocab_size=None, dropout=dropout) # start with model_args from command line
222
- if init_from == 'scratch':
223
- # init a new model from scratch
224
- print("Initializing a new model from scratch")
225
- # determine the vocab size we'll use for from-scratch training
226
- if meta_vocab_size is None:
227
- print("defaulting to vocab_size of GPT-2 to 50304 (50257 rounded up for efficiency)")
228
- model_args['vocab_size'] = meta_vocab_size if meta_vocab_size is not None else 50304
229
- gptconf = GPTConfig(**model_args)
230
- model = GPT(gptconf)
231
- elif init_from == 'resume':
232
- print(f"Resuming training from {out_dir}")
233
- # resume training from a checkpoint.
234
  ckpt_path = os.path.join(out_dir, 'ckpt.pt')
235
  checkpoint = torch.load(ckpt_path, map_location=device)
236
- checkpoint_model_args = checkpoint['model_args']
237
- # force these config attributes to be equal otherwise we can't even resume training
238
- # the rest of the attributes (e.g. dropout) can stay as desired from command line
239
- for k in ['n_layer', 'n_head', 'n_embd', 'block_size', 'bias', 'vocab_size']:
240
- model_args[k] = checkpoint_model_args[k]
241
- # create the model
242
- gptconf = GPTConfig(**model_args)
243
  model = GPT(gptconf)
244
  state_dict = checkpoint['model']
245
- # fix the keys of the state dictionary :(
246
- # honestly no idea how checkpoints sometimes get this prefix, have to debug more
247
  unwanted_prefix = '_orig_mod.'
248
  for k,v in list(state_dict.items()):
249
  if k.startswith(unwanted_prefix):
250
  state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k)
251
  model.load_state_dict(state_dict)
252
- iter_num = checkpoint['iter_num']
253
- best_val_loss = checkpoint['best_val_loss']
254
  elif init_from.startswith('gpt2'):
255
- print(f"Initializing from OpenAI GPT-2 weights: {init_from}")
256
- # initialize from OpenAI GPT-2 weights
257
- override_args = dict(dropout=dropout)
258
- model = GPT.from_pretrained(init_from, override_args)
259
- # read off the created config params, so we can store them into checkpoint correctly
260
- for k in ['n_layer', 'n_head', 'n_embd', 'block_size', 'bias', 'vocab_size']:
261
- model_args[k] = getattr(model.config, k)
262
- # crop down the model block size if desired, using model surgery
263
- if block_size < model.config.block_size:
264
- model.crop_block_size(block_size)
265
- model_args['block_size'] = block_size # so that the checkpoint will have the right value
266
- model.to(device)
267
-
268
- # initialize a GradScaler. If enabled=False scaler is a no-op
269
- scaler = torch.cuda.amp.GradScaler(enabled=(dtype == 'float16'))
270
-
271
- # optimizer
272
- optimizer = model.configure_optimizers(weight_decay, learning_rate, (beta1, beta2), device_type)
273
- if init_from == 'resume':
274
- optimizer.load_state_dict(checkpoint['optimizer'])
275
- checkpoint = None # free up memory
276
 
277
- # compile the model
 
278
  if compile:
279
- print("compiling the model... (takes a ~minute)")
280
- unoptimized_model = model
281
- model = torch.compile(model) # requires PyTorch 2.0
282
-
283
- # wrap model into DDP container
284
- if ddp:
285
- model = DDP(model, device_ids=[ddp_local_rank])
286
-
287
- # helps estimate an arbitrarily accurate loss over either split using many batches
288
- @torch.no_grad()
289
- def estimate_loss():
290
- out = {}
291
- model.eval()
292
- for split in ['train', 'val']:
293
- losses = torch.zeros(eval_iters)
294
- for k in range(eval_iters):
295
- X, Y = get_batch(split)
296
- with ctx:
297
- logits, loss = model(X, Y)
298
- losses[k] = loss.item()
299
- out[split] = losses.mean()
300
- model.train()
301
- return out
302
-
303
- # learning rate decay scheduler (cosine with warmup)
304
- def get_lr(it):
305
- # 1) linear warmup for warmup_iters steps
306
- if it < warmup_iters:
307
- return learning_rate * it / warmup_iters
308
- # 2) if it > lr_decay_iters, return min learning rate
309
- if it > lr_decay_iters:
310
- return min_lr
311
- # 3) in between, use cosine decay down to min learning rate
312
- decay_ratio = (it - warmup_iters) / (lr_decay_iters - warmup_iters)
313
- assert 0 <= decay_ratio <= 1
314
- coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio)) # coeff ranges 0..1
315
- return min_lr + coeff * (learning_rate - min_lr)
316
-
317
- # logging
318
- if wandb_log and master_process:
319
- import wandb
320
- wandb.init(project=wandb_project, name=wandb_run_name, config=config)
321
-
322
- # training loop
323
- X, Y = get_batch('train') # fetch the very first batch
324
- t0 = time.time()
325
- local_iter_num = 0 # number of iterations in the lifetime of this process
326
- raw_model = model.module if ddp else model # unwrap DDP container if needed
327
- running_mfu = -1.0
328
- while True:
329
-
330
- # determine and set the learning rate for this iteration
331
- lr = get_lr(iter_num) if decay_lr else learning_rate
332
- for param_group in optimizer.param_groups:
333
- param_group['lr'] = lr
334
-
335
- # evaluate the loss on train/val sets and write checkpoints
336
- if iter_num % eval_interval == 0 and master_process:
337
- losses = estimate_loss()
338
- print(f"step {iter_num}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
339
- if wandb_log:
340
- wandb.log({
341
- "iter": iter_num,
342
- "train/loss": losses['train'],
343
- "val/loss": losses['val'],
344
- "lr": lr,
345
- "mfu": running_mfu*100, # convert to percentage
346
- })
347
- if losses['val'] < best_val_loss or always_save_checkpoint:
348
- best_val_loss = losses['val']
349
- if iter_num > 0:
350
- checkpoint = {
351
- 'model': raw_model.state_dict(),
352
- 'optimizer': optimizer.state_dict(),
353
- 'model_args': model_args,
354
- 'iter_num': iter_num,
355
- 'best_val_loss': best_val_loss,
356
- 'config': config,
357
- }
358
- print(f"saving checkpoint to {out_dir}")
359
- torch.save(checkpoint, os.path.join(out_dir, 'ckpt.pt'))
360
- if iter_num == 0 and eval_only:
361
- break
362
-
363
- # forward backward update, with optional gradient accumulation to simulate larger batch size
364
- # and using the GradScaler if data type is float16
365
- for micro_step in range(gradient_accumulation_steps):
366
- if ddp:
367
- # in DDP training we only need to sync gradients at the last micro step.
368
- # the official way to do this is with model.no_sync() context manager, but
369
- # I really dislike that this bloats the code and forces us to repeat code
370
- # looking at the source of that context manager, it just toggles this variable
371
- model.require_backward_grad_sync = (micro_step == gradient_accumulation_steps - 1)
372
- with ctx:
373
- logits, loss = model(X, Y)
374
- loss = loss / gradient_accumulation_steps # scale the loss to account for gradient accumulation
375
- # immediately async prefetch next batch while model is doing the forward pass on the GPU
376
- X, Y = get_batch('train')
377
- # backward pass, with gradient scaling if training in fp16
378
- scaler.scale(loss).backward()
379
- # clip the gradient
380
- if grad_clip != 0.0:
381
- scaler.unscale_(optimizer)
382
- torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
383
- # step the optimizer and scaler if training in fp16
384
- scaler.step(optimizer)
385
- scaler.update()
386
- # flush the gradients as soon as we can, no need for this memory anymore
387
- optimizer.zero_grad(set_to_none=True)
388
-
389
- # timing and logging
390
- t1 = time.time()
391
- dt = t1 - t0
392
- t0 = t1
393
- if iter_num % log_interval == 0 and master_process:
394
- # get loss as float. note: this is a CPU-GPU sync point
395
- # scale up to undo the division above, approximating the true total loss (exact would have been a sum)
396
- lossf = loss.item() * gradient_accumulation_steps
397
- if local_iter_num >= 5: # let the training loop settle a bit
398
- mfu = raw_model.estimate_mfu(batch_size * gradient_accumulation_steps, dt)
399
- running_mfu = mfu if running_mfu == -1.0 else 0.9*running_mfu + 0.1*mfu
400
- print(f"iter {iter_num}: loss {lossf:.4f}, time {dt*1000:.2f}ms, mfu {running_mfu*100:.2f}%")
401
- iter_num += 1
402
- local_iter_num += 1
403
-
404
- # termination conditions
405
- if iter_num > max_iters:
406
- break
407
-
408
- if ddp:
409
- destroy_process_group()
410
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  """
2
+ Sample from a trained model
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  """
 
4
  import os
 
 
5
  import pickle
6
  from contextlib import nullcontext
 
 
7
  import torch
8
+ import tiktoken
 
 
9
  from model import GPTConfig, GPT
10
 
11
  # -----------------------------------------------------------------------------
12
+ init_from = 'gpt2-xl' # either 'resume' (from an out_dir) or a gpt2 variant (e.g. 'gpt2-xl')
13
+ out_dir = 'out' # ignored if init_from is not 'resume'
14
+ start = "Hi how are you?\n" # or "<|endoftext|>" or etc. Can also specify a file, use as: "FILE:prompt.txt"
15
+ num_samples = 10 # number of samples to draw
16
+ max_new_tokens = 500 # number of tokens generated in each sample
17
+ temperature = 0.8 # 1.0 = no change, < 1.0 = less random, > 1.0 = more random, in predictions
18
+ top_k = 200 # retain only the top_k most likely tokens, clamp others to have 0 probability
19
+ seed = 1337
20
+ device = 'cuda' # examples: 'cpu', 'cuda', 'cuda:0', 'cuda:1', etc.
21
+ dtype = 'bfloat16' # 'float32' or 'bfloat16' or 'float16'
22
+ compile = False # use PyTorch 2.0 to compile the model to be faster
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  exec(open('configurator.py').read()) # overrides from command line or config file
 
24
  # -----------------------------------------------------------------------------
25
 
26
+ torch.manual_seed(seed)
27
+ torch.cuda.manual_seed(seed)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  torch.backends.cuda.matmul.allow_tf32 = True # allow tf32 on matmul
29
  torch.backends.cudnn.allow_tf32 = True # allow tf32 on cudnn
30
  device_type = 'cuda' if 'cuda' in device else 'cpu' # for later use in torch.autocast
 
31
  ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]
32
+ ctx = nullcontext() if device_type == 'cpu' else torch.amp.autocast(device_type=device_type, dtype=ptdtype)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
 
34
+ # model
35
+ if init_from == 'resume':
36
+ # init from a model saved in a specific directory
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  ckpt_path = os.path.join(out_dir, 'ckpt.pt')
38
  checkpoint = torch.load(ckpt_path, map_location=device)
39
+ gptconf = GPTConfig(**checkpoint['model_args'])
 
 
 
 
 
 
40
  model = GPT(gptconf)
41
  state_dict = checkpoint['model']
 
 
42
  unwanted_prefix = '_orig_mod.'
43
  for k,v in list(state_dict.items()):
44
  if k.startswith(unwanted_prefix):
45
  state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k)
46
  model.load_state_dict(state_dict)
 
 
47
  elif init_from.startswith('gpt2'):
48
+ # init from a given GPT-2 model
49
+ model = GPT.from_pretrained(init_from, dict(dropout=0.0))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
 
51
+ model.eval()
52
+ model.to(device)
53
  if compile:
54
+ model = torch.compile(model) # requires PyTorch 2.0 (optional)
55
+
56
+ # look for the meta pickle in case it is available in the dataset folder
57
+ load_meta = False
58
+ if init_from == 'resume' and 'config' in checkpoint and 'dataset' in checkpoint['config']: # older checkpoints might not have these...
59
+ meta_path = os.path.join('data', checkpoint['config']['dataset'], 'meta.pkl')
60
+ load_meta = os.path.exists(meta_path)
61
+ if load_meta:
62
+ print(f"Loading meta from {meta_path}...")
63
+ with open(meta_path, 'rb') as f:
64
+ meta = pickle.load(f)
65
+ # TODO want to make this more general to arbitrary encoder/decoder schemes
66
+ stoi, itos = meta['stoi'], meta['itos']
67
+ encode = lambda s: [stoi[c] for c in s]
68
+ decode = lambda l: ''.join([itos[i] for i in l])
69
+ else:
70
+ # ok let's assume gpt-2 encodings by default
71
+ print("No meta.pkl found, assuming GPT-2 encodings...")
72
+ enc = tiktoken.get_encoding("gpt2")
73
+ encode = lambda s: enc.encode(s, allowed_special={"<|endoftext|>"})
74
+ decode = lambda l: enc.decode(l)
75
+
76
+ # encode the beginning of the prompt
77
+ if start.startswith('FILE:'):
78
+ with open(start[5:], 'r', encoding='utf-8') as f:
79
+ start = f.read()
80
+ start_ids = encode(start)
81
+ x = (torch.tensor(start_ids, dtype=torch.long, device=device)[None, ...])
82
+
83
+ # run generation
84
+ with torch.no_grad():
85
+ with ctx:
86
+ for k in range(num_samples):
87
+ y = model.generate(x, max_new_tokens, temperature=temperature, top_k=top_k)
88
+ print(decode(y[0].tolist()))
89
+ print('---------------')