In [2]:
!pip install --upgrade pip
!pip install torch torchaudio torchvision --upgrade --index-url https://download.pytorch.org/whl/cu118
!pip install huggingface_hub tokenizers sentencepiece -r requirements.txt

[0mLooking in indexes: https://download.pytorch.org/whl/cu118
[0mCollecting lightning@ git+https://github.com/Lightning-AI/lightning@532c723c8584903dc719458d0ad52861d51bc395 (from -r requirements.txt (line 1))
 Using cached lightning-2.2.0.dev0-py3-none-any.whl
[0m

In [3]:
import torch
torch.cuda.is_available()

True

In [4]:
import glob
import math
import sys
import time
from pathlib import Path
from typing import Optional, Tuple, Union

import lightning as L
import torch
from lightning.fabric.loggers import CSVLogger
from lightning.fabric.strategies import FSDPStrategy
from torch.utils.data import DataLoader

# # support running without installing as a package
# wd = Path(__file__).parent.parent.resolve()
# sys.path.append(str(wd))

from tsai_gpt.model import GPT, Block, Config
from tsai_gpt.packed_dataset import CombinedDataset, PackedDataset
from tsai_gpt.speed_monitor import SpeedMonitorBase, estimate_flops, measure_flops
from tsai_gpt.speed_monitor import SpeedMonitorFabric as SpeedMonitor
from tsai_gpt.utils import chunked_cross_entropy, get_default_supported_precision, num_parameters, load_checkpoint

In [5]:
model_name = "pythia-160m"
name = "redpajama"
out_dir = Path("out") / name
save_interval = 100
eval_interval = 1000
eval_iters = 100
log_interval = 100

In [6]:
# Hyperparameters
learning_rate = 6e-3
batch_size = 32
micro_batch_size = 4
gradient_accumulation_steps = batch_size // micro_batch_size
assert gradient_accumulation_steps > 0
#max_iters = 600000 # num_epochs * (epoch_size // micro_batch_size) // devices
max_iters = 25000
weight_decay = 1e-1
beta1 = 0.9
beta2 = 0.95
grad_clip = 1.0
decay_lr = True
warmup_iters = 6000
lr_decay_iters = max_iters
min_lr = 6e-4

In [7]:
# Data proportions from https://arxiv.org/pdf/2302.13971.pdf Table 1
data_config = [
 ("arxiv", 2.5),
 ("book", 4.5),
 ("c4", 15.0),
 ("cc", 67.0),
 ("github", 4.5),
 ("stackexchange", 2.0),
 ("wikipedia", 4.5),
]

In [8]:
hparams = {k: v for k, v in locals().items() if isinstance(v, (int, float, str)) and not k.startswith("_")}
logger = CSVLogger("out", name, flush_logs_every_n_steps=log_interval)


def setup(
 devices: int = 4,
 train_data_dir: Path = Path("data/redpajama_sample"),
 val_data_dir: Optional[Path] = None,
 precision: Optional[str] = None,
 resume: Union[bool, Path] = False,
) -> None:
 precision = precision or get_default_supported_precision(training=True)

 if devices > 1:
 strategy = FSDPStrategy(
 auto_wrap_policy={Block},
 activation_checkpointing_policy={Block},
 state_dict_type="full",
 limit_all_gathers=True,
 cpu_offload=False,
 )
 else:
 strategy = "auto"

 fabric = L.Fabric(devices=devices, strategy=strategy, precision=precision, loggers=logger)
 fabric.print(hparams)
 fabric.launch(main, train_data_dir, val_data_dir, resume)

In [9]:
model_copy = None

In [10]:
def main(fabric: L.Fabric, train_data_dir: Path, val_data_dir: Path, resume: Union[bool, Path]) -> None:
 global model_copy
 speed_monitor = SpeedMonitor(fabric, window_size=50, time_unit="seconds")

 if fabric.global_rank == 0:
 out_dir.mkdir(parents=True, exist_ok=True)

 config = Config.from_name(model_name)

 train_dataloader, val_dataloader = create_dataloaders(
 batch_size=micro_batch_size,
 block_size=config.block_size,
 fabric=fabric,
 train_data_dir=train_data_dir,
 val_data_dir=val_data_dir,
 seed=(1337 + fabric.global_rank),
 )
 if val_dataloader is None:
 train_dataloader = fabric.setup_dataloaders(train_dataloader)
 else:
 train_dataloader, val_dataloader = fabric.setup_dataloaders(train_dataloader, val_dataloader)

 fabric.seed_everything(1337) # same seed for every process to init model (FSDP)

 fabric.print(f"Loading model with {config.__dict__}")
 t0 = time.perf_counter()
 import torch
 import torch.nn as nn
 def _init_weights(module: nn.Module) -> None:
 """Meant to be used with `gpt.apply(gpt._init_weights)`."""
 if isinstance(module, nn.Linear):
 torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
 if module.bias is not None:
 torch.nn.init.zeros_(module.bias)
 elif isinstance(module, nn.Embedding):
 torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
 
 with fabric.init_module(empty_init=True):
 model = GPT(config)
 model.apply(_init_weights)
 model.apply(_init_weights)

 
 # checkpoint_path = Path("out/redpajama/iter-000999-ckpt.pth")

 # load_checkpoint(fabric, model, checkpoint_path)
 
 # print(model.transformer.h[0].mlp.fc.weight)

 fabric.print(f"Time to instantiate model: {time.perf_counter() - t0:.02f} seconds.")
 fabric.print(f"Total parameters {num_parameters(model):,}")

 model = fabric.setup(model)
 optimizer = torch.optim.AdamW(
 model.parameters(), lr=learning_rate, weight_decay=weight_decay, betas=(beta1, beta2), foreach=False
 )

 #model_copy = model

 optimizer = fabric.setup_optimizers(optimizer)

 state = {"model": model, "optimizer": optimizer, "hparams": hparams, "iter_num": 0, "step_count": 0}

 if resume is True:
 resume = max(out_dir.glob("*.pth"), key=lambda p: int(p.name.split("-")[1]))
 if resume:
 fabric.print(f"Resuming training from {resume}")
 fabric.load(resume, state)

 train_time = time.perf_counter()
 train(fabric, state, train_dataloader, val_dataloader, speed_monitor)
 fabric.print(f"Training time: {(time.perf_counter()-train_time):.2f}s")
 if fabric.device.type == "cuda":
 fabric.print(f"Memory used: {torch.cuda.max_memory_allocated() / 1e9:.02f} GB")



In [11]:
def train(
 fabric: L.Fabric,
 state: dict,
 train_dataloader: DataLoader,
 val_dataloader: DataLoader,
 speed_monitor: SpeedMonitorBase,
) -> None:
 model = state["model"]
 optimizer = state["optimizer"]

 if val_dataloader is not None:
 validate(fabric, model, val_dataloader) # sanity check

 with torch.device("meta"):
 meta_model = GPT(model.config)
 # "estimated" is not as precise as "measured". Estimated is optimistic but widely used in the wild.
 # When comparing MFU or FLOP numbers with other projects that use estimated FLOPs,
 # consider passing `SpeedMonitor(flops_per_batch=estimated_flops)` instead
 estimated_flops = estimate_flops(meta_model) * micro_batch_size
 fabric.print(f"Estimated TFLOPs: {estimated_flops * fabric.world_size / 1e12:.2f}")
 x = torch.randint(0, 1, (micro_batch_size, model.max_seq_length))
 measured_flops = measure_flops(meta_model, x)
 fabric.print(f"Measured TFLOPs: {measured_flops * fabric.world_size / 1e12:.2f}")
 del meta_model, x

 total_lengths = 0
 total_t0 = time.perf_counter()

 for state["iter_num"], train_data in enumerate(train_dataloader, state["iter_num"]):
 if state["iter_num"] >= max_iters:
 checkpoint_path = out_dir / f"iter-{state['iter_num']:06d}-ckpt.pth"
 fabric.print(f"Saving checkpoint to {str(checkpoint_path)!r}")
 fabric.save(checkpoint_path, state)
 break

 # determine and set the learning rate for this iteration
 lr = get_lr(state["iter_num"]) if decay_lr else learning_rate
 for param_group in optimizer.param_groups:
 param_group["lr"] = lr

 iter_t0 = time.perf_counter()

 input_ids = train_data[:, 0 : model.max_seq_length].contiguous()
 targets = train_data[:, 1 : model.max_seq_length + 1].contiguous()

 is_accumulating = (state["iter_num"] + 1) % gradient_accumulation_steps != 0
 with fabric.no_backward_sync(model, enabled=is_accumulating):
 logits = model(input_ids)
 loss = chunked_cross_entropy(logits, targets, chunk_size=0)
 fabric.backward(loss / gradient_accumulation_steps)
 
 # return 

 if not is_accumulating:
 fabric.clip_gradients(model, optimizer, max_norm=grad_clip)
 optimizer.step()
 optimizer.zero_grad()
 state["step_count"] += 1

 t1 = time.perf_counter()
 total_lengths += input_ids.size(1)
 speed_monitor.on_train_batch_end(
 (state["iter_num"] + 1) * micro_batch_size,
 t1 - total_t0,
 # this assumes that device FLOPs are the same and that all devices have the same batch size
 fabric.world_size,
 flops_per_batch=measured_flops,
 lengths=total_lengths,
 )
 if state["iter_num"] % log_interval == 0:
 fabric.print(
 f"iter {state['iter_num']} step {state['step_count']}: loss {loss.item():.4f}, LR: {lr:.6f}, iter time:"
 f" {(t1 - iter_t0) * 1000:.2f}ms{' (optimizer.step)' if not is_accumulating else ''}"
 )

 if val_dataloader is not None and not is_accumulating and state["step_count"] % eval_interval == 0:
 t0 = time.perf_counter()
 val_loss = validate(fabric, model, val_dataloader)
 t1 = time.perf_counter() - t0
 speed_monitor.eval_end(t1)
 fabric.print(f"step {state['iter_num']}: val loss {val_loss.item():.4f}, val time: {t1 * 1000:.2f}ms")
 fabric.barrier()
 if not is_accumulating and (state["step_count"]+1) % save_interval == 0:
 checkpoint_path = out_dir / f"iter-{state['iter_num']:06d}-ckpt.pth"
 fabric.print(f"Saving checkpoint to {str(checkpoint_path)!r}")
 fabric.save(checkpoint_path, state)
 
 '''if loss.item() <= 4.0 and state['iter_num'] >= 2000:
 fabric.print(
 f"iter {state['iter_num']} step {state['step_count']}: loss {loss.item():.4f}, LR: {lr:.6f}, iter time:"
 f" {(t1 - iter_t0) * 1000:.2f}ms{' (optimizer.step)' if not is_accumulating else ''}"
 )
 checkpoint_path = out_dir / f"iter-{state['iter_num']:06d}-ckpt.pth"
 fabric.print(f"Saving checkpoint to {str(checkpoint_path)!r}")
 fabric.save(checkpoint_path, state)
 break'''

In [12]:
@torch.inference_mode()
def validate(fabric: L.Fabric, model: torch.nn.Module, val_dataloader: DataLoader) -> torch.Tensor:
 fabric.print("Validating ...")
 model.eval()

 losses = torch.zeros(eval_iters, device=fabric.device)
 for k, val_data in enumerate(val_dataloader):
 input_ids = val_data[:, 0 : model.max_seq_length].contiguous()
 targets = val_data[:, 1 : model.max_seq_length + 1].contiguous()
 logits = model(input_ids)
 losses[k] = chunked_cross_entropy(logits, targets, chunk_size=0)
 out = losses.mean()

 model.train()
 return out

In [13]:
def create_dataloader(
 batch_size: int, block_size: int, data_dir: Path, fabric: L.Fabric, shuffle: bool = True, seed: int = 12345
) -> DataLoader:
 datasets = []
 for prefix, _ in data_config:
 filenames = glob.glob(str(data_dir / f"{prefix}*"))
 dataset = PackedDataset(
 filenames,
 n_chunks=4,
 block_size=block_size,
 shuffle=shuffle,
 seed=seed,
 num_processes=fabric.world_size,
 process_rank=fabric.global_rank,
 )
 datasets.append(dataset)

 if not datasets:
 raise RuntimeError(
 f"No data found at {data_dir}. Make sure you ran prepare_redpajama.py to create the dataset."
 )

 weights = [weight for _, weight in data_config]
 sum_weights = sum(weights)
 weights = [el / sum_weights for el in weights]

 combined_dataset = CombinedDataset(datasets=datasets, seed=seed, weights=weights)

 return DataLoader(combined_dataset, batch_size=batch_size, shuffle=False, pin_memory=True)


In [14]:
def create_dataloaders(
 batch_size: int,
 block_size: int,
 fabric: L.Fabric,
 train_data_dir: Path = Path("data/redpajama_sample"),
 val_data_dir: Optional[Path] = None,
 seed: int = 12345,
) -> Tuple[DataLoader, DataLoader]:
 # Increase by one because we need the next word as well
 effective_block_size = block_size + 1
 train_dataloader = create_dataloader(
 batch_size=batch_size,
 block_size=effective_block_size,
 fabric=fabric,
 data_dir=train_data_dir,
 shuffle=True,
 seed=seed,
 )
 val_dataloader = (
 create_dataloader(
 batch_size=batch_size,
 block_size=effective_block_size,
 fabric=fabric,
 data_dir=val_data_dir,
 shuffle=False,
 seed=seed,
 )
 if val_data_dir
 else None
 )
 return train_dataloader, val_dataloader

In [15]:
def get_lr(it: int) -> float:
 # 1) linear warmup for warmup_iters steps
 if it < warmup_iters:
 return learning_rate * it / warmup_iters
 # 2) if it > lr_decay_iters, return min learning rate
 if it > lr_decay_iters:
 return min_lr
 # 3) in between, use cosine decay down to min learning rate
 decay_ratio = (it - warmup_iters) / (lr_decay_iters - warmup_iters)
 assert 0 <= decay_ratio <= 1
 coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio)) # coeff ranges 0..1
 return min_lr + coeff * (learning_rate - min_lr)

In [16]:
torch.set_float32_matmul_precision("medium")
setup(
 devices=1,
 train_data_dir=Path("data/lit-redpajama-sample"),
 resume=True
)

Using 16-bit Automatic Mixed Precision (AMP)
Seed set to 1337


{'model_name': 'pythia-160m', 'name': 'redpajama', 'save_interval': 100, 'eval_interval': 1000, 'eval_iters': 100, 'log_interval': 100, 'learning_rate': 0.006, 'batch_size': 32, 'micro_batch_size': 4, 'gradient_accumulation_steps': 8, 'max_iters': 25000, 'weight_decay': 0.1, 'beta1': 0.9, 'beta2': 0.95, 'grad_clip': 1.0, 'decay_lr': True, 'warmup_iters': 6000, 'lr_decay_iters': 25000, 'min_lr': 0.0006}
Loading model with {'name': 'pythia-160m', 'hf_config': {'org': 'EleutherAI', 'name': 'pythia-160m-deduped'}, 'block_size': 2048, 'vocab_size': 50254, 'padding_multiple': 128, 'padded_vocab_size': 50304, 'n_layer': 12, 'n_head': 12, 'n_embd': 768, 'rotary_percentage': 0.25, 'parallel_residual': True, 'bias': True, 'lm_head_bias': False, 'n_query_groups': 12, 'shared_attention_norm': False, '_norm_class': 'LayerNorm', 'norm_eps': 1e-05, '_mlp_class': 'GptNeoxMLP', 'gelu_approximate': 'none', 'intermediate_size': 3072, 'rope_condense_ratio': 1, 'rope_base': 10000, 'head_size': 64, 'rope_n_

KeyboardInterrupt: 

In [20]:
!ls checkpoints

generation_config.json	meta-llama	tokenizer.model
lit_config.json		tokenizer.json	tokenizer_config.json


In [17]:
import torch.nn as nn
from tsai_gpt.tokenizer import Tokenizer
precision = get_default_supported_precision(False)
logger = CSVLogger("out", name, flush_logs_every_n_steps=log_interval)
fabric = L.Fabric(devices=1, strategy="auto", precision=precision, loggers=logger)

config = Config.from_name(model_name)

def _init_weights(module: nn.Module) -> None:
 """Meant to be used with `gpt.apply(gpt._init_weights)`."""
 if isinstance(module, nn.Linear):
 torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
 if module.bias is not None:
 torch.nn.init.zeros_(module.bias)
 elif isinstance(module, nn.Embedding):
 torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
 
with fabric.init_module(empty_init=True):
 model = GPT(config)
 model.apply(_init_weights)
model.apply(_init_weights)

checkpoint_path = Path("out/redpajama/iter-025000-ckpt.pth")

load_checkpoint(fabric, model, checkpoint_path)
 
#print(model.transformer.h[0].mlp.fc.weight)

#fabric.print(f"Time to instantiate model: {time.perf_counter() - t0:.02f} seconds.")
#fabric.print(f"Total parameters {num_parameters(model):,}")

weight_decay = 1e-1
beta1 = 0.9
beta2 = 0.95
learning_rate = 6e-3
hparams = {k: v for k, v in locals().items() if isinstance(v, (int, float, str)) and not k.startswith("_")}

model = fabric.setup(model)
optimizer = torch.optim.AdamW(
 model.parameters(), lr=learning_rate, weight_decay=weight_decay, betas=(beta1, beta2), foreach=False
)

# model_copy = model

optimizer = fabric.setup_optimizers(optimizer)

state = {"model": model, "optimizer": optimizer, "hparams": hparams, "iter_num": 0, "step_count": 0}

resume = max(out_dir.glob("*.pth"), key=lambda p: int(p.name.split("-")[1]))
if resume:
 fabric.print(f"Loading model from {resume}")
 fabric.load(resume, state)

deviceType = 'cuda' if torch.cuda.is_available() else 'cpu'
m = model.to(deviceType)
tokenizer_gpt = Tokenizer(checkpoint_dir=Path("checkpoints/meta-llama/Llama-2-7b-chat-hf")) 
 
def generate_predictions(prompt, max_new_tokens=200, temperature=0.8, top_k=50):
 m.eval()
 encoded_text = tokenizer_gpt.encode(prompt)
 #print('--------------------encoded text = ',encoded_text)
 
 reshaped_tensor = torch.unsqueeze(encoded_text, 0).to(deviceType) 
 #print('--------------------reshaped_tensor = ',reshaped_tensor)
 out_text = tokenizer_gpt.decode(m.generate(reshaped_tensor, max_new_tokens=max_new_tokens, temperature=0.8, top_k=50)[0])
 m.train()
 return out_text


print(
 generate_predictions(
 "The main reason for the financial "
 )
)
print("--------------\n")

print(
 generate_predictions(
 "Covid19 pandemic gave the world new "
 )
)
print("--------------\n")

print(
 generate_predictions(
 "Biofuels can be used "
 )
)
print("--------------\n")

print(
 generate_predictions(
 "You believe it or not but the fact is"
 )
)
print("--------------\n")

Loading model from out/redpajama/iter-025000-ckpt.pth
The main reason for the financial 16 taxpayer tax increase, the number of individuals working in the first quarter of 2018 to take interest rates and it is very likely that an employee might receive that amount and the amount paid back income of his assets in the third quarter of 2018.
It is not easy to give credit to the first quarter of 2018 to give the same amount that they can sell, as the amount paid back, and the amount paid back income of the shares of the ear pay will not be paid until the retirement is in 2019.
The more than the more the amount paid back in the quarter of 2018 to be paying a year and a pay back income of the year the tax return of the dividend will be paid back. RSS 600 605 50th Anniversary
The G200 605 603 
--------------

Covid19 pandemic gave the world new 1965 results to the National Public Health Protection Agency (SCO), the largest provider of scientific medical care program, the latest one of the sci

In [46]:
import gc
gc.collect()
torch.cuda.empty_cache()

In [45]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:2048"