{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import torch\n",
    "\n",
    "torch.cuda.is_available()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import glob\n",
    "import math\n",
    "import sys\n",
    "import time\n",
    "from pathlib import Path\n",
    "from typing import Optional, Tuple, Union\n",
    "\n",
    "import lightning as L\n",
    "import torch\n",
    "from lightning.fabric.loggers import CSVLogger\n",
    "from lightning.fabric.strategies import FSDPStrategy\n",
    "from torch.utils.data import DataLoader\n",
    "\n",
    "# # support running without installing as a package\n",
    "# wd = Path(__file__).parent.parent.resolve()\n",
    "# sys.path.append(str(wd))\n",
    "\n",
    "from tsai_gpt.model import GPT, Block, Config\n",
    "from tsai_gpt.packed_dataset import CombinedDataset, PackedDataset\n",
    "from tsai_gpt.speed_monitor import SpeedMonitorBase, estimate_flops, measure_flops\n",
    "from tsai_gpt.speed_monitor import SpeedMonitorFabric as SpeedMonitor\n",
    "from tsai_gpt.utils import (\n",
    "    chunked_cross_entropy,\n",
    "    get_default_supported_precision,\n",
    "    num_parameters,\n",
    "    load_checkpoint,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "model_name = \"pythia-160m\"\n",
    "name = \"redpajama\"\n",
    "out_dir = Path(\"out\") / name\n",
    "save_interval = 1000\n",
    "eval_interval = 1000\n",
    "eval_iters = 100\n",
    "log_interval = 100"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Hyperparameters\n",
    "learning_rate = 6e-3\n",
    "batch_size = 32\n",
    "micro_batch_size = 8\n",
    "gradient_accumulation_steps = batch_size // micro_batch_size\n",
    "assert gradient_accumulation_steps > 0\n",
    "# max_iters = 600000  # num_epochs * (epoch_size // micro_batch_size) // devices\n",
    "max_iters = 15000\n",
    "weight_decay = 1e-1\n",
    "beta1 = 0.9\n",
    "beta2 = 0.95\n",
    "grad_clip = 1.0\n",
    "decay_lr = True\n",
    "warmup_iters = 2000\n",
    "lr_decay_iters = max_iters\n",
    "min_lr = 6e-6"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Data proportions from https://arxiv.org/pdf/2302.13971.pdf Table 1\n",
    "data_config = [\n",
    "    (\"arxiv\", 2.5),\n",
    "    (\"book\", 4.5),\n",
    "    (\"c4\", 15.0),\n",
    "    (\"cc\", 67.0),\n",
    "    (\"github\", 4.5),\n",
    "    (\"stackexchange\", 2.0),\n",
    "    (\"wikipedia\", 4.5),\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "hparams = {\n",
    "    k: v\n",
    "    for k, v in locals().items()\n",
    "    if isinstance(v, (int, float, str)) and not k.startswith(\"_\")\n",
    "}\n",
    "logger = CSVLogger(\"out\", name, flush_logs_every_n_steps=log_interval)\n",
    "\n",
    "\n",
    "def setup(\n",
    "    devices: int = 4,\n",
    "    train_data_dir: Path = Path(\"data/redpajama_sample\"),\n",
    "    val_data_dir: Optional[Path] = None,\n",
    "    precision: Optional[str] = None,\n",
    "    resume: Union[bool, Path] = False,\n",
    ") -> None:\n",
    "    precision = precision or get_default_supported_precision(training=True)\n",
    "\n",
    "    if devices > 1:\n",
    "        strategy = FSDPStrategy(\n",
    "            auto_wrap_policy={Block},\n",
    "            activation_checkpointing_policy={Block},\n",
    "            state_dict_type=\"full\",\n",
    "            limit_all_gathers=True,\n",
    "            cpu_offload=False,\n",
    "        )\n",
    "    else:\n",
    "        strategy = \"auto\"\n",
    "\n",
    "    fabric = L.Fabric(\n",
    "        devices=devices, strategy=strategy, precision=precision, loggers=logger\n",
    "    )\n",
    "    fabric.print(hparams)\n",
    "    fabric.launch(main, train_data_dir, val_data_dir, resume)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "model_copy = None"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "def main(\n",
    "    fabric: L.Fabric,\n",
    "    train_data_dir: Path,\n",
    "    val_data_dir: Path,\n",
    "    resume: Union[bool, Path],\n",
    ") -> None:\n",
    "    global model_copy\n",
    "    speed_monitor = SpeedMonitor(fabric, window_size=50, time_unit=\"seconds\")\n",
    "\n",
    "    if fabric.global_rank == 0:\n",
    "        out_dir.mkdir(parents=True, exist_ok=True)\n",
    "\n",
    "    config = Config.from_name(model_name)\n",
    "\n",
    "    train_dataloader, val_dataloader = create_dataloaders(\n",
    "        batch_size=micro_batch_size,\n",
    "        block_size=config.block_size,\n",
    "        fabric=fabric,\n",
    "        train_data_dir=train_data_dir,\n",
    "        val_data_dir=val_data_dir,\n",
    "        seed=(1337 + fabric.global_rank),\n",
    "    )\n",
    "    if val_dataloader is None:\n",
    "        train_dataloader = fabric.setup_dataloaders(train_dataloader)\n",
    "    else:\n",
    "        train_dataloader, val_dataloader = fabric.setup_dataloaders(\n",
    "            train_dataloader, val_dataloader\n",
    "        )\n",
    "\n",
    "    fabric.seed_everything(1337)  # same seed for every process to init model (FSDP)\n",
    "\n",
    "    fabric.print(f\"Loading model with {config.__dict__}\")\n",
    "    t0 = time.perf_counter()\n",
    "    import torch\n",
    "    import torch.nn as nn\n",
    "\n",
    "    def _init_weights(module: nn.Module) -> None:\n",
    "        \"\"\"Meant to be used with `gpt.apply(gpt._init_weights)`.\"\"\"\n",
    "        if isinstance(module, nn.Linear):\n",
    "            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)\n",
    "            if module.bias is not None:\n",
    "                torch.nn.init.zeros_(module.bias)\n",
    "        elif isinstance(module, nn.Embedding):\n",
    "            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)\n",
    "\n",
    "    with fabric.init_module(empty_init=True):\n",
    "        model = GPT(config)\n",
    "        model.apply(_init_weights)\n",
    "    model.apply(_init_weights)\n",
    "\n",
    "    # checkpoint_path = Path(\"out/redpajama/iter-000999-ckpt.pth\")\n",
    "\n",
    "    # load_checkpoint(fabric, model, checkpoint_path)\n",
    "\n",
    "    # print(model.transformer.h[0].mlp.fc.weight)\n",
    "\n",
    "    fabric.print(f\"Time to instantiate model: {time.perf_counter() - t0:.02f} seconds.\")\n",
    "    fabric.print(f\"Total parameters {num_parameters(model):,}\")\n",
    "\n",
    "    model = fabric.setup(model)\n",
    "    optimizer = torch.optim.AdamW(\n",
    "        model.parameters(),\n",
    "        lr=learning_rate,\n",
    "        weight_decay=weight_decay,\n",
    "        betas=(beta1, beta2),\n",
    "        foreach=False,\n",
    "    )\n",
    "\n",
    "    # model_copy = model\n",
    "\n",
    "    optimizer = fabric.setup_optimizers(optimizer)\n",
    "\n",
    "    state = {\n",
    "        \"model\": model,\n",
    "        \"optimizer\": optimizer,\n",
    "        \"hparams\": hparams,\n",
    "        \"iter_num\": 0,\n",
    "        \"step_count\": 0,\n",
    "    }\n",
    "\n",
    "    if resume is True:\n",
    "        resume = max(out_dir.glob(\"*.pth\"), key=lambda p: int(p.name.split(\"-\")[1]))\n",
    "    if resume:\n",
    "        fabric.print(f\"Resuming training from {resume}\")\n",
    "        fabric.load(resume, state)\n",
    "\n",
    "    train_time = time.perf_counter()\n",
    "    train(fabric, state, train_dataloader, val_dataloader, speed_monitor)\n",
    "    fabric.print(f\"Training time: {(time.perf_counter()-train_time):.2f}s\")\n",
    "    if fabric.device.type == \"cuda\":\n",
    "        fabric.print(f\"Memory used: {torch.cuda.max_memory_allocated() / 1e9:.02f} GB\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "def train(\n",
    "    fabric: L.Fabric,\n",
    "    state: dict,\n",
    "    train_dataloader: DataLoader,\n",
    "    val_dataloader: DataLoader,\n",
    "    speed_monitor: SpeedMonitorBase,\n",
    ") -> None:\n",
    "    model = state[\"model\"]\n",
    "    optimizer = state[\"optimizer\"]\n",
    "\n",
    "    if val_dataloader is not None:\n",
    "        validate(fabric, model, val_dataloader)  # sanity check\n",
    "\n",
    "    with torch.device(\"meta\"):\n",
    "        meta_model = GPT(model.config)\n",
    "        # \"estimated\" is not as precise as \"measured\". Estimated is optimistic but widely used in the wild.\n",
    "        # When comparing MFU or FLOP numbers with other projects that use estimated FLOPs,\n",
    "        # consider passing `SpeedMonitor(flops_per_batch=estimated_flops)` instead\n",
    "        estimated_flops = estimate_flops(meta_model) * micro_batch_size\n",
    "        fabric.print(\n",
    "            f\"Estimated TFLOPs: {estimated_flops * fabric.world_size / 1e12:.2f}\"\n",
    "        )\n",
    "        x = torch.randint(0, 1, (micro_batch_size, model.max_seq_length))\n",
    "        measured_flops = measure_flops(meta_model, x)\n",
    "        fabric.print(\n",
    "            f\"Measured TFLOPs: {measured_flops * fabric.world_size / 1e12:.2f}\"\n",
    "        )\n",
    "        del meta_model, x\n",
    "\n",
    "    total_lengths = 0\n",
    "    total_t0 = time.perf_counter()\n",
    "\n",
    "    for state[\"iter_num\"], train_data in enumerate(train_dataloader, state[\"iter_num\"]):\n",
    "        if state[\"iter_num\"] >= max_iters:\n",
    "            checkpoint_path = out_dir / f\"iter-{state['iter_num']:06d}-ckpt.pth\"\n",
    "            fabric.print(f\"Saving checkpoint to {str(checkpoint_path)!r}\")\n",
    "            fabric.save(checkpoint_path, state)\n",
    "            break\n",
    "\n",
    "        # determine and set the learning rate for this iteration\n",
    "        lr = get_lr(state[\"iter_num\"]) if decay_lr else learning_rate\n",
    "        for param_group in optimizer.param_groups:\n",
    "            param_group[\"lr\"] = lr\n",
    "\n",
    "        iter_t0 = time.perf_counter()\n",
    "\n",
    "        input_ids = train_data[:, 0 : model.max_seq_length].contiguous()\n",
    "        targets = train_data[:, 1 : model.max_seq_length + 1].contiguous()\n",
    "\n",
    "        is_accumulating = (state[\"iter_num\"] + 1) % gradient_accumulation_steps != 0\n",
    "        with fabric.no_backward_sync(model, enabled=is_accumulating):\n",
    "            logits = model(input_ids)\n",
    "            loss = chunked_cross_entropy(logits, targets, chunk_size=0)\n",
    "            fabric.backward(loss / gradient_accumulation_steps)\n",
    "\n",
    "        # return\n",
    "\n",
    "        if not is_accumulating:\n",
    "            fabric.clip_gradients(model, optimizer, max_norm=grad_clip)\n",
    "            optimizer.step()\n",
    "            optimizer.zero_grad()\n",
    "            state[\"step_count\"] += 1\n",
    "\n",
    "        t1 = time.perf_counter()\n",
    "        total_lengths += input_ids.size(1)\n",
    "        speed_monitor.on_train_batch_end(\n",
    "            (state[\"iter_num\"] + 1) * micro_batch_size,\n",
    "            t1 - total_t0,\n",
    "            # this assumes that device FLOPs are the same and that all devices have the same batch size\n",
    "            fabric.world_size,\n",
    "            flops_per_batch=measured_flops,\n",
    "            lengths=total_lengths,\n",
    "        )\n",
    "        if state[\"iter_num\"] % log_interval == 0:\n",
    "            fabric.print(\n",
    "                f\"iter {state['iter_num']} step {state['step_count']}: loss {loss.item():.4f}, LR: {lr:.6f}, iter time:\"\n",
    "                f\" {(t1 - iter_t0) * 1000:.2f}ms{' (optimizer.step)' if not is_accumulating else ''}\"\n",
    "            )\n",
    "\n",
    "        if (\n",
    "            val_dataloader is not None\n",
    "            and not is_accumulating\n",
    "            and state[\"step_count\"] % eval_interval == 0\n",
    "        ):\n",
    "            t0 = time.perf_counter()\n",
    "            val_loss = validate(fabric, model, val_dataloader)\n",
    "            t1 = time.perf_counter() - t0\n",
    "            speed_monitor.eval_end(t1)\n",
    "            fabric.print(\n",
    "                f\"step {state['iter_num']}: val loss {val_loss.item():.4f}, val time: {t1 * 1000:.2f}ms\"\n",
    "            )\n",
    "            fabric.barrier()\n",
    "        if not is_accumulating and state[\"step_count\"] % save_interval == 0:\n",
    "            checkpoint_path = out_dir / f\"iter-{state['iter_num']:06d}-ckpt.pth\"\n",
    "            fabric.print(f\"Saving checkpoint to {str(checkpoint_path)!r}\")\n",
    "            fabric.save(checkpoint_path, state)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "@torch.inference_mode()\n",
    "def validate(\n",
    "    fabric: L.Fabric, model: torch.nn.Module, val_dataloader: DataLoader\n",
    ") -> torch.Tensor:\n",
    "    fabric.print(\"Validating ...\")\n",
    "    model.eval()\n",
    "\n",
    "    losses = torch.zeros(eval_iters, device=fabric.device)\n",
    "    for k, val_data in enumerate(val_dataloader):\n",
    "        input_ids = val_data[:, 0 : model.max_seq_length].contiguous()\n",
    "        targets = val_data[:, 1 : model.max_seq_length + 1].contiguous()\n",
    "        logits = model(input_ids)\n",
    "        losses[k] = chunked_cross_entropy(logits, targets, chunk_size=0)\n",
    "    out = losses.mean()\n",
    "\n",
    "    model.train()\n",
    "    return out"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "def create_dataloader(\n",
    "    batch_size: int,\n",
    "    block_size: int,\n",
    "    data_dir: Path,\n",
    "    fabric: L.Fabric,\n",
    "    shuffle: bool = True,\n",
    "    seed: int = 12345,\n",
    ") -> DataLoader:\n",
    "    datasets = []\n",
    "    for prefix, _ in data_config:\n",
    "        filenames = glob.glob(str(data_dir / f\"{prefix}*\"))\n",
    "        dataset = PackedDataset(\n",
    "            filenames,\n",
    "            n_chunks=4,\n",
    "            block_size=block_size,\n",
    "            shuffle=shuffle,\n",
    "            seed=seed,\n",
    "            num_processes=fabric.world_size,\n",
    "            process_rank=fabric.global_rank,\n",
    "        )\n",
    "        datasets.append(dataset)\n",
    "\n",
    "    if not datasets:\n",
    "        raise RuntimeError(\n",
    "            f\"No data found at {data_dir}. Make sure you ran prepare_redpajama.py to create the dataset.\"\n",
    "        )\n",
    "\n",
    "    weights = [weight for _, weight in data_config]\n",
    "    sum_weights = sum(weights)\n",
    "    weights = [el / sum_weights for el in weights]\n",
    "\n",
    "    combined_dataset = CombinedDataset(datasets=datasets, seed=seed, weights=weights)\n",
    "\n",
    "    return DataLoader(\n",
    "        combined_dataset, batch_size=batch_size, shuffle=False, pin_memory=True\n",
    "    )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "def create_dataloaders(\n",
    "    batch_size: int,\n",
    "    block_size: int,\n",
    "    fabric: L.Fabric,\n",
    "    train_data_dir: Path = Path(\"data/redpajama_sample\"),\n",
    "    val_data_dir: Optional[Path] = None,\n",
    "    seed: int = 12345,\n",
    ") -> Tuple[DataLoader, DataLoader]:\n",
    "    # Increase by one because we need the next word as well\n",
    "    effective_block_size = block_size + 1\n",
    "    train_dataloader = create_dataloader(\n",
    "        batch_size=batch_size,\n",
    "        block_size=effective_block_size,\n",
    "        fabric=fabric,\n",
    "        data_dir=train_data_dir,\n",
    "        shuffle=True,\n",
    "        seed=seed,\n",
    "    )\n",
    "    val_dataloader = (\n",
    "        create_dataloader(\n",
    "            batch_size=batch_size,\n",
    "            block_size=effective_block_size,\n",
    "            fabric=fabric,\n",
    "            data_dir=val_data_dir,\n",
    "            shuffle=False,\n",
    "            seed=seed,\n",
    "        )\n",
    "        if val_data_dir\n",
    "        else None\n",
    "    )\n",
    "    return train_dataloader, val_dataloader"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_lr(it: int) -> float:\n",
    "    # 1) linear warmup for warmup_iters steps\n",
    "    if it < warmup_iters:\n",
    "        return learning_rate * it / warmup_iters\n",
    "    # 2) if it > lr_decay_iters, return min learning rate\n",
    "    if it > lr_decay_iters:\n",
    "        return min_lr\n",
    "    # 3) in between, use cosine decay down to min learning rate\n",
    "    decay_ratio = (it - warmup_iters) / (lr_decay_iters - warmup_iters)\n",
    "    assert 0 <= decay_ratio <= 1\n",
    "    coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio))  # coeff ranges 0..1\n",
    "    return min_lr + coeff * (learning_rate - min_lr)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "# torch.set_float32_matmul_precision(\"medium\")\n",
    "# setup(devices=1, train_data_dir=Path(\"data/lit-redpajama-sample\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "from generate.base import main\n",
    "from pathlib import Path"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Loading model 'out/redpajama/lit_model.pth' with {'name': 'pythia-160m', 'hf_config': {'org': 'EleutherAI', 'name': 'pythia-160m'}, 'block_size': 2048, 'vocab_size': 50254, 'padding_multiple': 128, 'padded_vocab_size': 50304, 'n_layer': 12, 'n_head': 12, 'n_embd': 768, 'rotary_percentage': 0.25, 'parallel_residual': True, 'bias': True, 'lm_head_bias': False, 'n_query_groups': 12, 'shared_attention_norm': False, '_norm_class': 'LayerNorm', 'norm_eps': 1e-05, '_mlp_class': 'GptNeoxMLP', 'gelu_approximate': 'none', 'intermediate_size': 3072, 'rope_condense_ratio': 1, 'rope_base': 10000, 'head_size': 64, 'rope_n_elem': 16}\n",
      "Time to instantiate model: 0.17 seconds.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Time to load the model weights: 0.50 seconds.\n",
      "Seed set to 1234\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Earth is a planet with rocky core and 100,000 hectares of natural Earth. Our planet is a planet with rocky core and 100,000 hectares of natural Earth. The sun has a warm, warm surface and the sun has a\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Time for inference 1: 0.71 sec total, 70.90 tokens/sec\n",
      "Memory used: 0.35 GB\n"
     ]
    }
   ],
   "source": [
    "import torch\n",
    "\n",
    "torch.set_float32_matmul_precision(\"high\")\n",
    "main(\n",
    "    prompt=\"Earth is a planet with rocky core and \",\n",
    "    checkpoint_dir=Path(\"out/redpajama\"),\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Loading model 'out/redpajama/lit_model.pth' with {'name': 'pythia-160m', 'hf_config': {'org': 'EleutherAI', 'name': 'pythia-160m'}, 'block_size': 2048, 'vocab_size': 50254, 'padding_multiple': 128, 'padded_vocab_size': 50304, 'n_layer': 12, 'n_head': 12, 'n_embd': 768, 'rotary_percentage': 0.25, 'parallel_residual': True, 'bias': True, 'lm_head_bias': False, 'n_query_groups': 12, 'shared_attention_norm': False, '_norm_class': 'LayerNorm', 'norm_eps': 1e-05, '_mlp_class': 'GptNeoxMLP', 'gelu_approximate': 'none', 'intermediate_size': 3072, 'rope_condense_ratio': 1, 'rope_base': 10000, 'head_size': 64, 'rope_n_elem': 16}\n",
      "Time to instantiate model: 0.02 seconds.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Time to load the model weights: 0.49 seconds.\n",
      "Seed set to 1234\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "I like to drive when it is raining outside and 100% of the time. The next day, I think you will see the right movement.\n",
      "We already know that if you don't go to the center, you can be a hug, or a bit more vigor.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Time for inference 1: 0.69 sec total, 72.80 tokens/sec\n",
      "Memory used: 0.35 GB\n"
     ]
    }
   ],
   "source": [
    "main(\n",
    "    prompt=\"I like to drive when it is raining outside and \",\n",
    "    checkpoint_dir=Path(\"out/redpajama\"),\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Loading model 'out/redpajama/lit_model.pth' with {'name': 'pythia-160m', 'hf_config': {'org': 'EleutherAI', 'name': 'pythia-160m'}, 'block_size': 2048, 'vocab_size': 50254, 'padding_multiple': 128, 'padded_vocab_size': 50304, 'n_layer': 12, 'n_head': 12, 'n_embd': 768, 'rotary_percentage': 0.25, 'parallel_residual': True, 'bias': True, 'lm_head_bias': False, 'n_query_groups': 12, 'shared_attention_norm': False, '_norm_class': 'LayerNorm', 'norm_eps': 1e-05, '_mlp_class': 'GptNeoxMLP', 'gelu_approximate': 'none', 'intermediate_size': 3072, 'rope_condense_ratio': 1, 'rope_base': 10000, 'head_size': 64, 'rope_n_elem': 16}\n",
      "Time to instantiate model: 0.02 seconds.\n",
      "Time to load the model weights: 0.51 seconds.\n",
      "Seed set to 1234\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "I like to drive when it is raining outside and 100% of the time. The next day, I think you will see the right movement.\n",
      "We already know that if you don't go to the center, you can be a hug, or a bit more vigor.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Time for inference 1: 0.65 sec total, 76.96 tokens/sec\n",
      "Memory used: 0.35 GB\n"
     ]
    }
   ],
   "source": [
    "main(\n",
    "    prompt=\"I like to drive when it is raining outside and \",\n",
    "    checkpoint_dir=Path(\"out/redpajama\"),\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Loading model 'out/redpajama/lit_model.pth' with {'name': 'pythia-160m', 'hf_config': {'org': 'EleutherAI', 'name': 'pythia-160m'}, 'block_size': 2048, 'vocab_size': 50254, 'padding_multiple': 128, 'padded_vocab_size': 50304, 'n_layer': 12, 'n_head': 12, 'n_embd': 768, 'rotary_percentage': 0.25, 'parallel_residual': True, 'bias': True, 'lm_head_bias': False, 'n_query_groups': 12, 'shared_attention_norm': False, '_norm_class': 'LayerNorm', 'norm_eps': 1e-05, '_mlp_class': 'GptNeoxMLP', 'gelu_approximate': 'none', 'intermediate_size': 3072, 'rope_condense_ratio': 1, 'rope_base': 10000, 'head_size': 64, 'rope_n_elem': 16}\n",
      "Time to instantiate model: 0.02 seconds.\n",
      "Time to load the model weights: 0.49 seconds.\n",
      "Seed set to 1234\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "What a beautiful day it was, never imagined I would be able to 100,000 times a month. It was the beginning of a carpet, and was about 15 minutes to drain from the carpet. We were so overwhelmed, ready to do the kits,\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Time for inference 1: 0.68 sec total, 73.18 tokens/sec\n",
      "Memory used: 0.35 GB\n"
     ]
    }
   ],
   "source": [
    "main(\n",
    "    prompt=\"What a beautiful day it was, never imagined I would be able to \",\n",
    "    checkpoint_dir=Path(\"out/redpajama\"),\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Loading model 'out/redpajama/lit_model.pth' with {'name': 'pythia-160m', 'hf_config': {'org': 'EleutherAI', 'name': 'pythia-160m'}, 'block_size': 2048, 'vocab_size': 50254, 'padding_multiple': 128, 'padded_vocab_size': 50304, 'n_layer': 12, 'n_head': 12, 'n_embd': 768, 'rotary_percentage': 0.25, 'parallel_residual': True, 'bias': True, 'lm_head_bias': False, 'n_query_groups': 12, 'shared_attention_norm': False, '_norm_class': 'LayerNorm', 'norm_eps': 1e-05, '_mlp_class': 'GptNeoxMLP', 'gelu_approximate': 'none', 'intermediate_size': 3072, 'rope_condense_ratio': 1, 'rope_base': 10000, 'head_size': 64, 'rope_n_elem': 16}\n",
      "Time to instantiate model: 0.02 seconds.\n",
      "Time to load the model weights: 0.49 seconds.\n",
      "Seed set to 1234\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Do you think Einstein was the greatest ever physicist ever lived? I think 1 of the 1980s wrote a very deep, poetic narration of my life. I know all of you and your life is beautiful, especially in the sense of storytelling. You are. I know all of you\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Time for inference 1: 0.68 sec total, 74.07 tokens/sec\n",
      "Memory used: 0.35 GB\n"
     ]
    }
   ],
   "source": [
    "main(\n",
    "    prompt=\"Do you think Einstein was the greatest ever physicist ever lived? I think \",\n",
    "    checkpoint_dir=Path(\"out/redpajama\"),\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "base",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}