zwimpee
/

rotationally-invariant-transformer

Model card Files Files and versions Community

zwimpee commited on Jun 22, 2023

Commit

a7ef49b

•

1 Parent(s): 659a118

uploading preprocessing script, model code, and training script

Browse files

Files changed (3) hide show

model.py +365 -0
preprocessing.py +113 -0
train.py +260 -0

model.py ADDED Viewed

	@@ -0,0 +1,365 @@

+#.\experiments\experiment1\model.py
+import logging
+logging.basicConfig(level=logging.DEBUG)
+import math
+from dataclasses import dataclass
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from prereqs.nanoGPT.model import GPTConfig, GPT, MLP
+# set up logger
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.DEBUG)
+def new_rielu(x):
+    return 0.5 * x * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3.0))))
+@dataclass
+class RotationallyInvariantGPTConfig:
+    block_size: int = 512
+    vocab_size: int = 50304 # GPT-2 vocab_size of 50257, padded up to nearest multiple of 64 for efficiency
+    n_layer: int = 6
+    n_head: int = 8
+    n_embd: int = 768
+    dropout: float = 0.0
+    bias: bool = True # True: bias in Linears and LayerNorms, like GPT-2. False: a bit better and faster
+    rotational_invariance: bool = True # Set to True to enable the rotationally invariant gate layers
+# Models
+class RotationInvariantLayerNorm(nn.Module):
+    """ LayerNorm but with an optional bias. PyTorch doesn't support simply bias=False """
+    def __init__(self, ndim, bias):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(ndim))
+        self.bias = nn.Parameter(torch.zeros(ndim)) if bias else None
+        self.rotation_gate = nn.Linear(ndim, ndim, bias=False)  # no bias needed for rotation
+        self.rotation_gate.weight.data = torch.eye(ndim)
+    def forward(self, input, rotation_matrix=None):
+        # apply rotation
+        if rotation_matrix is not None:
+            input = torch.matmul(input, self.rotation_gate(rotation_matrix))
+        # normalize
+        return F.layer_norm(input, self.weight.shape, self.weight, self.bias, 1e-5)
+class RotationallyInvariantAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        assert config.n_embd % config.n_head == 0
+        self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd, bias=config.bias)
+        self.c_proj = nn.Linear(config.n_embd, config.n_embd, bias=config.bias)
+        self.attn_dropout = nn.Dropout(config.dropout)
+        self.resid_dropout = nn.Dropout(config.dropout)
+        self.n_head = config.n_head
+        self.n_embd = config.n_embd
+        self.dropout = config.dropout
+        self.gate_q = nn.Linear(config.n_embd // config.n_head, 1, bias=config.bias)
+        self.gate_k = nn.Linear(config.n_embd // config.n_head, 1, bias=config.bias)
+    def forward(self, x, rotation_matrix=None):
+        logging.debug(f'x.size(): {x.size()}')
+        B, T, C = x.size()
+        logging.debug(f'B: {B}, T: {T}, C: {C}')
+        q, k, v = self.c_attn(x).chunk(3, dim=-1)
+        logging.debug('Pre-Reshape Q, K, and V')
+        logging.debug(f'q.size(): {q.size()}, k.size(): {k.size()}, v.size(): {v.size()}')
+        logging.debug(f'q.shape: {q.shape}, k.shape: {k.shape}, v.shape: {v.shape}')
+        # Reshape q and k to match the shape of att_dotproduct and att_rotation
+        q = q.view(B, T, self.n_head, C // self.n_head).permute(0, 2, 1, 3)
+        k = k.view(B, T, self.n_head, C // self.n_head).permute(0, 2, 1, 3)
+        v = v.view(B, T, self.n_head, C // self.n_head).permute(0, 2, 1, 3)
+        logging.debug('Post-Reshape Q, K, and V')
+        logging.debug(f'q.size(): {q.size()}, k.size(): {k.size()}, v.size(): {v.size()}')
+        logging.debug(f'q.shape: {q.shape}, k.shape: {k.shape}, v.shape: {v.shape}')
+        # Compute gate_q and gate_k such that they have the same shape as q and k
+        gate_q = torch.sigmoid(self.gate_q(q.view(B, self.n_head, T, -1)))
+        gate_k = torch.sigmoid(self.gate_k(k.view(B, self.n_head, T, -1)))
+        # Traditional dot-product attention
+        qk_dot = q @ k.transpose(-2, -1)
+        att_dotproduct = qk_dot / math.sqrt(self.n_embd)
+        # Rotation invariant attention
+        q_norm = torch.sum(q * q, dim=-1, keepdim=True)
+        k_norm = torch.sum(k * k, dim=-1, keepdim=True)
+        distances = q_norm + k_norm.transpose(-2, -1) - 2 * qk_dot
+        att_rotation = -torch.sqrt(distances)
+        att_rotation = att_rotation / math.sqrt(self.n_embd)
+        # Apply gating to attention scores
+        mixed_att = att_dotproduct * gate_q + att_rotation * (torch.ones_like(gate_q) - gate_q)
+        att_scores = mixed_att / gate_k
+        if rotation_matrix is not None:
+            att_scores = att_scores + rotation_matrix
+        att_weights = F.softmax(att_scores, dim=-1)
+        y = att_weights @ v
+        y = y.permute(0, 2, 1, 3).contiguous().view(B, T, C)
+        y = self.resid_dropout(self.c_proj(y))
+        return y
+class RotationallyInvariantMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.c_fc    = nn.Linear(config.n_embd, 4 * config.n_embd, bias=config.bias)
+        self.c_proj  = nn.Linear(4 * config.n_embd, config.n_embd, bias=config.bias)
+        self.dropout = nn.Dropout(config.dropout)
+        self.rotation_gate = nn.Linear(config.n_embd, config.n_embd, bias=config.bias)  # Added rotational gate layer
+        self.rotation_gate.weight.data = torch.eye(config.n_embd)  # Assuming initial rotation matrix as an identity matrix
+    def forward(self, x, rotation_matrix=None):
+        x = self.c_fc(x)
+        x = F.gelu(x)
+        x = self.c_proj(x)
+        x = self.dropout(x)
+        # Rotational Invariance Part
+        if rotation_matrix is not None:
+            x = torch.matmul(x, self.rotation_gate(rotation_matrix))
+        return x
+class RotationallyInvariantBlock(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.ln_1 = RotationInvariantLayerNorm(config.n_embd, bias=config.bias)
+        self.attn = RotationallyInvariantAttention(config)
+        self.ln_2 = RotationInvariantLayerNorm(config.n_embd, bias=config.bias)
+        self.mlp = RotationallyInvariantMLP(config)
+    def forward(self, x, rotation_matrix=None):
+        x = x + self.attn(self.ln_1(x), rotation_matrix)
+        x = x + self.mlp(self.ln_2(x), rotation_matrix)
+        return x
+class RotationallyInvariantGPT(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        assert config.vocab_size is not None
+        assert config.block_size is not None
+        self.config = config
+        self.transformer = nn.ModuleDict(dict(
+            wte = nn.Embedding(config.vocab_size, config.n_embd),
+            wpe = nn.Embedding(config.block_size, config.n_embd),
+            drop = nn.Dropout(config.dropout),
+            h = nn.ModuleList([RotationallyInvariantBlock(config) for _ in range(config.n_layer)]),
+            ln_f = RotationInvariantLayerNorm(config.n_embd, bias=config.bias),
+        ))
+        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
+        # with weight tying when using torch.compile() some warnings get generated:
+        # "UserWarning: functional_call was passed multiple values for tied weights.
+        # This behavior is deprecated and will be an error in future versions"
+        # not 100% sure what this is, so far seems to be harmless. TODO investigate
+        self.transformer.wte.weight = self.lm_head.weight # https://paperswithcode.com/method/weight-tying
+        # init all weights
+        self.apply(self._init_weights)
+        # apply special scaled init to the residual projections, per GPT-2 paper
+        for pn, p in self.named_parameters():
+            if pn.endswith('c_proj.weight'):
+                torch.nn.init.normal_(p, mean=0.0, std=0.02/math.sqrt(2 * config.n_layer))
+        # report number of parameters
+        print("number of parameters: %.2fM" % (self.get_num_params()/1e6,))
+    def get_num_params(self, non_embedding=True):
+        """
+        Return the number of parameters in the model.
+        For non-embedding count (default), the position embeddings get subtracted.
+        The token embeddings would too, except due to the parameter sharing these
+        params are actually used as weights in the final layer, so we include them.
+        """
+        n_params = sum(p.numel() for p in self.parameters())
+        if non_embedding:
+            n_params -= self.transformer.wpe.weight.numel()
+        return n_params
+    def _init_weights(self, module):
+        if isinstance(module, nn.Linear):
+            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
+            if module.bias is not None:
+                torch.nn.init.zeros_(module.bias)
+        elif isinstance(module, nn.Embedding):
+            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
+    def forward(self, idx, targets=None):
+        device = idx.device
+        b, t = idx.size()
+        assert t <= self.config.block_size, f"Cannot forward sequence of length {t}, block size is only {self.config.block_size}"
+        pos = torch.arange(0, t, dtype=torch.long, device=device).unsqueeze(0) # shape (1, t)
+        # forward the GPT model itself
+        tok_emb = self.transformer.wte(idx) # token embeddings of shape (b, t, n_embd)
+        pos_emb = self.transformer.wpe(pos) # position embeddings of shape (1, t, n_embd)
+        x = self.transformer.drop(tok_emb + pos_emb)
+        for block in self.transformer.h:
+            x = block(x)
+        x = self.transformer.ln_f(x)
+        if targets is not None:
+            # if we are given some desired targets also calculate the loss
+            logits = self.lm_head(x)
+            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1)
+        else:
+            # inference-time mini-optimization: only forward the lm_head on the very last position
+            logits = self.lm_head(x[:, [-1], :]) # note: using list [-1] to preserve the time dim
+            loss = None
+        return logits, loss
+    def crop_block_size(self, block_size):
+        # model surgery to decrease the block size if necessary
+        # e.g. we may load the GPT2 pretrained model checkpoint (block size 1024)
+        # but want to use a smaller block size for some smaller, simpler model
+        assert block_size <= self.config.block_size
+        self.config.block_size = block_size
+        self.transformer.wpe.weight = nn.Parameter(self.transformer.wpe.weight[:block_size])
+        for block in self.transformer.h:
+            if hasattr(block.attn, 'bias'):
+                block.attn.bias = block.attn.bias[:,:,:block_size,:block_size]
+    @classmethod
+    def from_pretrained(cls, model_type, override_args=None):
+        assert model_type in {'gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-xl'}
+        override_args = override_args or {} # default to empty dict
+        # only dropout can be overridden see more notes below
+        assert all(k == 'dropout' for k in override_args)
+        from transformers import GPT2LMHeadModel
+        print("loading weights from pretrained gpt: %s" % model_type)
+        # n_layer, n_head and n_embd are determined from model_type
+        config_args = {
+            'gpt2':         dict(n_layer=12, n_head=12, n_embd=768),  # 124M params
+            'gpt2-medium':  dict(n_layer=24, n_head=16, n_embd=1024), # 350M params
+            'gpt2-large':   dict(n_layer=36, n_head=20, n_embd=1280), # 774M params
+            'gpt2-xl':      dict(n_layer=48, n_head=25, n_embd=1600), # 1558M params
+        }[model_type]
+        print("forcing vocab_size=50257, block_size=1024, bias=True")
+        config_args['vocab_size'] = 50257 # always 50257 for GPT model checkpoints
+        config_args['block_size'] = 1024 # always 1024 for GPT model checkpoints
+        config_args['bias'] = True # always True for GPT model checkpoints
+        # we can override the dropout rate, if desired
+        if 'dropout' in override_args:
+            print(f"overriding dropout rate to {override_args['dropout']}")
+            config_args['dropout'] = override_args['dropout']
+        # create a from-scratch initialized minGPT model
+        config = GPTConfig(**config_args)
+        model = GPT(config)
+        sd = model.state_dict()
+        sd_keys = sd.keys()
+        sd_keys = [k for k in sd_keys if not k.endswith('.attn.bias')] # discard this mask / buffer, not a param
+        # init a huggingface/transformers model
+        model_hf = GPT2LMHeadModel.from_pretrained(model_type)
+        sd_hf = model_hf.state_dict()
+        # copy while ensuring all of the parameters are aligned and match in names and shapes
+        sd_keys_hf = sd_hf.keys()
+        sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.masked_bias')] # ignore these, just a buffer
+        sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.bias')] # same, just the mask (buffer)
+        transposed = ['attn.c_attn.weight', 'attn.c_proj.weight', 'mlp.c_fc.weight', 'mlp.c_proj.weight']
+        # basically the openai checkpoints use a "Conv1D" module, but we only want to use a vanilla Linear
+        # this means that we have to transpose these weights when we import them
+        assert len(sd_keys_hf) == len(sd_keys), f"mismatched keys: {len(sd_keys_hf)} != {len(sd_keys)}"
+        for k in sd_keys_hf:
+            if any(k.endswith(w) for w in transposed):
+                # special treatment for the Conv1D weights we need to transpose
+                assert sd_hf[k].shape[::-1] == sd[k].shape
+                with torch.no_grad():
+                    sd[k].copy_(sd_hf[k].t())
+            else:
+                # vanilla copy over the other parameters
+                assert sd_hf[k].shape == sd[k].shape
+                with torch.no_grad():
+                    sd[k].copy_(sd_hf[k])
+        return model
+    def configure_optimizers(self, weight_decay, learning_rate, betas, device_type):
+        # start with all of the candidate parameters
+        param_dict = {pn: p for pn, p in self.named_parameters()}
+        # filter out those that do not require grad
+        param_dict = {pn: p for pn, p in param_dict.items() if p.requires_grad}
+        # create optim groups. Any parameters that is 2D will be weight decayed, otherwise no.
+        # i.e. all weight tensors in matmuls + embeddings decay, all biases and layernorms don't.
+        decay_params = [p for n, p in param_dict.items() if p.dim() >= 2]
+        nodecay_params = [p for n, p in param_dict.items() if p.dim() < 2]
+        optim_groups = [
+            {'params': decay_params, 'weight_decay': weight_decay},
+            {'params': nodecay_params, 'weight_decay': 0.0}
+        ]
+        num_decay_params = sum(p.numel() for p in decay_params)
+        num_nodecay_params = sum(p.numel() for p in nodecay_params)
+        print(f"num decayed parameter tensors: {len(decay_params)}, with {num_decay_params:,} parameters")
+        print(f"num non-decayed parameter tensors: {len(nodecay_params)}, with {num_nodecay_params:,} parameters")
+        # Create AdamW optimizer and use the fused version if it is available
+        fused_available = 'fused' in inspect.signature(torch.optim.AdamW).parameters
+        use_fused = fused_available and device_type == 'cuda'
+        extra_args = dict(fused=True) if use_fused else dict()
+        optimizer = torch.optim.AdamW(optim_groups, lr=learning_rate, betas=betas, **extra_args)
+        print(f"using fused AdamW: {use_fused}")
+        return optimizer
+    def estimate_mfu(self, fwdbwd_per_iter, dt):
+        """ estimate model flops utilization (MFU) in units of A100 bfloat16 peak FLOPS """
+        # first estimate the number of flops we do per iteration.
+        # see PaLM paper Appendix B as ref: https://arxiv.org/abs/2204.02311
+        N = self.get_num_params()
+        cfg = self.config
+        L, H, Q, T = cfg.n_layer, cfg.n_head, cfg.n_embd//cfg.n_head, cfg.block_size
+        flops_per_token = 6*N + 12*L*H*Q*T
+        flops_per_fwdbwd = flops_per_token * T
+        flops_per_iter = flops_per_fwdbwd * fwdbwd_per_iter
+        # express our flops throughput as ratio of A100 bfloat16 peak flops
+        flops_achieved = flops_per_iter * (1.0/dt) # per second
+        flops_promised = 312e12 # A100 GPU bfloat16 peak flops is 312 TFLOPS
+        mfu = flops_achieved / flops_promised
+        return mfu
+    @torch.no_grad()
+    def generate(self, idx, max_new_tokens, temperature=1.0, top_k=None):
+        """
+        Take a conditioning sequence of indices idx (LongTensor of shape (b,t)) and complete
+        the sequence max_new_tokens times, feeding the predictions back into the model each time.
+        Most likely you'll want to make sure to be in model.eval() mode of operation for this.
+        """
+        for _ in range(max_new_tokens):
+            # if the sequence context is growing too long we must crop it at block_size
+            idx_cond = idx if idx.size(1) <= self.config.block_size else idx[:, -self.config.block_size:]
+            # forward the model to get the logits for the index in the sequence
+            logits, _ = self(idx_cond)
+            # pluck the logits at the final step and scale by desired temperature
+            logits = logits[:, -1, :] / temperature
+            # optionally crop the logits to only the top k options
+            if top_k is not None:
+                v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
+                logits[logits < v[:, [-1]]] = -float('Inf')
+            # apply softmax to convert logits to (normalized) probabilities
+            probs = F.softmax(logits, dim=-1)
+            # sample from the distribution
+            idx_next = torch.multinomial(probs, num_samples=1)
+            # append sampled index to the running sequence and continue
+            idx = torch.cat((idx, idx_next), dim=1)
+        return idx

preprocessing.py ADDED Viewed

	@@ -0,0 +1,113 @@

+#./experiments/experiment1/preprocessing.py
+import logging
+import os
+import sqlite3
+from transformers import GPT2TokenizerFast
+from datasets import load_dataset
+class DatabaseInterface(object):
+    def __init__(self, db_file):
+        self.db_file = db_file
+    def create_table(self, table_name=None):
+        conn = sqlite3.connect(self.db_file)
+        c = conn.cursor()
+        c.execute(
+            '''
+            CREATE TABLE IF NOT EXISTS plain_text (
+                text TEXT,
+                split TEXT
+            )
+            '''
+        )
+        conn.commit()
+        conn.close()
+    def write_plain_text(self, example, split):
+        conn = sqlite3.connect(self.db_file)
+        c = conn.cursor()
+        c.execute("INSERT INTO plain_text (text, split) VALUES (?, ?)",
+                  (example, split))
+        conn.commit()
+        conn.close()
+def process_and_write(example, writer, split):
+    writer.write_plain_text(example, split)
+def prepare_data(start_index, end_index, **kwargs):
+    data_writer = kwargs['data_writer']
+    train_dataset = kwargs['train_dataset']
+    val_dataset = kwargs['val_dataset']
+    for split, dataset in {'val': val_dataset, 'train': train_dataset}.items():
+        subset = dataset[start_index:end_index]  # Select the subset based on start and end indices
+        if isinstance(subset, dict):
+            subset = subset["text"]  # Extract the "text" part from the subset dictionary
+        for example in subset:
+            process_and_write(example, data_writer, split)
+if __name__ == '__main__':
+    logging.basicConfig(
+        format='%(asctime)s - %(levelname)s - %(name)s -  %(message)s',
+        datefmt='%m/%d/%Y %H:%M:%S',
+        level=logging.INFO
+    )
+    # Configs
+    batch_size = 32
+    num_processes = 4 # number of jobs to run simultaneously
+    logging.info("Creating Database Interface")
+    db_file_path = os.path.join('data', 'experiment1.db')
+    _delete_db = True
+    # Check to see if the database file already exists
+    if os.path.exists(db_file_path):
+        if _delete_db:
+            logging.info(f"Database file {db_file_path} already exists. Deleting it.")
+            os.remove(db_file_path)
+            data_writer = DatabaseInterface(db_file_path)
+            data_writer.create_table()
+            logging.info("Database table `plain_text` created")
+        else:
+            logging.info(f"Database file {db_file_path} already exists. Connecting to it.")
+            data_writer = DatabaseInterface(db_file_path)
+    else:
+        data_writer = DatabaseInterface(db_file_path)
+        data_writer.create_table()
+        logging.info("Database table `plain_text` created")
+    #cache_dir=os.path.join(
+    #    'C:/Users/User/.cache/huggingface/datasets/openwebtext/plain_text',
+    #    '1.0.0',
+    #    '6f68e85c16ccc770c0dd489f4008852ea9633604995addd0cd76e293aed9e521'
+    #)
+    dataset = load_dataset(
+        "openwebtext",
+        cache_dir=cache_dir,
+        num_proc=num_processes,
+        save_infos = True,
+        writer_batch_size=batch_size
+    )
+    split_dataset = dataset["train"].train_test_split(test_size=0.1, seed=42, shuffle=False)
+    train_dataset = split_dataset["train"]
+    val_dataset = split_dataset["test"]
+    prepare_data(
+        start_index=0,
+        end_index=1000,
+        **{
+            'data_writer': data_writer,
+            'train_dataset': train_dataset,
+            'val_dataset': val_dataset,
+        }
+    )

train.py ADDED Viewed

	@@ -0,0 +1,260 @@

+#./experiments/experiment1/train.py
+import logging
+import pickle
+import sqlite3
+import torch
+import torchvision
+import torch.optim as optim
+import torch.nn as nn
+import torch.nn.functional as F
+import transformers
+from model import RotationallyInvariantGPT, RotationallyInvariantGPTConfig
+from prereqs.nanoGPT.model import GPTConfig, GPT, MLP
+from datasets import load_from_disk
+from torch.utils.data import DataLoader
+from transformers import GPT2TokenizerFast
+from torch.nn.utils.rnn import pad_sequence
+def pad_collate(batch):
+    # Separating inputs and labels
+    inputs = [d['input_ids'] for d in batch]
+    labels = [d['labels'] for d in batch]
+    # Padding the input sequences
+    input_tensor = pad_sequence(inputs, batch_first=True)
+    # Padding the labels sequences
+    label_tensor = pad_sequence(labels, batch_first=True)
+    return {'input_ids': input_tensor, 'labels': label_tensor}
+class DatabaseInterface(object):
+    def __init__(self, db_file):
+        self.db_file = db_file
+    def read(self, split):
+        conn = sqlite3.connect(self.db_file)
+        c = conn.cursor()
+        c.execute(f"SELECT * FROM plain_text WHERE split='{split}'")
+        col_names = [desc[0] for desc in c.description]  # get column names
+        results = [dict(zip(col_names, row)) for row in c.fetchall()]  # convert tuples to dictionaries
+        conn.close()
+        return results
+class PlainTextDataset(torch.utils.data.Dataset):
+    def __init__(self, plain_text_dataset, tokenizer, device):
+        self.plain_text_dataset = plain_text_dataset
+        self.tokenizer = tokenizer
+        self.device = device
+    def __len__(self):
+        return len(self.plain_text_dataset)
+    def __getitem__(self, idx):
+        item = self.plain_text_dataset[idx]
+        tokens = self.tokenizer.encode_plus(item["text"], truncation=True, max_length=512, padding="max_length")
+        input_ids = tokens["input_ids"]
+        attention_mask = tokens["attention_mask"]
+        return {
+        'input_ids': torch.as_tensor(input_ids[:-1], dtype=torch.long).to(self.device),
+        'attention_mask': torch.as_tensor(attention_mask[:-1], dtype=torch.long).to(self.device),
+        'labels': torch.as_tensor(input_ids[1:], dtype=torch.long).to(self.device)
+        }
+def train(model: nn.Module, optimizer: optim.Optimizer, train_loader: DataLoader) -> float:
+    model.train()
+    running_loss = 0
+    for i, batch in enumerate(train_loader):
+        inputs, targets = batch['input_ids'].to(device), batch['labels'].to(device)
+        optimizer.zero_grad()
+        outputs, loss = model(inputs, targets)
+        loss.backward()
+        optimizer.step()
+        running_loss += loss.item()
+        if i % 100 == 0:
+            logging.info(f"Batch {i}: Loss={loss.item()}")
+    return running_loss / len(train_loader)
+def evaluate(model, valid_loader) -> float:
+    model.eval()
+    running_loss = 0
+    with torch.no_grad():
+        for i, batch in enumerate(valid_loader):
+            inputs, targets = batch['input_ids'].to(device), batch['labels'].to(device)
+            outputs = model(inputs, targets)
+            loss = outputs.loss
+            running_loss += loss.item()
+            if i % 100 == 0:
+                logging.info(f"Batch {i}: Validation Loss={loss.item()}")
+    return running_loss / len(valid_loader)
+if __name__ == '__main__':
+    logging.basicConfig(
+        format='%(asctime)s - %(levelname)s - %(name)s -  %(message)s',
+        datefmt='%m/%d/%Y %H:%M:%S',
+        level=logging.INFO
+    )
+    logging.info(f"PyTorch version: {torch.__version__}")
+    logging.info(f"Torchvision version: {torchvision.__version__}")
+    logging.info(f"Transformers version: {transformers.__version__}")
+    logging.info(f"CUDA version: {torch.version.cuda}")
+    logging.info(f"cuDNN version: {torch.backends.cudnn.version()}")
+    logging.info("Clearing cuda cache...")
+    torch.cuda.empty_cache()
+    logging.info("Setting num_threads to 1...")
+    torch.set_num_threads(1)
+    # Configs
+    d_model = 512
+    num_heads = 4
+    num_layers = 1
+    block_size = 512
+    dropout = 0.2
+    bias = True
+    rotational = True
+    batch_size = 32
+    eval_batch_size = 64
+    epochs = 10
+    lr = 0.001
+    vocab_size = 50304  # GPT-2 tokenizer vocab size
+    logging.info(f"Vocab size: {vocab_size}")
+    logging.info(f'''
+    Config:
+        d_model={d_model},
+        num_heads={num_heads},
+        num_layers={num_layers},
+        block_size={block_size},
+        dropout={dropout}, bias={bias}
+    '''
+    )
+    logging.info(
+        f"Training for {epochs} epochs with a learning rate of {lr}..."
+    )
+    logging.info(f"Batch size: {batch_size}")
+    logging.info(f"Eval batch size: {eval_batch_size}")
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    # device = torch.device("cpu")
+    logging.info(f"Device: {device}")
+    logging.info("Loading tokenizer")
+    tokenizer = GPT2TokenizerFast.from_pretrained('gpt2')
+    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
+    # Query the database for the tokenized data
+    logging.info("Querying plain text data...")
+    db_file_path = "data/experiment1.db"
+    plain_text_train = DatabaseInterface(db_file_path).read("train")
+    #logging.debug(f"Plain text train: {plain_text_train[:10]}")
+    plain_text_val = DatabaseInterface(db_file_path).read("val")
+    #logging.debug(f"Plain text val: {plain_text_val[:10]}")
+    # Create train/val dataset objects
+    train_dataset = PlainTextDataset(plain_text_train, tokenizer, device)
+    valid_dataset = PlainTextDataset(plain_text_val, tokenizer, device)
+    # DEBUG
+    #for idx, item in enumerate(train_dataset):
+    #    input_ids = item["input_ids"]
+    #    attention_mask = item["attention_mask"]
+    #    if input_ids.size(0) == 0:
+    #        print(f"Sample index with 0 length: {idx}")
+    #        print(f"Input_ids: {input_ids}")
+    #        print(f"Attention_mask: {attention_mask}")
+    # Calculate the number of batches
+    num_train_batches = len(train_dataset) // batch_size
+    num_eval_batches = len(valid_dataset) // eval_batch_size
+    logging.info(f"Number of train batches: {num_train_batches}")
+    logging.info(f"Number of eval batches: {num_eval_batches}")
+    train_loader = DataLoader(
+        train_dataset,
+        batch_size=batch_size,
+        shuffle=False,
+        collate_fn=pad_collate
+    )
+    valid_loader = DataLoader(
+        valid_dataset,
+        batch_size=eval_batch_size,
+        shuffle=False,
+        collate_fn=pad_collate
+    )
+    # gpt_config = GPTConfig(
+    #     vocab_size=vocab_size,
+    #     n_embd=d_model,
+    #     n_head=num_heads,
+    #     n_layer=num_layers,
+    #     block_size=block_size,
+    #     dropout=dropout,
+    #     bias=bias
+    #)
+    rigpt_config = RotationallyInvariantGPTConfig(
+        vocab_size=vocab_size,
+        n_embd=d_model,
+        n_head=num_heads,
+        n_layer=num_layers,
+        block_size=block_size,
+        dropout=dropout,
+        bias=bias,
+        rotational_invariance=rotational
+    )
+    logging.info("Creating models...")
+    # gpt = GPT(gpt_config).to(device)
+    rigpt = RotationallyInvariantGPT(rigpt_config).to(device)
+    logging.info("Creating optimizers...")
+    # optimizer_gpt = optim.Adam(gpt.parameters(), lr=lr)
+    optimizer_rigpt = optim.Adam(rigpt.parameters(), lr=lr)
+    logging.info("Training...")
+    for model, optimizer, model_name in [
+        # (
+        #     gpt,
+        #     optimizer_gpt,
+        #     'GPT'
+        # ),
+        (
+            rigpt,
+            optimizer_rigpt,
+            'RotationallyInvariantGPT'
+        )
+    ]:
+        print(f"Training {model_name}")
+        for epoch in range(1, epochs + 1):
+            print(f"Training epoch {epoch}")
+            train_loss = train(model, optimizer, train_loader)
+            print(f"Validating epoch {epoch}")
+            valid_loss = evaluate(model, num_eval_batches)
+            print(
+                f'''
+                {model_name} -
+                    Epoch: {epoch},
+                    Train loss: {train_loss:.3f},
+                    Validation loss: {valid_loss:.3f}'
+                '''
+            )
+    # torch.save(gpt.state_dict(), "gpt.pt")
+    torch.save(rigpt.state_dict(), "rigpt.pt")