HaileyStorm
/

chess-mamba-vs-xformer

Model card Files Files and versions Community

HaileyStorm commited on Mar 13

Commit

27f8947

•

1 Parent(s): 62e35f0

Upload chess-mamba-vs-xformer/train_bygame.py with huggingface_hub

Browse files

Files changed (1) hide show

chess-mamba-vs-xformer/train_bygame.py +24 -13

chess-mamba-vs-xformer/train_bygame.py CHANGED Viewed

@@ -92,17 +92,22 @@ anneal_decay_iters = None # Set at init
 if model_type == 'mamba':
     from mamba_lm import MambaLM, MambaLMConfig
     model_config = MambaLMConfig(
         d_model=d_model,
-        n_layers=n_layer,
-        dt_rank=dt_rank,
-        d_state=d_state,
-        expand_factor=expand_factor,
-        bias=bias,
-        conv_bias=conv_bias,
-        pscan=pscan,
-        vocab_size=vocab_size
-    )
 elif model_type == 'xformer':
     from xformer import GPTConfig, GPT
     model_config = GPTConfig(
@@ -152,10 +157,13 @@ train_files = glob.glob(os.path.join(data_dir, 'train*.parquet')) + \
               glob.glob(os.path.join(data_dir, 'stable*.parquet')) + \
               glob.glob(os.path.join(data_dir, 'anneal*.parquet'))
 train_datasets = []
 for f in train_files:
     dataset = pq.read_table(f).to_pandas()
     dataset = dataset[dataset['tokenized'].apply(len) >= 8]
     train_datasets.append(dataset)
 #val_data = pq.read_table(os.path.join(data_dir, 'val.parquet')).to_pandas()
 #val_data = val_data[val_data['tokenized'].apply(len) >= 8]
 truncated_games_count = 0
@@ -217,7 +225,8 @@ if init_from == 'scratch':
     else:
         model_config.vocab_size = meta_vocab_size
     if model_type == 'mamba':
-        model = MambaLM(model_config)
     else:
         model = GPT(model_config)
     if auto_clip:
@@ -233,7 +242,8 @@ elif init_from == 'resume' or init_from == 'anneal':
     checkpoint = torch.load(ckpt_path, map_location=device)
     model_config = checkpoint['model_args']
     if model_type == 'mamba':
-        model = MambaLM(model_config)
     else:
         model = GPT(model_config)
     state_dict = checkpoint['model']
@@ -309,10 +319,11 @@ if ddp:
 def batch_to_loss(sequences, max_length_in_batch):
     if model_type == 'mamba':
-        logits = model(sequences[:, :-1])  # Forward pass, exclude last token for input
         # Compute loss (assuming next token prediction task)
         targets = sequences[:, 1:].reshape(-1)  # Shifted by one for next token prediction
         return F.cross_entropy(logits.view(-1, logits.size(-1)), targets)
     else:
         inputs = sequences[:, :-1]
         targets = sequences[:, 1:].reshape(-1)
@@ -474,7 +485,7 @@ while True:
     scaler.update()
     # flush the gradients as soon as we can, no need for this memory anymore
     optimizer.zero_grad(set_to_none=True)
-    torch.cuda.empty_cache()
     # timing and logging
     t1 = time.time()

 if model_type == 'mamba':
     from mamba_lm import MambaLM, MambaLMConfig
+    from mamba_ssm import MambaLMHeadModel
     model_config = MambaLMConfig(
         d_model=d_model,
+        #n_layers=n_layer,
+        n_layer=n_layer,
+        ssm_cfg={
+           'dt_rank': dt_rank,
+           'd_state': d_state,
+           #'expand_factor': expand_factor,
+           'bias': bias,
+           'conv_bias':conv_bias,
+           #'pscan':pscan,
+        },
+        vocab_size=vocab_size,
+        pad_vocab_size_multiple=1
+    ).to_mamba_config()
 elif model_type == 'xformer':
     from xformer import GPTConfig, GPT
     model_config = GPTConfig(
               glob.glob(os.path.join(data_dir, 'stable*.parquet')) + \
               glob.glob(os.path.join(data_dir, 'anneal*.parquet'))
 train_datasets = []
+print("Loading dataset...")
 for f in train_files:
     dataset = pq.read_table(f).to_pandas()
     dataset = dataset[dataset['tokenized'].apply(len) >= 8]
     train_datasets.append(dataset)
+    print('.',end='',flush=True)
+print("\nLoaded.")
 #val_data = pq.read_table(os.path.join(data_dir, 'val.parquet')).to_pandas()
 #val_data = val_data[val_data['tokenized'].apply(len) >= 8]
 truncated_games_count = 0
     else:
         model_config.vocab_size = meta_vocab_size
     if model_type == 'mamba':
+        #model = MambaLM(model_config)
+        model = MambaLMHeadModel(model_config)
     else:
         model = GPT(model_config)
     if auto_clip:
     checkpoint = torch.load(ckpt_path, map_location=device)
     model_config = checkpoint['model_args']
     if model_type == 'mamba':
+        #model = MambaLM(model_config)
+        model = MambaLMHeadModel(model_config)
     else:
         model = GPT(model_config)
     state_dict = checkpoint['model']
 def batch_to_loss(sequences, max_length_in_batch):
     if model_type == 'mamba':
+        logits = model(sequences[:, :-1]).logits  # Forward pass, exclude last token for input
         # Compute loss (assuming next token prediction task)
         targets = sequences[:, 1:].reshape(-1)  # Shifted by one for next token prediction
         return F.cross_entropy(logits.view(-1, logits.size(-1)), targets)
+        #return F.cross_entropy(logits.reshape(-1), targets)
     else:
         inputs = sequences[:, :-1]
         targets = sequences[:, 1:].reshape(-1)
     scaler.update()
     # flush the gradients as soon as we can, no need for this memory anymore
     optimizer.zero_grad(set_to_none=True)
+    #torch.cuda.empty_cache()
     # timing and logging
     t1 = time.time()