winglian commited on
Commit
e8cbf50
1 Parent(s): d887ad8

attention_mask not needed for training (#642)

Browse files

* attention_mask not needed for training

* specifically don't use attention mask for phi

* use a different check for phi

* small fixes since phi removed some values from their config

src/axolotl/models/phi/modeling_mixformer_sequential.py CHANGED
@@ -711,12 +711,8 @@ class ParallelBlock(nn.Module):
711
  self.resid_dropout = nn.Dropout(config.resid_pdrop)
712
  self.block_idx = block_idx
713
 
714
- self.mixer = MHA(config=config, **mixer, layer_idx=block_idx)
715
- mlp_cls = mlp.pop("mlp_cls")
716
- if mlp_cls == "fused_mlp":
717
- self.mlp = FusedMLP(config=config, **mlp)
718
- else:
719
- self.mlp = MLP(config=config, **mlp)
720
 
721
  def forward(
722
  self,
 
711
  self.resid_dropout = nn.Dropout(config.resid_pdrop)
712
  self.block_idx = block_idx
713
 
714
+ self.mixer = MHA(config, layer_idx=block_idx)
715
+ self.mlp = MLP(config)
 
 
 
 
716
 
717
  def forward(
718
  self,
src/axolotl/utils/data.py CHANGED
@@ -76,7 +76,7 @@ def prepare_dataset(cfg, tokenizer):
76
 
77
  with zero_first(is_main_process()):
78
  train_dataset, eval_dataset = process_datasets_for_packing(
79
- cfg, train_dataset, eval_dataset
80
  )
81
  if cfg.max_steps:
82
  total_num_steps = min(
 
76
 
77
  with zero_first(is_main_process()):
78
  train_dataset, eval_dataset = process_datasets_for_packing(
79
+ cfg, train_dataset, eval_dataset, tokenizer
80
  )
81
  if cfg.max_steps:
82
  total_num_steps = min(
src/axolotl/utils/trainer.py CHANGED
@@ -397,7 +397,7 @@ def disable_datasets_caching():
397
  set_caching_enabled(True)
398
 
399
 
400
- def process_datasets_for_packing(cfg, train_dataset, eval_dataset):
401
  drop_long = partial(drop_long_seq, sequence_len=cfg.sequence_len)
402
  with zero_first(is_main_process()):
403
  train_dataset = train_dataset.filter(drop_long, num_proc=os.cpu_count())
@@ -414,6 +414,13 @@ def process_datasets_for_packing(cfg, train_dataset, eval_dataset):
414
  eval_dataset = eval_dataset.map(
415
  add_position_ids, num_proc=os.cpu_count()
416
  )
 
 
 
 
 
 
 
417
  return train_dataset, eval_dataset
418
 
419
 
 
397
  set_caching_enabled(True)
398
 
399
 
400
+ def process_datasets_for_packing(cfg, train_dataset, eval_dataset, tokenizer):
401
  drop_long = partial(drop_long_seq, sequence_len=cfg.sequence_len)
402
  with zero_first(is_main_process()):
403
  train_dataset = train_dataset.filter(drop_long, num_proc=os.cpu_count())
 
414
  eval_dataset = eval_dataset.map(
415
  add_position_ids, num_proc=os.cpu_count()
416
  )
417
+
418
+ # Phi doesn't want the attention_mask feature when training
419
+ if "CodeGenTokenizer" in tokenizer.__class__.__name__:
420
+ train_dataset = train_dataset.remove_columns("attention_mask")
421
+ if eval_dataset:
422
+ eval_dataset = eval_dataset.remove_columns("attention_mask")
423
+
424
  return train_dataset, eval_dataset
425
 
426