attention_mask not needed for training (#642)
Browse files* attention_mask not needed for training
* specifically don't use attention mask for phi
* use a different check for phi
* small fixes since phi removed some values from their config
src/axolotl/models/phi/modeling_mixformer_sequential.py
CHANGED
@@ -711,12 +711,8 @@ class ParallelBlock(nn.Module):
|
|
711 |
self.resid_dropout = nn.Dropout(config.resid_pdrop)
|
712 |
self.block_idx = block_idx
|
713 |
|
714 |
-
self.mixer = MHA(config
|
715 |
-
|
716 |
-
if mlp_cls == "fused_mlp":
|
717 |
-
self.mlp = FusedMLP(config=config, **mlp)
|
718 |
-
else:
|
719 |
-
self.mlp = MLP(config=config, **mlp)
|
720 |
|
721 |
def forward(
|
722 |
self,
|
|
|
711 |
self.resid_dropout = nn.Dropout(config.resid_pdrop)
|
712 |
self.block_idx = block_idx
|
713 |
|
714 |
+
self.mixer = MHA(config, layer_idx=block_idx)
|
715 |
+
self.mlp = MLP(config)
|
|
|
|
|
|
|
|
|
716 |
|
717 |
def forward(
|
718 |
self,
|
src/axolotl/utils/data.py
CHANGED
@@ -76,7 +76,7 @@ def prepare_dataset(cfg, tokenizer):
|
|
76 |
|
77 |
with zero_first(is_main_process()):
|
78 |
train_dataset, eval_dataset = process_datasets_for_packing(
|
79 |
-
cfg, train_dataset, eval_dataset
|
80 |
)
|
81 |
if cfg.max_steps:
|
82 |
total_num_steps = min(
|
|
|
76 |
|
77 |
with zero_first(is_main_process()):
|
78 |
train_dataset, eval_dataset = process_datasets_for_packing(
|
79 |
+
cfg, train_dataset, eval_dataset, tokenizer
|
80 |
)
|
81 |
if cfg.max_steps:
|
82 |
total_num_steps = min(
|
src/axolotl/utils/trainer.py
CHANGED
@@ -397,7 +397,7 @@ def disable_datasets_caching():
|
|
397 |
set_caching_enabled(True)
|
398 |
|
399 |
|
400 |
-
def process_datasets_for_packing(cfg, train_dataset, eval_dataset):
|
401 |
drop_long = partial(drop_long_seq, sequence_len=cfg.sequence_len)
|
402 |
with zero_first(is_main_process()):
|
403 |
train_dataset = train_dataset.filter(drop_long, num_proc=os.cpu_count())
|
@@ -414,6 +414,13 @@ def process_datasets_for_packing(cfg, train_dataset, eval_dataset):
|
|
414 |
eval_dataset = eval_dataset.map(
|
415 |
add_position_ids, num_proc=os.cpu_count()
|
416 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
417 |
return train_dataset, eval_dataset
|
418 |
|
419 |
|
|
|
397 |
set_caching_enabled(True)
|
398 |
|
399 |
|
400 |
+
def process_datasets_for_packing(cfg, train_dataset, eval_dataset, tokenizer):
|
401 |
drop_long = partial(drop_long_seq, sequence_len=cfg.sequence_len)
|
402 |
with zero_first(is_main_process()):
|
403 |
train_dataset = train_dataset.filter(drop_long, num_proc=os.cpu_count())
|
|
|
414 |
eval_dataset = eval_dataset.map(
|
415 |
add_position_ids, num_proc=os.cpu_count()
|
416 |
)
|
417 |
+
|
418 |
+
# Phi doesn't want the attention_mask feature when training
|
419 |
+
if "CodeGenTokenizer" in tokenizer.__class__.__name__:
|
420 |
+
train_dataset = train_dataset.remove_columns("attention_mask")
|
421 |
+
if eval_dataset:
|
422 |
+
eval_dataset = eval_dataset.remove_columns("attention_mask")
|
423 |
+
|
424 |
return train_dataset, eval_dataset
|
425 |
|
426 |
|