Far El winglian commited on
Commit
bcdc9b1
·
unverified ·
1 Parent(s): c19d060

Fix falcon tokenization step (#1441) [skip ci]

Browse files

* Fix falcon tokenization step

* chore: lint

---------

Co-authored-by: Wing Lian <wing.lian@gmail.com>

Files changed (1) hide show
  1. src/axolotl/utils/trainer.py +4 -3
src/axolotl/utils/trainer.py CHANGED
@@ -124,9 +124,10 @@ def process_datasets_for_packing(cfg, train_dataset, eval_dataset):
124
  eval_dataset = eval_dataset.remove_columns("attention_mask")
125
 
126
  if cfg.model_config_type == "falcon":
127
- LOG.info("dropping token_type_ids column")
128
- train_dataset = train_dataset.remove_columns("token_type_ids")
129
- if eval_dataset:
 
130
  eval_dataset = eval_dataset.remove_columns("token_type_ids")
131
 
132
  train_dataset = train_dataset.filter(
 
124
  eval_dataset = eval_dataset.remove_columns("attention_mask")
125
 
126
  if cfg.model_config_type == "falcon":
127
+ LOG.info("dropping token_type_ids column if it exists")
128
+ if "token_type_ids" in train_dataset.column_names:
129
+ train_dataset = train_dataset.remove_columns("token_type_ids")
130
+ if eval_dataset and "token_type_ids" in eval_dataset.column_names:
131
  eval_dataset = eval_dataset.remove_columns("token_type_ids")
132
 
133
  train_dataset = train_dataset.filter(