Fix falcon tokenization step (#1441) [skip ci]
Browse files* Fix falcon tokenization step
* chore: lint
---------
Co-authored-by: Wing Lian <wing.lian@gmail.com>
src/axolotl/utils/trainer.py
CHANGED
@@ -124,9 +124,10 @@ def process_datasets_for_packing(cfg, train_dataset, eval_dataset):
|
|
124 |
eval_dataset = eval_dataset.remove_columns("attention_mask")
|
125 |
|
126 |
if cfg.model_config_type == "falcon":
|
127 |
-
LOG.info("dropping token_type_ids column")
|
128 |
-
|
129 |
-
|
|
|
130 |
eval_dataset = eval_dataset.remove_columns("token_type_ids")
|
131 |
|
132 |
train_dataset = train_dataset.filter(
|
|
|
124 |
eval_dataset = eval_dataset.remove_columns("attention_mask")
|
125 |
|
126 |
if cfg.model_config_type == "falcon":
|
127 |
+
LOG.info("dropping token_type_ids column if it exists")
|
128 |
+
if "token_type_ids" in train_dataset.column_names:
|
129 |
+
train_dataset = train_dataset.remove_columns("token_type_ids")
|
130 |
+
if eval_dataset and "token_type_ids" in eval_dataset.column_names:
|
131 |
eval_dataset = eval_dataset.remove_columns("token_type_ids")
|
132 |
|
133 |
train_dataset = train_dataset.filter(
|