report min lenght of tokenized data (#1186) [skip ci]
Browse files
src/axolotl/utils/trainer.py
CHANGED
@@ -110,6 +110,8 @@ def process_datasets_for_packing(cfg, train_dataset, eval_dataset):
|
|
110 |
drop_long = partial(drop_long_seq, sequence_len=cfg.sequence_len)
|
111 |
with zero_first(is_main_process()):
|
112 |
if cfg.is_preprocess:
|
|
|
|
|
113 |
max_input_len = np.max(get_dataset_lengths(train_dataset))
|
114 |
LOG.debug(f"max_input_len: {max_input_len}", main_process_only=True)
|
115 |
|
|
|
110 |
drop_long = partial(drop_long_seq, sequence_len=cfg.sequence_len)
|
111 |
with zero_first(is_main_process()):
|
112 |
if cfg.is_preprocess:
|
113 |
+
min_input_len = np.min(get_dataset_lengths(train_dataset))
|
114 |
+
LOG.debug(f"min_input_len: {min_input_len}", main_process_only=True)
|
115 |
max_input_len = np.max(get_dataset_lengths(train_dataset))
|
116 |
LOG.debug(f"max_input_len: {max_input_len}", main_process_only=True)
|
117 |
|