winglian commited on
Commit
d85d494
·
unverified ·
1 Parent(s): 02f2c72

report min lenght of tokenized data (#1186) [skip ci]

Browse files
Files changed (1) hide show
  1. src/axolotl/utils/trainer.py +2 -0
src/axolotl/utils/trainer.py CHANGED
@@ -110,6 +110,8 @@ def process_datasets_for_packing(cfg, train_dataset, eval_dataset):
110
  drop_long = partial(drop_long_seq, sequence_len=cfg.sequence_len)
111
  with zero_first(is_main_process()):
112
  if cfg.is_preprocess:
 
 
113
  max_input_len = np.max(get_dataset_lengths(train_dataset))
114
  LOG.debug(f"max_input_len: {max_input_len}", main_process_only=True)
115
 
 
110
  drop_long = partial(drop_long_seq, sequence_len=cfg.sequence_len)
111
  with zero_first(is_main_process()):
112
  if cfg.is_preprocess:
113
+ min_input_len = np.min(get_dataset_lengths(train_dataset))
114
+ LOG.debug(f"min_input_len: {min_input_len}", main_process_only=True)
115
  max_input_len = np.max(get_dataset_lengths(train_dataset))
116
  LOG.debug(f"max_input_len: {max_input_len}", main_process_only=True)
117