- dataset_max_seq_length=1024, dataset_sample_size=1000000, dataset_subset=20231101.en, dataset_uri=wikimedia_wikipedia, per_device_train_batch_size=8
- dataset_max_seq_length=1024, dataset_sample_size=1000000, dataset_subset=20231101.en, dataset_uri=wikimedia_wikipedia
- dataset_max_seq_length=1024, dataset_sample_size=1000000, dataset_subset=None, dataset_uri=distily_filtered_redpajama_en, per_device_train_batch_size=8
- dataset_max_seq_length=1024, dataset_sample_size=1000000, dataset_subset=sample-10BT, dataset_uri=HuggingFaceFW_fineweb, learning_rate=6e-05, per_device_train_batch_size=8
- dataset_max_seq_length=1024, dataset_sample_size=1000000, dataset_subset=sample-10BT, dataset_uri=HuggingFaceFW_fineweb, logits_loss_fn=liger_kl, per_device_train_batch_size=8
- dataset_max_seq_length=1024, dataset_sample_size=1000000, dataset_subset=sample-10BT, dataset_uri=HuggingFaceFW_fineweb, per_device_train_batch_size=8
- dataset_max_seq_length=1024, dataset_sample_size=1000000, dataset_subset=sample-10BT, dataset_uri=HuggingFaceFW_fineweb-edu, learning_rate=6e-05, per_device_train_batch_size=8
- dataset_max_seq_length=1024, dataset_sample_size=1000000, dataset_subset=sample-10BT, dataset_uri=HuggingFaceFW_fineweb-edu, per_device_train_batch_size=8
- dataset_max_seq_length=1024, dataset_sample_size=4000000, dataset_subset=20231101.en, dataset_uri=wikimedia_wikipedia, per_device_train_batch_size=8
- dataset_sample_size=1000000, dataset_subset=20231101.en, dataset_uri=wikimedia_wikipedia, per_device_train_batch_size=8
- dataset_sample_size=4000000, dataset_subset=20231101.en, dataset_uri=wikimedia_wikipedia