NEW RUN 2023-02-15-06-28-07 {'data_order': '', 'load_model': 'out/symatho/rwkv-6.pth', 'wandb': '', 'proj_dir': 'out', 'random_seed': -1, 'data_file': '../data/tho.dense.xyz', 'data_type': 'symato', 'vocab_size': 2944, 'ctx_len': 256, 'epoch_steps': 2500, 'epoch_count': 50, 'epoch_begin': 0, 'epoch_save': 2, 'micro_bsz': 112, 'n_layer': 6, 'n_embd': 512, 'pre_ffn': 0, 'head_qk': 0, 'tiny_att_dim': 0, 'tiny_att_layer': -999, 'lr_init': 7e-05, 'lr_final': 1e-05, 'warmup_steps': 0, 'beta1': 0.9, 'beta2': 0.99, 'adam_eps': 1e-08, 'grad_cp': 0, 'my_pile_stage': 0, 'my_pile_shift': -1, 'my_pile_edecay': 0, 'layerwise_lr': 1, 'ds_bucket_mb': 200, 'my_sample_len': 0, 'my_ffn_shift': 1, 'my_att_shift': 1, 'my_pos_emb': 0, 'load_partial': 0, 'magic_prime': 0, 'logger': False, 'enable_checkpointing': False, 'default_root_dir': None, 'gradient_clip_val': 1.0, 'gradient_clip_algorithm': None, 'num_nodes': 1, 'num_processes': None, 'devices': '1', 'gpus': None, 'auto_select_gpus': None, 'tpu_cores': None, 'ipus': None, 'enable_progress_bar': True, 'overfit_batches': 0.0, 'track_grad_norm': -1, 'check_val_every_n_epoch': 100000000000000000000, 'fast_dev_run': False, 'accumulate_grad_batches': None, 'max_epochs': -1, 'min_epochs': None, 'max_steps': -1, 'min_steps': None, 'max_time': None, 'limit_train_batches': None, 'limit_val_batches': None, 'limit_test_batches': None, 'limit_predict_batches': None, 'val_check_interval': None, 'log_every_n_steps': 100000000000000000000, 'accelerator': 'gpu', 'strategy': 'ddp_find_unused_parameters_false', 'sync_batchnorm': False, 'precision': 'bf16', 'enable_model_summary': True, 'num_sanity_val_steps': 0, 'resume_from_checkpoint': None, 'profiler': None, 'benchmark': None, 'reload_dataloaders_every_n_epochs': 0, 'auto_lr_find': False, 'replace_sampler_ddp': False, 'detect_anomaly': False, 'auto_scale_batch_size': False, 'plugins': None, 'amp_backend': None, 'amp_level': None, 'move_metrics_to_cpu': False, 'multiple_trainloader_mode': 'max_size_cycle', 'inference_mode': True, 'my_timestamp': '2023-02-15-06-28-07', 'betas': (0.9, 0.99), 'real_bsz': 112, 'run_name': '0 ctx256 L6 D512'} 0 0.857739 2.3578 0.00006733 2023-02-15 06:46:01.519094 0 1 0.854229 2.3496 0.00006476 2023-02-15 07:03:00.012250 1 2 0.851497 2.3432 0.00006229 2023-02-15 07:19:57.307930 2 3 0.846927 2.3325 0.00005991 2023-02-15 07:36:54.553249 3 4 0.843902 2.3254 0.00005762 2023-02-15 07:53:51.418850 4 5 0.839696 2.3157 0.00005542 2023-02-15 08:10:48.303610 5 6 0.835853 2.3068 0.00005331 2023-02-15 08:27:45.794078 6 NEW RUN 2023-02-15-08-29-25 {'data_order': '', 'load_model': 'out/rwkv-6.pth', 'wandb': '', 'proj_dir': 'out', 'random_seed': -1, 'data_file': '../data/tho.dense.xyz', 'data_type': 'symato', 'vocab_size': 2944, 'ctx_len': 256, 'epoch_steps': 2500, 'epoch_count': 50, 'epoch_begin': 0, 'epoch_save': 2, 'micro_bsz': 112, 'n_layer': 6, 'n_embd': 512, 'pre_ffn': 0, 'head_qk': 0, 'tiny_att_dim': 0, 'tiny_att_layer': -999, 'lr_init': 6e-05, 'lr_final': 1e-05, 'warmup_steps': 0, 'beta1': 0.9, 'beta2': 0.99, 'adam_eps': 1e-08, 'grad_cp': 0, 'my_pile_stage': 0, 'my_pile_shift': -1, 'my_pile_edecay': 0, 'layerwise_lr': 1, 'ds_bucket_mb': 200, 'my_sample_len': 0, 'my_ffn_shift': 1, 'my_att_shift': 1, 'my_pos_emb': 0, 'load_partial': 0, 'magic_prime': 0, 'logger': False, 'enable_checkpointing': False, 'default_root_dir': None, 'gradient_clip_val': 1.0, 'gradient_clip_algorithm': None, 'num_nodes': 1, 'num_processes': None, 'devices': '1', 'gpus': None, 'auto_select_gpus': None, 'tpu_cores': None, 'ipus': None, 'enable_progress_bar': True, 'overfit_batches': 0.0, 'track_grad_norm': -1, 'check_val_every_n_epoch': 100000000000000000000, 'fast_dev_run': False, 'accumulate_grad_batches': None, 'max_epochs': -1, 'min_epochs': None, 'max_steps': -1, 'min_steps': None, 'max_time': None, 'limit_train_batches': None, 'limit_val_batches': None, 'limit_test_batches': None, 'limit_predict_batches': None, 'val_check_interval': None, 'log_every_n_steps': 100000000000000000000, 'accelerator': 'gpu', 'strategy': 'ddp_find_unused_parameters_false', 'sync_batchnorm': False, 'precision': 'bf16', 'enable_model_summary': True, 'num_sanity_val_steps': 0, 'resume_from_checkpoint': None, 'profiler': None, 'benchmark': None, 'reload_dataloaders_every_n_epochs': 0, 'auto_lr_find': False, 'replace_sampler_ddp': False, 'detect_anomaly': False, 'auto_scale_batch_size': False, 'plugins': None, 'amp_backend': None, 'amp_level': None, 'move_metrics_to_cpu': False, 'multiple_trainloader_mode': 'max_size_cycle', 'inference_mode': True, 'my_timestamp': '2023-02-15-08-29-25', 'betas': (0.9, 0.99), 'real_bsz': 112, 'run_name': '0 ctx256 L6 D512'} NEW RUN 2023-02-15-08-31-41 {'data_order': '', 'load_model': 'out/rwkv-6.pth', 'wandb': '', 'proj_dir': 'out', 'random_seed': -1, 'data_file': '../data/tho.dense.xyz', 'data_type': 'symato', 'vocab_size': 2944, 'ctx_len': 256, 'epoch_steps': 2500, 'epoch_count': 50, 'epoch_begin': 0, 'epoch_save': 2, 'micro_bsz': 112, 'n_layer': 6, 'n_embd': 512, 'pre_ffn': 0, 'head_qk': 0, 'tiny_att_dim': 0, 'tiny_att_layer': -999, 'lr_init': 6e-05, 'lr_final': 1e-05, 'warmup_steps': 0, 'beta1': 0.9, 'beta2': 0.99, 'adam_eps': 1e-08, 'grad_cp': 0, 'my_pile_stage': 0, 'my_pile_shift': -1, 'my_pile_edecay': 0, 'layerwise_lr': 1, 'ds_bucket_mb': 200, 'my_sample_len': 0, 'my_ffn_shift': 1, 'my_att_shift': 1, 'my_pos_emb': 0, 'load_partial': 0, 'magic_prime': 0, 'logger': False, 'enable_checkpointing': False, 'default_root_dir': None, 'gradient_clip_val': 1.0, 'gradient_clip_algorithm': None, 'num_nodes': 1, 'num_processes': None, 'devices': '1', 'gpus': None, 'auto_select_gpus': None, 'tpu_cores': None, 'ipus': None, 'enable_progress_bar': True, 'overfit_batches': 0.0, 'track_grad_norm': -1, 'check_val_every_n_epoch': 100000000000000000000, 'fast_dev_run': False, 'accumulate_grad_batches': None, 'max_epochs': -1, 'min_epochs': None, 'max_steps': -1, 'min_steps': None, 'max_time': None, 'limit_train_batches': None, 'limit_val_batches': None, 'limit_test_batches': None, 'limit_predict_batches': None, 'val_check_interval': None, 'log_every_n_steps': 100000000000000000000, 'accelerator': 'gpu', 'strategy': 'ddp_find_unused_parameters_false', 'sync_batchnorm': False, 'precision': 'bf16', 'enable_model_summary': True, 'num_sanity_val_steps': 0, 'resume_from_checkpoint': None, 'profiler': None, 'benchmark': None, 'reload_dataloaders_every_n_epochs': 0, 'auto_lr_find': False, 'replace_sampler_ddp': False, 'detect_anomaly': False, 'auto_scale_batch_size': False, 'plugins': None, 'amp_backend': None, 'amp_level': None, 'move_metrics_to_cpu': False, 'multiple_trainloader_mode': 'max_size_cycle', 'inference_mode': True, 'my_timestamp': '2023-02-15-08-31-41', 'betas': (0.9, 0.99), 'real_bsz': 112, 'run_name': '0 ctx256 L6 D512'} 0 0.834407 2.3034 0.00005789 2023-02-15 08:49:25.234338 0 1 0.831531 2.2968 0.00005585 2023-02-15 09:06:20.485094 1 2 0.829730 2.2927 0.00005388 2023-02-15 09:23:14.424752 2 3 0.826368 2.2850 0.00005199 2023-02-15 09:40:08.736941 3 4 0.822394 2.2759 0.00005016 2023-02-15 09:57:03.307645 4 5 0.818884 2.2680 0.00004839 2023-02-15 10:13:58.151200 5 6 0.817309 2.2644 0.00004669 2023-02-15 10:30:52.723144 6 7 0.813574 2.2560 0.00004505 2023-02-15 10:47:47.135797 7 8 0.811447 2.2512 0.00004346 2023-02-15 11:04:41.410944 8 9 0.809101 2.2459 0.00004193 2023-02-15 11:21:36.330813 9 10 0.806216 2.2394 0.00004045 2023-02-15 11:38:30.884686 10 11 0.802590 2.2313 0.00003903 2023-02-15 11:55:27.530524 11 12 0.801704 2.2293 0.00003766 2023-02-15 12:12:21.848468 12 13 0.798498 2.2222 0.00003633 2023-02-15 12:29:16.886593 13 14 0.797000 2.2189 0.00003505 2023-02-15 12:46:11.329404 14 15 0.794856 2.2141 0.00003382 2023-02-15 13:03:05.716940 15 16 0.792826 2.2096 0.00003263 2023-02-15 13:20:00.207152 16 17 0.790887 2.2054 0.00003148 2023-02-15 13:36:54.751903 17 18 0.788561 2.2002 0.00003037 2023-02-15 13:53:49.569712 18 19 0.786226 2.1951 0.00002930 2023-02-15 14:10:44.069815 19 20 0.785394 2.1933 0.00002827 2023-02-15 14:27:38.966822 20 21 0.782471 2.1869 0.00002727 2023-02-15 14:44:33.500812 21 22 0.781069 2.1838 0.00002631 2023-02-15 15:01:28.827403 22 23 0.780484 2.1825 0.00002539 2023-02-15 15:18:23.439338 23 24 0.778297 2.1778 0.00002449 2023-02-15 15:35:17.878754 24 25 0.776599 2.1741 0.00002363 2023-02-15 15:52:12.615589 25 26 0.774825 2.1702 0.00002280 2023-02-15 16:09:07.316768 26