diff --git a/full_checkpoints/Model-B-16_Data-2B_Samples-13B_lr-1e-3_bs-88k/epoch_1.pt.txt b/full_checkpoints/Model-B-16_Data-2B_Samples-13B_lr-1e-3_bs-88k/epoch_1.pt.txt deleted file mode 100644 index 1a9d46cfa102d30e8dd18a9b60031c55b19d0bb3..0000000000000000000000000000000000000000 --- a/full_checkpoints/Model-B-16_Data-2B_Samples-13B_lr-1e-3_bs-88k/epoch_1.pt.txt +++ /dev/null @@ -1,432 +0,0 @@ -2022-12-21,09:36:42 | INFO | Running with a single process. Device cuda:0. -2022-12-21,09:36:42 | INFO | Loading ViT-B-16 model config. -2022-12-21,09:36:43 | INFO | Loading pretrained ViT-B-16 weights (logs/cvpr_stability/ViT-B-16_2B_13b/checkpoints/epoch_1.pt). -2022-12-21,09:36:52 | INFO | Model: -2022-12-21,09:36:52 | INFO | CLIP( - (visual): VisualTransformer( - (conv1): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16), bias=False) - (ln_pre): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (transformer): Transformer( - (resblocks): ModuleList( - (0): ResidualAttentionBlock( - (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (attn): MultiheadAttention( - (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) - ) - (ln_attn): Identity() - (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): Sequential( - (c_fc): Linear(in_features=768, out_features=3072, bias=True) - (ln): Identity() - (gelu): GELU(approximate=none) - (c_proj): Linear(in_features=3072, out_features=768, bias=True) - ) - ) - (1): ResidualAttentionBlock( - (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (attn): MultiheadAttention( - (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) - ) - (ln_attn): Identity() - (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): Sequential( - (c_fc): Linear(in_features=768, out_features=3072, bias=True) - (ln): Identity() - (gelu): GELU(approximate=none) - (c_proj): Linear(in_features=3072, out_features=768, bias=True) - ) - ) - (2): ResidualAttentionBlock( - (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (attn): MultiheadAttention( - (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) - ) - (ln_attn): Identity() - (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): Sequential( - (c_fc): Linear(in_features=768, out_features=3072, bias=True) - (ln): Identity() - (gelu): GELU(approximate=none) - (c_proj): Linear(in_features=3072, out_features=768, bias=True) - ) - ) - (3): ResidualAttentionBlock( - (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (attn): MultiheadAttention( - (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) - ) - (ln_attn): Identity() - (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): Sequential( - (c_fc): Linear(in_features=768, out_features=3072, bias=True) - (ln): Identity() - (gelu): GELU(approximate=none) - (c_proj): Linear(in_features=3072, out_features=768, bias=True) - ) - ) - (4): ResidualAttentionBlock( - (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (attn): MultiheadAttention( - (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) - ) - (ln_attn): Identity() - (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): Sequential( - (c_fc): Linear(in_features=768, out_features=3072, bias=True) - (ln): Identity() - (gelu): GELU(approximate=none) - (c_proj): Linear(in_features=3072, out_features=768, bias=True) - ) - ) - (5): ResidualAttentionBlock( - (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (attn): MultiheadAttention( - (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) - ) - (ln_attn): Identity() - (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): Sequential( - (c_fc): Linear(in_features=768, out_features=3072, bias=True) - (ln): Identity() - (gelu): GELU(approximate=none) - (c_proj): Linear(in_features=3072, out_features=768, bias=True) - ) - ) - (6): ResidualAttentionBlock( - (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (attn): MultiheadAttention( - (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) - ) - (ln_attn): Identity() - (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): Sequential( - (c_fc): Linear(in_features=768, out_features=3072, bias=True) - (ln): Identity() - (gelu): GELU(approximate=none) - (c_proj): Linear(in_features=3072, out_features=768, bias=True) - ) - ) - (7): ResidualAttentionBlock( - (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (attn): MultiheadAttention( - (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) - ) - (ln_attn): Identity() - (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): Sequential( - (c_fc): Linear(in_features=768, out_features=3072, bias=True) - (ln): Identity() - (gelu): GELU(approximate=none) - (c_proj): Linear(in_features=3072, out_features=768, bias=True) - ) - ) - (8): ResidualAttentionBlock( - (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (attn): MultiheadAttention( - (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) - ) - (ln_attn): Identity() - (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): Sequential( - (c_fc): Linear(in_features=768, out_features=3072, bias=True) - (ln): Identity() - (gelu): GELU(approximate=none) - (c_proj): Linear(in_features=3072, out_features=768, bias=True) - ) - ) - (9): ResidualAttentionBlock( - (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (attn): MultiheadAttention( - (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) - ) - (ln_attn): Identity() - (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): Sequential( - (c_fc): Linear(in_features=768, out_features=3072, bias=True) - (ln): Identity() - (gelu): GELU(approximate=none) - (c_proj): Linear(in_features=3072, out_features=768, bias=True) - ) - ) - (10): ResidualAttentionBlock( - (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (attn): MultiheadAttention( - (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) - ) - (ln_attn): Identity() - (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): Sequential( - (c_fc): Linear(in_features=768, out_features=3072, bias=True) - (ln): Identity() - (gelu): GELU(approximate=none) - (c_proj): Linear(in_features=3072, out_features=768, bias=True) - ) - ) - (11): ResidualAttentionBlock( - (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (attn): MultiheadAttention( - (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) - ) - (ln_attn): Identity() - (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): Sequential( - (c_fc): Linear(in_features=768, out_features=3072, bias=True) - (ln): Identity() - (gelu): GELU(approximate=none) - (c_proj): Linear(in_features=3072, out_features=768, bias=True) - ) - ) - ) - ) - (ln_post): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - ) - (transformer): Transformer( - (resblocks): ModuleList( - (0): ResidualAttentionBlock( - (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True) - (attn): MultiheadAttention( - (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True) - ) - (ln_attn): Identity() - (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True) - (mlp): Sequential( - (c_fc): Linear(in_features=512, out_features=2048, bias=True) - (ln): Identity() - (gelu): GELU(approximate=none) - (c_proj): Linear(in_features=2048, out_features=512, bias=True) - ) - ) - (1): ResidualAttentionBlock( - (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True) - (attn): MultiheadAttention( - (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True) - ) - (ln_attn): Identity() - (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True) - (mlp): Sequential( - (c_fc): Linear(in_features=512, out_features=2048, bias=True) - (ln): Identity() - (gelu): GELU(approximate=none) - (c_proj): Linear(in_features=2048, out_features=512, bias=True) - ) - ) - (2): ResidualAttentionBlock( - (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True) - (attn): MultiheadAttention( - (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True) - ) - (ln_attn): Identity() - (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True) - (mlp): Sequential( - (c_fc): Linear(in_features=512, out_features=2048, bias=True) - (ln): Identity() - (gelu): GELU(approximate=none) - (c_proj): Linear(in_features=2048, out_features=512, bias=True) - ) - ) - (3): ResidualAttentionBlock( - (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True) - (attn): MultiheadAttention( - (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True) - ) - (ln_attn): Identity() - (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True) - (mlp): Sequential( - (c_fc): Linear(in_features=512, out_features=2048, bias=True) - (ln): Identity() - (gelu): GELU(approximate=none) - (c_proj): Linear(in_features=2048, out_features=512, bias=True) - ) - ) - (4): ResidualAttentionBlock( - (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True) - (attn): MultiheadAttention( - (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True) - ) - (ln_attn): Identity() - (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True) - (mlp): Sequential( - (c_fc): Linear(in_features=512, out_features=2048, bias=True) - (ln): Identity() - (gelu): GELU(approximate=none) - (c_proj): Linear(in_features=2048, out_features=512, bias=True) - ) - ) - (5): ResidualAttentionBlock( - (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True) - (attn): MultiheadAttention( - (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True) - ) - (ln_attn): Identity() - (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True) - (mlp): Sequential( - (c_fc): Linear(in_features=512, out_features=2048, bias=True) - (ln): Identity() - (gelu): GELU(approximate=none) - (c_proj): Linear(in_features=2048, out_features=512, bias=True) - ) - ) - (6): ResidualAttentionBlock( - (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True) - (attn): MultiheadAttention( - (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True) - ) - (ln_attn): Identity() - (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True) - (mlp): Sequential( - (c_fc): Linear(in_features=512, out_features=2048, bias=True) - (ln): Identity() - (gelu): GELU(approximate=none) - (c_proj): Linear(in_features=2048, out_features=512, bias=True) - ) - ) - (7): ResidualAttentionBlock( - (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True) - (attn): MultiheadAttention( - (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True) - ) - (ln_attn): Identity() - (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True) - (mlp): Sequential( - (c_fc): Linear(in_features=512, out_features=2048, bias=True) - (ln): Identity() - (gelu): GELU(approximate=none) - (c_proj): Linear(in_features=2048, out_features=512, bias=True) - ) - ) - (8): ResidualAttentionBlock( - (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True) - (attn): MultiheadAttention( - (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True) - ) - (ln_attn): Identity() - (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True) - (mlp): Sequential( - (c_fc): Linear(in_features=512, out_features=2048, bias=True) - (ln): Identity() - (gelu): GELU(approximate=none) - (c_proj): Linear(in_features=2048, out_features=512, bias=True) - ) - ) - (9): ResidualAttentionBlock( - (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True) - (attn): MultiheadAttention( - (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True) - ) - (ln_attn): Identity() - (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True) - (mlp): Sequential( - (c_fc): Linear(in_features=512, out_features=2048, bias=True) - (ln): Identity() - (gelu): GELU(approximate=none) - (c_proj): Linear(in_features=2048, out_features=512, bias=True) - ) - ) - (10): ResidualAttentionBlock( - (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True) - (attn): MultiheadAttention( - (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True) - ) - (ln_attn): Identity() - (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True) - (mlp): Sequential( - (c_fc): Linear(in_features=512, out_features=2048, bias=True) - (ln): Identity() - (gelu): GELU(approximate=none) - (c_proj): Linear(in_features=2048, out_features=512, bias=True) - ) - ) - (11): ResidualAttentionBlock( - (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True) - (attn): MultiheadAttention( - (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True) - ) - (ln_attn): Identity() - (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True) - (mlp): Sequential( - (c_fc): Linear(in_features=512, out_features=2048, bias=True) - (ln): Identity() - (gelu): GELU(approximate=none) - (c_proj): Linear(in_features=2048, out_features=512, bias=True) - ) - ) - ) - ) - (token_embedding): Embedding(49408, 512) - (ln_final): LayerNorm((512,), eps=1e-05, elementwise_affine=True) -) -2022-12-21,09:36:52 | INFO | Params: -2022-12-21,09:36:52 | INFO | batch_size: 256 -2022-12-21,09:36:52 | INFO | beta1: 0.9 -2022-12-21,09:36:52 | INFO | beta2: 0.98 -2022-12-21,09:36:52 | INFO | checkpoint_path: logs/cvpr_stability/ViT-B-16_2B_13b/checkpoints/eval/epoch_1.pt/checkpoints -2022-12-21,09:36:52 | INFO | copy_codebase: False -2022-12-21,09:36:52 | INFO | csv_caption_key: title -2022-12-21,09:36:52 | INFO | csv_img_key: filepath -2022-12-21,09:36:52 | INFO | csv_separator: -2022-12-21,09:36:52 | INFO | dataset_resampled: False -2022-12-21,09:36:52 | INFO | dataset_type: auto -2022-12-21,09:36:52 | INFO | ddp_static_graph: False -2022-12-21,09:36:52 | INFO | debug: False -2022-12-21,09:36:52 | INFO | device: cuda:0 -2022-12-21,09:36:52 | INFO | dist_backend: nccl -2022-12-21,09:36:52 | INFO | dist_url: env:// -2022-12-21,09:36:52 | INFO | distributed: False -2022-12-21,09:36:52 | INFO | epochs: 32 -2022-12-21,09:36:52 | INFO | eps: 1e-06 -2022-12-21,09:36:52 | INFO | force_quick_gelu: False -2022-12-21,09:36:52 | INFO | gather_with_grad: False -2022-12-21,09:36:52 | INFO | grad_checkpointing: False -2022-12-21,09:36:52 | INFO | horovod: False -2022-12-21,09:36:52 | INFO | image_mean: None -2022-12-21,09:36:52 | INFO | image_std: None -2022-12-21,09:36:52 | INFO | imagenet_v2: None -2022-12-21,09:36:52 | INFO | imagenet_val: /p/fastdata/mmlaion/imagenet_val -2022-12-21,09:36:52 | INFO | local_loss: False -2022-12-21,09:36:52 | INFO | local_rank: 0 -2022-12-21,09:36:52 | INFO | lock_image: False -2022-12-21,09:36:52 | INFO | lock_image_freeze_bn_stats: False -2022-12-21,09:36:52 | INFO | lock_image_unlocked_groups: 0 -2022-12-21,09:36:52 | INFO | log_level: 20 -2022-12-21,09:36:52 | INFO | log_local: False -2022-12-21,09:36:52 | INFO | log_path: logs/cvpr_stability/ViT-B-16_2B_13b/checkpoints/eval/epoch_1.pt/out.log -2022-12-21,09:36:52 | INFO | logs: logs/cvpr_stability/ViT-B-16_2B_13b/checkpoints/eval -2022-12-21,09:36:52 | INFO | lr: 0.0005 -2022-12-21,09:36:52 | INFO | model: ViT-B-16 -2022-12-21,09:36:52 | INFO | name: epoch_1.pt -2022-12-21,09:36:52 | INFO | no_set_device_rank: False -2022-12-21,09:36:52 | INFO | norm_gradient_clip: None -2022-12-21,09:36:52 | INFO | precision: amp_bfloat16 -2022-12-21,09:36:52 | INFO | pretrained: logs/cvpr_stability/ViT-B-16_2B_13b/checkpoints/epoch_1.pt -2022-12-21,09:36:52 | INFO | pretrained_image: False -2022-12-21,09:36:52 | INFO | rank: 0 -2022-12-21,09:36:52 | INFO | report_to: -2022-12-21,09:36:52 | INFO | resume: None -2022-12-21,09:36:52 | INFO | save_frequency: 1 -2022-12-21,09:36:52 | INFO | save_most_recent: False -2022-12-21,09:36:52 | INFO | seed: 0 -2022-12-21,09:36:52 | INFO | skip_scheduler: False -2022-12-21,09:36:52 | INFO | tensorboard: False -2022-12-21,09:36:52 | INFO | tensorboard_path: -2022-12-21,09:36:52 | INFO | torchscript: False -2022-12-21,09:36:52 | INFO | trace: False -2022-12-21,09:36:52 | INFO | train_data: None -2022-12-21,09:36:52 | INFO | train_num_samples: None -2022-12-21,09:36:52 | INFO | use_bn_sync: False -2022-12-21,09:36:52 | INFO | val_data: None -2022-12-21,09:36:52 | INFO | val_frequency: 1 -2022-12-21,09:36:52 | INFO | val_num_samples: None -2022-12-21,09:36:52 | INFO | wandb: False -2022-12-21,09:36:52 | INFO | wandb_notes: -2022-12-21,09:36:52 | INFO | warmup: 10000 -2022-12-21,09:36:52 | INFO | wd: 0.2 -2022-12-21,09:36:52 | INFO | workers: 1 -2022-12-21,09:36:52 | INFO | world_size: 1 -2022-12-21,09:36:52 | INFO | zeroshot_frequency: 2 -2022-12-21,09:36:53 | INFO | Starting zero-shot imagenet. -2022-12-21,09:36:53 | INFO | Building zero-shot classifier - 0%| | 0/1000 [00:00