Upload folder using huggingface_hub
Browse files- parity_n_40_k_3_N_1000_100/fcn_relu_100_0.00_default/sgd_bs_32_lr_0.10/wd_0.01_do_0_no_bn_syn_0_no_ln_red_0/seed_0/checkpoints/checkpoints_1400.zip +3 -0
- parity_n_40_k_3_N_1000_100/fcn_relu_100_0.00_default/sgd_bs_32_lr_0.10/wd_0.01_do_0_no_bn_syn_0_no_ln_red_0/seed_0/config.yaml +26 -34
- parity_n_40_k_3_N_1000_100/fcn_relu_100_0.00_default/sgd_bs_32_lr_0.10/wd_0.01_do_0_no_bn_syn_0_no_ln_red_0/seed_0/epoch_results_1400_seed_0.pkl +3 -0
- parity_n_40_k_3_N_1000_100/fcn_relu_100_0.00_default/sgd_bs_32_lr_0.10/wd_0.01_do_0_no_bn_syn_0_no_ln_red_0/seed_0/optimal_results_seed_1400_0.pkl +3 -0
- parity_n_40_k_3_N_1000_100/fcn_relu_100_0.00_default/sgd_bs_32_lr_0.10/wd_0.01_do_0_no_bn_syn_0_no_ln_red_0/seed_1/config.yaml +26 -34
- parity_n_40_k_3_N_1000_100/fcn_relu_100_0.00_default/sgd_bs_32_lr_0.10/wd_0.01_do_0_no_bn_syn_0_no_ln_red_0/seed_2/config.yaml +26 -34
- parity_n_40_k_3_N_1000_100/fcn_relu_100_0.00_default/sgd_bs_32_lr_0.10/wd_0.01_do_0_no_bn_syn_0_no_ln_red_0/seed_3/config.yaml +26 -34
- parity_n_40_k_3_N_1000_100/fcn_relu_100_0.00_default/sgd_bs_32_lr_0.10/wd_0.01_do_0_no_bn_syn_0_no_ln_red_0/seed_97/config.yaml +26 -34
parity_n_40_k_3_N_1000_100/fcn_relu_100_0.00_default/sgd_bs_32_lr_0.10/wd_0.01_do_0_no_bn_syn_0_no_ln_red_0/seed_0/checkpoints/checkpoints_1400.zip
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e174d3901f5508911e129b07501fb42e472ab95801d3bec9b3ccda0f5e0d2c26
|
| 3 |
+
size 23670534
|
parity_n_40_k_3_N_1000_100/fcn_relu_100_0.00_default/sgd_bs_32_lr_0.10/wd_0.01_do_0_no_bn_syn_0_no_ln_red_0/seed_0/config.yaml
CHANGED
|
@@ -6,7 +6,7 @@ huggingface:
|
|
| 6 |
synergymask: false
|
| 7 |
sparsity_sampling: 10
|
| 8 |
scheduler: null
|
| 9 |
-
device:
|
| 10 |
test_seed: 123
|
| 11 |
evaluate_oinformation: false
|
| 12 |
evaluate_sparsity: false
|
|
@@ -25,11 +25,11 @@ oinformation:
|
|
| 25 |
layer: fc2_post
|
| 26 |
njobs: 16
|
| 27 |
layers:
|
| 28 |
-
-
|
| 29 |
train: true
|
| 30 |
features_type:
|
| 31 |
- train
|
| 32 |
-
max_batch_exhaustive:
|
| 33 |
loss:
|
| 34 |
_target_: model.neural_network.MyHingeLoss
|
| 35 |
train:
|
|
@@ -37,7 +37,7 @@ train:
|
|
| 37 |
train_batch_size: 32
|
| 38 |
num_workers: 6
|
| 39 |
eval_batch_size: 32
|
| 40 |
-
max_epochs:
|
| 41 |
max_steps: 1000000
|
| 42 |
regularization:
|
| 43 |
weight_decay:
|
|
@@ -98,10 +98,11 @@ paths:
|
|
| 98 |
data_dir: ${paths.root_dir}/data/
|
| 99 |
log_dir: ${paths.root_dir}/runs/
|
| 100 |
output_dir: /kyukon/scratch/gent/433/vsc43397/oinformation-grokking/runs/parity_n_40_k_3_N_1000_100/fcn_relu_100_0.00_default/sgd_bs_32_lr_0.10/wd_0.01_do_0_no_bn_syn_0_no_ln_red_0
|
| 101 |
-
plot_dir: ./plots/
|
| 102 |
-
plot_dir_all: ./plots/
|
|
|
|
| 103 |
work_dir: ${hydra:runtime.cwd}
|
| 104 |
-
experiment_name:
|
| 105 |
seeds:
|
| 106 |
- 0
|
| 107 |
- 1
|
|
@@ -110,45 +111,36 @@ seeds:
|
|
| 110 |
- 97
|
| 111 |
evaluate_generalizing_models_only: false
|
| 112 |
remove_files: true
|
| 113 |
-
train_seeds_parallel:
|
| 114 |
-
plots:
|
| 115 |
-
experiment1:
|
| 116 |
-
title:
|
| 117 |
-
- train.max_epochs
|
| 118 |
-
type: heatmap
|
| 119 |
-
x: dataset.train_samples
|
| 120 |
-
x_label: dataset size
|
| 121 |
-
y_label: learning rate
|
| 122 |
-
'y':
|
| 123 |
-
- test_acc
|
| 124 |
-
- synergy
|
| 125 |
-
- redundancy
|
| 126 |
grid_search:
|
| 127 |
dataset.train_samples:
|
| 128 |
array:
|
|
|
|
|
|
|
| 129 |
- 800
|
|
|
|
| 130 |
- 1000
|
| 131 |
-
-
|
| 132 |
-
|
| 133 |
-
- 2000
|
| 134 |
-
- 3000
|
| 135 |
train.lr:
|
| 136 |
array:
|
|
|
|
| 137 |
- 0.1
|
|
|
|
| 138 |
train.train_batch_size:
|
| 139 |
array:
|
| 140 |
- 32
|
|
|
|
|
|
|
|
|
|
| 141 |
dataset.parameters.n:
|
| 142 |
array:
|
| 143 |
-
- 30
|
| 144 |
- 40
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
-
|
| 151 |
-
-
|
| 152 |
-
- 20
|
| 153 |
-
- 100
|
| 154 |
experiment_oinfo_title: null
|
|
|
|
| 6 |
synergymask: false
|
| 7 |
sparsity_sampling: 10
|
| 8 |
scheduler: null
|
| 9 |
+
device: cuda
|
| 10 |
test_seed: 123
|
| 11 |
evaluate_oinformation: false
|
| 12 |
evaluate_sparsity: false
|
|
|
|
| 25 |
layer: fc2_post
|
| 26 |
njobs: 16
|
| 27 |
layers:
|
| 28 |
+
- fc3_post
|
| 29 |
train: true
|
| 30 |
features_type:
|
| 31 |
- train
|
| 32 |
+
max_batch_exhaustive: 10
|
| 33 |
loss:
|
| 34 |
_target_: model.neural_network.MyHingeLoss
|
| 35 |
train:
|
|
|
|
| 37 |
train_batch_size: 32
|
| 38 |
num_workers: 6
|
| 39 |
eval_batch_size: 32
|
| 40 |
+
max_epochs: 1400
|
| 41 |
max_steps: 1000000
|
| 42 |
regularization:
|
| 43 |
weight_decay:
|
|
|
|
| 98 |
data_dir: ${paths.root_dir}/data/
|
| 99 |
log_dir: ${paths.root_dir}/runs/
|
| 100 |
output_dir: /kyukon/scratch/gent/433/vsc43397/oinformation-grokking/runs/parity_n_40_k_3_N_1000_100/fcn_relu_100_0.00_default/sgd_bs_32_lr_0.10/wd_0.01_do_0_no_bn_syn_0_no_ln_red_0
|
| 101 |
+
plot_dir: ./plots/gridsearch_100
|
| 102 |
+
plot_dir_all: ./plots/gridsearch_100/all/fcn_relu/sgd_bs_32_lr_0.10/wd_0.01_do_0_no_bn_syn_0_no_ln_red_0
|
| 103 |
+
run_dir: /kyukon/scratch/gent/433/vsc43397/oinformation-grokking/runs
|
| 104 |
work_dir: ${hydra:runtime.cwd}
|
| 105 |
+
experiment_name: gridsearch_100
|
| 106 |
seeds:
|
| 107 |
- 0
|
| 108 |
- 1
|
|
|
|
| 111 |
- 97
|
| 112 |
evaluate_generalizing_models_only: false
|
| 113 |
remove_files: true
|
| 114 |
+
train_seeds_parallel: false
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 115 |
grid_search:
|
| 116 |
dataset.train_samples:
|
| 117 |
array:
|
| 118 |
+
- 500
|
| 119 |
+
- 600
|
| 120 |
- 800
|
| 121 |
+
- 900
|
| 122 |
- 1000
|
| 123 |
+
- 1100
|
| 124 |
+
name: train_samples
|
|
|
|
|
|
|
| 125 |
train.lr:
|
| 126 |
array:
|
| 127 |
+
- 0.05
|
| 128 |
- 0.1
|
| 129 |
+
name: lr
|
| 130 |
train.train_batch_size:
|
| 131 |
array:
|
| 132 |
- 32
|
| 133 |
+
- 64
|
| 134 |
+
- 128
|
| 135 |
+
name: train_bs
|
| 136 |
dataset.parameters.n:
|
| 137 |
array:
|
|
|
|
| 138 |
- 40
|
| 139 |
+
- 50
|
| 140 |
+
name: 'n'
|
| 141 |
+
plots:
|
| 142 |
+
training_heatmaps:
|
| 143 |
+
plot_1:
|
| 144 |
+
- dataset.train_samples
|
| 145 |
+
- train.train_batch_size
|
|
|
|
|
|
|
| 146 |
experiment_oinfo_title: null
|
parity_n_40_k_3_N_1000_100/fcn_relu_100_0.00_default/sgd_bs_32_lr_0.10/wd_0.01_do_0_no_bn_syn_0_no_ln_red_0/seed_0/epoch_results_1400_seed_0.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1faf94efe8e1ac182cdd6e61fed6088c9e5b501815e27299bc6be37e037c2dee
|
| 3 |
+
size 101686
|
parity_n_40_k_3_N_1000_100/fcn_relu_100_0.00_default/sgd_bs_32_lr_0.10/wd_0.01_do_0_no_bn_syn_0_no_ln_red_0/seed_0/optimal_results_seed_1400_0.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8498fd021a0ed5b5716b682b2575834da0cd3bb2366ff29bf3baceb24fb171c1
|
| 3 |
+
size 98
|
parity_n_40_k_3_N_1000_100/fcn_relu_100_0.00_default/sgd_bs_32_lr_0.10/wd_0.01_do_0_no_bn_syn_0_no_ln_red_0/seed_1/config.yaml
CHANGED
|
@@ -6,7 +6,7 @@ huggingface:
|
|
| 6 |
synergymask: false
|
| 7 |
sparsity_sampling: 10
|
| 8 |
scheduler: null
|
| 9 |
-
device:
|
| 10 |
test_seed: 123
|
| 11 |
evaluate_oinformation: false
|
| 12 |
evaluate_sparsity: false
|
|
@@ -25,11 +25,11 @@ oinformation:
|
|
| 25 |
layer: fc2_post
|
| 26 |
njobs: 16
|
| 27 |
layers:
|
| 28 |
-
-
|
| 29 |
train: true
|
| 30 |
features_type:
|
| 31 |
- train
|
| 32 |
-
max_batch_exhaustive:
|
| 33 |
loss:
|
| 34 |
_target_: model.neural_network.MyHingeLoss
|
| 35 |
train:
|
|
@@ -37,7 +37,7 @@ train:
|
|
| 37 |
train_batch_size: 32
|
| 38 |
num_workers: 6
|
| 39 |
eval_batch_size: 32
|
| 40 |
-
max_epochs:
|
| 41 |
max_steps: 1000000
|
| 42 |
regularization:
|
| 43 |
weight_decay:
|
|
@@ -98,10 +98,11 @@ paths:
|
|
| 98 |
data_dir: ${paths.root_dir}/data/
|
| 99 |
log_dir: ${paths.root_dir}/runs/
|
| 100 |
output_dir: /kyukon/scratch/gent/433/vsc43397/oinformation-grokking/runs/parity_n_40_k_3_N_1000_100/fcn_relu_100_0.00_default/sgd_bs_32_lr_0.10/wd_0.01_do_0_no_bn_syn_0_no_ln_red_0
|
| 101 |
-
plot_dir: ./plots/
|
| 102 |
-
plot_dir_all: ./plots/
|
|
|
|
| 103 |
work_dir: ${hydra:runtime.cwd}
|
| 104 |
-
experiment_name:
|
| 105 |
seeds:
|
| 106 |
- 0
|
| 107 |
- 1
|
|
@@ -110,45 +111,36 @@ seeds:
|
|
| 110 |
- 97
|
| 111 |
evaluate_generalizing_models_only: false
|
| 112 |
remove_files: true
|
| 113 |
-
train_seeds_parallel:
|
| 114 |
-
plots:
|
| 115 |
-
experiment1:
|
| 116 |
-
title:
|
| 117 |
-
- train.max_epochs
|
| 118 |
-
type: heatmap
|
| 119 |
-
x: dataset.train_samples
|
| 120 |
-
x_label: dataset size
|
| 121 |
-
y_label: learning rate
|
| 122 |
-
'y':
|
| 123 |
-
- test_acc
|
| 124 |
-
- synergy
|
| 125 |
-
- redundancy
|
| 126 |
grid_search:
|
| 127 |
dataset.train_samples:
|
| 128 |
array:
|
|
|
|
|
|
|
| 129 |
- 800
|
|
|
|
| 130 |
- 1000
|
| 131 |
-
-
|
| 132 |
-
|
| 133 |
-
- 2000
|
| 134 |
-
- 3000
|
| 135 |
train.lr:
|
| 136 |
array:
|
|
|
|
| 137 |
- 0.1
|
|
|
|
| 138 |
train.train_batch_size:
|
| 139 |
array:
|
| 140 |
- 32
|
|
|
|
|
|
|
|
|
|
| 141 |
dataset.parameters.n:
|
| 142 |
array:
|
| 143 |
-
- 30
|
| 144 |
- 40
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
-
|
| 151 |
-
-
|
| 152 |
-
- 20
|
| 153 |
-
- 100
|
| 154 |
experiment_oinfo_title: null
|
|
|
|
| 6 |
synergymask: false
|
| 7 |
sparsity_sampling: 10
|
| 8 |
scheduler: null
|
| 9 |
+
device: cuda
|
| 10 |
test_seed: 123
|
| 11 |
evaluate_oinformation: false
|
| 12 |
evaluate_sparsity: false
|
|
|
|
| 25 |
layer: fc2_post
|
| 26 |
njobs: 16
|
| 27 |
layers:
|
| 28 |
+
- fc3_post
|
| 29 |
train: true
|
| 30 |
features_type:
|
| 31 |
- train
|
| 32 |
+
max_batch_exhaustive: 10
|
| 33 |
loss:
|
| 34 |
_target_: model.neural_network.MyHingeLoss
|
| 35 |
train:
|
|
|
|
| 37 |
train_batch_size: 32
|
| 38 |
num_workers: 6
|
| 39 |
eval_batch_size: 32
|
| 40 |
+
max_epochs: 1400
|
| 41 |
max_steps: 1000000
|
| 42 |
regularization:
|
| 43 |
weight_decay:
|
|
|
|
| 98 |
data_dir: ${paths.root_dir}/data/
|
| 99 |
log_dir: ${paths.root_dir}/runs/
|
| 100 |
output_dir: /kyukon/scratch/gent/433/vsc43397/oinformation-grokking/runs/parity_n_40_k_3_N_1000_100/fcn_relu_100_0.00_default/sgd_bs_32_lr_0.10/wd_0.01_do_0_no_bn_syn_0_no_ln_red_0
|
| 101 |
+
plot_dir: ./plots/gridsearch_100
|
| 102 |
+
plot_dir_all: ./plots/gridsearch_100/all/fcn_relu/sgd_bs_32_lr_0.10/wd_0.01_do_0_no_bn_syn_0_no_ln_red_0
|
| 103 |
+
run_dir: /kyukon/scratch/gent/433/vsc43397/oinformation-grokking/runs
|
| 104 |
work_dir: ${hydra:runtime.cwd}
|
| 105 |
+
experiment_name: gridsearch_100
|
| 106 |
seeds:
|
| 107 |
- 0
|
| 108 |
- 1
|
|
|
|
| 111 |
- 97
|
| 112 |
evaluate_generalizing_models_only: false
|
| 113 |
remove_files: true
|
| 114 |
+
train_seeds_parallel: false
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 115 |
grid_search:
|
| 116 |
dataset.train_samples:
|
| 117 |
array:
|
| 118 |
+
- 500
|
| 119 |
+
- 600
|
| 120 |
- 800
|
| 121 |
+
- 900
|
| 122 |
- 1000
|
| 123 |
+
- 1100
|
| 124 |
+
name: train_samples
|
|
|
|
|
|
|
| 125 |
train.lr:
|
| 126 |
array:
|
| 127 |
+
- 0.05
|
| 128 |
- 0.1
|
| 129 |
+
name: lr
|
| 130 |
train.train_batch_size:
|
| 131 |
array:
|
| 132 |
- 32
|
| 133 |
+
- 64
|
| 134 |
+
- 128
|
| 135 |
+
name: train_bs
|
| 136 |
dataset.parameters.n:
|
| 137 |
array:
|
|
|
|
| 138 |
- 40
|
| 139 |
+
- 50
|
| 140 |
+
name: 'n'
|
| 141 |
+
plots:
|
| 142 |
+
training_heatmaps:
|
| 143 |
+
plot_1:
|
| 144 |
+
- dataset.train_samples
|
| 145 |
+
- train.train_batch_size
|
|
|
|
|
|
|
| 146 |
experiment_oinfo_title: null
|
parity_n_40_k_3_N_1000_100/fcn_relu_100_0.00_default/sgd_bs_32_lr_0.10/wd_0.01_do_0_no_bn_syn_0_no_ln_red_0/seed_2/config.yaml
CHANGED
|
@@ -6,7 +6,7 @@ huggingface:
|
|
| 6 |
synergymask: false
|
| 7 |
sparsity_sampling: 10
|
| 8 |
scheduler: null
|
| 9 |
-
device:
|
| 10 |
test_seed: 123
|
| 11 |
evaluate_oinformation: false
|
| 12 |
evaluate_sparsity: false
|
|
@@ -25,11 +25,11 @@ oinformation:
|
|
| 25 |
layer: fc2_post
|
| 26 |
njobs: 16
|
| 27 |
layers:
|
| 28 |
-
-
|
| 29 |
train: true
|
| 30 |
features_type:
|
| 31 |
- train
|
| 32 |
-
max_batch_exhaustive:
|
| 33 |
loss:
|
| 34 |
_target_: model.neural_network.MyHingeLoss
|
| 35 |
train:
|
|
@@ -37,7 +37,7 @@ train:
|
|
| 37 |
train_batch_size: 32
|
| 38 |
num_workers: 6
|
| 39 |
eval_batch_size: 32
|
| 40 |
-
max_epochs:
|
| 41 |
max_steps: 1000000
|
| 42 |
regularization:
|
| 43 |
weight_decay:
|
|
@@ -98,10 +98,11 @@ paths:
|
|
| 98 |
data_dir: ${paths.root_dir}/data/
|
| 99 |
log_dir: ${paths.root_dir}/runs/
|
| 100 |
output_dir: /kyukon/scratch/gent/433/vsc43397/oinformation-grokking/runs/parity_n_40_k_3_N_1000_100/fcn_relu_100_0.00_default/sgd_bs_32_lr_0.10/wd_0.01_do_0_no_bn_syn_0_no_ln_red_0
|
| 101 |
-
plot_dir: ./plots/
|
| 102 |
-
plot_dir_all: ./plots/
|
|
|
|
| 103 |
work_dir: ${hydra:runtime.cwd}
|
| 104 |
-
experiment_name:
|
| 105 |
seeds:
|
| 106 |
- 0
|
| 107 |
- 1
|
|
@@ -110,45 +111,36 @@ seeds:
|
|
| 110 |
- 97
|
| 111 |
evaluate_generalizing_models_only: false
|
| 112 |
remove_files: true
|
| 113 |
-
train_seeds_parallel:
|
| 114 |
-
plots:
|
| 115 |
-
experiment1:
|
| 116 |
-
title:
|
| 117 |
-
- train.max_epochs
|
| 118 |
-
type: heatmap
|
| 119 |
-
x: dataset.train_samples
|
| 120 |
-
x_label: dataset size
|
| 121 |
-
y_label: learning rate
|
| 122 |
-
'y':
|
| 123 |
-
- test_acc
|
| 124 |
-
- synergy
|
| 125 |
-
- redundancy
|
| 126 |
grid_search:
|
| 127 |
dataset.train_samples:
|
| 128 |
array:
|
|
|
|
|
|
|
| 129 |
- 800
|
|
|
|
| 130 |
- 1000
|
| 131 |
-
-
|
| 132 |
-
|
| 133 |
-
- 2000
|
| 134 |
-
- 3000
|
| 135 |
train.lr:
|
| 136 |
array:
|
|
|
|
| 137 |
- 0.1
|
|
|
|
| 138 |
train.train_batch_size:
|
| 139 |
array:
|
| 140 |
- 32
|
|
|
|
|
|
|
|
|
|
| 141 |
dataset.parameters.n:
|
| 142 |
array:
|
| 143 |
-
- 30
|
| 144 |
- 40
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
-
|
| 151 |
-
-
|
| 152 |
-
- 20
|
| 153 |
-
- 100
|
| 154 |
experiment_oinfo_title: null
|
|
|
|
| 6 |
synergymask: false
|
| 7 |
sparsity_sampling: 10
|
| 8 |
scheduler: null
|
| 9 |
+
device: cuda
|
| 10 |
test_seed: 123
|
| 11 |
evaluate_oinformation: false
|
| 12 |
evaluate_sparsity: false
|
|
|
|
| 25 |
layer: fc2_post
|
| 26 |
njobs: 16
|
| 27 |
layers:
|
| 28 |
+
- fc3_post
|
| 29 |
train: true
|
| 30 |
features_type:
|
| 31 |
- train
|
| 32 |
+
max_batch_exhaustive: 10
|
| 33 |
loss:
|
| 34 |
_target_: model.neural_network.MyHingeLoss
|
| 35 |
train:
|
|
|
|
| 37 |
train_batch_size: 32
|
| 38 |
num_workers: 6
|
| 39 |
eval_batch_size: 32
|
| 40 |
+
max_epochs: 1400
|
| 41 |
max_steps: 1000000
|
| 42 |
regularization:
|
| 43 |
weight_decay:
|
|
|
|
| 98 |
data_dir: ${paths.root_dir}/data/
|
| 99 |
log_dir: ${paths.root_dir}/runs/
|
| 100 |
output_dir: /kyukon/scratch/gent/433/vsc43397/oinformation-grokking/runs/parity_n_40_k_3_N_1000_100/fcn_relu_100_0.00_default/sgd_bs_32_lr_0.10/wd_0.01_do_0_no_bn_syn_0_no_ln_red_0
|
| 101 |
+
plot_dir: ./plots/gridsearch_100
|
| 102 |
+
plot_dir_all: ./plots/gridsearch_100/all/fcn_relu/sgd_bs_32_lr_0.10/wd_0.01_do_0_no_bn_syn_0_no_ln_red_0
|
| 103 |
+
run_dir: /kyukon/scratch/gent/433/vsc43397/oinformation-grokking/runs
|
| 104 |
work_dir: ${hydra:runtime.cwd}
|
| 105 |
+
experiment_name: gridsearch_100
|
| 106 |
seeds:
|
| 107 |
- 0
|
| 108 |
- 1
|
|
|
|
| 111 |
- 97
|
| 112 |
evaluate_generalizing_models_only: false
|
| 113 |
remove_files: true
|
| 114 |
+
train_seeds_parallel: false
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 115 |
grid_search:
|
| 116 |
dataset.train_samples:
|
| 117 |
array:
|
| 118 |
+
- 500
|
| 119 |
+
- 600
|
| 120 |
- 800
|
| 121 |
+
- 900
|
| 122 |
- 1000
|
| 123 |
+
- 1100
|
| 124 |
+
name: train_samples
|
|
|
|
|
|
|
| 125 |
train.lr:
|
| 126 |
array:
|
| 127 |
+
- 0.05
|
| 128 |
- 0.1
|
| 129 |
+
name: lr
|
| 130 |
train.train_batch_size:
|
| 131 |
array:
|
| 132 |
- 32
|
| 133 |
+
- 64
|
| 134 |
+
- 128
|
| 135 |
+
name: train_bs
|
| 136 |
dataset.parameters.n:
|
| 137 |
array:
|
|
|
|
| 138 |
- 40
|
| 139 |
+
- 50
|
| 140 |
+
name: 'n'
|
| 141 |
+
plots:
|
| 142 |
+
training_heatmaps:
|
| 143 |
+
plot_1:
|
| 144 |
+
- dataset.train_samples
|
| 145 |
+
- train.train_batch_size
|
|
|
|
|
|
|
| 146 |
experiment_oinfo_title: null
|
parity_n_40_k_3_N_1000_100/fcn_relu_100_0.00_default/sgd_bs_32_lr_0.10/wd_0.01_do_0_no_bn_syn_0_no_ln_red_0/seed_3/config.yaml
CHANGED
|
@@ -6,7 +6,7 @@ huggingface:
|
|
| 6 |
synergymask: false
|
| 7 |
sparsity_sampling: 10
|
| 8 |
scheduler: null
|
| 9 |
-
device:
|
| 10 |
test_seed: 123
|
| 11 |
evaluate_oinformation: false
|
| 12 |
evaluate_sparsity: false
|
|
@@ -25,11 +25,11 @@ oinformation:
|
|
| 25 |
layer: fc2_post
|
| 26 |
njobs: 16
|
| 27 |
layers:
|
| 28 |
-
-
|
| 29 |
train: true
|
| 30 |
features_type:
|
| 31 |
- train
|
| 32 |
-
max_batch_exhaustive:
|
| 33 |
loss:
|
| 34 |
_target_: model.neural_network.MyHingeLoss
|
| 35 |
train:
|
|
@@ -37,7 +37,7 @@ train:
|
|
| 37 |
train_batch_size: 32
|
| 38 |
num_workers: 6
|
| 39 |
eval_batch_size: 32
|
| 40 |
-
max_epochs:
|
| 41 |
max_steps: 1000000
|
| 42 |
regularization:
|
| 43 |
weight_decay:
|
|
@@ -98,10 +98,11 @@ paths:
|
|
| 98 |
data_dir: ${paths.root_dir}/data/
|
| 99 |
log_dir: ${paths.root_dir}/runs/
|
| 100 |
output_dir: /kyukon/scratch/gent/433/vsc43397/oinformation-grokking/runs/parity_n_40_k_3_N_1000_100/fcn_relu_100_0.00_default/sgd_bs_32_lr_0.10/wd_0.01_do_0_no_bn_syn_0_no_ln_red_0
|
| 101 |
-
plot_dir: ./plots/
|
| 102 |
-
plot_dir_all: ./plots/
|
|
|
|
| 103 |
work_dir: ${hydra:runtime.cwd}
|
| 104 |
-
experiment_name:
|
| 105 |
seeds:
|
| 106 |
- 0
|
| 107 |
- 1
|
|
@@ -110,45 +111,36 @@ seeds:
|
|
| 110 |
- 97
|
| 111 |
evaluate_generalizing_models_only: false
|
| 112 |
remove_files: true
|
| 113 |
-
train_seeds_parallel:
|
| 114 |
-
plots:
|
| 115 |
-
experiment1:
|
| 116 |
-
title:
|
| 117 |
-
- train.max_epochs
|
| 118 |
-
type: heatmap
|
| 119 |
-
x: dataset.train_samples
|
| 120 |
-
x_label: dataset size
|
| 121 |
-
y_label: learning rate
|
| 122 |
-
'y':
|
| 123 |
-
- test_acc
|
| 124 |
-
- synergy
|
| 125 |
-
- redundancy
|
| 126 |
grid_search:
|
| 127 |
dataset.train_samples:
|
| 128 |
array:
|
|
|
|
|
|
|
| 129 |
- 800
|
|
|
|
| 130 |
- 1000
|
| 131 |
-
-
|
| 132 |
-
|
| 133 |
-
- 2000
|
| 134 |
-
- 3000
|
| 135 |
train.lr:
|
| 136 |
array:
|
|
|
|
| 137 |
- 0.1
|
|
|
|
| 138 |
train.train_batch_size:
|
| 139 |
array:
|
| 140 |
- 32
|
|
|
|
|
|
|
|
|
|
| 141 |
dataset.parameters.n:
|
| 142 |
array:
|
| 143 |
-
- 30
|
| 144 |
- 40
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
-
|
| 151 |
-
-
|
| 152 |
-
- 20
|
| 153 |
-
- 100
|
| 154 |
experiment_oinfo_title: null
|
|
|
|
| 6 |
synergymask: false
|
| 7 |
sparsity_sampling: 10
|
| 8 |
scheduler: null
|
| 9 |
+
device: cuda
|
| 10 |
test_seed: 123
|
| 11 |
evaluate_oinformation: false
|
| 12 |
evaluate_sparsity: false
|
|
|
|
| 25 |
layer: fc2_post
|
| 26 |
njobs: 16
|
| 27 |
layers:
|
| 28 |
+
- fc3_post
|
| 29 |
train: true
|
| 30 |
features_type:
|
| 31 |
- train
|
| 32 |
+
max_batch_exhaustive: 10
|
| 33 |
loss:
|
| 34 |
_target_: model.neural_network.MyHingeLoss
|
| 35 |
train:
|
|
|
|
| 37 |
train_batch_size: 32
|
| 38 |
num_workers: 6
|
| 39 |
eval_batch_size: 32
|
| 40 |
+
max_epochs: 1400
|
| 41 |
max_steps: 1000000
|
| 42 |
regularization:
|
| 43 |
weight_decay:
|
|
|
|
| 98 |
data_dir: ${paths.root_dir}/data/
|
| 99 |
log_dir: ${paths.root_dir}/runs/
|
| 100 |
output_dir: /kyukon/scratch/gent/433/vsc43397/oinformation-grokking/runs/parity_n_40_k_3_N_1000_100/fcn_relu_100_0.00_default/sgd_bs_32_lr_0.10/wd_0.01_do_0_no_bn_syn_0_no_ln_red_0
|
| 101 |
+
plot_dir: ./plots/gridsearch_100
|
| 102 |
+
plot_dir_all: ./plots/gridsearch_100/all/fcn_relu/sgd_bs_32_lr_0.10/wd_0.01_do_0_no_bn_syn_0_no_ln_red_0
|
| 103 |
+
run_dir: /kyukon/scratch/gent/433/vsc43397/oinformation-grokking/runs
|
| 104 |
work_dir: ${hydra:runtime.cwd}
|
| 105 |
+
experiment_name: gridsearch_100
|
| 106 |
seeds:
|
| 107 |
- 0
|
| 108 |
- 1
|
|
|
|
| 111 |
- 97
|
| 112 |
evaluate_generalizing_models_only: false
|
| 113 |
remove_files: true
|
| 114 |
+
train_seeds_parallel: false
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 115 |
grid_search:
|
| 116 |
dataset.train_samples:
|
| 117 |
array:
|
| 118 |
+
- 500
|
| 119 |
+
- 600
|
| 120 |
- 800
|
| 121 |
+
- 900
|
| 122 |
- 1000
|
| 123 |
+
- 1100
|
| 124 |
+
name: train_samples
|
|
|
|
|
|
|
| 125 |
train.lr:
|
| 126 |
array:
|
| 127 |
+
- 0.05
|
| 128 |
- 0.1
|
| 129 |
+
name: lr
|
| 130 |
train.train_batch_size:
|
| 131 |
array:
|
| 132 |
- 32
|
| 133 |
+
- 64
|
| 134 |
+
- 128
|
| 135 |
+
name: train_bs
|
| 136 |
dataset.parameters.n:
|
| 137 |
array:
|
|
|
|
| 138 |
- 40
|
| 139 |
+
- 50
|
| 140 |
+
name: 'n'
|
| 141 |
+
plots:
|
| 142 |
+
training_heatmaps:
|
| 143 |
+
plot_1:
|
| 144 |
+
- dataset.train_samples
|
| 145 |
+
- train.train_batch_size
|
|
|
|
|
|
|
| 146 |
experiment_oinfo_title: null
|
parity_n_40_k_3_N_1000_100/fcn_relu_100_0.00_default/sgd_bs_32_lr_0.10/wd_0.01_do_0_no_bn_syn_0_no_ln_red_0/seed_97/config.yaml
CHANGED
|
@@ -6,7 +6,7 @@ huggingface:
|
|
| 6 |
synergymask: false
|
| 7 |
sparsity_sampling: 10
|
| 8 |
scheduler: null
|
| 9 |
-
device:
|
| 10 |
test_seed: 123
|
| 11 |
evaluate_oinformation: false
|
| 12 |
evaluate_sparsity: false
|
|
@@ -25,11 +25,11 @@ oinformation:
|
|
| 25 |
layer: fc2_post
|
| 26 |
njobs: 16
|
| 27 |
layers:
|
| 28 |
-
-
|
| 29 |
train: true
|
| 30 |
features_type:
|
| 31 |
- train
|
| 32 |
-
max_batch_exhaustive:
|
| 33 |
loss:
|
| 34 |
_target_: model.neural_network.MyHingeLoss
|
| 35 |
train:
|
|
@@ -37,7 +37,7 @@ train:
|
|
| 37 |
train_batch_size: 32
|
| 38 |
num_workers: 6
|
| 39 |
eval_batch_size: 32
|
| 40 |
-
max_epochs:
|
| 41 |
max_steps: 1000000
|
| 42 |
regularization:
|
| 43 |
weight_decay:
|
|
@@ -98,10 +98,11 @@ paths:
|
|
| 98 |
data_dir: ${paths.root_dir}/data/
|
| 99 |
log_dir: ${paths.root_dir}/runs/
|
| 100 |
output_dir: /kyukon/scratch/gent/433/vsc43397/oinformation-grokking/runs/parity_n_40_k_3_N_1000_100/fcn_relu_100_0.00_default/sgd_bs_32_lr_0.10/wd_0.01_do_0_no_bn_syn_0_no_ln_red_0
|
| 101 |
-
plot_dir: ./plots/
|
| 102 |
-
plot_dir_all: ./plots/
|
|
|
|
| 103 |
work_dir: ${hydra:runtime.cwd}
|
| 104 |
-
experiment_name:
|
| 105 |
seeds:
|
| 106 |
- 0
|
| 107 |
- 1
|
|
@@ -110,45 +111,36 @@ seeds:
|
|
| 110 |
- 97
|
| 111 |
evaluate_generalizing_models_only: false
|
| 112 |
remove_files: true
|
| 113 |
-
train_seeds_parallel:
|
| 114 |
-
plots:
|
| 115 |
-
experiment1:
|
| 116 |
-
title:
|
| 117 |
-
- train.max_epochs
|
| 118 |
-
type: heatmap
|
| 119 |
-
x: dataset.train_samples
|
| 120 |
-
x_label: dataset size
|
| 121 |
-
y_label: learning rate
|
| 122 |
-
'y':
|
| 123 |
-
- test_acc
|
| 124 |
-
- synergy
|
| 125 |
-
- redundancy
|
| 126 |
grid_search:
|
| 127 |
dataset.train_samples:
|
| 128 |
array:
|
|
|
|
|
|
|
| 129 |
- 800
|
|
|
|
| 130 |
- 1000
|
| 131 |
-
-
|
| 132 |
-
|
| 133 |
-
- 2000
|
| 134 |
-
- 3000
|
| 135 |
train.lr:
|
| 136 |
array:
|
|
|
|
| 137 |
- 0.1
|
|
|
|
| 138 |
train.train_batch_size:
|
| 139 |
array:
|
| 140 |
- 32
|
|
|
|
|
|
|
|
|
|
| 141 |
dataset.parameters.n:
|
| 142 |
array:
|
| 143 |
-
- 30
|
| 144 |
- 40
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
-
|
| 151 |
-
-
|
| 152 |
-
- 20
|
| 153 |
-
- 100
|
| 154 |
experiment_oinfo_title: null
|
|
|
|
| 6 |
synergymask: false
|
| 7 |
sparsity_sampling: 10
|
| 8 |
scheduler: null
|
| 9 |
+
device: cuda
|
| 10 |
test_seed: 123
|
| 11 |
evaluate_oinformation: false
|
| 12 |
evaluate_sparsity: false
|
|
|
|
| 25 |
layer: fc2_post
|
| 26 |
njobs: 16
|
| 27 |
layers:
|
| 28 |
+
- fc3_post
|
| 29 |
train: true
|
| 30 |
features_type:
|
| 31 |
- train
|
| 32 |
+
max_batch_exhaustive: 10
|
| 33 |
loss:
|
| 34 |
_target_: model.neural_network.MyHingeLoss
|
| 35 |
train:
|
|
|
|
| 37 |
train_batch_size: 32
|
| 38 |
num_workers: 6
|
| 39 |
eval_batch_size: 32
|
| 40 |
+
max_epochs: 1400
|
| 41 |
max_steps: 1000000
|
| 42 |
regularization:
|
| 43 |
weight_decay:
|
|
|
|
| 98 |
data_dir: ${paths.root_dir}/data/
|
| 99 |
log_dir: ${paths.root_dir}/runs/
|
| 100 |
output_dir: /kyukon/scratch/gent/433/vsc43397/oinformation-grokking/runs/parity_n_40_k_3_N_1000_100/fcn_relu_100_0.00_default/sgd_bs_32_lr_0.10/wd_0.01_do_0_no_bn_syn_0_no_ln_red_0
|
| 101 |
+
plot_dir: ./plots/gridsearch_100
|
| 102 |
+
plot_dir_all: ./plots/gridsearch_100/all/fcn_relu/sgd_bs_32_lr_0.10/wd_0.01_do_0_no_bn_syn_0_no_ln_red_0
|
| 103 |
+
run_dir: /kyukon/scratch/gent/433/vsc43397/oinformation-grokking/runs
|
| 104 |
work_dir: ${hydra:runtime.cwd}
|
| 105 |
+
experiment_name: gridsearch_100
|
| 106 |
seeds:
|
| 107 |
- 0
|
| 108 |
- 1
|
|
|
|
| 111 |
- 97
|
| 112 |
evaluate_generalizing_models_only: false
|
| 113 |
remove_files: true
|
| 114 |
+
train_seeds_parallel: false
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 115 |
grid_search:
|
| 116 |
dataset.train_samples:
|
| 117 |
array:
|
| 118 |
+
- 500
|
| 119 |
+
- 600
|
| 120 |
- 800
|
| 121 |
+
- 900
|
| 122 |
- 1000
|
| 123 |
+
- 1100
|
| 124 |
+
name: train_samples
|
|
|
|
|
|
|
| 125 |
train.lr:
|
| 126 |
array:
|
| 127 |
+
- 0.05
|
| 128 |
- 0.1
|
| 129 |
+
name: lr
|
| 130 |
train.train_batch_size:
|
| 131 |
array:
|
| 132 |
- 32
|
| 133 |
+
- 64
|
| 134 |
+
- 128
|
| 135 |
+
name: train_bs
|
| 136 |
dataset.parameters.n:
|
| 137 |
array:
|
|
|
|
| 138 |
- 40
|
| 139 |
+
- 50
|
| 140 |
+
name: 'n'
|
| 141 |
+
plots:
|
| 142 |
+
training_heatmaps:
|
| 143 |
+
plot_1:
|
| 144 |
+
- dataset.train_samples
|
| 145 |
+
- train.train_batch_size
|
|
|
|
|
|
|
| 146 |
experiment_oinfo_title: null
|