aapot commited on
Commit
5b4c301
1 Parent(s): 2cc762b

Add 50k train step model

Browse files
config.gin CHANGED
@@ -2,7 +2,7 @@ from __gin__ import dynamic_registration
2
  import __main__ as train_script
3
  import seqio
4
  from t5x import adafactor
5
- from t5x.examples.scalable_t5 import network
6
  from t5x import gin_utils
7
  from t5x import models
8
  from t5x import partitioning
@@ -116,8 +116,6 @@ network.T5Config.mlp_dim = 2048
116
  network.T5Config.num_decoder_layers = 16
117
  network.T5Config.num_encoder_layers = 16
118
  network.T5Config.num_heads = 8
119
- network.T5Config.remat_policy = 'minimal'
120
- network.T5Config.scan_layers = True
121
  network.T5Config.vocab_size = 32128
122
 
123
  # Parameters for train_script.train:
@@ -135,7 +133,6 @@ train_script.train.total_steps = %TRAIN_STEPS
135
  train_script.train.train_dataset_cfg = @train/utils.DatasetConfig()
136
  train_script.train.train_eval_dataset_cfg = @train_eval/utils.DatasetConfig()
137
  train_script.train.trainer_cls = @trainer.Trainer
138
- train_script.train.use_gda = False
139
  train_script.train.use_hardware_rng = %USE_HARDWARE_RNG
140
 
141
  # Parameters for trainer.Trainer:
2
  import __main__ as train_script
3
  import seqio
4
  from t5x import adafactor
5
+ from t5x.examples.t5 import network
6
  from t5x import gin_utils
7
  from t5x import models
8
  from t5x import partitioning
116
  network.T5Config.num_decoder_layers = 16
117
  network.T5Config.num_encoder_layers = 16
118
  network.T5Config.num_heads = 8
 
 
119
  network.T5Config.vocab_size = 32128
120
 
121
  # Parameters for train_script.train:
133
  train_script.train.train_dataset_cfg = @train/utils.DatasetConfig()
134
  train_script.train.train_eval_dataset_cfg = @train_eval/utils.DatasetConfig()
135
  train_script.train.trainer_cls = @trainer.Trainer
 
136
  train_script.train.use_hardware_rng = %USE_HARDWARE_RNG
137
 
138
  # Parameters for trainer.Trainer:
config.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "_name_or_path": "/researchdisk/t5-small-nl16-finnish",
3
  "architectures": [
4
  "T5ForConditionalGeneration"
5
  ],
@@ -26,7 +26,7 @@
26
  "relative_attention_num_buckets": 32,
27
  "tie_word_embeddings": false,
28
  "torch_dtype": "float32",
29
- "transformers_version": "4.21.2",
30
  "use_cache": true,
31
  "vocab_size": 32128
32
  }
1
  {
2
+ "_name_or_path": "./",
3
  "architectures": [
4
  "T5ForConditionalGeneration"
5
  ],
26
  "relative_attention_num_buckets": 32,
27
  "tie_word_embeddings": false,
28
  "torch_dtype": "float32",
29
+ "transformers_version": "4.22.1",
30
  "use_cache": true,
31
  "vocab_size": 32128
32
  }
convert_t5x_checkpoint_to_flax.py CHANGED
@@ -12,36 +12,37 @@ def convert_t5x_checkpoint_to_flax(t5x_checkpoint_path, config_name, flax_dump_f
12
  flax_model = FlaxT5ForConditionalGeneration(config=config)
13
  t5x_model = checkpoints.load_t5x_checkpoint(t5x_checkpoint_path)
14
 
15
- split_mlp_wi = "wi_0" in t5x_model["target"]["encoder"]["encoder"]["mlp"]
16
 
17
  # Encoder
18
  for layer_index in range(config.num_layers):
 
19
 
20
  # Self-Attention
21
- t5x_attention_key = t5x_model["target"]["encoder"]["encoder"]["attention"]["key"]["kernel"][:, layer_index, :, :]
22
- t5x_attention_out = t5x_model["target"]["encoder"]["encoder"]["attention"]["out"]["kernel"][:, layer_index, :, :]
23
- t5x_attention_query = t5x_model["target"]["encoder"]["encoder"]["attention"]["query"]["kernel"][:, layer_index, :, :]
24
- t5x_attention_value = t5x_model["target"]["encoder"]["encoder"]["attention"]["value"]["kernel"][:, layer_index, :, :]
25
 
26
  ## Layer Normalization
27
- t5x_attention_layer_norm = t5x_model["target"]["encoder"]["encoder"]["pre_attention_layer_norm"]["scale"][:, layer_index]
28
 
29
  if split_mlp_wi:
30
- t5x_mlp_wi_0 = t5x_model["target"]["encoder"]["encoder"]["mlp"]["wi_0"]["kernel"][:, layer_index, :]
31
- t5x_mlp_wi_1 = t5x_model["target"]["encoder"]["encoder"]["mlp"]["wi_1"]["kernel"][:, layer_index, :]
32
  else:
33
- t5x_mlp_wi = t5x_model["target"]["encoder"]["encoder"]["mlp"]["wi"]["kernel"][:, layer_index, :]
34
 
35
- t5x_mlp_wo = t5x_model["target"]["encoder"]["encoder"]["mlp"]["wo"]["kernel"][:, layer_index, :]
36
 
37
  ## Layer Normalization
38
- t5x_mlp_layer_norm = t5x_model["target"]["encoder"]["encoder"]["pre_mlp_layer_norm"]["scale"][:, layer_index]
39
 
40
  # Assigning
41
- flax_model.params["encoder"]["block"][str(layer_index)]["layer"]["0"]["SelfAttention"]["k"]["kernel"] = t5x_attention_key.reshape(*t5x_attention_key.shape[:-2], -1)
42
- flax_model.params["encoder"]["block"][str(layer_index)]["layer"]["0"]["SelfAttention"]["o"]["kernel"] = t5x_attention_out.reshape(-1, t5x_attention_out.shape[-1])
43
- flax_model.params["encoder"]["block"][str(layer_index)]["layer"]["0"]["SelfAttention"]["q"]["kernel"] = t5x_attention_query.reshape(*t5x_attention_query.shape[:-2], -1)
44
- flax_model.params["encoder"]["block"][str(layer_index)]["layer"]["0"]["SelfAttention"]["v"]["kernel"] = t5x_attention_value.reshape(*t5x_attention_value.shape[:-2], -1)
45
 
46
  flax_model.params["encoder"]["block"][str(layer_index)]["layer"]["0"]["layer_norm"]["weight"] = t5x_attention_layer_norm
47
 
@@ -55,8 +56,8 @@ def convert_t5x_checkpoint_to_flax(t5x_checkpoint_path, config_name, flax_dump_f
55
  flax_model.params["encoder"]["block"][str(layer_index)]["layer"]["1"]["layer_norm"]["weight"] = t5x_mlp_layer_norm
56
 
57
  # Only for layer 0:
58
- t5x_encoder_rel_embedding = t5x_model["target"]["encoder"]["encoder"]["relpos_bias"]["rel_embedding"].T
59
- flax_model.params["encoder"]["block"]["0"]["layer"]["0"]["SelfAttention"]["relative_attention_bias"]["embedding"] = t5x_encoder_rel_embedding[:, 0, :]
60
 
61
  # Assigning
62
  t5x_encoder_norm = t5x_model["target"]["encoder"]["encoder_norm"]["scale"]
@@ -64,49 +65,50 @@ def convert_t5x_checkpoint_to_flax(t5x_checkpoint_path, config_name, flax_dump_f
64
 
65
  # Decoder
66
  for layer_index in range(config.num_decoder_layers):
 
67
 
68
  # Self-Attention
69
- t5x_attention_key = t5x_model["target"]["decoder"]["decoder"]["self_attention"]["key"]["kernel"][:, layer_index, :, :]
70
- t5x_attention_out = t5x_model["target"]["decoder"]["decoder"]["self_attention"]["out"]["kernel"][:, layer_index, :, :]
71
- t5x_attention_query = t5x_model["target"]["decoder"]["decoder"]["self_attention"]["query"]["kernel"][:, layer_index, :, :]
72
- t5x_attention_value = t5x_model["target"]["decoder"]["decoder"]["self_attention"]["value"]["kernel"][:, layer_index, :, :]
73
 
74
  ## Layer Normalization
75
- t5x_pre_attention_layer_norm = t5x_model["target"]["decoder"]["decoder"]["pre_self_attention_layer_norm"]["scale"][:, layer_index]
76
 
77
  # Encoder-Decoder-Attention
78
- t5x_enc_dec_attention_key = t5x_model["target"]["decoder"]["decoder"]["encoder_decoder_attention"]["key"]["kernel"][:, layer_index, :, :]
79
- t5x_enc_dec_attention_out = t5x_model["target"]["decoder"]["decoder"]["encoder_decoder_attention"]["out"]["kernel"][:, layer_index, :, :]
80
- t5x_enc_dec_attention_query = t5x_model["target"]["decoder"]["decoder"]["encoder_decoder_attention"]["query"]["kernel"][:, layer_index, :, :]
81
- t5x_enc_dec_attention_value = t5x_model["target"]["decoder"]["decoder"]["encoder_decoder_attention"]["value"]["kernel"][:, layer_index, :, :]
82
 
83
  ## Layer Normalization
84
- t5x_cross_layer_norm = t5x_model["target"]["decoder"]["decoder"]["pre_cross_attention_layer_norm"]["scale"][:, layer_index]
85
 
86
  # MLP
87
  if split_mlp_wi:
88
- t5x_mlp_wi_0 = t5x_model["target"]["decoder"]["decoder"]["mlp"]["wi_0"]["kernel"][:, layer_index, :]
89
- t5x_mlp_wi_1 = t5x_model["target"]["decoder"]["decoder"]["mlp"]["wi_1"]["kernel"][:, layer_index, :]
90
  else:
91
- t5x_mlp_wi = t5x_model["target"]["decoder"]["decoder"]["mlp"]["wi"]["kernel"][:, layer_index, :]
92
 
93
- t5x_mlp_wo = t5x_model["target"]["decoder"]["decoder"]["mlp"]["wo"]["kernel"][:, layer_index, :]
94
 
95
  ## Layer Normalization
96
- tx5_mlp_layer_norm = t5x_model["target"]["decoder"]["decoder"]["pre_mlp_layer_norm"]["scale"][:, layer_index]
97
 
98
  # Assigning
99
- flax_model.params["decoder"]["block"][str(layer_index)]["layer"]["0"]["SelfAttention"]["k"]["kernel"] = t5x_attention_key.reshape(*t5x_attention_key.shape[:-2], -1)
100
- flax_model.params["decoder"]["block"][str(layer_index)]["layer"]["0"]["SelfAttention"]["o"]["kernel"] = t5x_attention_out.reshape(-1, t5x_attention_out.shape[-1])
101
- flax_model.params["decoder"]["block"][str(layer_index)]["layer"]["0"]["SelfAttention"]["q"]["kernel"] = t5x_attention_query.reshape(*t5x_attention_query.shape[:-2], -1)
102
- flax_model.params["decoder"]["block"][str(layer_index)]["layer"]["0"]["SelfAttention"]["v"]["kernel"] = t5x_attention_value.reshape(*t5x_attention_value.shape[:-2], -1)
103
 
104
  flax_model.params["decoder"]["block"][str(layer_index)]["layer"]["0"]["layer_norm"]["weight"] = t5x_pre_attention_layer_norm
105
 
106
- flax_model.params["decoder"]["block"][str(layer_index)]["layer"]["1"]["EncDecAttention"]["k"]["kernel"] = t5x_enc_dec_attention_key.reshape(*t5x_enc_dec_attention_key.shape[:-2], -1)
107
- flax_model.params["decoder"]["block"][str(layer_index)]["layer"]["1"]["EncDecAttention"]["o"]["kernel"] = t5x_enc_dec_attention_out.reshape(-1, t5x_enc_dec_attention_out.shape[-1])
108
- flax_model.params["decoder"]["block"][str(layer_index)]["layer"]["1"]["EncDecAttention"]["q"]["kernel"] = t5x_enc_dec_attention_query.reshape(*t5x_enc_dec_attention_query.shape[:-2], -1)
109
- flax_model.params["decoder"]["block"][str(layer_index)]["layer"]["1"]["EncDecAttention"]["v"]["kernel"] = t5x_enc_dec_attention_value.reshape(*t5x_enc_dec_attention_value.shape[:-2], -1)
110
 
111
  flax_model.params["decoder"]["block"][str(layer_index)]["layer"]["1"]["layer_norm"]["weight"] = t5x_cross_layer_norm
112
 
@@ -125,8 +127,8 @@ def convert_t5x_checkpoint_to_flax(t5x_checkpoint_path, config_name, flax_dump_f
125
  flax_model.params["decoder"]["final_layer_norm"]["weight"] = tx5_decoder_norm
126
 
127
  # Only for layer 0:
128
- t5x_decoder_rel_embedding = t5x_model["target"]["decoder"]["decoder"]["relpos_bias"]["rel_embedding"].T
129
- flax_model.params["decoder"]["block"]["0"]["layer"]["0"]["SelfAttention"]["relative_attention_bias"]["embedding"] = t5x_decoder_rel_embedding[:, 0, :]
130
 
131
  # Token Embeddings
132
  tx5_token_embeddings = t5x_model["target"]["token_embedder"]["embedding"]
@@ -158,5 +160,4 @@ if __name__ == "__main__":
158
  args = parser.parse_args()
159
  convert_t5x_checkpoint_to_flax(args.t5x_checkpoint_path, args.config_name, args.flax_dump_folder_path)
160
  convert_flax_to_pytorch(args.flax_dump_folder_path, args.flax_dump_folder_path)
161
-
162
 
12
  flax_model = FlaxT5ForConditionalGeneration(config=config)
13
  t5x_model = checkpoints.load_t5x_checkpoint(t5x_checkpoint_path)
14
 
15
+ split_mlp_wi = "wi_0" in t5x_model["target"]["encoder"]["layers_0"]["mlp"]
16
 
17
  # Encoder
18
  for layer_index in range(config.num_layers):
19
+ layer_name = f"layers_{str(layer_index)}"
20
 
21
  # Self-Attention
22
+ t5x_attention_key = t5x_model["target"]["encoder"][layer_name]["attention"]["key"]["kernel"]
23
+ t5x_attention_out = t5x_model["target"]["encoder"][layer_name]["attention"]["out"]["kernel"]
24
+ t5x_attention_query = t5x_model["target"]["encoder"][layer_name]["attention"]["query"]["kernel"]
25
+ t5x_attention_value = t5x_model["target"]["encoder"][layer_name]["attention"]["value"]["kernel"]
26
 
27
  ## Layer Normalization
28
+ t5x_attention_layer_norm = t5x_model["target"]["encoder"][layer_name]["pre_attention_layer_norm"]["scale"]
29
 
30
  if split_mlp_wi:
31
+ t5x_mlp_wi_0 = t5x_model["target"]["encoder"][layer_name]["mlp"]["wi_0"]["kernel"]
32
+ t5x_mlp_wi_1 = t5x_model["target"]["encoder"][layer_name]["mlp"]["wi_1"]["kernel"]
33
  else:
34
+ t5x_mlp_wi = t5x_model["target"]["encoder"][layer_name]["mlp"]["wi"]["kernel"]
35
 
36
+ t5x_mlp_wo = t5x_model["target"]["encoder"][layer_name]["mlp"]["wo"]["kernel"]
37
 
38
  ## Layer Normalization
39
+ t5x_mlp_layer_norm = t5x_model["target"]["encoder"][layer_name]["pre_mlp_layer_norm"]["scale"]
40
 
41
  # Assigning
42
+ flax_model.params["encoder"]["block"][str(layer_index)]["layer"]["0"]["SelfAttention"]["k"]["kernel"] = t5x_attention_key
43
+ flax_model.params["encoder"]["block"][str(layer_index)]["layer"]["0"]["SelfAttention"]["o"]["kernel"] = t5x_attention_out
44
+ flax_model.params["encoder"]["block"][str(layer_index)]["layer"]["0"]["SelfAttention"]["q"]["kernel"] = t5x_attention_query
45
+ flax_model.params["encoder"]["block"][str(layer_index)]["layer"]["0"]["SelfAttention"]["v"]["kernel"] = t5x_attention_value
46
 
47
  flax_model.params["encoder"]["block"][str(layer_index)]["layer"]["0"]["layer_norm"]["weight"] = t5x_attention_layer_norm
48
 
56
  flax_model.params["encoder"]["block"][str(layer_index)]["layer"]["1"]["layer_norm"]["weight"] = t5x_mlp_layer_norm
57
 
58
  # Only for layer 0:
59
+ t5x_encoder_rel_embedding = t5x_model["target"]["encoder"]["relpos_bias"]["rel_embedding"].T
60
+ flax_model.params["encoder"]["block"]["0"]["layer"]["0"]["SelfAttention"]["relative_attention_bias"]["embedding"] = t5x_encoder_rel_embedding
61
 
62
  # Assigning
63
  t5x_encoder_norm = t5x_model["target"]["encoder"]["encoder_norm"]["scale"]
65
 
66
  # Decoder
67
  for layer_index in range(config.num_decoder_layers):
68
+ layer_name = f"layers_{str(layer_index)}"
69
 
70
  # Self-Attention
71
+ t5x_attention_key = t5x_model["target"]["decoder"][layer_name]["self_attention"]["key"]["kernel"]
72
+ t5x_attention_out = t5x_model["target"]["decoder"][layer_name]["self_attention"]["out"]["kernel"]
73
+ t5x_attention_query = t5x_model["target"]["decoder"][layer_name]["self_attention"]["query"]["kernel"]
74
+ t5x_attention_value = t5x_model["target"]["decoder"][layer_name]["self_attention"]["value"]["kernel"]
75
 
76
  ## Layer Normalization
77
+ t5x_pre_attention_layer_norm = t5x_model["target"]["decoder"][layer_name]["pre_self_attention_layer_norm"]["scale"]
78
 
79
  # Encoder-Decoder-Attention
80
+ t5x_enc_dec_attention_key = t5x_model["target"]["decoder"][layer_name]["encoder_decoder_attention"]["key"]["kernel"]
81
+ t5x_enc_dec_attention_out = t5x_model["target"]["decoder"][layer_name]["encoder_decoder_attention"]["out"]["kernel"]
82
+ t5x_enc_dec_attention_query = t5x_model["target"]["decoder"][layer_name]["encoder_decoder_attention"]["query"]["kernel"]
83
+ t5x_enc_dec_attention_value = t5x_model["target"]["decoder"][layer_name]["encoder_decoder_attention"]["value"]["kernel"]
84
 
85
  ## Layer Normalization
86
+ t5x_cross_layer_norm = t5x_model["target"]["decoder"][layer_name]["pre_cross_attention_layer_norm"]["scale"]
87
 
88
  # MLP
89
  if split_mlp_wi:
90
+ t5x_mlp_wi_0 = t5x_model["target"]["decoder"][layer_name]["mlp"]["wi_0"]["kernel"]
91
+ t5x_mlp_wi_1 = t5x_model["target"]["decoder"][layer_name]["mlp"]["wi_1"]["kernel"]
92
  else:
93
+ t5x_mlp_wi = t5x_model["target"]["decoder"][layer_name]["mlp"]["wi"]["kernel"]
94
 
95
+ t5x_mlp_wo = t5x_model["target"]["decoder"][layer_name]["mlp"]["wo"]["kernel"]
96
 
97
  ## Layer Normalization
98
+ tx5_mlp_layer_norm = t5x_model["target"]["decoder"][layer_name]["pre_mlp_layer_norm"]["scale"]
99
 
100
  # Assigning
101
+ flax_model.params["decoder"]["block"][str(layer_index)]["layer"]["0"]["SelfAttention"]["k"]["kernel"] = t5x_attention_key
102
+ flax_model.params["decoder"]["block"][str(layer_index)]["layer"]["0"]["SelfAttention"]["o"]["kernel"] = t5x_attention_out
103
+ flax_model.params["decoder"]["block"][str(layer_index)]["layer"]["0"]["SelfAttention"]["q"]["kernel"] = t5x_attention_query
104
+ flax_model.params["decoder"]["block"][str(layer_index)]["layer"]["0"]["SelfAttention"]["v"]["kernel"] = t5x_attention_value
105
 
106
  flax_model.params["decoder"]["block"][str(layer_index)]["layer"]["0"]["layer_norm"]["weight"] = t5x_pre_attention_layer_norm
107
 
108
+ flax_model.params["decoder"]["block"][str(layer_index)]["layer"]["1"]["EncDecAttention"]["k"]["kernel"] = t5x_enc_dec_attention_key
109
+ flax_model.params["decoder"]["block"][str(layer_index)]["layer"]["1"]["EncDecAttention"]["o"]["kernel"] = t5x_enc_dec_attention_out
110
+ flax_model.params["decoder"]["block"][str(layer_index)]["layer"]["1"]["EncDecAttention"]["q"]["kernel"] = t5x_enc_dec_attention_query
111
+ flax_model.params["decoder"]["block"][str(layer_index)]["layer"]["1"]["EncDecAttention"]["v"]["kernel"] = t5x_enc_dec_attention_value
112
 
113
  flax_model.params["decoder"]["block"][str(layer_index)]["layer"]["1"]["layer_norm"]["weight"] = t5x_cross_layer_norm
114
 
127
  flax_model.params["decoder"]["final_layer_norm"]["weight"] = tx5_decoder_norm
128
 
129
  # Only for layer 0:
130
+ t5x_decoder_rel_embedding = t5x_model["target"]["decoder"]["relpos_bias"]["rel_embedding"].T
131
+ flax_model.params["decoder"]["block"]["0"]["layer"]["0"]["SelfAttention"]["relative_attention_bias"]["embedding"] = t5x_decoder_rel_embedding
132
 
133
  # Token Embeddings
134
  tx5_token_embeddings = t5x_model["target"]["token_embedder"]["embedding"]
160
  args = parser.parse_args()
161
  convert_t5x_checkpoint_to_flax(args.t5x_checkpoint_path, args.config_name, args.flax_dump_folder_path)
162
  convert_flax_to_pytorch(args.flax_dump_folder_path, args.flax_dump_folder_path)
 
163
 
flax_model.msgpack CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8d2c726083b92da996735a77026451a7e5b42627a81a0b01dbc34c3ff2002468
3
  size 735762207
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d82a40e2a628be30c18aec9a0e0a7dd7e73eef7ec43ec213ad0c883f616c1b69
3
  size 735762207
model-info.txt CHANGED
The diff for this file is too large to render. See raw diff
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0b58f3924c39b22b631c2aac1eaa4fb74334d43a8460d34eb3308b8d9d0b32c8
3
  size 735867349
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:94b61f6dc4f056da83ffa8f0786219a3eb1a297e88096524ed6a2c003200249b
3
  size 735867349
small_nl16.gin CHANGED
@@ -1,7 +1,7 @@
1
  # T5.1.1 Efficient small nl16 model.
2
 
3
  import seqio
4
- include 't5x/examples/scalable_t5/t5_1_1/base.gin' # imports vocab, optimizer and model.
5
 
6
  # ------------------- Network specification overrides --------------------------
7
  network.Transformer.config = @network.T5Config()
1
  # T5.1.1 Efficient small nl16 model.
2
 
3
  import seqio
4
+ include 't5x/examples/t5/t5_1_1/base.gin' # imports vocab, optimizer and model.
5
 
6
  # ------------------- Network specification overrides --------------------------
7
  network.Transformer.config = @network.T5Config()
small_nl16_pretrain.gin CHANGED
@@ -12,7 +12,6 @@ include 't5x/configs/runs/pretrain.gin'
12
  # ------------------- Training specification overrides --------------------------
13
  train_script.train:
14
  eval_period = 10000
15
- use_gda = False
16
 
17
  utils.SaveCheckpointConfig:
18
  period = 10000
12
  # ------------------- Training specification overrides --------------------------
13
  train_script.train:
14
  eval_period = 10000
 
15
 
16
  utils.SaveCheckpointConfig:
17
  period = 10000
train/{events.out.tfevents.1661710468.t1v-n-12f94ad0-w-0.60675.0.v2 → events.out.tfevents.1664039578.t1v-n-12f94ad0-w-0.2066226.0.v2} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cc35fbcf1aae134d37c8b1835b980a3a231015174e2f0151a381a3158e01a5df
3
- size 83260
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:efe928bebef1c949dca5e29eec1f2b4b26d042761ef3f6aa77f029a6d9538624
3
+ size 16734
training_eval/pretrain_finnish/{events.out.tfevents.1661710468.t1v-n-12f94ad0-w-0.60675.1.v2 → events.out.tfevents.1664039578.t1v-n-12f94ad0-w-0.2066226.1.v2} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2c362fea4a3c4e8f1ff7f459bbaec9e8224c189528fd13acd10026b714773969
3
- size 65359
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cb328b235587d5a36430d35207227a5c66091fc9e2b52bcb127b089b112215ca
3
+ size 9244