mazesmazes commited on
Commit
c26201f
·
verified ·
1 Parent(s): 9fb1159

Training in progress, step 1000

Browse files
Files changed (4) hide show
  1. config.json +63 -15
  2. generation_config.json +4 -4
  3. model.safetensors +1 -1
  4. training_args.bin +1 -1
config.json CHANGED
@@ -279,20 +279,19 @@
279
  "system_prompt": "",
280
  "temperature": 0.7,
281
  "text_config": {
282
- "_name_or_path": "Qwen/Qwen3-1.7B",
283
  "architectures": [
284
- "Qwen3ForCausalLM"
285
  ],
286
  "attention_bias": false,
287
  "attention_dropout": 0.0,
288
  "bos_token_id": null,
289
  "dtype": "bfloat16",
290
- "eos_token_id": 151645,
291
- "head_dim": 128,
292
  "hidden_act": "silu",
293
  "hidden_size": 2048,
294
  "initializer_range": 0.02,
295
- "intermediate_size": 6144,
296
  "layer_types": [
297
  "full_attention",
298
  "full_attention",
@@ -321,27 +320,76 @@
321
  "full_attention",
322
  "full_attention",
323
  "full_attention",
 
 
 
 
 
 
 
 
324
  "full_attention"
325
  ],
326
- "max_position_embeddings": 40960,
327
  "max_window_layers": 28,
328
- "model_type": "qwen3",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
329
  "num_attention_heads": 16,
330
- "num_hidden_layers": 28,
331
- "num_key_value_heads": 8,
332
- "pad_token_id": 151643,
 
333
  "rms_norm_eps": 1e-06,
334
  "rope_parameters": {
335
- "rope_theta": 1000000,
336
  "rope_type": "default"
337
  },
338
  "sliding_window": null,
339
  "tie_word_embeddings": true,
340
- "use_cache": true,
341
  "use_sliding_window": false,
342
- "vocab_size": 151670
343
  },
344
- "text_model_id": "Qwen/Qwen3-1.7B",
345
  "time_mask_length": 100,
346
  "top_k": null,
347
  "top_p": null,
@@ -349,5 +397,5 @@
349
  "use_cache": false,
350
  "use_lora": false,
351
  "use_specaugment": true,
352
- "vocab_size": 151670
353
  }
 
279
  "system_prompt": "",
280
  "temperature": 0.7,
281
  "text_config": {
282
+ "_name_or_path": "HuggingFaceTB/SmolLM3-3B",
283
  "architectures": [
284
+ "SmolLM3ForCausalLM"
285
  ],
286
  "attention_bias": false,
287
  "attention_dropout": 0.0,
288
  "bos_token_id": null,
289
  "dtype": "bfloat16",
290
+ "eos_token_id": 128012,
 
291
  "hidden_act": "silu",
292
  "hidden_size": 2048,
293
  "initializer_range": 0.02,
294
+ "intermediate_size": 11008,
295
  "layer_types": [
296
  "full_attention",
297
  "full_attention",
 
320
  "full_attention",
321
  "full_attention",
322
  "full_attention",
323
+ "full_attention",
324
+ "full_attention",
325
+ "full_attention",
326
+ "full_attention",
327
+ "full_attention",
328
+ "full_attention",
329
+ "full_attention",
330
+ "full_attention",
331
  "full_attention"
332
  ],
333
+ "max_position_embeddings": 65536,
334
  "max_window_layers": 28,
335
+ "mlp_bias": false,
336
+ "model_type": "smollm3",
337
+ "no_rope_layer_interval": 4,
338
+ "no_rope_layers": [
339
+ 1,
340
+ 1,
341
+ 1,
342
+ 0,
343
+ 1,
344
+ 1,
345
+ 1,
346
+ 0,
347
+ 1,
348
+ 1,
349
+ 1,
350
+ 0,
351
+ 1,
352
+ 1,
353
+ 1,
354
+ 0,
355
+ 1,
356
+ 1,
357
+ 1,
358
+ 0,
359
+ 1,
360
+ 1,
361
+ 1,
362
+ 0,
363
+ 1,
364
+ 1,
365
+ 1,
366
+ 0,
367
+ 1,
368
+ 1,
369
+ 1,
370
+ 0,
371
+ 1,
372
+ 1,
373
+ 1,
374
+ 0
375
+ ],
376
  "num_attention_heads": 16,
377
+ "num_hidden_layers": 36,
378
+ "num_key_value_heads": 4,
379
+ "pad_token_id": 128004,
380
+ "pretraining_tp": 2,
381
  "rms_norm_eps": 1e-06,
382
  "rope_parameters": {
383
+ "rope_theta": 5000000.0,
384
  "rope_type": "default"
385
  },
386
  "sliding_window": null,
387
  "tie_word_embeddings": true,
388
+ "use_cache": false,
389
  "use_sliding_window": false,
390
+ "vocab_size": 128257
391
  },
392
+ "text_model_id": "HuggingFaceTB/SmolLM3-3B",
393
  "time_mask_length": 100,
394
  "top_k": null,
395
  "top_p": null,
 
397
  "use_cache": false,
398
  "use_lora": false,
399
  "use_specaugment": true,
400
+ "vocab_size": 128257
401
  }
generation_config.json CHANGED
@@ -1,16 +1,16 @@
1
  {
2
- "bos_token_id": 151643,
3
  "do_sample": true,
4
  "eos_token_id": [
5
- 151645,
6
- 151643
7
  ],
8
  "length_penalty": 1.0,
9
  "max_new_tokens": 128,
10
  "min_new_tokens": 0,
11
  "no_repeat_ngram_size": 0,
12
  "num_beams": 1,
13
- "pad_token_id": 151643,
14
  "repetition_penalty": 1.05,
15
  "temperature": 0.7,
16
  "transformers_version": "5.0.0.dev0",
 
1
  {
2
+ "bos_token_id": 128000,
3
  "do_sample": true,
4
  "eos_token_id": [
5
+ 128012,
6
+ null
7
  ],
8
  "length_penalty": 1.0,
9
  "max_new_tokens": 128,
10
  "min_new_tokens": 0,
11
  "no_repeat_ngram_size": 0,
12
  "num_beams": 1,
13
+ "pad_token_id": 128004,
14
  "repetition_penalty": 1.05,
15
  "temperature": 0.7,
16
  "transformers_version": "5.0.0.dev0",
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:21bc671971dc0afa4c8724009fa53e8939e00b4a2c2248e4f97caca316c1f236
3
  size 58732960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f2edc6701b98756dfb86241a8eccc63a7f29eedf9898bf9cf85742e7b2f3be77
3
  size 58732960
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:060e9c0aa3404e278bdec0b14f0a68b938b1ec109a6bbd119f1c4dfcbb908299
3
  size 5265
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dae774817b9d14faaedcd0353adbe6446f55e18bc682604a3a87a18f9e9ec0cb
3
  size 5265