Dampish commited on
Commit
870f230
1 Parent(s): 049140f

Upload 7 files

Browse files
config.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "_name_or_path": "Dampish/StellarX-4B-V1",
3
  "architectures": [
4
  "GPTNeoXForCausalLM"
5
  ],
@@ -14,7 +14,7 @@
14
  "max_position_embeddings": 2048,
15
  "model_type": "gpt_neox",
16
  "num_attention_heads": 32,
17
- "num_hidden_layers": 47,
18
  "rotary_emb_base": 10000,
19
  "rotary_pct": 1.0,
20
  "tie_word_embeddings": false,
 
1
  {
2
+ "_name_or_path": "Dampish/stellar7CHPT",
3
  "architectures": [
4
  "GPTNeoXForCausalLM"
5
  ],
 
14
  "max_position_embeddings": 2048,
15
  "model_type": "gpt_neox",
16
  "num_attention_heads": 32,
17
+ "num_hidden_layers": 32,
18
  "rotary_emb_base": 10000,
19
  "rotary_pct": 1.0,
20
  "tie_word_embeddings": false,
pytorch_model.bin.index.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "metadata": {
3
- "total_size": 15848695324.0
4
  },
5
  "weight_map": {
6
  "embed_out.weight": "pytorch_model-00002-of-00002.bin",
@@ -397,126 +397,6 @@
397
  "gpt_neox.layers.31.mlp.dense_h_to_4h.weight": "pytorch_model-00002-of-00002.bin",
398
  "gpt_neox.layers.31.post_attention_layernorm.bias": "pytorch_model-00002-of-00002.bin",
399
  "gpt_neox.layers.31.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
400
- "gpt_neox.layers.32.attention.bias": "pytorch_model-00002-of-00002.bin",
401
- "gpt_neox.layers.32.attention.dense.bias": "pytorch_model-00002-of-00002.bin",
402
- "gpt_neox.layers.32.attention.dense.weight": "pytorch_model-00002-of-00002.bin",
403
- "gpt_neox.layers.32.attention.masked_bias": "pytorch_model-00002-of-00002.bin",
404
- "gpt_neox.layers.32.attention.query_key_value.bias": "pytorch_model-00002-of-00002.bin",
405
- "gpt_neox.layers.32.attention.query_key_value.weight": "pytorch_model-00002-of-00002.bin",
406
- "gpt_neox.layers.32.attention.rotary_emb.inv_freq": "pytorch_model-00002-of-00002.bin",
407
- "gpt_neox.layers.32.input_layernorm.bias": "pytorch_model-00002-of-00002.bin",
408
- "gpt_neox.layers.32.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
409
- "gpt_neox.layers.32.mlp.dense_4h_to_h.bias": "pytorch_model-00002-of-00002.bin",
410
- "gpt_neox.layers.32.mlp.dense_4h_to_h.weight": "pytorch_model-00002-of-00002.bin",
411
- "gpt_neox.layers.32.mlp.dense_h_to_4h.bias": "pytorch_model-00002-of-00002.bin",
412
- "gpt_neox.layers.32.mlp.dense_h_to_4h.weight": "pytorch_model-00002-of-00002.bin",
413
- "gpt_neox.layers.32.post_attention_layernorm.bias": "pytorch_model-00002-of-00002.bin",
414
- "gpt_neox.layers.32.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
415
- "gpt_neox.layers.33.attention.bias": "pytorch_model-00002-of-00002.bin",
416
- "gpt_neox.layers.33.attention.dense.bias": "pytorch_model-00002-of-00002.bin",
417
- "gpt_neox.layers.33.attention.dense.weight": "pytorch_model-00002-of-00002.bin",
418
- "gpt_neox.layers.33.attention.masked_bias": "pytorch_model-00002-of-00002.bin",
419
- "gpt_neox.layers.33.attention.query_key_value.bias": "pytorch_model-00002-of-00002.bin",
420
- "gpt_neox.layers.33.attention.query_key_value.weight": "pytorch_model-00002-of-00002.bin",
421
- "gpt_neox.layers.33.attention.rotary_emb.inv_freq": "pytorch_model-00002-of-00002.bin",
422
- "gpt_neox.layers.33.input_layernorm.bias": "pytorch_model-00002-of-00002.bin",
423
- "gpt_neox.layers.33.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
424
- "gpt_neox.layers.33.mlp.dense_4h_to_h.bias": "pytorch_model-00002-of-00002.bin",
425
- "gpt_neox.layers.33.mlp.dense_4h_to_h.weight": "pytorch_model-00002-of-00002.bin",
426
- "gpt_neox.layers.33.mlp.dense_h_to_4h.bias": "pytorch_model-00002-of-00002.bin",
427
- "gpt_neox.layers.33.mlp.dense_h_to_4h.weight": "pytorch_model-00002-of-00002.bin",
428
- "gpt_neox.layers.33.post_attention_layernorm.bias": "pytorch_model-00002-of-00002.bin",
429
- "gpt_neox.layers.33.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
430
- "gpt_neox.layers.34.attention.bias": "pytorch_model-00002-of-00002.bin",
431
- "gpt_neox.layers.34.attention.dense.bias": "pytorch_model-00002-of-00002.bin",
432
- "gpt_neox.layers.34.attention.dense.weight": "pytorch_model-00002-of-00002.bin",
433
- "gpt_neox.layers.34.attention.masked_bias": "pytorch_model-00002-of-00002.bin",
434
- "gpt_neox.layers.34.attention.query_key_value.bias": "pytorch_model-00002-of-00002.bin",
435
- "gpt_neox.layers.34.attention.query_key_value.weight": "pytorch_model-00002-of-00002.bin",
436
- "gpt_neox.layers.34.attention.rotary_emb.inv_freq": "pytorch_model-00002-of-00002.bin",
437
- "gpt_neox.layers.34.input_layernorm.bias": "pytorch_model-00002-of-00002.bin",
438
- "gpt_neox.layers.34.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
439
- "gpt_neox.layers.34.mlp.dense_4h_to_h.bias": "pytorch_model-00002-of-00002.bin",
440
- "gpt_neox.layers.34.mlp.dense_4h_to_h.weight": "pytorch_model-00002-of-00002.bin",
441
- "gpt_neox.layers.34.mlp.dense_h_to_4h.bias": "pytorch_model-00002-of-00002.bin",
442
- "gpt_neox.layers.34.mlp.dense_h_to_4h.weight": "pytorch_model-00002-of-00002.bin",
443
- "gpt_neox.layers.34.post_attention_layernorm.bias": "pytorch_model-00002-of-00002.bin",
444
- "gpt_neox.layers.34.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
445
- "gpt_neox.layers.35.attention.bias": "pytorch_model-00002-of-00002.bin",
446
- "gpt_neox.layers.35.attention.dense.bias": "pytorch_model-00002-of-00002.bin",
447
- "gpt_neox.layers.35.attention.dense.weight": "pytorch_model-00002-of-00002.bin",
448
- "gpt_neox.layers.35.attention.masked_bias": "pytorch_model-00002-of-00002.bin",
449
- "gpt_neox.layers.35.attention.query_key_value.bias": "pytorch_model-00002-of-00002.bin",
450
- "gpt_neox.layers.35.attention.query_key_value.weight": "pytorch_model-00002-of-00002.bin",
451
- "gpt_neox.layers.35.attention.rotary_emb.inv_freq": "pytorch_model-00002-of-00002.bin",
452
- "gpt_neox.layers.35.input_layernorm.bias": "pytorch_model-00002-of-00002.bin",
453
- "gpt_neox.layers.35.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
454
- "gpt_neox.layers.35.mlp.dense_4h_to_h.bias": "pytorch_model-00002-of-00002.bin",
455
- "gpt_neox.layers.35.mlp.dense_4h_to_h.weight": "pytorch_model-00002-of-00002.bin",
456
- "gpt_neox.layers.35.mlp.dense_h_to_4h.bias": "pytorch_model-00002-of-00002.bin",
457
- "gpt_neox.layers.35.mlp.dense_h_to_4h.weight": "pytorch_model-00002-of-00002.bin",
458
- "gpt_neox.layers.35.post_attention_layernorm.bias": "pytorch_model-00002-of-00002.bin",
459
- "gpt_neox.layers.35.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
460
- "gpt_neox.layers.36.attention.bias": "pytorch_model-00002-of-00002.bin",
461
- "gpt_neox.layers.36.attention.dense.bias": "pytorch_model-00002-of-00002.bin",
462
- "gpt_neox.layers.36.attention.dense.weight": "pytorch_model-00002-of-00002.bin",
463
- "gpt_neox.layers.36.attention.masked_bias": "pytorch_model-00002-of-00002.bin",
464
- "gpt_neox.layers.36.attention.query_key_value.bias": "pytorch_model-00002-of-00002.bin",
465
- "gpt_neox.layers.36.attention.query_key_value.weight": "pytorch_model-00002-of-00002.bin",
466
- "gpt_neox.layers.36.attention.rotary_emb.inv_freq": "pytorch_model-00002-of-00002.bin",
467
- "gpt_neox.layers.36.input_layernorm.bias": "pytorch_model-00002-of-00002.bin",
468
- "gpt_neox.layers.36.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
469
- "gpt_neox.layers.36.mlp.dense_4h_to_h.bias": "pytorch_model-00002-of-00002.bin",
470
- "gpt_neox.layers.36.mlp.dense_4h_to_h.weight": "pytorch_model-00002-of-00002.bin",
471
- "gpt_neox.layers.36.mlp.dense_h_to_4h.bias": "pytorch_model-00002-of-00002.bin",
472
- "gpt_neox.layers.36.mlp.dense_h_to_4h.weight": "pytorch_model-00002-of-00002.bin",
473
- "gpt_neox.layers.36.post_attention_layernorm.bias": "pytorch_model-00002-of-00002.bin",
474
- "gpt_neox.layers.36.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
475
- "gpt_neox.layers.37.attention.bias": "pytorch_model-00002-of-00002.bin",
476
- "gpt_neox.layers.37.attention.dense.bias": "pytorch_model-00002-of-00002.bin",
477
- "gpt_neox.layers.37.attention.dense.weight": "pytorch_model-00002-of-00002.bin",
478
- "gpt_neox.layers.37.attention.masked_bias": "pytorch_model-00002-of-00002.bin",
479
- "gpt_neox.layers.37.attention.query_key_value.bias": "pytorch_model-00002-of-00002.bin",
480
- "gpt_neox.layers.37.attention.query_key_value.weight": "pytorch_model-00002-of-00002.bin",
481
- "gpt_neox.layers.37.attention.rotary_emb.inv_freq": "pytorch_model-00002-of-00002.bin",
482
- "gpt_neox.layers.37.input_layernorm.bias": "pytorch_model-00002-of-00002.bin",
483
- "gpt_neox.layers.37.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
484
- "gpt_neox.layers.37.mlp.dense_4h_to_h.bias": "pytorch_model-00002-of-00002.bin",
485
- "gpt_neox.layers.37.mlp.dense_4h_to_h.weight": "pytorch_model-00002-of-00002.bin",
486
- "gpt_neox.layers.37.mlp.dense_h_to_4h.bias": "pytorch_model-00002-of-00002.bin",
487
- "gpt_neox.layers.37.mlp.dense_h_to_4h.weight": "pytorch_model-00002-of-00002.bin",
488
- "gpt_neox.layers.37.post_attention_layernorm.bias": "pytorch_model-00002-of-00002.bin",
489
- "gpt_neox.layers.37.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
490
- "gpt_neox.layers.38.attention.bias": "pytorch_model-00002-of-00002.bin",
491
- "gpt_neox.layers.38.attention.dense.bias": "pytorch_model-00002-of-00002.bin",
492
- "gpt_neox.layers.38.attention.dense.weight": "pytorch_model-00002-of-00002.bin",
493
- "gpt_neox.layers.38.attention.masked_bias": "pytorch_model-00002-of-00002.bin",
494
- "gpt_neox.layers.38.attention.query_key_value.bias": "pytorch_model-00002-of-00002.bin",
495
- "gpt_neox.layers.38.attention.query_key_value.weight": "pytorch_model-00002-of-00002.bin",
496
- "gpt_neox.layers.38.attention.rotary_emb.inv_freq": "pytorch_model-00002-of-00002.bin",
497
- "gpt_neox.layers.38.input_layernorm.bias": "pytorch_model-00002-of-00002.bin",
498
- "gpt_neox.layers.38.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
499
- "gpt_neox.layers.38.mlp.dense_4h_to_h.bias": "pytorch_model-00002-of-00002.bin",
500
- "gpt_neox.layers.38.mlp.dense_4h_to_h.weight": "pytorch_model-00002-of-00002.bin",
501
- "gpt_neox.layers.38.mlp.dense_h_to_4h.bias": "pytorch_model-00002-of-00002.bin",
502
- "gpt_neox.layers.38.mlp.dense_h_to_4h.weight": "pytorch_model-00002-of-00002.bin",
503
- "gpt_neox.layers.38.post_attention_layernorm.bias": "pytorch_model-00002-of-00002.bin",
504
- "gpt_neox.layers.38.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
505
- "gpt_neox.layers.39.attention.bias": "pytorch_model-00002-of-00002.bin",
506
- "gpt_neox.layers.39.attention.dense.bias": "pytorch_model-00002-of-00002.bin",
507
- "gpt_neox.layers.39.attention.dense.weight": "pytorch_model-00002-of-00002.bin",
508
- "gpt_neox.layers.39.attention.masked_bias": "pytorch_model-00002-of-00002.bin",
509
- "gpt_neox.layers.39.attention.query_key_value.bias": "pytorch_model-00002-of-00002.bin",
510
- "gpt_neox.layers.39.attention.query_key_value.weight": "pytorch_model-00002-of-00002.bin",
511
- "gpt_neox.layers.39.attention.rotary_emb.inv_freq": "pytorch_model-00002-of-00002.bin",
512
- "gpt_neox.layers.39.input_layernorm.bias": "pytorch_model-00002-of-00002.bin",
513
- "gpt_neox.layers.39.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
514
- "gpt_neox.layers.39.mlp.dense_4h_to_h.bias": "pytorch_model-00002-of-00002.bin",
515
- "gpt_neox.layers.39.mlp.dense_4h_to_h.weight": "pytorch_model-00002-of-00002.bin",
516
- "gpt_neox.layers.39.mlp.dense_h_to_4h.bias": "pytorch_model-00002-of-00002.bin",
517
- "gpt_neox.layers.39.mlp.dense_h_to_4h.weight": "pytorch_model-00002-of-00002.bin",
518
- "gpt_neox.layers.39.post_attention_layernorm.bias": "pytorch_model-00002-of-00002.bin",
519
- "gpt_neox.layers.39.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
520
  "gpt_neox.layers.4.attention.bias": "pytorch_model-00001-of-00002.bin",
521
  "gpt_neox.layers.4.attention.dense.bias": "pytorch_model-00001-of-00002.bin",
522
  "gpt_neox.layers.4.attention.dense.weight": "pytorch_model-00001-of-00002.bin",
@@ -532,111 +412,6 @@
532
  "gpt_neox.layers.4.mlp.dense_h_to_4h.weight": "pytorch_model-00001-of-00002.bin",
533
  "gpt_neox.layers.4.post_attention_layernorm.bias": "pytorch_model-00001-of-00002.bin",
534
  "gpt_neox.layers.4.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
535
- "gpt_neox.layers.40.attention.bias": "pytorch_model-00002-of-00002.bin",
536
- "gpt_neox.layers.40.attention.dense.bias": "pytorch_model-00002-of-00002.bin",
537
- "gpt_neox.layers.40.attention.dense.weight": "pytorch_model-00002-of-00002.bin",
538
- "gpt_neox.layers.40.attention.masked_bias": "pytorch_model-00002-of-00002.bin",
539
- "gpt_neox.layers.40.attention.query_key_value.bias": "pytorch_model-00002-of-00002.bin",
540
- "gpt_neox.layers.40.attention.query_key_value.weight": "pytorch_model-00002-of-00002.bin",
541
- "gpt_neox.layers.40.attention.rotary_emb.inv_freq": "pytorch_model-00002-of-00002.bin",
542
- "gpt_neox.layers.40.input_layernorm.bias": "pytorch_model-00002-of-00002.bin",
543
- "gpt_neox.layers.40.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
544
- "gpt_neox.layers.40.mlp.dense_4h_to_h.bias": "pytorch_model-00002-of-00002.bin",
545
- "gpt_neox.layers.40.mlp.dense_4h_to_h.weight": "pytorch_model-00002-of-00002.bin",
546
- "gpt_neox.layers.40.mlp.dense_h_to_4h.bias": "pytorch_model-00002-of-00002.bin",
547
- "gpt_neox.layers.40.mlp.dense_h_to_4h.weight": "pytorch_model-00002-of-00002.bin",
548
- "gpt_neox.layers.40.post_attention_layernorm.bias": "pytorch_model-00002-of-00002.bin",
549
- "gpt_neox.layers.40.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
550
- "gpt_neox.layers.41.attention.bias": "pytorch_model-00002-of-00002.bin",
551
- "gpt_neox.layers.41.attention.dense.bias": "pytorch_model-00002-of-00002.bin",
552
- "gpt_neox.layers.41.attention.dense.weight": "pytorch_model-00002-of-00002.bin",
553
- "gpt_neox.layers.41.attention.masked_bias": "pytorch_model-00002-of-00002.bin",
554
- "gpt_neox.layers.41.attention.query_key_value.bias": "pytorch_model-00002-of-00002.bin",
555
- "gpt_neox.layers.41.attention.query_key_value.weight": "pytorch_model-00002-of-00002.bin",
556
- "gpt_neox.layers.41.attention.rotary_emb.inv_freq": "pytorch_model-00002-of-00002.bin",
557
- "gpt_neox.layers.41.input_layernorm.bias": "pytorch_model-00002-of-00002.bin",
558
- "gpt_neox.layers.41.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
559
- "gpt_neox.layers.41.mlp.dense_4h_to_h.bias": "pytorch_model-00002-of-00002.bin",
560
- "gpt_neox.layers.41.mlp.dense_4h_to_h.weight": "pytorch_model-00002-of-00002.bin",
561
- "gpt_neox.layers.41.mlp.dense_h_to_4h.bias": "pytorch_model-00002-of-00002.bin",
562
- "gpt_neox.layers.41.mlp.dense_h_to_4h.weight": "pytorch_model-00002-of-00002.bin",
563
- "gpt_neox.layers.41.post_attention_layernorm.bias": "pytorch_model-00002-of-00002.bin",
564
- "gpt_neox.layers.41.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
565
- "gpt_neox.layers.42.attention.bias": "pytorch_model-00002-of-00002.bin",
566
- "gpt_neox.layers.42.attention.dense.bias": "pytorch_model-00002-of-00002.bin",
567
- "gpt_neox.layers.42.attention.dense.weight": "pytorch_model-00002-of-00002.bin",
568
- "gpt_neox.layers.42.attention.masked_bias": "pytorch_model-00002-of-00002.bin",
569
- "gpt_neox.layers.42.attention.query_key_value.bias": "pytorch_model-00002-of-00002.bin",
570
- "gpt_neox.layers.42.attention.query_key_value.weight": "pytorch_model-00002-of-00002.bin",
571
- "gpt_neox.layers.42.attention.rotary_emb.inv_freq": "pytorch_model-00002-of-00002.bin",
572
- "gpt_neox.layers.42.input_layernorm.bias": "pytorch_model-00002-of-00002.bin",
573
- "gpt_neox.layers.42.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
574
- "gpt_neox.layers.42.mlp.dense_4h_to_h.bias": "pytorch_model-00002-of-00002.bin",
575
- "gpt_neox.layers.42.mlp.dense_4h_to_h.weight": "pytorch_model-00002-of-00002.bin",
576
- "gpt_neox.layers.42.mlp.dense_h_to_4h.bias": "pytorch_model-00002-of-00002.bin",
577
- "gpt_neox.layers.42.mlp.dense_h_to_4h.weight": "pytorch_model-00002-of-00002.bin",
578
- "gpt_neox.layers.42.post_attention_layernorm.bias": "pytorch_model-00002-of-00002.bin",
579
- "gpt_neox.layers.42.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
580
- "gpt_neox.layers.43.attention.bias": "pytorch_model-00002-of-00002.bin",
581
- "gpt_neox.layers.43.attention.dense.bias": "pytorch_model-00002-of-00002.bin",
582
- "gpt_neox.layers.43.attention.dense.weight": "pytorch_model-00002-of-00002.bin",
583
- "gpt_neox.layers.43.attention.masked_bias": "pytorch_model-00002-of-00002.bin",
584
- "gpt_neox.layers.43.attention.query_key_value.bias": "pytorch_model-00002-of-00002.bin",
585
- "gpt_neox.layers.43.attention.query_key_value.weight": "pytorch_model-00002-of-00002.bin",
586
- "gpt_neox.layers.43.attention.rotary_emb.inv_freq": "pytorch_model-00002-of-00002.bin",
587
- "gpt_neox.layers.43.input_layernorm.bias": "pytorch_model-00002-of-00002.bin",
588
- "gpt_neox.layers.43.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
589
- "gpt_neox.layers.43.mlp.dense_4h_to_h.bias": "pytorch_model-00002-of-00002.bin",
590
- "gpt_neox.layers.43.mlp.dense_4h_to_h.weight": "pytorch_model-00002-of-00002.bin",
591
- "gpt_neox.layers.43.mlp.dense_h_to_4h.bias": "pytorch_model-00002-of-00002.bin",
592
- "gpt_neox.layers.43.mlp.dense_h_to_4h.weight": "pytorch_model-00002-of-00002.bin",
593
- "gpt_neox.layers.43.post_attention_layernorm.bias": "pytorch_model-00002-of-00002.bin",
594
- "gpt_neox.layers.43.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
595
- "gpt_neox.layers.44.attention.bias": "pytorch_model-00002-of-00002.bin",
596
- "gpt_neox.layers.44.attention.dense.bias": "pytorch_model-00002-of-00002.bin",
597
- "gpt_neox.layers.44.attention.dense.weight": "pytorch_model-00002-of-00002.bin",
598
- "gpt_neox.layers.44.attention.masked_bias": "pytorch_model-00002-of-00002.bin",
599
- "gpt_neox.layers.44.attention.query_key_value.bias": "pytorch_model-00002-of-00002.bin",
600
- "gpt_neox.layers.44.attention.query_key_value.weight": "pytorch_model-00002-of-00002.bin",
601
- "gpt_neox.layers.44.attention.rotary_emb.inv_freq": "pytorch_model-00002-of-00002.bin",
602
- "gpt_neox.layers.44.input_layernorm.bias": "pytorch_model-00002-of-00002.bin",
603
- "gpt_neox.layers.44.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
604
- "gpt_neox.layers.44.mlp.dense_4h_to_h.bias": "pytorch_model-00002-of-00002.bin",
605
- "gpt_neox.layers.44.mlp.dense_4h_to_h.weight": "pytorch_model-00002-of-00002.bin",
606
- "gpt_neox.layers.44.mlp.dense_h_to_4h.bias": "pytorch_model-00002-of-00002.bin",
607
- "gpt_neox.layers.44.mlp.dense_h_to_4h.weight": "pytorch_model-00002-of-00002.bin",
608
- "gpt_neox.layers.44.post_attention_layernorm.bias": "pytorch_model-00002-of-00002.bin",
609
- "gpt_neox.layers.44.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
610
- "gpt_neox.layers.45.attention.bias": "pytorch_model-00002-of-00002.bin",
611
- "gpt_neox.layers.45.attention.dense.bias": "pytorch_model-00002-of-00002.bin",
612
- "gpt_neox.layers.45.attention.dense.weight": "pytorch_model-00002-of-00002.bin",
613
- "gpt_neox.layers.45.attention.masked_bias": "pytorch_model-00002-of-00002.bin",
614
- "gpt_neox.layers.45.attention.query_key_value.bias": "pytorch_model-00002-of-00002.bin",
615
- "gpt_neox.layers.45.attention.query_key_value.weight": "pytorch_model-00002-of-00002.bin",
616
- "gpt_neox.layers.45.attention.rotary_emb.inv_freq": "pytorch_model-00002-of-00002.bin",
617
- "gpt_neox.layers.45.input_layernorm.bias": "pytorch_model-00002-of-00002.bin",
618
- "gpt_neox.layers.45.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
619
- "gpt_neox.layers.45.mlp.dense_4h_to_h.bias": "pytorch_model-00002-of-00002.bin",
620
- "gpt_neox.layers.45.mlp.dense_4h_to_h.weight": "pytorch_model-00002-of-00002.bin",
621
- "gpt_neox.layers.45.mlp.dense_h_to_4h.bias": "pytorch_model-00002-of-00002.bin",
622
- "gpt_neox.layers.45.mlp.dense_h_to_4h.weight": "pytorch_model-00002-of-00002.bin",
623
- "gpt_neox.layers.45.post_attention_layernorm.bias": "pytorch_model-00002-of-00002.bin",
624
- "gpt_neox.layers.45.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
625
- "gpt_neox.layers.46.attention.bias": "pytorch_model-00002-of-00002.bin",
626
- "gpt_neox.layers.46.attention.dense.bias": "pytorch_model-00002-of-00002.bin",
627
- "gpt_neox.layers.46.attention.dense.weight": "pytorch_model-00002-of-00002.bin",
628
- "gpt_neox.layers.46.attention.masked_bias": "pytorch_model-00002-of-00002.bin",
629
- "gpt_neox.layers.46.attention.query_key_value.bias": "pytorch_model-00002-of-00002.bin",
630
- "gpt_neox.layers.46.attention.query_key_value.weight": "pytorch_model-00002-of-00002.bin",
631
- "gpt_neox.layers.46.attention.rotary_emb.inv_freq": "pytorch_model-00002-of-00002.bin",
632
- "gpt_neox.layers.46.input_layernorm.bias": "pytorch_model-00002-of-00002.bin",
633
- "gpt_neox.layers.46.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
634
- "gpt_neox.layers.46.mlp.dense_4h_to_h.bias": "pytorch_model-00002-of-00002.bin",
635
- "gpt_neox.layers.46.mlp.dense_4h_to_h.weight": "pytorch_model-00002-of-00002.bin",
636
- "gpt_neox.layers.46.mlp.dense_h_to_4h.bias": "pytorch_model-00002-of-00002.bin",
637
- "gpt_neox.layers.46.mlp.dense_h_to_4h.weight": "pytorch_model-00002-of-00002.bin",
638
- "gpt_neox.layers.46.post_attention_layernorm.bias": "pytorch_model-00002-of-00002.bin",
639
- "gpt_neox.layers.46.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
640
  "gpt_neox.layers.5.attention.bias": "pytorch_model-00001-of-00002.bin",
641
  "gpt_neox.layers.5.attention.dense.bias": "pytorch_model-00001-of-00002.bin",
642
  "gpt_neox.layers.5.attention.dense.weight": "pytorch_model-00001-of-00002.bin",
 
1
  {
2
  "metadata": {
3
+ "total_size": 11120239744.0
4
  },
5
  "weight_map": {
6
  "embed_out.weight": "pytorch_model-00002-of-00002.bin",
 
397
  "gpt_neox.layers.31.mlp.dense_h_to_4h.weight": "pytorch_model-00002-of-00002.bin",
398
  "gpt_neox.layers.31.post_attention_layernorm.bias": "pytorch_model-00002-of-00002.bin",
399
  "gpt_neox.layers.31.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
400
  "gpt_neox.layers.4.attention.bias": "pytorch_model-00001-of-00002.bin",
401
  "gpt_neox.layers.4.attention.dense.bias": "pytorch_model-00001-of-00002.bin",
402
  "gpt_neox.layers.4.attention.dense.weight": "pytorch_model-00001-of-00002.bin",
 
412
  "gpt_neox.layers.4.mlp.dense_h_to_4h.weight": "pytorch_model-00001-of-00002.bin",
413
  "gpt_neox.layers.4.post_attention_layernorm.bias": "pytorch_model-00001-of-00002.bin",
414
  "gpt_neox.layers.4.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
415
  "gpt_neox.layers.5.attention.bias": "pytorch_model-00001-of-00002.bin",
416
  "gpt_neox.layers.5.attention.dense.bias": "pytorch_model-00001-of-00002.bin",
417
  "gpt_neox.layers.5.attention.dense.weight": "pytorch_model-00001-of-00002.bin",
special_tokens_map.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<|endoftext|>",
3
+ "eos_token": "<|endoftext|>",
4
+ "pad_token": "<|endoftext|>",
5
+ "unk_token": "<|endoftext|>"
6
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "bos_token": "<|endoftext|>",
4
+ "clean_up_tokenization_spaces": true,
5
+ "eos_token": "<|endoftext|>",
6
+ "model_max_length": 2048,
7
+ "tokenizer_class": "GPTNeoXTokenizer",
8
+ "unk_token": "<|endoftext|>"
9
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c7f5b60fb2e763a22cc0eb206f3b00f7f9f904fed78253adb0c5535ab00f74d3
3
+ size 3899