zhangzhao219 commited on Feb 23

Commit

5ac2298

•

1 Parent(s): a542899

Delete pretrained

Browse files

Files changed (31) hide show

pretrained/nomic-ai/nomic-embed-text-v1/.gitattributes +0 -35
pretrained/nomic-ai/nomic-embed-text-v1/1_Pooling/config.json +0 -9
pretrained/nomic-ai/nomic-embed-text-v1/README.md +0 -2736
pretrained/nomic-ai/nomic-embed-text-v1/config.json +0 -56
pretrained/nomic-ai/nomic-embed-text-v1/config_sentence_transformers.json +0 -7
pretrained/nomic-ai/nomic-embed-text-v1/configuration_hf_nomic_bert.py +0 -53
pretrained/nomic-ai/nomic-embed-text-v1/model.safetensors +0 -3
pretrained/nomic-ai/nomic-embed-text-v1/modeling_hf_nomic_bert.py +0 -1238
pretrained/nomic-ai/nomic-embed-text-v1/modules.json +0 -20
pretrained/nomic-ai/nomic-embed-text-v1/onnx/model.onnx +0 -3
pretrained/nomic-ai/nomic-embed-text-v1/onnx/model_quantized.onnx +0 -3
pretrained/nomic-ai/nomic-embed-text-v1/pytorch_model.bin +0 -3
pretrained/nomic-ai/nomic-embed-text-v1/sentence_bert_config.json +0 -4
pretrained/nomic-ai/nomic-embed-text-v1/special_tokens_map.json +0 -7
pretrained/nomic-ai/nomic-embed-text-v1/tokenizer.json +0 -0
pretrained/nomic-ai/nomic-embed-text-v1/tokenizer_config.json +0 -55
pretrained/nomic-ai/nomic-embed-text-v1/vocab.txt +0 -0
pretrained/upstage/SOLAR-10.7B-Instruct-v1.0/.gitattributes +0 -35
pretrained/upstage/SOLAR-10.7B-Instruct-v1.0/README.md +0 -181
pretrained/upstage/SOLAR-10.7B-Instruct-v1.0/config.json +0 -28
pretrained/upstage/SOLAR-10.7B-Instruct-v1.0/generation_config.json +0 -8
pretrained/upstage/SOLAR-10.7B-Instruct-v1.0/model-00001-of-00005.safetensors +0 -3
pretrained/upstage/SOLAR-10.7B-Instruct-v1.0/model-00002-of-00005.safetensors +0 -3
pretrained/upstage/SOLAR-10.7B-Instruct-v1.0/model-00003-of-00005.safetensors +0 -3
pretrained/upstage/SOLAR-10.7B-Instruct-v1.0/model-00004-of-00005.safetensors +0 -3
pretrained/upstage/SOLAR-10.7B-Instruct-v1.0/model-00005-of-00005.safetensors +0 -3
pretrained/upstage/SOLAR-10.7B-Instruct-v1.0/model.safetensors.index.json +0 -442
pretrained/upstage/SOLAR-10.7B-Instruct-v1.0/solar_logo.png +0 -0
pretrained/upstage/SOLAR-10.7B-Instruct-v1.0/tokenizer.json +0 -0
pretrained/upstage/SOLAR-10.7B-Instruct-v1.0/tokenizer.model +0 -3
pretrained/upstage/SOLAR-10.7B-Instruct-v1.0/tokenizer_config.json +0 -43

pretrained/nomic-ai/nomic-embed-text-v1/.gitattributes DELETED Viewed

@@ -1,35 +0,0 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text

pretrained/nomic-ai/nomic-embed-text-v1/1_Pooling/config.json DELETED Viewed

@@ -1,9 +0,0 @@
-{
-  "word_embedding_dimension": 768,
-  "pooling_mode_cls_token": false,
-  "pooling_mode_mean_tokens": true,
-  "pooling_mode_max_tokens": false,
-  "pooling_mode_mean_sqrt_len_tokens": false,
-  "pooling_mode_weightedmean_tokens": false,
-  "pooling_mode_lasttoken": false
-}

pretrained/nomic-ai/nomic-embed-text-v1/README.md DELETED Viewed

@@ -1,2736 +0,0 @@
----
-library_name: sentence-transformers
-pipeline_tag: sentence-similarity
-tags:
-  - feature-extraction
-  - sentence-similarity
-  - mteb
-  - transformers
-  - transformers.js
-model-index:
-- name: epoch_0_model
-  results:
-  - task:
-      type: Classification
-    dataset:
-      type: mteb/amazon_counterfactual
-      name: MTEB AmazonCounterfactualClassification (en)
-      config: en
-      split: test
-      revision: e8379541af4e31359cca9fbcf4b00f2671dba205
-    metrics:
-    - type: accuracy
-      value: 76.8507462686567
-    - type: ap
-      value: 40.592189159090495
-    - type: f1
-      value: 71.01634655512476
-  - task:
-      type: Classification
-    dataset:
-      type: mteb/amazon_polarity
-      name: MTEB AmazonPolarityClassification
-      config: default
-      split: test
-      revision: e2d317d38cd51312af73b3d32a06d1a08b442046
-    metrics:
-    - type: accuracy
-      value: 91.51892500000001
-    - type: ap
-      value: 88.50346762975335
-    - type: f1
-      value: 91.50342077459624
-  - task:
-      type: Classification
-    dataset:
-      type: mteb/amazon_reviews_multi
-      name: MTEB AmazonReviewsClassification (en)
-      config: en
-      split: test
-      revision: 1399c76144fd37290681b995c656ef9b2e06e26d
-    metrics:
-    - type: accuracy
-      value: 47.364
-    - type: f1
-      value: 46.72708080922794
-  - task:
-      type: Retrieval
-    dataset:
-      type: arguana
-      name: MTEB ArguAna
-      config: default
-      split: test
-      revision: None
-    metrics:
-    - type: map_at_1
-      value: 25.178
-    - type: map_at_10
-      value: 40.244
-    - type: map_at_100
-      value: 41.321999999999996
-    - type: map_at_1000
-      value: 41.331
-    - type: map_at_3
-      value: 35.016999999999996
-    - type: map_at_5
-      value: 37.99
-    - type: mrr_at_1
-      value: 25.605
-    - type: mrr_at_10
-      value: 40.422000000000004
-    - type: mrr_at_100
-      value: 41.507
-    - type: mrr_at_1000
-      value: 41.516
-    - type: mrr_at_3
-      value: 35.23
-    - type: mrr_at_5
-      value: 38.15
-    - type: ndcg_at_1
-      value: 25.178
-    - type: ndcg_at_10
-      value: 49.258
-    - type: ndcg_at_100
-      value: 53.776
-    - type: ndcg_at_1000
-      value: 53.995000000000005
-    - type: ndcg_at_3
-      value: 38.429
-    - type: ndcg_at_5
-      value: 43.803
-    - type: precision_at_1
-      value: 25.178
-    - type: precision_at_10
-      value: 7.831
-    - type: precision_at_100
-      value: 0.979
-    - type: precision_at_1000
-      value: 0.1
-    - type: precision_at_3
-      value: 16.121
-    - type: precision_at_5
-      value: 12.29
-    - type: recall_at_1
-      value: 25.178
-    - type: recall_at_10
-      value: 78.307
-    - type: recall_at_100
-      value: 97.866
-    - type: recall_at_1000
-      value: 99.57300000000001
-    - type: recall_at_3
-      value: 48.364000000000004
-    - type: recall_at_5
-      value: 61.451
-  - task:
-      type: Clustering
-    dataset:
-      type: mteb/arxiv-clustering-p2p
-      name: MTEB ArxivClusteringP2P
-      config: default
-      split: test
-      revision: a122ad7f3f0291bf49cc6f4d32aa80929df69d5d
-    metrics:
-    - type: v_measure
-      value: 45.93034494751465
-  - task:
-      type: Clustering
-    dataset:
-      type: mteb/arxiv-clustering-s2s
-      name: MTEB ArxivClusteringS2S
-      config: default
-      split: test
-      revision: f910caf1a6075f7329cdf8c1a6135696f37dbd53
-    metrics:
-    - type: v_measure
-      value: 36.64579480054327
-  - task:
-      type: Reranking
-    dataset:
-      type: mteb/askubuntudupquestions-reranking
-      name: MTEB AskUbuntuDupQuestions
-      config: default
-      split: test
-      revision: 2000358ca161889fa9c082cb41daa8dcfb161a54
-    metrics:
-    - type: map
-      value: 60.601310529222054
-    - type: mrr
-      value: 75.04484896451656
-  - task:
-      type: STS
-    dataset:
-      type: mteb/biosses-sts
-      name: MTEB BIOSSES
-      config: default
-      split: test
-      revision: d3fb88f8f02e40887cd149695127462bbcf29b4a
-    metrics:
-    - type: cos_sim_pearson
-      value: 88.57797718095814
-    - type: cos_sim_spearman
-      value: 86.47064499110101
-    - type: euclidean_pearson
-      value: 87.4559602783142
-    - type: euclidean_spearman
-      value: 86.47064499110101
-    - type: manhattan_pearson
-      value: 87.7232764230245
-    - type: manhattan_spearman
-      value: 86.91222131777742
-  - task:
-      type: Classification
-    dataset:
-      type: mteb/banking77
-      name: MTEB Banking77Classification
-      config: default
-      split: test
-      revision: 0fd18e25b25c072e09e0d92ab615fda904d66300
-    metrics:
-    - type: accuracy
-      value: 84.5422077922078
-    - type: f1
-      value: 84.47657456950589
-  - task:
-      type: Clustering
-    dataset:
-      type: mteb/biorxiv-clustering-p2p
-      name: MTEB BiorxivClusteringP2P
-      config: default
-      split: test
-      revision: 65b79d1d13f80053f67aca9498d9402c2d9f1f40
-    metrics:
-    - type: v_measure
-      value: 38.48953561974464
-  - task:
-      type: Clustering
-    dataset:
-      type: mteb/biorxiv-clustering-s2s
-      name: MTEB BiorxivClusteringS2S
-      config: default
-      split: test
-      revision: 258694dd0231531bc1fd9de6ceb52a0853c6d908
-    metrics:
-    - type: v_measure
-      value: 32.75995857510105
-  - task:
-      type: Retrieval
-    dataset:
-      type: BeIR/cqadupstack
-      name: MTEB CQADupstackAndroidRetrieval
-      config: default
-      split: test
-      revision: None
-    metrics:
-    - type: map_at_1
-      value: 30.008000000000003
-    - type: map_at_10
-      value: 39.51
-    - type: map_at_100
-      value: 40.841
-    - type: map_at_1000
-      value: 40.973
-    - type: map_at_3
-      value: 36.248999999999995
-    - type: map_at_5
-      value: 38.096999999999994
-    - type: mrr_at_1
-      value: 36.481
-    - type: mrr_at_10
-      value: 44.818000000000005
-    - type: mrr_at_100
-      value: 45.64
-    - type: mrr_at_1000
-      value: 45.687
-    - type: mrr_at_3
-      value: 42.036
-    - type: mrr_at_5
-      value: 43.782
-    - type: ndcg_at_1
-      value: 36.481
-    - type: ndcg_at_10
-      value: 45.152
-    - type: ndcg_at_100
-      value: 50.449
-    - type: ndcg_at_1000
-      value: 52.76499999999999
-    - type: ndcg_at_3
-      value: 40.161
-    - type: ndcg_at_5
-      value: 42.577999999999996
-    - type: precision_at_1
-      value: 36.481
-    - type: precision_at_10
-      value: 8.369
-    - type: precision_at_100
-      value: 1.373
-    - type: precision_at_1000
-      value: 0.186
-    - type: precision_at_3
-      value: 18.693
-    - type: precision_at_5
-      value: 13.533999999999999
-    - type: recall_at_1
-      value: 30.008000000000003
-    - type: recall_at_10
-      value: 56.108999999999995
-    - type: recall_at_100
-      value: 78.55499999999999
-    - type: recall_at_1000
-      value: 93.659
-    - type: recall_at_3
-      value: 41.754999999999995
-    - type: recall_at_5
-      value: 48.296
-  - task:
-      type: Retrieval
-    dataset:
-      type: BeIR/cqadupstack
-      name: MTEB CQADupstackEnglishRetrieval
-      config: default
-      split: test
-      revision: None
-    metrics:
-    - type: map_at_1
-      value: 30.262
-    - type: map_at_10
-      value: 40.139
-    - type: map_at_100
-      value: 41.394
-    - type: map_at_1000
-      value: 41.526
-    - type: map_at_3
-      value: 37.155
-    - type: map_at_5
-      value: 38.785
-    - type: mrr_at_1
-      value: 38.153
-    - type: mrr_at_10
-      value: 46.369
-    - type: mrr_at_100
-      value: 47.072
-    - type: mrr_at_1000
-      value: 47.111999999999995
-    - type: mrr_at_3
-      value: 44.268
-    - type: mrr_at_5
-      value: 45.389
-    - type: ndcg_at_1
-      value: 38.153
-    - type: ndcg_at_10
-      value: 45.925
-    - type: ndcg_at_100
-      value: 50.394000000000005
-    - type: ndcg_at_1000
-      value: 52.37500000000001
-    - type: ndcg_at_3
-      value: 41.754000000000005
-    - type: ndcg_at_5
-      value: 43.574
-    - type: precision_at_1
-      value: 38.153
-    - type: precision_at_10
-      value: 8.796
-    - type: precision_at_100
-      value: 1.432
-    - type: precision_at_1000
-      value: 0.189
-    - type: precision_at_3
-      value: 20.318
-    - type: precision_at_5
-      value: 14.395
-    - type: recall_at_1
-      value: 30.262
-    - type: recall_at_10
-      value: 55.72200000000001
-    - type: recall_at_100
-      value: 74.97500000000001
-    - type: recall_at_1000
-      value: 87.342
-    - type: recall_at_3
-      value: 43.129
-    - type: recall_at_5
-      value: 48.336
-  - task:
-      type: Retrieval
-    dataset:
-      type: BeIR/cqadupstack
-      name: MTEB CQADupstackGamingRetrieval
-      config: default
-      split: test
-      revision: None
-    metrics:
-    - type: map_at_1
-      value: 39.951
-    - type: map_at_10
-      value: 51.248000000000005
-    - type: map_at_100
-      value: 52.188
-    - type: map_at_1000
-      value: 52.247
-    - type: map_at_3
-      value: 48.211
-    - type: map_at_5
-      value: 49.797000000000004
-    - type: mrr_at_1
-      value: 45.329
-    - type: mrr_at_10
-      value: 54.749
-    - type: mrr_at_100
-      value: 55.367999999999995
-    - type: mrr_at_1000
-      value: 55.400000000000006
-    - type: mrr_at_3
-      value: 52.382
-    - type: mrr_at_5
-      value: 53.649
-    - type: ndcg_at_1
-      value: 45.329
-    - type: ndcg_at_10
-      value: 56.847
-    - type: ndcg_at_100
-      value: 60.738
-    - type: ndcg_at_1000
-      value: 61.976
-    - type: ndcg_at_3
-      value: 51.59
-    - type: ndcg_at_5
-      value: 53.915
-    - type: precision_at_1
-      value: 45.329
-    - type: precision_at_10
-      value: 8.959
-    - type: precision_at_100
-      value: 1.187
-    - type: precision_at_1000
-      value: 0.134
-    - type: precision_at_3
-      value: 22.612
-    - type: precision_at_5
-      value: 15.273
-    - type: recall_at_1
-      value: 39.951
-    - type: recall_at_10
-      value: 70.053
-    - type: recall_at_100
-      value: 86.996
-    - type: recall_at_1000
-      value: 95.707
-    - type: recall_at_3
-      value: 56.032000000000004
-    - type: recall_at_5
-      value: 61.629999999999995
-  - task:
-      type: Retrieval
-    dataset:
-      type: BeIR/cqadupstack
-      name: MTEB CQADupstackGisRetrieval
-      config: default
-      split: test
-      revision: None
-    metrics:
-    - type: map_at_1
-      value: 25.566
-    - type: map_at_10
-      value: 33.207
-    - type: map_at_100
-      value: 34.166000000000004
-    - type: map_at_1000
-      value: 34.245
-    - type: map_at_3
-      value: 30.94
-    - type: map_at_5
-      value: 32.01
-    - type: mrr_at_1
-      value: 27.345000000000002
-    - type: mrr_at_10
-      value: 35.193000000000005
-    - type: mrr_at_100
-      value: 35.965
-    - type: mrr_at_1000
-      value: 36.028999999999996
-    - type: mrr_at_3
-      value: 32.806000000000004
-    - type: mrr_at_5
-      value: 34.021
-    - type: ndcg_at_1
-      value: 27.345000000000002
-    - type: ndcg_at_10
-      value: 37.891999999999996
-    - type: ndcg_at_100
-      value: 42.664
-    - type: ndcg_at_1000
-      value: 44.757000000000005
-    - type: ndcg_at_3
-      value: 33.123000000000005
-    - type: ndcg_at_5
-      value: 35.035
-    - type: precision_at_1
-      value: 27.345000000000002
-    - type: precision_at_10
-      value: 5.763
-    - type: precision_at_100
-      value: 0.859
-    - type: precision_at_1000
-      value: 0.108
-    - type: precision_at_3
-      value: 13.71
-    - type: precision_at_5
-      value: 9.401
-    - type: recall_at_1
-      value: 25.566
-    - type: recall_at_10
-      value: 50.563
-    - type: recall_at_100
-      value: 72.86399999999999
-    - type: recall_at_1000
-      value: 88.68599999999999
-    - type: recall_at_3
-      value: 37.43
-    - type: recall_at_5
-      value: 41.894999999999996
-  - task:
-      type: Retrieval
-    dataset:
-      type: BeIR/cqadupstack
-      name: MTEB CQADupstackMathematicaRetrieval
-      config: default
-      split: test
-      revision: None
-    metrics:
-    - type: map_at_1
-      value: 16.663
-    - type: map_at_10
-      value: 23.552
-    - type: map_at_100
-      value: 24.538
-    - type: map_at_1000
-      value: 24.661
-    - type: map_at_3
-      value: 21.085
-    - type: map_at_5
-      value: 22.391
-    - type: mrr_at_1
-      value: 20.025000000000002
-    - type: mrr_at_10
-      value: 27.643
-    - type: mrr_at_100
-      value: 28.499999999999996
-    - type: mrr_at_1000
-      value: 28.582
-    - type: mrr_at_3
-      value: 25.083
-    - type: mrr_at_5
-      value: 26.544
-    - type: ndcg_at_1
-      value: 20.025000000000002
-    - type: ndcg_at_10
-      value: 28.272000000000002
-    - type: ndcg_at_100
-      value: 33.353
-    - type: ndcg_at_1000
-      value: 36.454
-    - type: ndcg_at_3
-      value: 23.579
-    - type: ndcg_at_5
-      value: 25.685000000000002
-    - type: precision_at_1
-      value: 20.025000000000002
-    - type: precision_at_10
-      value: 5.187
-    - type: precision_at_100
-      value: 0.897
-    - type: precision_at_1000
-      value: 0.13
-    - type: precision_at_3
-      value: 10.987
-    - type: precision_at_5
-      value: 8.06
-    - type: recall_at_1
-      value: 16.663
-    - type: recall_at_10
-      value: 38.808
-    - type: recall_at_100
-      value: 61.305
-    - type: recall_at_1000
-      value: 83.571
-    - type: recall_at_3
-      value: 25.907999999999998
-    - type: recall_at_5
-      value: 31.214
-  - task:
-      type: Retrieval
-    dataset:
-      type: BeIR/cqadupstack
-      name: MTEB CQADupstackPhysicsRetrieval
-      config: default
-      split: test
-      revision: None
-    metrics:
-    - type: map_at_1
-      value: 27.695999999999998
-    - type: map_at_10
-      value: 37.018
-    - type: map_at_100
-      value: 38.263000000000005
-    - type: map_at_1000
-      value: 38.371
-    - type: map_at_3
-      value: 34.226
-    - type: map_at_5
-      value: 35.809999999999995
-    - type: mrr_at_1
-      value: 32.916000000000004
-    - type: mrr_at_10
-      value: 42.067
-    - type: mrr_at_100
-      value: 42.925000000000004
-    - type: mrr_at_1000
-      value: 42.978
-    - type: mrr_at_3
-      value: 39.637
-    - type: mrr_at_5
-      value: 41.134
-    - type: ndcg_at_1
-      value: 32.916000000000004
-    - type: ndcg_at_10
-      value: 42.539
-    - type: ndcg_at_100
-      value: 47.873
-    - type: ndcg_at_1000
-      value: 50.08200000000001
-    - type: ndcg_at_3
-      value: 37.852999999999994
-    - type: ndcg_at_5
-      value: 40.201
-    - type: precision_at_1
-      value: 32.916000000000004
-    - type: precision_at_10
-      value: 7.5840000000000005
-    - type: precision_at_100
-      value: 1.199
-    - type: precision_at_1000
-      value: 0.155
-    - type: precision_at_3
-      value: 17.485
-    - type: precision_at_5
-      value: 12.512
-    - type: recall_at_1
-      value: 27.695999999999998
-    - type: recall_at_10
-      value: 53.638
-    - type: recall_at_100
-      value: 76.116
-    - type: recall_at_1000
-      value: 91.069
-    - type: recall_at_3
-      value: 41.13
-    - type: recall_at_5
-      value: 46.872
-  - task:
-      type: Retrieval
-    dataset:
-      type: BeIR/cqadupstack
-      name: MTEB CQADupstackProgrammersRetrieval
-      config: default
-      split: test
-      revision: None
-    metrics:
-    - type: map_at_1
-      value: 24.108
-    - type: map_at_10
-      value: 33.372
-    - type: map_at_100
-      value: 34.656
-    - type: map_at_1000
-      value: 34.768
-    - type: map_at_3
-      value: 30.830999999999996
-    - type: map_at_5
-      value: 32.204
-    - type: mrr_at_1
-      value: 29.110000000000003
-    - type: mrr_at_10
-      value: 37.979
-    - type: mrr_at_100
-      value: 38.933
-    - type: mrr_at_1000
-      value: 38.988
-    - type: mrr_at_3
-      value: 35.731
-    - type: mrr_at_5
-      value: 36.963
-    - type: ndcg_at_1
-      value: 29.110000000000003
-    - type: ndcg_at_10
-      value: 38.635000000000005
-    - type: ndcg_at_100
-      value: 44.324999999999996
-    - type: ndcg_at_1000
-      value: 46.747
-    - type: ndcg_at_3
-      value: 34.37
-    - type: ndcg_at_5
-      value: 36.228
-    - type: precision_at_1
-      value: 29.110000000000003
-    - type: precision_at_10
-      value: 6.963
-    - type: precision_at_100
-      value: 1.146
-    - type: precision_at_1000
-      value: 0.152
-    - type: precision_at_3
-      value: 16.400000000000002
-    - type: precision_at_5
-      value: 11.552999999999999
-    - type: recall_at_1
-      value: 24.108
-    - type: recall_at_10
-      value: 49.597
-    - type: recall_at_100
-      value: 73.88900000000001
-    - type: recall_at_1000
-      value: 90.62400000000001
-    - type: recall_at_3
-      value: 37.662
-    - type: recall_at_5
-      value: 42.565
-  - task:
-      type: Retrieval
-    dataset:
-      type: BeIR/cqadupstack
-      name: MTEB CQADupstackRetrieval
-      config: default
-      split: test
-      revision: None
-    metrics:
-    - type: map_at_1
-      value: 25.00791666666667
-    - type: map_at_10
-      value: 33.287749999999996
-    - type: map_at_100
-      value: 34.41141666666667
-    - type: map_at_1000
-      value: 34.52583333333333
-    - type: map_at_3
-      value: 30.734416666666668
-    - type: map_at_5
-      value: 32.137166666666666
-    - type: mrr_at_1
-      value: 29.305666666666664
-    - type: mrr_at_10
-      value: 37.22966666666666
-    - type: mrr_at_100
-      value: 38.066583333333334
-    - type: mrr_at_1000
-      value: 38.12616666666667
-    - type: mrr_at_3
-      value: 34.92275
-    - type: mrr_at_5
-      value: 36.23333333333334
-    - type: ndcg_at_1
-      value: 29.305666666666664
-    - type: ndcg_at_10
-      value: 38.25533333333333
-    - type: ndcg_at_100
-      value: 43.25266666666666
-    - type: ndcg_at_1000
-      value: 45.63583333333334
-    - type: ndcg_at_3
-      value: 33.777166666666666
-    - type: ndcg_at_5
-      value: 35.85
-    - type: precision_at_1
-      value: 29.305666666666664
-    - type: precision_at_10
-      value: 6.596416666666667
-    - type: precision_at_100
-      value: 1.0784166666666668
-    - type: precision_at_1000
-      value: 0.14666666666666664
-    - type: precision_at_3
-      value: 15.31075
-    - type: precision_at_5
-      value: 10.830916666666667
-    - type: recall_at_1
-      value: 25.00791666666667
-    - type: recall_at_10
-      value: 49.10933333333333
-    - type: recall_at_100
-      value: 71.09216666666667
-    - type: recall_at_1000
-      value: 87.77725000000001
-    - type: recall_at_3
-      value: 36.660916666666665
-    - type: recall_at_5
-      value: 41.94149999999999
-  - task:
-      type: Retrieval
-    dataset:
-      type: BeIR/cqadupstack
-      name: MTEB CQADupstackStatsRetrieval
-      config: default
-      split: test
-      revision: None
-    metrics:
-    - type: map_at_1
-      value: 23.521
-    - type: map_at_10
-      value: 30.043
-    - type: map_at_100
-      value: 30.936000000000003
-    - type: map_at_1000
-      value: 31.022
-    - type: map_at_3
-      value: 27.926000000000002
-    - type: map_at_5
-      value: 29.076999999999998
-    - type: mrr_at_1
-      value: 26.227
-    - type: mrr_at_10
-      value: 32.822
-    - type: mrr_at_100
-      value: 33.61
-    - type: mrr_at_1000
-      value: 33.672000000000004
-    - type: mrr_at_3
-      value: 30.776999999999997
-    - type: mrr_at_5
-      value: 31.866
-    - type: ndcg_at_1
-      value: 26.227
-    - type: ndcg_at_10
-      value: 34.041
-    - type: ndcg_at_100
-      value: 38.394
-    - type: ndcg_at_1000
-      value: 40.732
-    - type: ndcg_at_3
-      value: 30.037999999999997
-    - type: ndcg_at_5
-      value: 31.845000000000002
-    - type: precision_at_1
-      value: 26.227
-    - type: precision_at_10
-      value: 5.244999999999999
-    - type: precision_at_100
-      value: 0.808
-    - type: precision_at_1000
-      value: 0.107
-    - type: precision_at_3
-      value: 12.679000000000002
-    - type: precision_at_5
-      value: 8.773
-    - type: recall_at_1
-      value: 23.521
-    - type: recall_at_10
-      value: 43.633
-    - type: recall_at_100
-      value: 63.126000000000005
-    - type: recall_at_1000
-      value: 80.765
-    - type: recall_at_3
-      value: 32.614
-    - type: recall_at_5
-      value: 37.15
-  - task:
-      type: Retrieval
-    dataset:
-      type: BeIR/cqadupstack
-      name: MTEB CQADupstackTexRetrieval
-      config: default
-      split: test
-      revision: None
-    metrics:
-    - type: map_at_1
-      value: 16.236
-    - type: map_at_10
-      value: 22.898
-    - type: map_at_100
-      value: 23.878
-    - type: map_at_1000
-      value: 24.009
-    - type: map_at_3
-      value: 20.87
-    - type: map_at_5
-      value: 22.025
-    - type: mrr_at_1
-      value: 19.339000000000002
-    - type: mrr_at_10
-      value: 26.382
-    - type: mrr_at_100
-      value: 27.245
-    - type: mrr_at_1000
-      value: 27.33
-    - type: mrr_at_3
-      value: 24.386
-    - type: mrr_at_5
-      value: 25.496000000000002
-    - type: ndcg_at_1
-      value: 19.339000000000002
-    - type: ndcg_at_10
-      value: 27.139999999999997
-    - type: ndcg_at_100
-      value: 31.944
-    - type: ndcg_at_1000
-      value: 35.077999999999996
-    - type: ndcg_at_3
-      value: 23.424
-    - type: ndcg_at_5
-      value: 25.188
-    - type: precision_at_1
-      value: 19.339000000000002
-    - type: precision_at_10
-      value: 4.8309999999999995
-    - type: precision_at_100
-      value: 0.845
-    - type: precision_at_1000
-      value: 0.128
-    - type: precision_at_3
-      value: 10.874
-    - type: precision_at_5
-      value: 7.825
-    - type: recall_at_1
-      value: 16.236
-    - type: recall_at_10
-      value: 36.513
-    - type: recall_at_100
-      value: 57.999
-    - type: recall_at_1000
-      value: 80.512
-    - type: recall_at_3
-      value: 26.179999999999996
-    - type: recall_at_5
-      value: 30.712
-  - task:
-      type: Retrieval
-    dataset:
-      type: BeIR/cqadupstack
-      name: MTEB CQADupstackUnixRetrieval
-      config: default
-      split: test
-      revision: None
-    metrics:
-    - type: map_at_1
-      value: 24.11
-    - type: map_at_10
-      value: 31.566
-    - type: map_at_100
-      value: 32.647
-    - type: map_at_1000
-      value: 32.753
-    - type: map_at_3
-      value: 29.24
-    - type: map_at_5
-      value: 30.564999999999998
-    - type: mrr_at_1
-      value: 28.265
-    - type: mrr_at_10
-      value: 35.504000000000005
-    - type: mrr_at_100
-      value: 36.436
-    - type: mrr_at_1000
-      value: 36.503
-    - type: mrr_at_3
-      value: 33.349000000000004
-    - type: mrr_at_5
-      value: 34.622
-    - type: ndcg_at_1
-      value: 28.265
-    - type: ndcg_at_10
-      value: 36.192
-    - type: ndcg_at_100
-      value: 41.388000000000005
-    - type: ndcg_at_1000
-      value: 43.948
-    - type: ndcg_at_3
-      value: 31.959
-    - type: ndcg_at_5
-      value: 33.998
-    - type: precision_at_1
-      value: 28.265
-    - type: precision_at_10
-      value: 5.989
-    - type: precision_at_100
-      value: 0.9650000000000001
-    - type: precision_at_1000
-      value: 0.13
-    - type: precision_at_3
-      value: 14.335
-    - type: precision_at_5
-      value: 10.112
-    - type: recall_at_1
-      value: 24.11
-    - type: recall_at_10
-      value: 46.418
-    - type: recall_at_100
-      value: 69.314
-    - type: recall_at_1000
-      value: 87.397
-    - type: recall_at_3
-      value: 34.724
-    - type: recall_at_5
-      value: 39.925
-  - task:
-      type: Retrieval
-    dataset:
-      type: BeIR/cqadupstack
-      name: MTEB CQADupstackWebmastersRetrieval
-      config: default
-      split: test
-      revision: None
-    metrics:
-    - type: map_at_1
-      value: 22.091
-    - type: map_at_10
-      value: 29.948999999999998
-    - type: map_at_100
-      value: 31.502000000000002
-    - type: map_at_1000
-      value: 31.713
-    - type: map_at_3
-      value: 27.464
-    - type: map_at_5
-      value: 28.968
-    - type: mrr_at_1
-      value: 26.482
-    - type: mrr_at_10
-      value: 34.009
-    - type: mrr_at_100
-      value: 35.081
-    - type: mrr_at_1000
-      value: 35.138000000000005
-    - type: mrr_at_3
-      value: 31.785000000000004
-    - type: mrr_at_5
-      value: 33.178999999999995
-    - type: ndcg_at_1
-      value: 26.482
-    - type: ndcg_at_10
-      value: 35.008
-    - type: ndcg_at_100
-      value: 41.272999999999996
-    - type: ndcg_at_1000
-      value: 43.972
-    - type: ndcg_at_3
-      value: 30.804
-    - type: ndcg_at_5
-      value: 33.046
-    - type: precision_at_1
-      value: 26.482
-    - type: precision_at_10
-      value: 6.462
-    - type: precision_at_100
-      value: 1.431
-    - type: precision_at_1000
-      value: 0.22899999999999998
-    - type: precision_at_3
-      value: 14.360999999999999
-    - type: precision_at_5
-      value: 10.474
-    - type: recall_at_1
-      value: 22.091
-    - type: recall_at_10
-      value: 45.125
-    - type: recall_at_100
-      value: 72.313
-    - type: recall_at_1000
-      value: 89.503
-    - type: recall_at_3
-      value: 33.158
-    - type: recall_at_5
-      value: 39.086999999999996
-  - task:
-      type: Retrieval
-    dataset:
-      type: BeIR/cqadupstack
-      name: MTEB CQADupstackWordpressRetrieval
-      config: default
-      split: test
-      revision: None
-    metrics:
-    - type: map_at_1
-      value: 19.883
-    - type: map_at_10
-      value: 26.951000000000004
-    - type: map_at_100
-      value: 27.927999999999997
-    - type: map_at_1000
-      value: 28.022000000000002
-    - type: map_at_3
-      value: 24.616
-    - type: map_at_5
-      value: 25.917
-    - type: mrr_at_1
-      value: 21.996
-    - type: mrr_at_10
-      value: 29.221000000000004
-    - type: mrr_at_100
-      value: 30.024
-    - type: mrr_at_1000
-      value: 30.095
-    - type: mrr_at_3
-      value: 26.833000000000002
-    - type: mrr_at_5
-      value: 28.155
-    - type: ndcg_at_1
-      value: 21.996
-    - type: ndcg_at_10
-      value: 31.421
-    - type: ndcg_at_100
-      value: 36.237
-    - type: ndcg_at_1000
-      value: 38.744
-    - type: ndcg_at_3
-      value: 26.671
-    - type: ndcg_at_5
-      value: 28.907
-    - type: precision_at_1
-      value: 21.996
-    - type: precision_at_10
-      value: 5.009
-    - type: precision_at_100
-      value: 0.799
-    - type: precision_at_1000
-      value: 0.11199999999999999
-    - type: precision_at_3
-      value: 11.275
-    - type: precision_at_5
-      value: 8.059
-    - type: recall_at_1
-      value: 19.883
-    - type: recall_at_10
-      value: 43.132999999999996
-    - type: recall_at_100
-      value: 65.654
-    - type: recall_at_1000
-      value: 84.492
-    - type: recall_at_3
-      value: 30.209000000000003
-    - type: recall_at_5
-      value: 35.616
-  - task:
-      type: Retrieval
-    dataset:
-      type: climate-fever
-      name: MTEB ClimateFEVER
-      config: default
-      split: test
-      revision: None
-    metrics:
-    - type: map_at_1
-      value: 17.756
-    - type: map_at_10
-      value: 30.378
-    - type: map_at_100
-      value: 32.537
-    - type: map_at_1000
-      value: 32.717
-    - type: map_at_3
-      value: 25.599
-    - type: map_at_5
-      value: 28.372999999999998
-    - type: mrr_at_1
-      value: 41.303
-    - type: mrr_at_10
-      value: 53.483999999999995
-    - type: mrr_at_100
-      value: 54.106
-    - type: mrr_at_1000
-      value: 54.127
-    - type: mrr_at_3
-      value: 50.315
-    - type: mrr_at_5
-      value: 52.396
-    - type: ndcg_at_1
-      value: 41.303
-    - type: ndcg_at_10
-      value: 40.503
-    - type: ndcg_at_100
-      value: 47.821000000000005
-    - type: ndcg_at_1000
-      value: 50.788
-    - type: ndcg_at_3
-      value: 34.364
-    - type: ndcg_at_5
-      value: 36.818
-    - type: precision_at_1
-      value: 41.303
-    - type: precision_at_10
-      value: 12.463000000000001
-    - type: precision_at_100
-      value: 2.037
-    - type: precision_at_1000
-      value: 0.26
-    - type: precision_at_3
-      value: 25.798
-    - type: precision_at_5
-      value: 19.896
-    - type: recall_at_1
-      value: 17.756
-    - type: recall_at_10
-      value: 46.102
-    - type: recall_at_100
-      value: 70.819
-    - type: recall_at_1000
-      value: 87.21799999999999
-    - type: recall_at_3
-      value: 30.646
-    - type: recall_at_5
-      value: 38.022
-  - task:
-      type: Retrieval
-    dataset:
-      type: dbpedia-entity
-      name: MTEB DBPedia
-      config: default
-      split: test
-      revision: None
-    metrics:
-    - type: map_at_1
-      value: 9.033
-    - type: map_at_10
-      value: 20.584
-    - type: map_at_100
-      value: 29.518
-    - type: map_at_1000
-      value: 31.186000000000003
-    - type: map_at_3
-      value: 14.468
-    - type: map_at_5
-      value: 17.177
-    - type: mrr_at_1
-      value: 69.75
-    - type: mrr_at_10
-      value: 77.025
-    - type: mrr_at_100
-      value: 77.36699999999999
-    - type: mrr_at_1000
-      value: 77.373
-    - type: mrr_at_3
-      value: 75.583
-    - type: mrr_at_5
-      value: 76.396
-    - type: ndcg_at_1
-      value: 58.5
-    - type: ndcg_at_10
-      value: 45.033
-    - type: ndcg_at_100
-      value: 49.071
-    - type: ndcg_at_1000
-      value: 56.056
-    - type: ndcg_at_3
-      value: 49.936
-    - type: ndcg_at_5
-      value: 47.471999999999994
-    - type: precision_at_1
-      value: 69.75
-    - type: precision_at_10
-      value: 35.775
-    - type: precision_at_100
-      value: 11.594999999999999
-    - type: precision_at_1000
-      value: 2.062
-    - type: precision_at_3
-      value: 52.5
-    - type: precision_at_5
-      value: 45.300000000000004
-    - type: recall_at_1
-      value: 9.033
-    - type: recall_at_10
-      value: 26.596999999999998
-    - type: recall_at_100
-      value: 54.607000000000006
-    - type: recall_at_1000
-      value: 76.961
-    - type: recall_at_3
-      value: 15.754999999999999
-    - type: recall_at_5
-      value: 20.033
-  - task:
-      type: Classification
-    dataset:
-      type: mteb/emotion
-      name: MTEB EmotionClassification
-      config: default
-      split: test
-      revision: 4f58c6b202a23cf9a4da393831edf4f9183cad37
-    metrics:
-    - type: accuracy
-      value: 48.345000000000006
-    - type: f1
-      value: 43.4514918068706
-  - task:
-      type: Retrieval
-    dataset:
-      type: fever
-      name: MTEB FEVER
-      config: default
-      split: test
-      revision: None
-    metrics:
-    - type: map_at_1
-      value: 71.29100000000001
-    - type: map_at_10
-      value: 81.059
-    - type: map_at_100
-      value: 81.341
-    - type: map_at_1000
-      value: 81.355
-    - type: map_at_3
-      value: 79.74799999999999
-    - type: map_at_5
-      value: 80.612
-    - type: mrr_at_1
-      value: 76.40299999999999
-    - type: mrr_at_10
-      value: 84.615
-    - type: mrr_at_100
-      value: 84.745
-    - type: mrr_at_1000
-      value: 84.748
-    - type: mrr_at_3
-      value: 83.776
-    - type: mrr_at_5
-      value: 84.343
-    - type: ndcg_at_1
-      value: 76.40299999999999
-    - type: ndcg_at_10
-      value: 84.981
-    - type: ndcg_at_100
-      value: 86.00999999999999
-    - type: ndcg_at_1000
-      value: 86.252
-    - type: ndcg_at_3
-      value: 82.97
-    - type: ndcg_at_5
-      value: 84.152
-    - type: precision_at_1
-      value: 76.40299999999999
-    - type: precision_at_10
-      value: 10.446
-    - type: precision_at_100
-      value: 1.1199999999999999
-    - type: precision_at_1000
-      value: 0.116
-    - type: precision_at_3
-      value: 32.147999999999996
-    - type: precision_at_5
-      value: 20.135
-    - type: recall_at_1
-      value: 71.29100000000001
-    - type: recall_at_10
-      value: 93.232
-    - type: recall_at_100
-      value: 97.363
-    - type: recall_at_1000
-      value: 98.905
-    - type: recall_at_3
-      value: 87.893
-    - type: recall_at_5
-      value: 90.804
-  - task:
-      type: Retrieval
-    dataset:
-      type: fiqa
-      name: MTEB FiQA2018
-      config: default
-      split: test
-      revision: None
-    metrics:
-    - type: map_at_1
-      value: 18.667
-    - type: map_at_10
-      value: 30.853
-    - type: map_at_100
-      value: 32.494
-    - type: map_at_1000
-      value: 32.677
-    - type: map_at_3
-      value: 26.91
-    - type: map_at_5
-      value: 29.099000000000004
-    - type: mrr_at_1
-      value: 37.191
-    - type: mrr_at_10
-      value: 46.171
-    - type: mrr_at_100
-      value: 47.056
-    - type: mrr_at_1000
-      value: 47.099000000000004
-    - type: mrr_at_3
-      value: 44.059
-    - type: mrr_at_5
-      value: 45.147
-    - type: ndcg_at_1
-      value: 37.191
-    - type: ndcg_at_10
-      value: 38.437
-    - type: ndcg_at_100
-      value: 44.62
-    - type: ndcg_at_1000
-      value: 47.795
-    - type: ndcg_at_3
-      value: 35.003
-    - type: ndcg_at_5
-      value: 36.006
-    - type: precision_at_1
-      value: 37.191
-    - type: precision_at_10
-      value: 10.586
-    - type: precision_at_100
-      value: 1.688
-    - type: precision_at_1000
-      value: 0.22699999999999998
-    - type: precision_at_3
-      value: 23.302
-    - type: precision_at_5
-      value: 17.006
-    - type: recall_at_1
-      value: 18.667
-    - type: recall_at_10
-      value: 45.367000000000004
-    - type: recall_at_100
-      value: 68.207
-    - type: recall_at_1000
-      value: 87.072
-    - type: recall_at_3
-      value: 32.129000000000005
-    - type: recall_at_5
-      value: 37.719
-  - task:
-      type: Retrieval
-    dataset:
-      type: hotpotqa
-      name: MTEB HotpotQA
-      config: default
-      split: test
-      revision: None
-    metrics:
-    - type: map_at_1
-      value: 39.494
-    - type: map_at_10
-      value: 66.223
-    - type: map_at_100
-      value: 67.062
-    - type: map_at_1000
-      value: 67.11500000000001
-    - type: map_at_3
-      value: 62.867
-    - type: map_at_5
-      value: 64.994
-    - type: mrr_at_1
-      value: 78.987
-    - type: mrr_at_10
-      value: 84.585
-    - type: mrr_at_100
-      value: 84.773
-    - type: mrr_at_1000
-      value: 84.77900000000001
-    - type: mrr_at_3
-      value: 83.592
-    - type: mrr_at_5
-      value: 84.235
-    - type: ndcg_at_1
-      value: 78.987
-    - type: ndcg_at_10
-      value: 73.64
-    - type: ndcg_at_100
-      value: 76.519
-    - type: ndcg_at_1000
-      value: 77.51
-    - type: ndcg_at_3
-      value: 68.893
-    - type: ndcg_at_5
-      value: 71.585
-    - type: precision_at_1
-      value: 78.987
-    - type: precision_at_10
-      value: 15.529000000000002
-    - type: precision_at_100
-      value: 1.7770000000000001
-    - type: precision_at_1000
-      value: 0.191
-    - type: precision_at_3
-      value: 44.808
-    - type: precision_at_5
-      value: 29.006999999999998
-    - type: recall_at_1
-      value: 39.494
-    - type: recall_at_10
-      value: 77.643
-    - type: recall_at_100
-      value: 88.825
-    - type: recall_at_1000
-      value: 95.321
-    - type: recall_at_3
-      value: 67.211
-    - type: recall_at_5
-      value: 72.519
-  - task:
-      type: Classification
-    dataset:
-      type: mteb/imdb
-      name: MTEB ImdbClassification
-      config: default
-      split: test
-      revision: 3d86128a09e091d6018b6d26cad27f2739fc2db7
-    metrics:
-    - type: accuracy
-      value: 85.55959999999999
-    - type: ap
-      value: 80.7246500384617
-    - type: f1
-      value: 85.52336485065454
-  - task:
-      type: Retrieval
-    dataset:
-      type: msmarco
-      name: MTEB MSMARCO
-      config: default
-      split: dev
-      revision: None
-    metrics:
-    - type: map_at_1
-      value: 23.631
-    - type: map_at_10
-      value: 36.264
-    - type: map_at_100
-      value: 37.428
-    - type: map_at_1000
-      value: 37.472
-    - type: map_at_3
-      value: 32.537
-    - type: map_at_5
-      value: 34.746
-    - type: mrr_at_1
-      value: 24.312
-    - type: mrr_at_10
-      value: 36.858000000000004
-    - type: mrr_at_100
-      value: 37.966
-    - type: mrr_at_1000
-      value: 38.004
-    - type: mrr_at_3
-      value: 33.188
-    - type: mrr_at_5
-      value: 35.367
-    - type: ndcg_at_1
-      value: 24.312
-    - type: ndcg_at_10
-      value: 43.126999999999995
-    - type: ndcg_at_100
-      value: 48.642
-    - type: ndcg_at_1000
-      value: 49.741
-    - type: ndcg_at_3
-      value: 35.589
-    - type: ndcg_at_5
-      value: 39.515
-    - type: precision_at_1
-      value: 24.312
-    - type: precision_at_10
-      value: 6.699
-    - type: precision_at_100
-      value: 0.9450000000000001
-    - type: precision_at_1000
-      value: 0.104
-    - type: precision_at_3
-      value: 15.153
-    - type: precision_at_5
-      value: 11.065999999999999
-    - type: recall_at_1
-      value: 23.631
-    - type: recall_at_10
-      value: 64.145
-    - type: recall_at_100
-      value: 89.41
-    - type: recall_at_1000
-      value: 97.83500000000001
-    - type: recall_at_3
-      value: 43.769000000000005
-    - type: recall_at_5
-      value: 53.169
-  - task:
-      type: Classification
-    dataset:
-      type: mteb/mtop_domain
-      name: MTEB MTOPDomainClassification (en)
-      config: en
-      split: test
-      revision: d80d48c1eb48d3562165c59d59d0034df9fff0bf
-    metrics:
-    - type: accuracy
-      value: 93.4108527131783
-    - type: f1
-      value: 93.1415880261038
-  - task:
-      type: Classification
-    dataset:
-      type: mteb/mtop_intent
-      name: MTEB MTOPIntentClassification (en)
-      config: en
-      split: test
-      revision: ae001d0e6b1228650b7bd1c2c65fb50ad11a8aba
-    metrics:
-    - type: accuracy
-      value: 77.24806201550388
-    - type: f1
-      value: 60.531916308197175
-  - task:
-      type: Classification
-    dataset:
-      type: mteb/amazon_massive_intent
-      name: MTEB MassiveIntentClassification (en)
-      config: en
-      split: test
-      revision: 31efe3c427b0bae9c22cbb560b8f15491cc6bed7
-    metrics:
-    - type: accuracy
-      value: 73.71553463349024
-    - type: f1
-      value: 71.70753174900791
-  - task:
-      type: Classification
-    dataset:
-      type: mteb/amazon_massive_scenario
-      name: MTEB MassiveScenarioClassification (en)
-      config: en
-      split: test
-      revision: 7d571f92784cd94a019292a1f45445077d0ef634
-    metrics:
-    - type: accuracy
-      value: 77.79757901815736
-    - type: f1
-      value: 77.83719850433258
-  - task:
-      type: Clustering
-    dataset:
-      type: mteb/medrxiv-clustering-p2p
-      name: MTEB MedrxivClusteringP2P
-      config: default
-      split: test
-      revision: e7a26af6f3ae46b30dde8737f02c07b1505bcc73
-    metrics:
-    - type: v_measure
-      value: 33.74193296622113
-  - task:
-      type: Clustering
-    dataset:
-      type: mteb/medrxiv-clustering-s2s
-      name: MTEB MedrxivClusteringS2S
-      config: default
-      split: test
-      revision: 35191c8c0dca72d8ff3efcd72aa802307d469663
-    metrics:
-    - type: v_measure
-      value: 30.64257594108566
-  - task:
-      type: Reranking
-    dataset:
-      type: mteb/mind_small
-      name: MTEB MindSmallReranking
-      config: default
-      split: test
-      revision: 3bdac13927fdc888b903db93b2ffdbd90b295a69
-    metrics:
-    - type: map
-      value: 30.811018518883625
-    - type: mrr
-      value: 31.910376577445003
-  - task:
-      type: Retrieval
-    dataset:
-      type: nfcorpus
-      name: MTEB NFCorpus
-      config: default
-      split: test
-      revision: None
-    metrics:
-    - type: map_at_1
-      value: 5.409
-    - type: map_at_10
-      value: 13.093
-    - type: map_at_100
-      value: 16.256999999999998
-    - type: map_at_1000
-      value: 17.617
-    - type: map_at_3
-      value: 9.555
-    - type: map_at_5
-      value: 11.428
-    - type: mrr_at_1
-      value: 45.201
-    - type: mrr_at_10
-      value: 54.179
-    - type: mrr_at_100
-      value: 54.812000000000005
-    - type: mrr_at_1000
-      value: 54.840999999999994
-    - type: mrr_at_3
-      value: 51.909000000000006
-    - type: mrr_at_5
-      value: 53.519000000000005
-    - type: ndcg_at_1
-      value: 43.189
-    - type: ndcg_at_10
-      value: 35.028
-    - type: ndcg_at_100
-      value: 31.226
-    - type: ndcg_at_1000
-      value: 39.678000000000004
-    - type: ndcg_at_3
-      value: 40.596
-    - type: ndcg_at_5
-      value: 38.75
-    - type: precision_at_1
-      value: 44.582
-    - type: precision_at_10
-      value: 25.974999999999998
-    - type: precision_at_100
-      value: 7.793
-    - type: precision_at_1000
-      value: 2.036
-    - type: precision_at_3
-      value: 38.493
-    - type: precision_at_5
-      value: 33.994
-    - type: recall_at_1
-      value: 5.409
-    - type: recall_at_10
-      value: 16.875999999999998
-    - type: recall_at_100
-      value: 30.316
-    - type: recall_at_1000
-      value: 60.891
-    - type: recall_at_3
-      value: 10.688
-    - type: recall_at_5
-      value: 13.832
-  - task:
-      type: Retrieval
-    dataset:
-      type: nq
-      name: MTEB NQ
-      config: default
-      split: test
-      revision: None
-    metrics:
-    - type: map_at_1
-      value: 36.375
-    - type: map_at_10
-      value: 51.991
-    - type: map_at_100
-      value: 52.91400000000001
-    - type: map_at_1000
-      value: 52.93600000000001
-    - type: map_at_3
-      value: 48.014
-    - type: map_at_5
-      value: 50.381
-    - type: mrr_at_1
-      value: 40.759
-    - type: mrr_at_10
-      value: 54.617000000000004
-    - type: mrr_at_100
-      value: 55.301
-    - type: mrr_at_1000
-      value: 55.315000000000005
-    - type: mrr_at_3
-      value: 51.516
-    - type: mrr_at_5
-      value: 53.435
-    - type: ndcg_at_1
-      value: 40.759
-    - type: ndcg_at_10
-      value: 59.384
-    - type: ndcg_at_100
-      value: 63.157
-    - type: ndcg_at_1000
-      value: 63.654999999999994
-    - type: ndcg_at_3
-      value: 52.114000000000004
-    - type: ndcg_at_5
-      value: 55.986000000000004
-    - type: precision_at_1
-      value: 40.759
-    - type: precision_at_10
-      value: 9.411999999999999
-    - type: precision_at_100
-      value: 1.153
-    - type: precision_at_1000
-      value: 0.12
-    - type: precision_at_3
-      value: 23.329
-    - type: precision_at_5
-      value: 16.256999999999998
-    - type: recall_at_1
-      value: 36.375
-    - type: recall_at_10
-      value: 79.053
-    - type: recall_at_100
-      value: 95.167
-    - type: recall_at_1000
-      value: 98.82
-    - type: recall_at_3
-      value: 60.475
-    - type: recall_at_5
-      value: 69.327
-  - task:
-      type: Retrieval
-    dataset:
-      type: quora
-      name: MTEB QuoraRetrieval
-      config: default
-      split: test
-      revision: None
-    metrics:
-    - type: map_at_1
-      value: 70.256
-    - type: map_at_10
-      value: 83.8
-    - type: map_at_100
-      value: 84.425
-    - type: map_at_1000
-      value: 84.444
-    - type: map_at_3
-      value: 80.906
-    - type: map_at_5
-      value: 82.717
-    - type: mrr_at_1
-      value: 80.97999999999999
-    - type: mrr_at_10
-      value: 87.161
-    - type: mrr_at_100
-      value: 87.262
-    - type: mrr_at_1000
-      value: 87.263
-    - type: mrr_at_3
-      value: 86.175
-    - type: mrr_at_5
-      value: 86.848
-    - type: ndcg_at_1
-      value: 80.97999999999999
-    - type: ndcg_at_10
-      value: 87.697
-    - type: ndcg_at_100
-      value: 88.959
-    - type: ndcg_at_1000
-      value: 89.09899999999999
-    - type: ndcg_at_3
-      value: 84.83800000000001
-    - type: ndcg_at_5
-      value: 86.401
-    - type: precision_at_1
-      value: 80.97999999999999
-    - type: precision_at_10
-      value: 13.261000000000001
-    - type: precision_at_100
-      value: 1.5150000000000001
-    - type: precision_at_1000
-      value: 0.156
-    - type: precision_at_3
-      value: 37.01
-    - type: precision_at_5
-      value: 24.298000000000002
-    - type: recall_at_1
-      value: 70.256
-    - type: recall_at_10
-      value: 94.935
-    - type: recall_at_100
-      value: 99.274
-    - type: recall_at_1000
-      value: 99.928
-    - type: recall_at_3
-      value: 86.602
-    - type: recall_at_5
-      value: 91.133
-  - task:
-      type: Clustering
-    dataset:
-      type: mteb/reddit-clustering
-      name: MTEB RedditClustering
-      config: default
-      split: test
-      revision: 24640382cdbf8abc73003fb0fa6d111a705499eb
-    metrics:
-    - type: v_measure
-      value: 56.322692497613104
-  - task:
-      type: Clustering
-    dataset:
-      type: mteb/reddit-clustering-p2p
-      name: MTEB RedditClusteringP2P
-      config: default
-      split: test
-      revision: 282350215ef01743dc01b456c7f5241fa8937f16
-    metrics:
-    - type: v_measure
-      value: 61.895813503775074
-  - task:
-      type: Retrieval
-    dataset:
-      type: scidocs
-      name: MTEB SCIDOCS
-      config: default
-      split: test
-      revision: None
-    metrics:
-    - type: map_at_1
-      value: 4.338
-    - type: map_at_10
-      value: 10.767
-    - type: map_at_100
-      value: 12.537999999999998
-    - type: map_at_1000
-      value: 12.803999999999998
-    - type: map_at_3
-      value: 7.788
-    - type: map_at_5
-      value: 9.302000000000001
-    - type: mrr_at_1
-      value: 21.4
-    - type: mrr_at_10
-      value: 31.637999999999998
-    - type: mrr_at_100
-      value: 32.688
-    - type: mrr_at_1000
-      value: 32.756
-    - type: mrr_at_3
-      value: 28.433000000000003
-    - type: mrr_at_5
-      value: 30.178
-    - type: ndcg_at_1
-      value: 21.4
-    - type: ndcg_at_10
-      value: 18.293
-    - type: ndcg_at_100
-      value: 25.274
-    - type: ndcg_at_1000
-      value: 30.284
-    - type: ndcg_at_3
-      value: 17.391000000000002
-    - type: ndcg_at_5
-      value: 15.146999999999998
-    - type: precision_at_1
-      value: 21.4
-    - type: precision_at_10
-      value: 9.48
-    - type: precision_at_100
-      value: 1.949
-    - type: precision_at_1000
-      value: 0.316
-    - type: precision_at_3
-      value: 16.167
-    - type: precision_at_5
-      value: 13.22
-    - type: recall_at_1
-      value: 4.338
-    - type: recall_at_10
-      value: 19.213
-    - type: recall_at_100
-      value: 39.562999999999995
-    - type: recall_at_1000
-      value: 64.08
-    - type: recall_at_3
-      value: 9.828000000000001
-    - type: recall_at_5
-      value: 13.383000000000001
-  - task:
-      type: STS
-    dataset:
-      type: mteb/sickr-sts
-      name: MTEB SICK-R
-      config: default
-      split: test
-      revision: a6ea5a8cab320b040a23452cc28066d9beae2cee
-    metrics:
-    - type: cos_sim_pearson
-      value: 82.42568163642142
-    - type: cos_sim_spearman
-      value: 78.5797159641342
-    - type: euclidean_pearson
-      value: 80.22151260811604
-    - type: euclidean_spearman
-      value: 78.5797151953878
-    - type: manhattan_pearson
-      value: 80.21224215864788
-    - type: manhattan_spearman
-      value: 78.55641478381344
-  - task:
-      type: STS
-    dataset:
-      type: mteb/sts12-sts
-      name: MTEB STS12
-      config: default
-      split: test
-      revision: a0d554a64d88156834ff5ae9920b964011b16384
-    metrics:
-    - type: cos_sim_pearson
-      value: 85.44020710812569
-    - type: cos_sim_spearman
-      value: 78.91631735081286
-    - type: euclidean_pearson
-      value: 81.64188964182102
-    - type: euclidean_spearman
-      value: 78.91633286881678
-    - type: manhattan_pearson
-      value: 81.69294748512496
-    - type: manhattan_spearman
-      value: 78.93438558002656
-  - task:
-      type: STS
-    dataset:
-      type: mteb/sts13-sts
-      name: MTEB STS13
-      config: default
-      split: test
-      revision: 7e90230a92c190f1bf69ae9002b8cea547a64cca
-    metrics:
-    - type: cos_sim_pearson
-      value: 84.27165426412311
-    - type: cos_sim_spearman
-      value: 85.40429140249618
-    - type: euclidean_pearson
-      value: 84.7509580724893
-    - type: euclidean_spearman
-      value: 85.40429140249618
-    - type: manhattan_pearson
-      value: 84.76488289321308
-    - type: manhattan_spearman
-      value: 85.4256793698708
-  - task:
-      type: STS
-    dataset:
-      type: mteb/sts14-sts
-      name: MTEB STS14
-      config: default
-      split: test
-      revision: 6031580fec1f6af667f0bd2da0a551cf4f0b2375
-    metrics:
-    - type: cos_sim_pearson
-      value: 83.138851760732
-    - type: cos_sim_spearman
-      value: 81.64101363896586
-    - type: euclidean_pearson
-      value: 82.55165038934942
-    - type: euclidean_spearman
-      value: 81.64105257080502
-    - type: manhattan_pearson
-      value: 82.52802949883335
-    - type: manhattan_spearman
-      value: 81.61255430718158
-  - task:
-      type: STS
-    dataset:
-      type: mteb/sts15-sts
-      name: MTEB STS15
-      config: default
-      split: test
-      revision: ae752c7c21bf194d8b67fd573edf7ae58183cbe3
-    metrics:
-    - type: cos_sim_pearson
-      value: 86.0654695484029
-    - type: cos_sim_spearman
-      value: 87.20408521902229
-    - type: euclidean_pearson
-      value: 86.8110651362115
-    - type: euclidean_spearman
-      value: 87.20408521902229
-    - type: manhattan_pearson
-      value: 86.77984656478691
-    - type: manhattan_spearman
-      value: 87.1719947099227
-  - task:
-      type: STS
-    dataset:
-      type: mteb/sts16-sts
-      name: MTEB STS16
-      config: default
-      split: test
-      revision: 4d8694f8f0e0100860b497b999b3dbed754a0513
-    metrics:
-    - type: cos_sim_pearson
-      value: 83.77823915496512
-    - type: cos_sim_spearman
-      value: 85.43566325729779
-    - type: euclidean_pearson
-      value: 84.5396956658821
-    - type: euclidean_spearman
-      value: 85.43566325729779
-    - type: manhattan_pearson
-      value: 84.5665398848169
-    - type: manhattan_spearman
-      value: 85.44375870303232
-  - task:
-      type: STS
-    dataset:
-      type: mteb/sts17-crosslingual-sts
-      name: MTEB STS17 (en-en)
-      config: en-en
-      split: test
-      revision: af5e6fb845001ecf41f4c1e033ce921939a2a68d
-    metrics:
-    - type: cos_sim_pearson
-      value: 87.20030208471798
-    - type: cos_sim_spearman
-      value: 87.20485505076539
-    - type: euclidean_pearson
-      value: 88.10588324368722
-    - type: euclidean_spearman
-      value: 87.20485505076539
-    - type: manhattan_pearson
-      value: 87.92324770415183
-    - type: manhattan_spearman
-      value: 87.0571314561877
-  - task:
-      type: STS
-    dataset:
-      type: mteb/sts22-crosslingual-sts
-      name: MTEB STS22 (en)
-      config: en
-      split: test
-      revision: 6d1ba47164174a496b7fa5d3569dae26a6813b80
-    metrics:
-    - type: cos_sim_pearson
-      value: 63.06093161604453
-    - type: cos_sim_spearman
-      value: 64.2163140357722
-    - type: euclidean_pearson
-      value: 65.27589680994006
-    - type: euclidean_spearman
-      value: 64.2163140357722
-    - type: manhattan_pearson
-      value: 65.45904383711101
-    - type: manhattan_spearman
-      value: 64.55404716679305
-  - task:
-      type: STS
-    dataset:
-      type: mteb/stsbenchmark-sts
-      name: MTEB STSBenchmark
-      config: default
-      split: test
-      revision: b0fddb56ed78048fa8b90373c8a3cfc37b684831
-    metrics:
-    - type: cos_sim_pearson
-      value: 84.32976164578706
-    - type: cos_sim_spearman
-      value: 85.54302197678368
-    - type: euclidean_pearson
-      value: 85.26307149193056
-    - type: euclidean_spearman
-      value: 85.54302197678368
-    - type: manhattan_pearson
-      value: 85.26647282029371
-    - type: manhattan_spearman
-      value: 85.5316135265568
-  - task:
-      type: Reranking
-    dataset:
-      type: mteb/scidocs-reranking
-      name: MTEB SciDocsRR
-      config: default
-      split: test
-      revision: d3c5e1fc0b855ab6097bf1cda04dd73947d7caab
-    metrics:
-    - type: map
-      value: 81.44675968318754
-    - type: mrr
-      value: 94.92741826075158
-  - task:
-      type: Retrieval
-    dataset:
-      type: scifact
-      name: MTEB SciFact
-      config: default
-      split: test
-      revision: None
-    metrics:
-    - type: map_at_1
-      value: 56.34400000000001
-    - type: map_at_10
-      value: 65.927
-    - type: map_at_100
-      value: 66.431
-    - type: map_at_1000
-      value: 66.461
-    - type: map_at_3
-      value: 63.529
-    - type: map_at_5
-      value: 64.818
-    - type: mrr_at_1
-      value: 59.333000000000006
-    - type: mrr_at_10
-      value: 67.54599999999999
-    - type: mrr_at_100
-      value: 67.892
-    - type: mrr_at_1000
-      value: 67.917
-    - type: mrr_at_3
-      value: 65.778
-    - type: mrr_at_5
-      value: 66.794
-    - type: ndcg_at_1
-      value: 59.333000000000006
-    - type: ndcg_at_10
-      value: 70.5
-    - type: ndcg_at_100
-      value: 72.688
-    - type: ndcg_at_1000
-      value: 73.483
-    - type: ndcg_at_3
-      value: 66.338
-    - type: ndcg_at_5
-      value: 68.265
-    - type: precision_at_1
-      value: 59.333000000000006
-    - type: precision_at_10
-      value: 9.3
-    - type: precision_at_100
-      value: 1.053
-    - type: precision_at_1000
-      value: 0.11199999999999999
-    - type: precision_at_3
-      value: 25.889
-    - type: precision_at_5
-      value: 16.866999999999997
-    - type: recall_at_1
-      value: 56.34400000000001
-    - type: recall_at_10
-      value: 82.789
-    - type: recall_at_100
-      value: 92.767
-    - type: recall_at_1000
-      value: 99
-    - type: recall_at_3
-      value: 71.64399999999999
-    - type: recall_at_5
-      value: 76.322
-  - task:
-      type: PairClassification
-    dataset:
-      type: mteb/sprintduplicatequestions-pairclassification
-      name: MTEB SprintDuplicateQuestions
-      config: default
-      split: test
-      revision: d66bd1f72af766a5cc4b0ca5e00c162f89e8cc46
-    metrics:
-    - type: cos_sim_accuracy
-      value: 99.75742574257426
-    - type: cos_sim_ap
-      value: 93.52081548447406
-    - type: cos_sim_f1
-      value: 87.33850129198966
-    - type: cos_sim_precision
-      value: 90.37433155080214
-    - type: cos_sim_recall
-      value: 84.5
-    - type: dot_accuracy
-      value: 99.75742574257426
-    - type: dot_ap
-      value: 93.52081548447406
-    - type: dot_f1
-      value: 87.33850129198966
-    - type: dot_precision
-      value: 90.37433155080214
-    - type: dot_recall
-      value: 84.5
-    - type: euclidean_accuracy
-      value: 99.75742574257426
-    - type: euclidean_ap
-      value: 93.52081548447406
-    - type: euclidean_f1
-      value: 87.33850129198966
-    - type: euclidean_precision
-      value: 90.37433155080214
-    - type: euclidean_recall
-      value: 84.5
-    - type: manhattan_accuracy
-      value: 99.75841584158415
-    - type: manhattan_ap
-      value: 93.4975678585854
-    - type: manhattan_f1
-      value: 87.26708074534162
-    - type: manhattan_precision
-      value: 90.45064377682404
-    - type: manhattan_recall
-      value: 84.3
-    - type: max_accuracy
-      value: 99.75841584158415
-    - type: max_ap
-      value: 93.52081548447406
-    - type: max_f1
-      value: 87.33850129198966
-  - task:
-      type: Clustering
-    dataset:
-      type: mteb/stackexchange-clustering
-      name: MTEB StackExchangeClustering
-      config: default
-      split: test
-      revision: 6cbc1f7b2bc0622f2e39d2c77fa502909748c259
-    metrics:
-    - type: v_measure
-      value: 64.31437036686651
-  - task:
-      type: Clustering
-    dataset:
-      type: mteb/stackexchange-clustering-p2p
-      name: MTEB StackExchangeClusteringP2P
-      config: default
-      split: test
-      revision: 815ca46b2622cec33ccafc3735d572c266efdb44
-    metrics:
-    - type: v_measure
-      value: 33.25569319007206
-  - task:
-      type: Reranking
-    dataset:
-      type: mteb/stackoverflowdupquestions-reranking
-      name: MTEB StackOverflowDupQuestions
-      config: default
-      split: test
-      revision: e185fbe320c72810689fc5848eb6114e1ef5ec69
-    metrics:
-    - type: map
-      value: 49.90474939720706
-    - type: mrr
-      value: 50.568115503777264
-  - task:
-      type: Summarization
-    dataset:
-      type: mteb/summeval
-      name: MTEB SummEval
-      config: default
-      split: test
-      revision: cda12ad7615edc362dbf25a00fdd61d3b1eaf93c
-    metrics:
-    - type: cos_sim_pearson
-      value: 29.866828641244712
-    - type: cos_sim_spearman
-      value: 30.077555055873866
-    - type: dot_pearson
-      value: 29.866832988572266
-    - type: dot_spearman
-      value: 30.077555055873866
-  - task:
-      type: Retrieval
-    dataset:
-      type: trec-covid
-      name: MTEB TRECCOVID
-      config: default
-      split: test
-      revision: None
-    metrics:
-    - type: map_at_1
-      value: 0.232
-    - type: map_at_10
-      value: 2.094
-    - type: map_at_100
-      value: 11.971
-    - type: map_at_1000
-      value: 28.158
-    - type: map_at_3
-      value: 0.688
-    - type: map_at_5
-      value: 1.114
-    - type: mrr_at_1
-      value: 88
-    - type: mrr_at_10
-      value: 93.4
-    - type: mrr_at_100
-      value: 93.4
-    - type: mrr_at_1000
-      value: 93.4
-    - type: mrr_at_3
-      value: 93
-    - type: mrr_at_5
-      value: 93.4
-    - type: ndcg_at_1
-      value: 84
-    - type: ndcg_at_10
-      value: 79.923
-    - type: ndcg_at_100
-      value: 61.17
-    - type: ndcg_at_1000
-      value: 53.03
-    - type: ndcg_at_3
-      value: 84.592
-    - type: ndcg_at_5
-      value: 82.821
-    - type: precision_at_1
-      value: 88
-    - type: precision_at_10
-      value: 85
-    - type: precision_at_100
-      value: 63.019999999999996
-    - type: precision_at_1000
-      value: 23.554
-    - type: precision_at_3
-      value: 89.333
-    - type: precision_at_5
-      value: 87.2
-    - type: recall_at_1
-      value: 0.232
-    - type: recall_at_10
-      value: 2.255
-    - type: recall_at_100
-      value: 14.823
-    - type: recall_at_1000
-      value: 49.456
-    - type: recall_at_3
-      value: 0.718
-    - type: recall_at_5
-      value: 1.175
-  - task:
-      type: Retrieval
-    dataset:
-      type: webis-touche2020
-      name: MTEB Touche2020
-      config: default
-      split: test
-      revision: None
-    metrics:
-    - type: map_at_1
-      value: 2.547
-    - type: map_at_10
-      value: 11.375
-    - type: map_at_100
-      value: 18.194
-    - type: map_at_1000
-      value: 19.749
-    - type: map_at_3
-      value: 5.825
-    - type: map_at_5
-      value: 8.581
-    - type: mrr_at_1
-      value: 32.653
-    - type: mrr_at_10
-      value: 51.32
-    - type: mrr_at_100
-      value: 51.747
-    - type: mrr_at_1000
-      value: 51.747
-    - type: mrr_at_3
-      value: 47.278999999999996
-    - type: mrr_at_5
-      value: 48.605
-    - type: ndcg_at_1
-      value: 29.592000000000002
-    - type: ndcg_at_10
-      value: 28.151
-    - type: ndcg_at_100
-      value: 39.438
-    - type: ndcg_at_1000
-      value: 50.769
-    - type: ndcg_at_3
-      value: 30.758999999999997
-    - type: ndcg_at_5
-      value: 30.366
-    - type: precision_at_1
-      value: 32.653
-    - type: precision_at_10
-      value: 25.714
-    - type: precision_at_100
-      value: 8.041
-    - type: precision_at_1000
-      value: 1.555
-    - type: precision_at_3
-      value: 33.333
-    - type: precision_at_5
-      value: 31.837
-    - type: recall_at_1
-      value: 2.547
-    - type: recall_at_10
-      value: 18.19
-    - type: recall_at_100
-      value: 49.538
-    - type: recall_at_1000
-      value: 83.86
-    - type: recall_at_3
-      value: 7.329
-    - type: recall_at_5
-      value: 11.532
-  - task:
-      type: Classification
-    dataset:
-      type: mteb/toxic_conversations_50k
-      name: MTEB ToxicConversationsClassification
-      config: default
-      split: test
-      revision: d7c0de2777da35d6aae2200a62c6e0e5af397c4c
-    metrics:
-    - type: accuracy
-      value: 71.4952
-    - type: ap
-      value: 14.793362635531409
-    - type: f1
-      value: 55.204635551516915
-  - task:
-      type: Classification
-    dataset:
-      type: mteb/tweet_sentiment_extraction
-      name: MTEB TweetSentimentExtractionClassification
-      config: default
-      split: test
-      revision: d604517c81ca91fe16a244d1248fc021f9ecee7a
-    metrics:
-    - type: accuracy
-      value: 61.5365025466893
-    - type: f1
-      value: 61.81742556334845
-  - task:
-      type: Clustering
-    dataset:
-      type: mteb/twentynewsgroups-clustering
-      name: MTEB TwentyNewsgroupsClustering
-      config: default
-      split: test
-      revision: 6125ec4e24fa026cec8a478383ee943acfbd5449
-    metrics:
-    - type: v_measure
-      value: 49.05531070301185
-  - task:
-      type: PairClassification
-    dataset:
-      type: mteb/twittersemeval2015-pairclassification
-      name: MTEB TwitterSemEval2015
-      config: default
-      split: test
-      revision: 70970daeab8776df92f5ea462b6173c0b46fd2d1
-    metrics:
-    - type: cos_sim_accuracy
-      value: 86.51725576682364
-    - type: cos_sim_ap
-      value: 75.2292304265163
-    - type: cos_sim_f1
-      value: 69.54022988505749
-    - type: cos_sim_precision
-      value: 63.65629110039457
-    - type: cos_sim_recall
-      value: 76.62269129287598
-    - type: dot_accuracy
-      value: 86.51725576682364
-    - type: dot_ap
-      value: 75.22922386081054
-    - type: dot_f1
-      value: 69.54022988505749
-    - type: dot_precision
-      value: 63.65629110039457
-    - type: dot_recall
-      value: 76.62269129287598
-    - type: euclidean_accuracy
-      value: 86.51725576682364
-    - type: euclidean_ap
-      value: 75.22925730473472
-    - type: euclidean_f1
-      value: 69.54022988505749
-    - type: euclidean_precision
-      value: 63.65629110039457
-    - type: euclidean_recall
-      value: 76.62269129287598
-    - type: manhattan_accuracy
-      value: 86.52321630804077
-    - type: manhattan_ap
-      value: 75.20608115037336
-    - type: manhattan_f1
-      value: 69.60000000000001
-    - type: manhattan_precision
-      value: 64.37219730941705
-    - type: manhattan_recall
-      value: 75.75197889182058
-    - type: max_accuracy
-      value: 86.52321630804077
-    - type: max_ap
-      value: 75.22925730473472
-    - type: max_f1
-      value: 69.60000000000001
-  - task:
-      type: PairClassification
-    dataset:
-      type: mteb/twitterurlcorpus-pairclassification
-      name: MTEB TwitterURLCorpus
-      config: default
-      split: test
-      revision: 8b6510b0b1fa4e4c4f879467980e9be563ec1cdf
-    metrics:
-    - type: cos_sim_accuracy
-      value: 89.34877944657896
-    - type: cos_sim_ap
-      value: 86.71257569277373
-    - type: cos_sim_f1
-      value: 79.10386355986088
-    - type: cos_sim_precision
-      value: 76.91468470434214
-    - type: cos_sim_recall
-      value: 81.4213119802895
-    - type: dot_accuracy
-      value: 89.34877944657896
-    - type: dot_ap
-      value: 86.71257133133368
-    - type: dot_f1
-      value: 79.10386355986088
-    - type: dot_precision
-      value: 76.91468470434214
-    - type: dot_recall
-      value: 81.4213119802895
-    - type: euclidean_accuracy
-      value: 89.34877944657896
-    - type: euclidean_ap
-      value: 86.71257651501476
-    - type: euclidean_f1
-      value: 79.10386355986088
-    - type: euclidean_precision
-      value: 76.91468470434214
-    - type: euclidean_recall
-      value: 81.4213119802895
-    - type: manhattan_accuracy
-      value: 89.35848177901967
-    - type: manhattan_ap
-      value: 86.69330615469126
-    - type: manhattan_f1
-      value: 79.13867741453949
-    - type: manhattan_precision
-      value: 76.78881807647741
-    - type: manhattan_recall
-      value: 81.63689559593472
-    - type: max_accuracy
-      value: 89.35848177901967
-    - type: max_ap
-      value: 86.71257651501476
-    - type: max_f1
-      value: 79.13867741453949
-license: apache-2.0
-language:
-- en
----
-# nomic-embed-text-v1: A Reproducible Long Context (8192) Text Embedder
-`nomic-embed-text-v1` is 8192 context length text encoder that surpasses OpenAI text-embedding-ada-002 and text-embedding-3-small performance on short and long context tasks.
-| Name                             | SeqLen | MTEB      | LoCo     | Jina Long Context |  Open Weights | Open Training Code | Open Data   |
-| :-------------------------------:| :----- | :-------- | :------: | :---------------: | :-----------: | :----------------: | :---------- |
-| nomic-embed-text-v1              | 8192   | **62.39** |**85.53** | 54.16             | ✅            | ✅                  | ✅          |
-| jina-embeddings-v2-base-en       | 8192   | 60.39     | 85.45    | 51.90             | ✅            | ❌                  | ❌          |
-| text-embedding-3-small           | 8191   | 62.26     | 82.40    | **58.20**         | ❌            | ❌                  | ❌          |
-| text-embedding-ada-002           | 8191   | 60.99     | 52.7     | 55.25             | ❌            | ❌                  | ❌          |
-## Hosted Inference API
-The easiest way to get started with Nomic Embed is through the Nomic Embedding API.
-Generating embeddings with the `nomic` Python client is as easy as
-```python
-from nomic import embed
-output = embed.text(
-    texts=['Nomic Embedding API', '#keepAIOpen'],
-    model='nomic-embed-text-v1',
-    task_type='search_document'
-)
-print(output)
-```
-For more information, see the [API reference](https://docs.nomic.ai/reference/endpoints/nomic-embed-text)
-## Data Visualization
-Click the Nomic Atlas map below to visualize a 5M sample of our contrastive pretraining data!
-[![image/webp](https://cdn-uploads.huggingface.co/production/uploads/607997c83a565c15675055b3/pjhJhuNyRfPagRd_c_iUz.webp)](https://atlas.nomic.ai/map/nomic-text-embed-v1-5m-sample)
-## Training Details
-We train our embedder using a multi-stage training pipeline. Starting from a long-context [BERT model](https://huggingface.co/nomic-ai/nomic-bert-2048),
-the first unsupervised contrastive stage trains on a dataset generated from weakly related text pairs, such as question-answer pairs from forums like StackExchange and Quora, title-body pairs from Amazon reviews, and summarizations from news articles.
-In the second finetuning stage, higher quality labeled datasets such as search queries and answers from web searches are leveraged. Data curation and hard-example mining is crucial in this stage.
-For more details, see the Nomic Embed [Technical Report](https://static.nomic.ai/reports/2024_Nomic_Embed_Text_Technical_Report.pdf) and corresponding [blog post](https://blog.nomic.ai/posts/nomic-embed-text-v1).
-Training data to train the models is released in its entirety. For more details, see the `contrastors` [repository](https://github.com/nomic-ai/contrastors)
-## Usage
-Note `nomic-embed-text` requires prefixes! We support the prefixes `[search_query, search_document, classification, clustering]`.
-For retrieval applications, you should prepend `search_document` for all your documents and `search_query` for your queries.
-### Sentence Transformers
-```python
-from sentence_transformers import SentenceTransformer
-model = SentenceTransformer("nomic-ai/nomic-embed-text-v1", trust_remote_code=True)
-sentences = ['search_query: What is TSNE?', 'search_query: Who is Laurens van der Maaten?']
-embeddings = model.encode(sentences)
-print(embeddings)
-```
-### Transformers
-```python
-import torch
-import torch.nn.functional as F
-from transformers import AutoTokenizer, AutoModel
-def mean_pooling(model_output, attention_mask):
-    token_embeddings = model_output[0]
-    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
-    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
-sentences = ['search_query: What is TSNE?', 'search_query: Who is Laurens van der Maaten?']
-tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
-model = AutoModel.from_pretrained('nomic-ai/nomic-embed-text-v1', trust_remote_code=True)
-model.eval()
-encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
-with torch.no_grad():
-    model_output = model(**encoded_input)
-embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
-embeddings = F.normalize(embeddings, p=2, dim=1)
-print(embeddings)
-```
-The model natively supports scaling of the sequence length past 2048 tokens. To do so,
-```diff
-- tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
-+ tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased', model_max_length=8192)
-- model = AutoModel.from_pretrained('nomic-ai/nomic-embed-text-v1', trust_remote_code=True)
-+ model = AutoModel.from_pretrained('nomic-ai/nomic-embed-text-v1', trust_remote_code=True, rotary_scaling_factor=2)
-```
-### Transformers.js
-```js
-import { pipeline } from '@xenova/transformers';
-// Create a feature extraction pipeline
-const extractor = await pipeline('feature-extraction', 'nomic-ai/nomic-embed-text-v1', {
-    quantized: false, // Comment out this line to use the quantized version
-});
-// Compute sentence embeddings
-const texts = ['What is TSNE?', 'Who is Laurens van der Maaten?'];
-const embeddings = await extractor(texts, { pooling: 'mean', normalize: true });
-console.log(embeddings);
-```
-# Join the Nomic Community
-- Nomic: [https://nomic.ai](https://nomic.ai)
-- Discord: [https://discord.gg/myY5YDR8z8](https://discord.gg/myY5YDR8z8)
-- Twitter: [https://twitter.com/nomic_ai](https://twitter.com/nomic_ai)

pretrained/nomic-ai/nomic-embed-text-v1/config.json DELETED Viewed

@@ -1,56 +0,0 @@
-{
-  "activation_function": "swiglu",
-  "architectures": [
-    "NomicBertModel"
-  ],
-  "attn_pdrop": 0.0,
-  "auto_map": {
-    "AutoConfig": "configuration_hf_nomic_bert.NomicBertConfig",
-    "AutoModel": "modeling_hf_nomic_bert.NomicBertModel",
-    "AutoModelForMaskedLM": "nomic-ai/nomic-bert-2048--modeling_hf_nomic_bert.NomicBertForPreTraining"
-  },
-  "bos_token_id": null,
-  "causal": false,
-  "dense_seq_output": true,
-  "embd_pdrop": 0.0,
-  "eos_token_id": null,
-  "fused_bias_fc": true,
-  "fused_dropout_add_ln": true,
-  "initializer_range": 0.02,
-  "layer_norm_epsilon": 1e-12,
-  "mlp_fc1_bias": false,
-  "mlp_fc2_bias": false,
-  "model_type": "nomic_bert",
-  "n_embd": 768,
-  "n_head": 12,
-  "n_inner": 3072,
-  "n_layer": 12,
-  "n_positions": 8192,
-  "pad_vocab_size_multiple": 64,
-  "parallel_block": false,
-  "parallel_block_tied_norm": false,
-  "prenorm": false,
-  "qkv_proj_bias": false,
-  "reorder_and_upcast_attn": false,
-  "resid_pdrop": 0.0,
-  "rotary_emb_base": 1000,
-  "rotary_emb_fraction": 1.0,
-  "rotary_emb_interleaved": false,
-  "rotary_emb_scale_base": null,
-  "rotary_scaling_factor": 2,
-  "scale_attn_by_inverse_layer_idx": false,
-  "scale_attn_weights": true,
-  "summary_activation": null,
-  "summary_first_dropout": 0.0,
-  "summary_proj_to_labels": true,
-  "summary_type": "cls_index",
-  "summary_use_proj": true,
-  "torch_dtype": "float32",
-  "transformers_version": "4.34.0",
-  "type_vocab_size": 2,
-  "use_cache": true,
-  "use_flash_attn": true,
-  "use_rms_norm": false,
-  "use_xentropy": true,
-  "vocab_size": 30528
-}

pretrained/nomic-ai/nomic-embed-text-v1/config_sentence_transformers.json DELETED Viewed

@@ -1,7 +0,0 @@
-{
-  "__version__": {
-    "sentence_transformers": "2.4.0.dev0",
-    "transformers": "4.37.2",
-    "pytorch": "2.1.0+cu121"
-  }
-}

pretrained/nomic-ai/nomic-embed-text-v1/configuration_hf_nomic_bert.py DELETED Viewed

@@ -1,53 +0,0 @@
-from transformers import GPT2Config
-class NomicBertConfig(GPT2Config):
-    model_type = "nomic_bert"
-    def __init__(self,
-        prenorm=False,
-        parallel_block=False,
-        parallel_block_tied_norm=False,
-        rotary_emb_fraction=0.0,
-        fused_dropout_add_ln=False,
-        fused_bias_fc=False,
-        use_flash_attn=False,
-        use_xentropy=False,
-        qkv_proj_bias=True,
-        rotary_emb_base=1000,
-        rotary_emb_scale_base=None,
-        rotary_emb_interleaved=False,
-        mlp_fc1_bias=True,
-        mlp_fc2_bias=True,
-        use_rms_norm=False,
-        causal=False,
-        type_vocab_size=2,
-        dense_seq_output=True,
-        pad_vocab_size_multiple=1,
-        tie_word_embeddings=True,
-        rotary_scaling_factor=1.0,
-        **kwargs,
-    ):
-        self.prenorm = prenorm
-        self.parallel_block = parallel_block
-        self.parallel_block_tied_norm = parallel_block_tied_norm
-        self.rotary_emb_fraction = rotary_emb_fraction
-        self.tie_word_embeddings = tie_word_embeddings
-        self.fused_dropout_add_ln = fused_dropout_add_ln
-        self.fused_bias_fc = fused_bias_fc
-        self.use_flash_attn = use_flash_attn
-        self.use_xentropy = use_xentropy
-        self.qkv_proj_bias = qkv_proj_bias
-        self.rotary_emb_base = rotary_emb_base
-        self.rotary_emb_scale_base = rotary_emb_scale_base
-        self.rotary_emb_interleaved = rotary_emb_interleaved
-        self.mlp_fc1_bias = mlp_fc1_bias
-        self.mlp_fc2_bias = mlp_fc2_bias
-        self.use_rms_norm = use_rms_norm
-        self.causal = causal
-        self.type_vocab_size = type_vocab_size
-        self.dense_seq_output = dense_seq_output
-        self.pad_vocab_size_multiple = pad_vocab_size_multiple
-        self.rotary_scaling_factor = rotary_scaling_factor
-        super().__init__(**kwargs)

pretrained/nomic-ai/nomic-embed-text-v1/model.safetensors DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:47e396424a085a613034450cd4bf9e8acfb568b19089ae1c5c4e7051ae286877
-size 546938168

pretrained/nomic-ai/nomic-embed-text-v1/modeling_hf_nomic_bert.py DELETED Viewed

@@ -1,1238 +0,0 @@
-# Copyright (c) 2022, Tri Dao.
-# This BERT implementation is based on our MLPerf 2.0 and MLPerf 2.1 BERT implementation.
-# https://github.com/mlcommons/training_results_v2.0/blob/main/HazyResearch/benchmarks/bert/implementations/pytorch/modeling.py
-# https://github.com/mlcommons/training_results_v2.1/blob/main/Azure-HazyResearch/benchmarks/bert/implementations/ND96amsr_A100_v4/modeling.py
-# Inspired by https://github.com/huggingface/transformers/blob/main/src/transformers/models/bert/modeling_bert.py
-import os
-import logging
-from functools import partial
-from typing import Optional, List, Tuple, Union
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from einops import rearrange, repeat
-from transformers import GPT2Config, PreTrainedModel
-from transformers.models.bert.modeling_bert import (
-    BaseModelOutputWithPoolingAndCrossAttentions,
-    MaskedLMOutput,
-    SequenceClassifierOutput
-)
-import re
-from collections import OrderedDict
-from safetensors.torch import load_file as safe_load_file
-from transformers.utils import (
-    SAFE_WEIGHTS_INDEX_NAME,
-    SAFE_WEIGHTS_NAME,
-    WEIGHTS_INDEX_NAME,
-    WEIGHTS_NAME,
-)
-from transformers.utils.hub import cached_file, get_checkpoint_shard_files
-from .configuration_hf_nomic_bert import NomicBertConfig
-logger = logging.getLogger(__name__)
-# adapted from flash attention, added safe serialization option for hf models
-def state_dict_from_pretrained(model_name, safe_serialization=False, device=None, dtype=None):
-    # If not fp32, then we don't want to load directly to the GPU
-    mapped_device = "cpu" if dtype not in [torch.float32, None] else device
-    is_sharded = False
-    load_safe = False
-    resolved_archive_file = None
-    weights_path = os.path.join(model_name, WEIGHTS_NAME)
-    weights_index_path = os.path.join(model_name, WEIGHTS_INDEX_NAME)
-    safe_weights_path = os.path.join(model_name, SAFE_WEIGHTS_NAME)
-    safe_weights_index_path = os.path.join(model_name, SAFE_WEIGHTS_INDEX_NAME)
-    if os.path.isfile(weights_path):
-        resolved_archive_file = cached_file(
-            model_name, WEIGHTS_NAME, _raise_exceptions_for_missing_entries=False
-        )
-    elif os.path.isfile(weights_index_path):
-        resolved_archive_file = cached_file(
-            model_name, WEIGHTS_INDEX_NAME, _raise_exceptions_for_missing_entries=False
-        )
-        is_sharded = True
-    elif os.path.isfile(safe_weights_path):
-        resolved_archive_file = cached_file(
-            model_name, SAFE_WEIGHTS_NAME, _raise_exceptions_for_missing_entries=False
-        )
-        load_safe = True
-    elif os.path.isfile(safe_weights_index_path):
-        resolved_archive_file = cached_file(
-            model_name, SAFE_WEIGHTS_INDEX_NAME, _raise_exceptions_for_missing_entries=False
-        )
-        is_sharded = True
-        load_safe = True
-    else:  # Try loading from HF hub instead of from local files
-        weight_name = WEIGHTS_NAME if not safe_serialization else SAFE_WEIGHTS_NAME
-        resolved_archive_file = cached_file(model_name, weight_name, _raise_exceptions_for_missing_entries=False)
-        if resolved_archive_file is None:
-            weight_index = WEIGHTS_INDEX_NAME if not safe_serialization else SAFE_WEIGHTS_INDEX_NAME
-            resolved_archive_file = cached_file(model_name, weight_index,
-                                                _raise_exceptions_for_missing_entries=False)
-            if resolved_archive_file is not None:
-                is_sharded = True
-        load_safe = safe_serialization
-    if resolved_archive_file is None:
-        raise EnvironmentError(f"Model name {model_name} was not found.")
-    if load_safe:
-        loader = partial(safe_load_file, device=mapped_device)
-    else:
-        loader = partial(torch.load, map_location=mapped_device)
-    if is_sharded:
-        # resolved_archive_file becomes a list of files that point to the different
-        # checkpoint shards in this case.
-        resolved_archive_file, sharded_metadata = get_checkpoint_shard_files(
-            model_name, resolved_archive_file
-        )
-        state_dict = {}
-        for sharded_file in resolved_archive_file:
-            state_dict.update(loader(sharded_file))
-    else:
-        state_dict = loader(resolved_archive_file)
-    # Convert dtype before moving to GPU to save memory
-    if dtype is not None:
-        state_dict = {k: v.to(dtype=dtype) for k, v in state_dict.items()}
-    state_dict = {k: v.to(device=device) for k, v in state_dict.items()}
-    return state_dict
-def filter_shapes(state_dict, model):
-    """
-    Filters the state dict to match the current model shape.
-    """
-    filtered_state_dict = {}
-    for key, value in state_dict.items():
-        if key in model.state_dict():
-            if value.shape == model.state_dict()[key].shape:
-                filtered_state_dict[key] = value
-    return filtered_state_dict
-def remap_bert_state_dict(state_dict, config, remove_bert=False, remove_cls_weights=False, add_pooling_layer=False):
-    """
-    Map the state_dict of a Huggingface BERT model to be flash_attn compatible.
-    """
-    def add_bert_prefix(key):
-        # prepend bert. to the key
-        if key.startswith("bert.") or key.startswith("cls."):
-            return key
-        return f"bert.{key}"
-    state_dict = OrderedDict((add_bert_prefix(k), v) for k, v in state_dict.items())
-    # LayerNorm
-    def key_mapping_ln_gamma_beta(key):
-        key = re.sub(r"LayerNorm.gamma$", "LayerNorm.weight", key)
-        key = re.sub(r"LayerNorm.beta$", "LayerNorm.bias", key)
-        return key
-    state_dict = OrderedDict((key_mapping_ln_gamma_beta(k), v) for k, v in state_dict.items())
-    # Layers
-    def key_mapping_layers(key):
-        return re.sub(r"^bert.encoder.layer\.", "bert.encoder.layers.", key)
-    state_dict = OrderedDict((key_mapping_layers(k), v) for k, v in state_dict.items())
-    # LayerNorm
-    def key_mapping_ln(key):
-        key = re.sub(r"^bert.embeddings.LayerNorm.", "bert.emb_ln.", key)
-        key = re.sub(
-            r"^bert.encoder.layers.(\d+).attention.output.LayerNorm.(weight|bias)",
-            r"bert.encoder.layers.\1.norm1.\2",
-            key,
-        )
-        key = re.sub(
-            r"^bert.encoder.layers.(\d+).output.LayerNorm.(weight|bias)",
-            r"bert.encoder.layers.\1.norm2.\2",
-            key,
-        )
-        key = re.sub(
-            r"^cls.predictions.transform.LayerNorm.(weight|bias)",
-            r"cls.predictions.transform.layer_norm.\1",
-            key,
-        )
-        return key
-    state_dict = OrderedDict((key_mapping_ln(k), v) for k, v in state_dict.items())
-    # MLP
-    def key_mapping_mlp(key):
-        key = re.sub(
-            r"^bert.encoder.layers.(\d+).intermediate.dense.(weight|bias)",
-            r"bert.encoder.layers.\1.mlp.fc1.\2",
-            key,
-        )
-        key = re.sub(
-            r"^bert.encoder.layers.(\d+).output.dense.(weight|bias)",
-            r"bert.encoder.layers.\1.mlp.fc2.\2",
-            key,
-        )
-        return key
-    state_dict = OrderedDict((key_mapping_mlp(k), v) for k, v in state_dict.items())
-    # Attention
-    last_layer_subset = getattr(config, "last_layer_subset", False)
-    for d in range(config.num_hidden_layers):
-        if f"bert.encoder.layers.{d}.attention.self.query.weight" not in state_dict:
-            continue
-        Wq = state_dict.pop(f"bert.encoder.layers.{d}.attention.self.query.weight")
-        Wk = state_dict.pop(f"bert.encoder.layers.{d}.attention.self.key.weight")
-        Wv = state_dict.pop(f"bert.encoder.layers.{d}.attention.self.value.weight")
-        bq = state_dict.pop(f"bert.encoder.layers.{d}.attention.self.query.bias")
-        bk = state_dict.pop(f"bert.encoder.layers.{d}.attention.self.key.bias")
-        bv = state_dict.pop(f"bert.encoder.layers.{d}.attention.self.value.bias")
-        if not (last_layer_subset and d == config.num_hidden_layers - 1):
-            state_dict[f"bert.encoder.layers.{d}.attn.Wqkv.weight"] = torch.cat(
-                [Wq, Wk, Wv], dim=0
-            )
-            state_dict[f"bert.encoder.layers.{d}.attn.Wqkv.bias"] = torch.cat([bq, bk, bv], dim=0)
-        else:
-            state_dict[f"bert.encoder.layers.{d}.attn.Wq.weight"] = Wq
-            state_dict[f"bert.encoder.layers.{d}.attn.Wkv.weight"] = torch.cat([Wk, Wv], dim=0)
-            state_dict[f"bert.encoder.layers.{d}.attn.Wq.bias"] = bq
-            state_dict[f"bert.encoder.layers.{d}.attn.Wkv.bias"] = torch.cat([bk, bv], dim=0)
-    def key_mapping_attn(key):
-        return re.sub(
-            r"^bert.encoder.layers.(\d+).attention.output.dense.(weight|bias)",
-            r"bert.encoder.layers.\1.attn.out_proj.\2",
-            key,
-        )
-    state_dict = OrderedDict((key_mapping_attn(k), v) for k, v in state_dict.items())
-    def key_mapping_decoder_bias(key):
-        return re.sub(r"^cls.predictions.bias", "cls.predictions.decoder.bias", key)
-    # remove nsp weights, we don't use
-    state_dict.pop("cls.seq_relationship.weight", None)
-    state_dict.pop("cls.seq_relationship.bias", None)
-    state_dict.pop("bert.embeddings.position_ids", None)
-    state_dict = OrderedDict((key_mapping_decoder_bias(k), v) for k, v in state_dict.items())
-    if remove_cls_weights:
-        cls_weights = ["cls.predictions.decoder.bias",
-                       "cls.predictions.transform.dense.weight",
-                       "cls.predictions.transform.dense.bias",
-                       "cls.predictions.transform.layer_norm.weight",
-                       "cls.predictions.transform.layer_norm.bias",
-                       "cls.predictions.decoder.weight"]
-        for weight in cls_weights:
-            state_dict.pop(weight, None)
-    # Word embedding
-    pad_vocab_size_multiple = getattr(config, "pad_vocab_size_multiple", 1)
-    if pad_vocab_size_multiple > 1:
-        word_embeddings = state_dict["bert.embeddings.word_embeddings.weight"]
-        state_dict["bert.embeddings.word_embeddings.weight"] = F.pad(
-            word_embeddings, (0, 0, 0, config.vocab_size - word_embeddings.shape[0])
-        )
-        if not remove_cls_weights:
-            decoder_weight = state_dict["cls.predictions.decoder.weight"]
-            state_dict["cls.predictions.decoder.weight"] = F.pad(
-                decoder_weight, (0, 0, 0, config.vocab_size - decoder_weight.shape[0])
-            )
-            # If the vocab was padded, we want to set the decoder bias for those padded indices to be
-            # strongly negative (i.e. the decoder shouldn't predict those indices).
-            # TD [2022-05-09]: I don't think it affects the MLPerf training.
-            if "cls.predictions.decoder.bias" in state_dict:
-                decoder_bias = state_dict["cls.predictions.decoder.bias"]
-                state_dict["cls.predictions.decoder.bias"] = F.pad(
-                    decoder_bias, (0, config.vocab_size - decoder_bias.shape[0]), value=-100.0
-                )
-    if add_pooling_layer is False:
-        pooler_weights = ["bert.pooler.dense.weight",
-                          "bert.pooler.dense.bias",
-                        ]
-        for key in pooler_weights:
-            state_dict.pop(key, None)
-    if remove_bert:
-        def remove_bert_prefix(key):
-            key = re.sub(r"^bert.", "", key)
-            return key
-        state_dict = OrderedDict((remove_bert_prefix(k), v) for k, v in state_dict.items())
-    return state_dict
-class NomicBertPreTrainedModel(PreTrainedModel):
-    """An abstract class to handle weights initialization and
-    a simple interface for dowloading and loading pretrained models.
-    """
-    config_class = NomicBertConfig
-    base_model_prefix = "model"
-    supports_gradient_checkpointing = True
-    _no_split_modules = ["Block"]
-    _skip_keys_device_placement = "past_key_values"
-    def __init__(self, config, *inputs, **kwargs):
-        super().__init__(config)
-        if not isinstance(config, GPT2Config):
-            raise ValueError(
-                "Parameter config in `{}(config)` should be an instance of class `GPT2Config`. "
-                "To create a model from a Google pretrained model use "
-                "`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`".format(
-                    self.__class__.__name__, self.__class__.__name__
-                )
-            )
-        self.config = config
-    @classmethod
-    def from_pretrained(cls, model_name, config=None, *inputs, **kwargs):
-        """
-        Instantiate a NomicBertPreTrainedModel from a pre-trained model file or a pytorch state dict.
-        Download and cache the pre-trained model file if needed.
-        Params:
-            pretrained_model_name_or_path: either:
-                - a path or url to a pretrained model archive containing:
-                    . `bert_config.json` a configuration file for the model
-                    . `pytorch_model.bin` a PyTorch dump of a NomicBertForPretraining instance
-                - a path or url to a pretrained model archive containing:
-                    . `bert_config.json` a configuration file for the model
-                    . `model.chkpt` a TensorFlow checkpoint
-            *inputs, **kwargs: additional input for the specific NomicBert class
-                (ex: num_labels for NomicBertForSequenceClassification)
-        """
-        # Instantiate model.
-        if config is None:
-            config = cls.config_class.from_pretrained(model_name)
-        remove_cls = cls != NomicBertForPreTraining
-        remove_bert_prefix = cls != NomicBertForPreTraining
-        ignore_mismatched_shapes = kwargs.pop("ignore_mismatched_sizes", False)
-        num_labels = kwargs.pop("num_labels", None)
-        rotary_scaling_factor = kwargs.pop("rotary_scaling_factor", None)
-        if rotary_scaling_factor:
-            config.rotary_scaling_factor = rotary_scaling_factor
-        else:
-            config.rotary_scaling_factor = None
-        if config.n_positions <= 0 and config.rotary_emb_fraction > 0:
-            config.n_positions = 2048
-        if num_labels:
-            config.num_labels = num_labels
-        if "add_pooling_layer" in kwargs:
-            model = cls(config, *inputs, add_pooling_layer=kwargs.pop("add_pooling_layer"))
-        else:
-            if cls == NomicBertModel:
-                model = cls(config, *inputs, add_pooling_layer=False)
-            else:
-                model = cls(config, *inputs)
-        # TODO: fix this
-        # Assuming we know what we're doing when loading from disk
-        # Prob a bad assumption but i'm tired and want to train this asap
-        if os.path.exists(model_name):
-            state_dict = torch.load(f"{model_name}/pytorch_model.bin")
-            if ignore_mismatched_shapes:
-                state_dict = filter_shapes(state_dict, model)
-            load_return = model.load_state_dict(state_dict, strict=False)
-        else:
-            # TODO: can probably check config class and see if we need to remap from a bert model
-            state_dict = state_dict_from_pretrained(model_name)
-            state_dict = remap_bert_state_dict(state_dict,
-                                               config,
-                                               remove_bert=remove_bert_prefix,
-                                               remove_cls_weights=remove_cls,
-                                               add_pooling_layer=getattr(config, "add_pooling_layer", False)
-                                               )
-            if ignore_mismatched_shapes:
-                state_dict = filter_shapes(state_dict, model)
-            load_return = model.load_state_dict(
-                state_dict,
-                strict=True
-            )
-        logger.warning(load_return)
-        return model
-    def _set_gradient_checkpointing(self, module, value=False):
-        if isinstance(module, NomicBertEncoder):
-            module.gradient_checkpointing = value
-# https://github.com/huggingface/transformers/blob/7032e0203262ebb2ebf55da8d2e01f873973e835/src/transformers/models/bert/modeling_bert.py#L748
-def _init_weights(module, initializer_range=0.02):
-    if isinstance(module, nn.Linear):
-        nn.init.normal_(module.weight, std=initializer_range)
-        if module.bias is not None:
-            nn.init.zeros_(module.bias)
-    elif isinstance(module, nn.Embedding):
-        nn.init.normal_(module.weight, std=initializer_range)
-        if module.padding_idx is not None:
-            nn.init.zeros_(module.weight[module.padding_idx])
-class NomicBertEmbeddings(nn.Module):
-    def __init__(
-        self,
-        config
-    ):
-        """
-        If max_position_embeddings <= 0, there's no position embeddings
-        If type_vocab_size <= 0, there's no token type embeddings
-        """
-        super().__init__()
-        self.word_embeddings = nn.Embedding(
-            config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id
-        )
-        self.max_position_embeddings = config.max_position_embeddings if config.rotary_emb_fraction <= 0 else 0
-        self.type_vocab_size = config.type_vocab_size
-        if self.max_position_embeddings > 0 and config.rotary_emb_fraction <= 0:
-            self.position_embeddings = nn.Embedding(
-                config.max_position_embeddings, config.hidden_size,
-            )
-        if self.type_vocab_size > 0:
-            self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
-    def forward(self, input_ids, position_ids=None, token_type_ids=None):
-        """
-        input_ids: (batch, seqlen)
-        position_ids: (batch, seqlen)
-        token_type_ids: (batch, seqlen)
-        """
-        batch_size, seqlen = input_ids.shape
-        embeddings = self.word_embeddings(input_ids)
-        if self.type_vocab_size > 0:
-            if token_type_ids is None:
-                token_type_ids = torch.zeros(seqlen, dtype=torch.long, device=input_ids.device)
-            token_type_embeddings = self.token_type_embeddings(token_type_ids)
-            embeddings = embeddings + token_type_embeddings
-        if self.max_position_embeddings > 0:
-            if position_ids is None:
-                position_ids = torch.arange(seqlen, dtype=torch.long, device=input_ids.device)
-            position_embeddings = self.position_embeddings(position_ids)
-            embeddings = embeddings + position_embeddings
-        return embeddings
-class NomicBertMLP(nn.Module):
-    def __init__(
-        self,
-        in_features,
-        hidden_features=None,
-        out_features=None,
-        activation=F.gelu,
-        bias1=True,
-        bias2=True,
-        return_residual=False,
-        fused_bias_fc=False,
-    ):
-        super().__init__()
-        out_features = out_features if out_features is not None else in_features
-        hidden_features = hidden_features if hidden_features is not None else in_features * 4
-        self.return_residual = return_residual
-        self.fc1 = nn.Linear(in_features, hidden_features, bias=bias1)
-        approximate = (
-            "tanh"
-            if activation in ["gelu_new", "gelu_fast", "gelu_pytorch_tanh"]
-            else "none"
-        )
-        self.activation = nn.GELU(approximate=approximate) if activation == "gelu" else activation
-        self.fc2 = nn.Linear(hidden_features, out_features, bias=bias2)
-    def forward(self, x):
-        y = self.fc1(x)
-        y = self.activation(y)
-        y = self.fc2(y)
-        return y if not self.return_residual else (y, x)
-class NomciBertGatedMLP(nn.Module):
-    def __init__(
-        self,
-        in_features,
-        hidden_features=None,
-        out_features=None,
-        activation=F.sigmoid,
-        bias1=True,
-        bias2=True,
-        multiple_of=256,
-        return_residual=False,
-        fused_bias_fc=True,
-        device=None,
-        dtype=None,
-    ):
-        super().__init__()
-        out_features = out_features if out_features is not None else in_features
-        hidden_features = (
-            hidden_features if hidden_features is not None else int(8 * in_features / 3)
-        )
-        hidden_features = (hidden_features + multiple_of - 1) // multiple_of * multiple_of
-        self.return_residual = return_residual
-        self.fc11 = nn.Linear(in_features, hidden_features, bias=bias1)
-        self.fc12 = nn.Linear(in_features, hidden_features, bias=bias1)
-        self.activation = activation
-        self.fc2 = nn.Linear(hidden_features, out_features, bias=bias2)
-    def forward(self, x):
-        y = self.fc11(x)
-        gate = self.fc12(x)
-        if self.activation == F.sigmoid:  # Special case for GLU
-            y = F.glu(torch.cat([y, gate], dim=-1), dim=-1)
-        else:
-            y = y * self.activation(gate)
-        y = self.fc2(y)
-        return y if not self.return_residual else (y, x)
-def rotate_half(x, interleaved=False):
-    if not interleaved:
-        x1, x2 = x.chunk(2, dim=-1)
-        return torch.cat((-x2, x1), dim=-1)
-    else:
-        x1, x2 = x[..., ::2], x[..., 1::2]
-        return rearrange(torch.stack((-x2, x1), dim=-1), "... d two -> ... (d two)", two=2)
-def apply_rotary_emb(x, cos, sin, offset=0, interleaved=False):
-    """
-    x: (batch_size, seqlen, nheads, headdim)
-    cos, sin: (seqlen, rotary_dim / 2) or (batch_size, seqlen, rotary_dim / 2)
-    """
-    ro_dim = cos.shape[-1] * 2
-    assert ro_dim <= x.shape[-1]
-    cos, sin = (
-        cos[offset: offset + x.shape[1]],
-        sin[offset: offset + x.shape[1]],
-    )
-    cos = repeat(cos, "... d -> ... 1 (2 d)" if not interleaved else "... d -> ... 1 (d 2)")
-    sin = repeat(sin, "... d -> ... 1 (2 d)" if not interleaved else "... d -> ... 1 (d 2)")
-    return torch.cat(
-        [x[..., :ro_dim] * cos + rotate_half(x[..., :ro_dim], interleaved) * sin, x[..., ro_dim:]],
-        dim=-1,
-    )
-class NomicBertRotaryEmbedding(nn.Module):
-    def __init__(
-        self,
-        dim: int,
-        base=10000.0,
-        interleaved=False,
-        scale_base=None,
-        pos_idx_in_fp32=True,
-        device=None,
-    ):
-        """
-        interleaved: if True, rotate pairs of even and odd dimensions (GPT-J style) instead
-            of 1st half and 2nd half (GPT-NeoX style).
-        pos_idx_in_fp32: if True, the position indices [0.0, ..., seqlen - 1] are in fp32,
-            otherwise they might be in lower precision.
-            This option was added because previously (before 2023-07-02), when we construct
-            the position indices, we use the dtype of self.inv_freq. In most cases this would
-            be fp32, but if the model is trained in pure bf16 (not mixed precision), then
-            self.inv_freq would be bf16, and the position indices are also in bf16.
-            Because of the limited precision of bf16 (e.g. 1995.0 is rounded to 2000.0), the
-            embeddings for some positions will coincide.
-            To maintain compatibility with models previously trained in pure bf16,
-            we add this option.
-        """
-        super().__init__()
-        self.dim = dim
-        self.base = float(base)
-        self.pos_idx_in_fp32 = pos_idx_in_fp32
-        # Generate and save the inverse frequency buffer (non trainable)
-        inv_freq = self._compute_inv_freq(device)
-        self.register_buffer("inv_freq", inv_freq, persistent=False)
-        self.interleaved = interleaved
-        self.scale_base = scale_base
-        scale = (
-            (torch.arange(0, dim, 2, device=device, dtype=torch.float32) + 0.4 * dim) / (1.4 * dim)
-            if scale_base is not None
-            else None
-        )
-        self.register_buffer("scale", scale, persistent=False)
-        self._seq_len_cached = 0
-        self._cos_cached = None
-        self._sin_cached = None
-        self._cos_k_cached = None
-        self._sin_k_cached = None
-    def _compute_inv_freq(self, device=None):
-        return 1.0 / (
-            self.base
-            ** (torch.arange(0, self.dim, 2, device=device, dtype=torch.float32) / self.dim)
-        )
-    def _update_cos_sin_cache(self, seqlen, device=None, dtype=None):
-        # Reset the tables if the sequence length has changed,
-        # if we're on a new device (possibly due to tracing for instance),
-        # or if we're switching from inference mode to training
-        if (
-            seqlen > self._seq_len_cached
-            or self._cos_cached is None
-            or self._cos_cached.device != device
-            or self._cos_cached.dtype != dtype
-            or (self.training and self._cos_cached.is_inference())
-        ):
-            self._seq_len_cached = seqlen
-            # We want fp32 here, not self.inv_freq.dtype, since the model could be loaded in bf16
-            # And the output of arange can be quite large, so bf16 would lose a lot of precision.
-            # However, for compatibility reason, we add an option to use the dtype of self.inv_freq.
-            if self.pos_idx_in_fp32:
-                t = torch.arange(seqlen, device=device, dtype=torch.float32)
-                # We want fp32 here as well since inv_freq will be multiplied with t, and the output
-                # will be large. Having it in bf16 will lose a lot of precision and cause the
-                # cos & sin output to change significantly.
-                # We want to recompute self.inv_freq if it was not loaded in fp32
-                if self.inv_freq.dtype != torch.float32:
-                    inv_freq = self._compute_inv_freq(device=device)
-                else:
-                    inv_freq = self.inv_freq
-            else:
-                t = torch.arange(seqlen, device=device, dtype=self.inv_freq.dtype)
-                inv_freq = self.inv_freq
-            # Don't do einsum, it converts fp32 to fp16 under AMP
-            # freqs = torch.einsum("i,j->ij", t, self.inv_freq)
-            freqs = torch.outer(t, inv_freq)
-            self._cos_cached = torch.cos(freqs).to(dtype)
-            self._sin_cached = torch.sin(freqs).to(dtype)
-    def forward(
-        self,
-        qkv: torch.Tensor,
-        kv: Optional[torch.Tensor] = None,
-        seqlen_offset: Union[int, torch.Tensor] = 0,
-        max_seqlen: Optional[int] = None,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        """
-        qkv: (batch, seqlen, 3, nheads, headdim) if kv is none,
-             else it's just q of shape (batch, seqlen, nheads, headdim)
-        kv: (batch, seqlen, 2, nheads, headdim)
-        seqlen_offset: (batch_size,) or int. Each sequence in x is shifted by this amount.
-            Most commonly used in inference when we have KV cache.
-            If it's a tensor of shape (batch_size,), then to update the cos / sin cache, one
-            should pass in max_seqlen, which will update the cos / sin cache up to that length.
-        Apply rotary embedding *inplace* to qkv and / or kv.
-        """
-        seqlen = qkv.shape[1]
-        if seqlen > self._seq_len_cached:
-            self._update_cos_sin_cache(seqlen, device=qkv.device, dtype=qkv.dtype)
-        elif max_seqlen is not None:
-            self._update_cos_sin_cache(max_seqlen, device=qkv.device, dtype=qkv.dtype)
-        elif isinstance(seqlen_offset, int):
-            self._update_cos_sin_cache(seqlen + seqlen_offset, device=qkv.device, dtype=qkv.dtype)
-        q_rot = apply_rotary_emb(qkv[:, :, 0], self._cos_cached, self._sin_cached, seqlen_offset, self.interleaved)
-        k_rot = apply_rotary_emb(qkv[:, :, 1], self._cos_cached, self._sin_cached, seqlen_offset, self.interleaved)
-        return torch.stack((q_rot, k_rot, qkv[:, :, 2]), dim=2)
-class NomicBertDynamicNTKRotaryEmbedding(NomicBertRotaryEmbedding):
-    def __init__(self, rotary_scaling_factor, max_position_embeddings, **kwargs):
-        super().__init__(**kwargs)
-        self.rotary_scaling_factor = rotary_scaling_factor
-        self.max_position_embeddings = max_position_embeddings
-    def _compute_inv_freq(self, base=None, device=None):
-        if base is None:
-            base = self.base
-        return 1.0 / (
-            base
-            ** (torch.arange(0, self.dim, 2, device=device, dtype=torch.float32) / self.dim)
-        )
-    def _update_cos_sin_cache(self, seqlen, device=None, dtype=None):
-        # Reset the tables if the sequence length has changed,
-        # if we're on a new device (possibly due to tracing for instance),
-        # or if we're switching from inference mode to training
-        if seqlen > self.max_position_embeddings:
-            base = self.base * (
-                (self.rotary_scaling_factor * seqlen / self.max_position_embeddings) - (self.rotary_scaling_factor - 1)
-            ) ** (self.dim / (self.dim - 2))
-            inv_freq = self._compute_inv_freq(base=base, device=device)
-            self.register_buffer("inv_freq", inv_freq, persistent=False)
-        if (
-            seqlen > self._seq_len_cached
-            or self._cos_cached is None
-            or self._cos_cached.device != device
-            or self._cos_cached.dtype != dtype
-            or (self.training and self._cos_cached.is_inference())
-        ):
-            self._seq_len_cached = seqlen
-            # We want fp32 here, not self.inv_freq.dtype, since the model could be loaded in bf16
-            # And the output of arange can be quite large, so bf16 would lose a lot of precision.
-            # However, for compatibility reason, we add an option to use the dtype of self.inv_freq.
-            if self.pos_idx_in_fp32:
-                t = torch.arange(seqlen, device=device, dtype=torch.float32)
-                # We want fp32 here as well since inv_freq will be multiplied with t, and the output
-                # will be large. Having it in bf16 will lose a lot of precision and cause the
-                # cos & sin output to change significantly.
-                # We want to recompute self.inv_freq if it was not loaded in fp32
-                if self.inv_freq.dtype != torch.float32:
-                    if seqlen > self.max_position_embeddings:
-                        base = self.base * (
-                            (self.scaling_factor * seqlen / self.max_position_embeddings) - (self.scaling_factor - 1)
-                        ) ** (self.dim / (self.dim - 2))
-                    else:
-                        base = self.base
-                    inv_freq = self._compute_inv_freq(device=device, base=base)
-                else:
-                    inv_freq = self.inv_freq
-            else:
-                t = torch.arange(seqlen, device=device, dtype=self.inv_freq.dtype)
-                inv_freq = self.inv_freq
-            # Don't do einsum, it converts fp32 to fp16 under AMP
-            # freqs = torch.einsum("i,j->ij", t, self.inv_freq)
-            freqs = torch.outer(t, inv_freq)
-            if self.scale is None:
-                self._cos_cached = torch.cos(freqs).to(dtype)
-                self._sin_cached = torch.sin(freqs).to(dtype)
-            else:
-                power = (
-                    torch.arange(seqlen, dtype=self.scale.dtype, device=self.scale.device)
-                    - seqlen // 2
-                ) / self.scale_base
-                scale = self.scale.to(device=power.device) ** rearrange(power, "s -> s 1")
-                # We want the multiplication by scale to happen in fp32
-                self._cos_cached = (torch.cos(freqs) * scale).to(dtype)
-                self._sin_cached = (torch.sin(freqs) * scale).to(dtype)
-                self._cos_k_cached = (torch.cos(freqs) / scale).to(dtype)
-                self._sin_k_cached = (torch.sin(freqs) / scale).to(dtype)
-class NomicBertAttention(nn.Module):
-    """Multi-head self-attention and cross-attention"""
-    def __init__(
-        self,
-        config,
-    ) -> None:
-        """
-        num_heads_kv: can be used to toggle MQA / GQA. If None, use num_heads.
-        return_residual: whether to return the input x along with the output. This is for
-            performance reason: for post-norm architecture, returning the input allows us
-            to fuse the backward of nn.Linear with the residual connection.
-        """
-        super().__init__()
-        self.embed_dim = config.n_embd
-        self.use_flash_attn = config.use_flash_attn
-        self.fused_bias_fc = config.fused_bias_fc
-        self.num_heads = config.n_head
-        self.num_heads_kv = config.num_heads_kv if getattr(config, "num_heads_kv", None) is not None else self.num_heads
-        assert self.embed_dim % self.num_heads == 0, "embed_dim must be divisible by num_heads"
-        self.head_dim = self.embed_dim // self.num_heads
-        # we don't really support mqa / gqa for now
-        qkv_dim = self.head_dim * (self.num_heads + 2 * self.num_heads_kv)
-        self.register_buffer(
-            "norm_factor",
-            torch.sqrt(torch.tensor(self.head_dim, dtype=torch.float32)).to(torch.get_default_dtype()),
-            persistent=False,
-        )
-        self.rotary_emb_dim = self.head_dim * config.rotary_emb_fraction
-        if self.rotary_emb_dim > 0:
-            if config.rotary_scaling_factor:
-                self.rotary_emb = NomicBertDynamicNTKRotaryEmbedding(
-                    dim=self.rotary_emb_dim,
-                    base=config.rotary_emb_base,
-                    scale_base=config.rotary_emb_scale_base,
-                    interleaved=config.rotary_emb_interleaved,
-                    rotary_scaling_factor=config.rotary_scaling_factor,
-                    max_position_embeddings=config.n_positions,
-                )
-            else:
-                self.rotary_emb = NomicBertRotaryEmbedding(
-                    dim=self.rotary_emb_dim,
-                    base=config.rotary_emb_base,
-                    scale_base=config.rotary_emb_scale_base,
-                    interleaved=config.rotary_emb_interleaved,
-                )
-            # bug in xformers: https://github.com/facebookresearch/xformers/issues/841
-            # uses the head dimension instead of the sequence dimension
-            self.rotary_head_dim = getattr(config, "rotary_head_dim", False)
-        self.Wqkv = nn.Linear(self.embed_dim, qkv_dim, bias=config.qkv_proj_bias)
-        self.out_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.qkv_proj_bias)
-        self.causal = config.causal
-        self.drop = nn.Dropout(config.attn_pdrop)
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
-        is_padded_inputs: Optional[bool] = True,
-        cu_seqlens: Optional[torch.Tensor] = None,
-        max_seq_len: Optional[int] = None,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        has_layer_past = past_key_value is not None
-        if has_layer_past:
-            past_key_value = past_key_value[0]
-            past_len = past_key_value[1]
-        else:
-            past_len = 0
-        qkv = self.Wqkv(hidden_states)
-        qkv = rearrange(qkv, "... (three h d) -> ... three h d", three=3, d=self.head_dim)
-        past_key_value = (past_key_value, past_len + qkv.size(1)) if use_cache else None
-        if self.rotary_emb_dim > 0:
-            if self.rotary_head_dim:
-                qkv = rearrange(qkv, "b s three h d -> b h three s d")
-            qkv = self.rotary_emb(qkv, seqlen_offset=past_len)
-            if self.rotary_head_dim:
-                qkv = rearrange(qkv, "b h three s d -> b s three h d")
-        query, key, value = qkv[:, :, 0], qkv[:, :, 1], qkv[:, :, 2]
-        query = query.permute(0, 2, 1, 3)
-        key = key.permute(0, 2, 1, 3)
-        value = value.permute(0, 2, 1, 3)
-        attention_scores = torch.matmul(query, key.transpose(-1, -2)) / self.norm_factor
-        if attention_mask is not None:
-            attention_scores = attention_scores + attention_mask
-        attentions_probs = F.softmax(attention_scores, dim=-1)
-        attentions_probs = self.drop(attentions_probs)
-        attn_output = torch.matmul(attentions_probs, value)
-        attn_output = rearrange(attn_output.permute(0, 2, 1, 3), "... h d -> ... (h d)")
-        attn_output = self.out_proj(attn_output)
-        return attn_output
-class NomicBertBlock(nn.Module):
-    def __init__(
-        self,
-        config,
-    ):
-        super().__init__()
-        self.prenorm = config.prenorm
-        self.fused_dropout_add_ln = config.fused_dropout_add_ln
-        self.attn = NomicBertAttention(config)
-        activation = (
-                F.sigmoid
-                if config.activation_function == "glu"
-                else (F.silu if config.activation_function == "swiglu" else F.gelu)
-        )
-        if config.activation_function in ["glu", "swiglu", "geglu"]:
-            self.mlp = NomciBertGatedMLP(config.n_embd, hidden_features=config.n_inner, bias1=config.mlp_fc1_bias, bias2=config.mlp_fc2_bias, activation=activation, fused_bias_fc=config.fused_bias_fc)
-        else:
-            self.mlp = NomicBertMLP(config.n_embd, hidden_features=config.n_inner, bias1=config.mlp_fc1_bias, bias2=config.mlp_fc2_bias, activation=activation, fused_bias_fc=config.fused_bias_fc)
-        self.dropout1 = nn.Dropout(config.resid_pdrop)
-        self.norm1 = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
-        self.norm2 = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
-        self.dropout2 = nn.Dropout(config.resid_pdrop)
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        hidden_states2: torch.Tensor,
-        residual: Optional[torch.Tensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
-        is_padded_inputs: Optional[bool] = True,
-        output_attentions: Optional[bool] = False,
-        use_cache: Optional[bool] = False,
-        cu_seqlens: Optional[torch.Tensor] = None,
-        max_seq_len: Optional[int] = None,
-    ):
-        r"""Pass the input through the encoder layer.
-        Args:
-            hidden_states: the sequence to the encoder layer (required).
-            residual: if postnorm, residual=None, If prenorm, hidden_states = Attn/MLP(LN(residual))
-            mixer_subset: for cross-attention only. If not None, will take a subset of x
-                before applying the query projection. Useful for e.g., ViT where we only care
-                about the CLS token in the last layer.
-        """
-        if self.prenorm:
-            dropped = self.dropout1(hidden_states)
-            residual = (dropped + residual) if residual is not None else dropped
-            hidden_states = self.norm1(residual.to(dtype=self.norm1.weight.dtype))
-            hidden_states = self.attn(hidden_states, attention_mask=attention_mask, is_padded_inputs=is_padded_inputs, cu_seqlens=cu_seqlens, max_seq_len=max_seq_len)
-            dropped = self.dropout2(hidden_states)
-            residual = (dropped + residual) if residual is not None else dropped
-            hidden_states = self.norm2(residual.to(dtype=self.norm2.weight.dtype))
-            hidden_states = self.mlp(hidden_states)
-            return hidden_states, None, residual
-        else:
-            assert residual is None
-            attn_outputs = self.attn(hidden_states,
-                                     attention_mask=attention_mask,
-                                     is_padded_inputs=is_padded_inputs,
-                                     cu_seqlens=cu_seqlens,
-                                     max_seq_len=max_seq_len)
-            hidden_states = self.norm1(
-                (self.dropout1(attn_outputs) + hidden_states).to(
-                    dtype=self.norm1.weight.dtype
-                )
-            )
-            mlp_out = self.mlp(hidden_states)
-            hidden_states = self.norm2(
-                (self.dropout2(mlp_out) + hidden_states).to(
-                    dtype=self.norm2.weight.dtype
-                )
-            )
-            return hidden_states, None, None
-class NomicBertEncoder(nn.Module):
-    def __init__(self, config: GPT2Config):
-        super().__init__()
-        self.layers = nn.ModuleList(
-            [NomicBertBlock(config) for _ in range(config.n_layer)]
-        )
-        self.gradient_checkpointing = False
-        self.config = config
-    def forward(self,
-        hidden_states: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-        is_padded_inputs: Optional[bool] = True,):
-        """If subset_mask is not None, we only want output for the subset of the sequence.
-        This means that we only compute the last layer output for these tokens.
-        subset_mask: (batch, seqlen), dtype=torch.bool
-        """
-        hidden_states2 = None
-        residual = None
-        for _, layer in enumerate(self.layers):
-            if self.gradient_checkpointing and self.training:
-                def create_custom_forward(module):
-                    def custom_forward(*inputs):
-                        # None for past_key_value
-                        return module(*inputs)
-                    return custom_forward
-                hidden_states, hidden_states2, residual = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(layer),
-                    hidden_states,
-                    hidden_states2,
-                    residual,
-                    attention_mask,
-                    None,
-                    None,
-                    is_padded_inputs,
-                    # if you freeze ANY layers, you need `use_reentrant=False`
-                    # https://github.com/huggingface/transformers/issues/21381
-                    # https://discuss.pytorch.org/t/checkpoint-with-no-grad-requiring-inputs-problem/19117/7
-                    use_reentrant=False,
-                )
-            else:
-                hidden_states, hidden_states2, residual = layer(
-                    hidden_states,
-                    hidden_states2,
-                    residual,
-                    attention_mask,
-                    position_ids,
-                    None,
-                    is_padded_inputs,
-                    output_attentions,
-                    use_cache,
-                )
-        return hidden_states
-class NomicBertPooler(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.dense = nn.Linear(config.n_embd, config.n_embd)
-        self.activation = nn.Tanh()
-    def forward(self, hidden_states, pool=True):
-        # We "pool" the model by simply taking the hidden state corresponding
-        # to the first token.
-        first_token_tensor = hidden_states[:, 0] if pool else hidden_states
-        pooled_output = self.dense(first_token_tensor)
-        pooled_output = self.activation(pooled_output)
-        return pooled_output
-class NomicBertPredictionHeadTransform(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.dense = nn.Linear(config.n_embd, config.n_embd, bias=config.mlp_fc1_bias)
-        approximate = (
-            "tanh"
-            if config.activation_function in ["gelu_new", "gelu_fast", "gelu_pytorch_tanh"]
-            else "none"
-        )
-        if config.activation_function == "swiglu":
-            self.transform_act_fn = F.silu
-        else:
-            self.transform_act_fn = nn.GELU(approximate=approximate)
-        self.layer_norm = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
-    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        hidden_states = self.dense(hidden_states)
-        hidden_states = self.transform_act_fn(hidden_states)
-        hidden_states = self.layer_norm(hidden_states)
-        return hidden_states
-class NomicBertLMPredictionHead(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.transform = NomicBertPredictionHeadTransform(config)
-        self.decoder = nn.Linear(config.n_embd, config.vocab_size, bias=config.mlp_fc1_bias)
-    def forward(self, hidden_states):
-        hidden_states = self.transform(hidden_states)
-        hidden_states = self.decoder(hidden_states)
-        return hidden_states
-class NomicBertPreTrainingHeads(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.predictions = NomicBertLMPredictionHead(config)
-    def forward(self, sequence_output):
-        prediction_scores = self.predictions(sequence_output)
-        return prediction_scores
-class NomicBertModel(NomicBertPreTrainedModel):
-    def __init__(self, config: GPT2Config, add_pooling_layer=True):
-        super().__init__(config)
-        self.pad_vocab_size_multiple = getattr(config, "pad_vocab_size_multiple", 1)
-        if config.vocab_size % self.pad_vocab_size_multiple != 0:
-            config.vocab_size += self.pad_vocab_size_multiple - (
-                config.vocab_size % self.pad_vocab_size_multiple
-            )
-        assert config.activation_function in ["gelu", "gelu_new", "gelu_fast", "gelu_pytorch_tanh", "swiglu", "geglu", "glu"]
-        self.embeddings = NomicBertEmbeddings(
-            config
-        )
-        self.emb_drop = nn.Dropout(config.resid_pdrop)
-        self.emb_ln = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
-        self.encoder = NomicBertEncoder(config)
-        self.pooler = NomicBertPooler(config) if add_pooling_layer else None
-        self.apply(partial(_init_weights, initializer_range=config.initializer_range))
-    def forward(
-        self,
-        input_ids,
-        position_ids=None,
-        token_type_ids=None,
-        attention_mask=None,
-        return_dict=None,
-    ):
-        if token_type_ids is None:
-            token_type_ids = torch.zeros_like(input_ids)
-        hidden_states = self.embeddings(
-            input_ids, position_ids=position_ids, token_type_ids=token_type_ids
-        )
-        hidden_states = self.emb_ln(hidden_states)
-        hidden_states = self.emb_drop(hidden_states)
-        attention_mask = self.get_extended_attention_mask(attention_mask, input_ids.shape)
-        sequence_output = self.encoder(
-            hidden_states, attention_mask=attention_mask, return_dict=return_dict,
-        )
-        pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
-        return BaseModelOutputWithPoolingAndCrossAttentions(
-            last_hidden_state=sequence_output,
-            pooler_output=pooled_output,
-        )
-class NomicBertForPreTraining(NomicBertPreTrainedModel):
-    _tied_weights_keys = ["predictions.decoder.bias", "cls.predictions.decoder.weight"]
-    def __init__(self, config: GPT2Config):
-        super().__init__(config)
-        self.bert = NomicBertModel(config, add_pooling_layer=getattr(config, "add_pooling_layer", False))
-        self.cls = NomicBertPreTrainingHeads(config)
-        self.mlm_loss = nn.CrossEntropyLoss()
-        # Initialize weights and apply final processing
-        self.apply(partial(_init_weights, initializer_range=config.initializer_range))
-        self.tie_weights()
-    def tie_weights(self):
-        self.cls.predictions.decoder.weight = self.bert.embeddings.word_embeddings.weight
-    def forward(
-        self,
-        input_ids,
-        position_ids=None,
-        token_type_ids=None,
-        attention_mask=None,
-        labels=None,
-    ):
-        """
-        If labels are provided, they must be -100 for masked out tokens (as specified in the attention
-        mask).
-        Outputs:
-            if `labels` and `next_sentence_label` are not `None`:
-                Outputs the total_loss which is the sum of the masked language modeling loss and the next
-                sentence classification loss.
-            if `labels` or `next_sentence_label` is `None`:
-                Outputs a tuple comprising
-                - the masked language modeling logits of shape [batch_size, sequence_length, vocab_size], and
-                - the next sentence classification logits of shape [batch_size, 2].
-        """
-        outputs = self.bert(
-            input_ids,
-            position_ids=position_ids,
-            token_type_ids=token_type_ids,
-            attention_mask=attention_mask.bool() if attention_mask is not None else None,
-        )
-        sequence_output, _ = outputs.last_hidden_state, outputs.pooler_output
-        prediction_scores = self.cls(sequence_output)
-        total_loss = None
-        if labels is not None:
-            masked_lm_loss = self.mlm_loss(
-                rearrange(prediction_scores, "... v -> (...) v"),
-                rearrange(labels, "... -> (...)"),
-            )
-            total_loss = masked_lm_loss.float()
-        return MaskedLMOutput(
-            loss=total_loss,
-            logits=prediction_scores,
-            hidden_states=outputs.hidden_states,
-            attentions=None,
-        )
-class NomicBertForSequenceClassification(NomicBertPreTrainedModel):
-    def __init__(self, config):
-        super().__init__(config)
-        self.num_labels = config.num_labels
-        self.config = config
-        self.bert = NomicBertModel(config)
-        classifier_dropout = (
-            getattr(config, "classifier_dropout", config.embd_pdrop)
-        )
-        self.dropout = nn.Dropout(classifier_dropout)
-        self.classifier = nn.Linear(config.n_embd, config.num_labels)
-        # Initialize weights and apply final processing
-        self.post_init()
-    def forward(
-        self,
-        input_ids: Optional[torch.Tensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        token_type_ids: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.Tensor] = None,
-        head_mask: Optional[torch.Tensor] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-        labels: Optional[torch.Tensor] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ):
-        r"""
-        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
-            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
-            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        outputs = self.bert(
-            input_ids,
-            position_ids=position_ids,
-            token_type_ids=token_type_ids,
-            attention_mask=attention_mask.bool() if attention_mask is not None else None,
-        )
-        pooled_output = outputs[1]
-        pooled_output = self.dropout(pooled_output)
-        logits = self.classifier(pooled_output)
-        loss = None
-        if labels is not None:
-            if self.config.problem_type is None:
-                if self.num_labels == 1:
-                    self.config.problem_type = "regression"
-                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
-                    self.config.problem_type = "single_label_classification"
-                else:
-                    self.config.problem_type = "multi_label_classification"
-            if self.config.problem_type == "regression":
-                loss_fct = nn.MSELoss()
-                if self.num_labels == 1:
-                    loss = loss_fct(logits.squeeze(), labels.squeeze())
-                else:
-                    loss = loss_fct(logits, labels)
-            elif self.config.problem_type == "single_label_classification":
-                loss_fct = nn.CrossEntropyLoss()
-                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
-            elif self.config.problem_type == "multi_label_classification":
-                loss_fct = nn.BCEWithLogitsLoss()
-                loss = loss_fct(logits, labels)
-        if not return_dict:
-            output = (logits,) + outputs[2:]
-            return ((loss,) + output) if loss is not None else output
-        return SequenceClassifierOutput(
-            loss=loss,
-            logits=logits,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )

pretrained/nomic-ai/nomic-embed-text-v1/modules.json DELETED Viewed

@@ -1,20 +0,0 @@
-[
-  {
-    "idx": 0,
-    "name": "0",
-    "path": "",
-    "type": "sentence_transformers.models.Transformer"
-  },
-  {
-    "idx": 1,
-    "name": "1",
-    "path": "1_Pooling",
-    "type": "sentence_transformers.models.Pooling"
-  },
-  {
-    "idx": 2,
-    "name": "2",
-    "path": "2_Normalize",
-    "type": "sentence_transformers.models.Normalize"
-  }
-]

pretrained/nomic-ai/nomic-embed-text-v1/onnx/model.onnx DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:347440e93b5ec979fdcf6041b72721aade7b9680c16022e2830db7115ff6fd9f
-size 547552426

pretrained/nomic-ai/nomic-embed-text-v1/onnx/model_quantized.onnx DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:b7941066a6529a287e2502ea6cb68ff82006d311eac53627dc88c259cbcbda64
-size 138355983

pretrained/nomic-ai/nomic-embed-text-v1/pytorch_model.bin DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:9fc78c00133aac4e12f358cfe9546e893cb82bb9bb7956506fbbcaa1700ce17c
-size 546961866

pretrained/nomic-ai/nomic-embed-text-v1/sentence_bert_config.json DELETED Viewed

@@ -1,4 +0,0 @@
-{
-  "max_seq_length": 8192,
-  "do_lower_case": false
-}

pretrained/nomic-ai/nomic-embed-text-v1/special_tokens_map.json DELETED Viewed

@@ -1,7 +0,0 @@
-{
-  "cls_token": "[CLS]",
-  "mask_token": "[MASK]",
-  "pad_token": "[PAD]",
-  "sep_token": "[SEP]",
-  "unk_token": "[UNK]"
-}

pretrained/nomic-ai/nomic-embed-text-v1/tokenizer.json DELETED Viewed

The diff for this file is too large to render. See raw diff

pretrained/nomic-ai/nomic-embed-text-v1/tokenizer_config.json DELETED Viewed

@@ -1,55 +0,0 @@
-{
-  "added_tokens_decoder": {
-    "0": {
-      "content": "[PAD]",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "100": {
-      "content": "[UNK]",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "101": {
-      "content": "[CLS]",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "102": {
-      "content": "[SEP]",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "103": {
-      "content": "[MASK]",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    }
-  },
-  "clean_up_tokenization_spaces": true,
-  "cls_token": "[CLS]",
-  "do_lower_case": true,
-  "mask_token": "[MASK]",
-  "model_max_length": 8192,
-  "pad_token": "[PAD]",
-  "sep_token": "[SEP]",
-  "strip_accents": null,
-  "tokenize_chinese_chars": true,
-  "tokenizer_class": "BertTokenizer",
-  "unk_token": "[UNK]"
-}

pretrained/nomic-ai/nomic-embed-text-v1/vocab.txt DELETED Viewed

The diff for this file is too large to render. See raw diff

pretrained/upstage/SOLAR-10.7B-Instruct-v1.0/.gitattributes DELETED Viewed

@@ -1,35 +0,0 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text

pretrained/upstage/SOLAR-10.7B-Instruct-v1.0/README.md DELETED Viewed

@@ -1,181 +0,0 @@
----
-datasets:
-- c-s-ale/alpaca-gpt4-data
-- Open-Orca/OpenOrca
-- Intel/orca_dpo_pairs
-- allenai/ultrafeedback_binarized_cleaned
-language:
-- en
-license: cc-by-nc-4.0
-base_model:
-  - upstage/SOLAR-10.7B-v1.0
----
-<p align="left">
-    <img src="https://huggingface.co/upstage/SOLAR-10.7B-Instruct-v1.0/resolve/main/solar_logo.png" width="150"/>
-<p>
-# **Meet 10.7B Solar: Elevating Performance with Upstage Depth UP Scaling!**
-**(This model is [upstage/SOLAR-10.7B-v1.0](https://huggingface.co/upstage/SOLAR-10.7B-v1.0) fine-tuned version for single-turn conversation.)**
-# **Introduction**
-We introduce SOLAR-10.7B, an advanced large language model (LLM) with 10.7 billion parameters, demonstrating superior performance in various natural language processing (NLP) tasks. It's compact, yet remarkably powerful, and demonstrates unparalleled state-of-the-art performance in models with parameters under 30B.
-We present a methodology for scaling LLMs called depth up-scaling (DUS) , which encompasses architectural modifications and continued pretraining. In other words, we integrated Mistral 7B weights into the upscaled layers, and finally, continued pre-training for the entire model.
-SOLAR-10.7B has remarkable performance. It outperforms models with up to 30B parameters, even surpassing the recent Mixtral 8X7B model. For detailed information, please refer to the experimental table.
-Solar 10.7B is an ideal choice for fine-tuning. SOLAR-10.7B offers robustness and adaptability for your fine-tuning needs. Our simple instruction fine-tuning using the SOLAR-10.7B pre-trained model yields significant performance improvements.
-For full details of this model please read our [paper](https://arxiv.org/abs/2312.15166).
-# **Instruction Fine-Tuning Strategy**
-We utilize state-of-the-art instruction fine-tuning methods including supervised fine-tuning (SFT) and direct preference optimization (DPO) [1].
-We used a mixture of the following datasets
-- c-s-ale/alpaca-gpt4-data (SFT)
-- Open-Orca/OpenOrca (SFT)
-- in-house generated data utilizing Metamath [2] (SFT, DPO)
-- Intel/orca_dpo_pairs (DPO)
-- allenai/ultrafeedback_binarized_cleaned (DPO)
-where we were careful of data contamination by not using GSM8K samples when generating data and filtering tasks when applicable via the following list.
-```python
-filtering_task_list = [
-    'task228_arc_answer_generation_easy',
-    'ai2_arc/ARC-Challenge:1.0.0',
-    'ai2_arc/ARC-Easy:1.0.0',
-    'task229_arc_answer_generation_hard',
-    'hellaswag:1.1.0',
-    'task1389_hellaswag_completion',
-    'cot_gsm8k',
-    'cot_gsm8k_ii',
-    'drop:2.0.0',
-    'winogrande:1.1.0'
-]
-```
-Using the datasets mentioned above, we applied SFT and iterative DPO training, a proprietary alignment strategy, to maximize the performance of our resulting model.
-[1] Rafailov, R., Sharma, A., Mitchell, E., Ermon, S., Manning, C.D. and Finn, C., 2023. Direct preference optimization: Your language model is secretly a reward model. NeurIPS.
-[2] Yu, L., Jiang, W., Shi, H., Yu, J., Liu, Z., Zhang, Y., Kwok, J.T., Li, Z., Weller, A. and Liu, W., 2023. Metamath: Bootstrap your own mathematical questions for large language models. arXiv preprint arXiv:2309.12284.
-# **Data Contamination Test Results**
-Recently, there have been contamination issues in some models on the LLM leaderboard.
-We note that we made every effort to exclude any benchmark-related datasets from training.
-We also ensured the integrity of our model by conducting a data contamination test [3] that is also used by the HuggingFace team [4, 5].
-Our results, with `result < 0.1, %:` being well below 0.9, indicate that our model is free from contamination.
-*The data contamination test results of HellaSwag and Winograde will be added once [3] supports them.*
-| Model                        | ARC   | MMLU | TruthfulQA | GSM8K |
-|------------------------------|-------|-------|-------|-------|
-| **SOLAR-10.7B-Instruct-v1.0**| result < 0.1, %: 0.06 |result < 0.1, %: 0.15 | result < 0.1, %: 0.28 | result < 0.1, %: 0.70 |
-[3] https://github.com/swj0419/detect-pretrain-code-contamination
-[4] https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474#657f2245365456e362412a06
-[5] https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/265#657b6debf81f6b44b8966230
-# **Evaluation Results**
-| Model                                  | H6    | Model Size |
-|----------------------------------------|-------|------------|
-| **SOLAR-10.7B-Instruct-v1.0**              | **74.20** | **~ 11B**      |
-| mistralai/Mixtral-8x7B-Instruct-v0.1   | 72.62 | ~ 46.7B    |
-| 01-ai/Yi-34B-200K                      | 70.81 | ~ 34B      |
-| 01-ai/Yi-34B                           | 69.42 | ~ 34B      |
-| mistralai/Mixtral-8x7B-v0.1            | 68.42 | ~ 46.7B    |
-| meta-llama/Llama-2-70b-hf              | 67.87 | ~ 70B      |
-| tiiuae/falcon-180B                     | 67.85 | ~ 180B     |
-| **SOLAR-10.7B-v1.0**                   | **66.04** | **~11B**   |
-| mistralai/Mistral-7B-Instruct-v0.2     | 65.71 | ~ 7B       |
-| Qwen/Qwen-14B                          | 65.86 | ~ 14B      |
-| 01-ai/Yi-34B-Chat                      | 65.32 | ~34B       |
-| meta-llama/Llama-2-70b-chat-hf         | 62.4  | ~ 70B      |
-| mistralai/Mistral-7B-v0.1              | 60.97 | ~ 7B       |
-| mistralai/Mistral-7B-Instruct-v0.1     | 54.96 | ~ 7B       |
-# **Usage Instructions**
-This model has been fine-tuned primarily for single-turn conversation, making it less suitable for multi-turn conversations such as chat.
-### **Version**
-Make sure you have the correct version of the transformers library installed:
-```sh
-pip install transformers==4.35.2
-```
-### **Loading the Model**
-Use the following Python code to load the model:
-```python
-import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer
-tokenizer = AutoTokenizer.from_pretrained("Upstage/SOLAR-10.7B-Instruct-v1.0")
-model = AutoModelForCausalLM.from_pretrained(
-    "Upstage/SOLAR-10.7B-Instruct-v1.0",
-    device_map="auto",
-    torch_dtype=torch.float16,
-)
-```
-### **Conducting Single-Turn Conversation**
-```python
-conversation = [ {'role': 'user', 'content': 'Hello?'} ]
-prompt = tokenizer.apply_chat_template(conversation, tokenize=False, add_generation_prompt=True)
-inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
-outputs = model.generate(**inputs, use_cache=True, max_length=4096)
-output_text = tokenizer.decode(outputs[0])
-print(output_text)
-```
-Below is an example of the output.
-```
-<s> ### User:
-Hello?
-### Assistant:
-Hello, how can I assist you today? Please feel free to ask any questions or request help with a specific task.</s>
-```
-### **License**
-- [upstage/SOLAR-10.7B-v1.0](https://huggingface.co/upstage/SOLAR-10.7B-v1.0): apache-2.0
-- [upstage/SOLAR-10.7B-Instruct-v1.0](https://huggingface.co/upstage/SOLAR-10.7B-Instruct-v1.0): cc-by-nc-4.0
-  - Since some non-commercial datasets such as Alpaca are used for fine-tuning, we release this model as cc-by-nc-4.0.
-### **How to Cite**
-Please cite this model using this format.
-```bibtex
-@misc{kim2023solar,
-      title={SOLAR 10.7B: Scaling Large Language Models with Simple yet Effective Depth Up-Scaling},
-      author={Dahyun Kim and Chanjun Park and Sanghoon Kim and Wonsung Lee and Wonho Song and Yunsu Kim and Hyeonwoo Kim and Yungi Kim and Hyeonju Lee and Jihoo Kim and Changbae Ahn and Seonghoon Yang and Sukyung Lee and Hyunbyung Park and Gyoungjin Gim and Mikyoung Cha and Hwalsuk Lee and Sunghun Kim},
-      year={2023},
-      eprint={2312.15166},
-      archivePrefix={arXiv},
-      primaryClass={cs.CL}
-}
-```
-### **The Upstage AI Team** ###
-Upstage is creating the best LLM and DocAI. Please find more information at https://upstage.ai
-### **Contact Us** ###
-Any questions and suggestions, please use the discussion tab. If you want to contact us directly, drop an email to [contact@upstage.ai](mailto:contact@upstage.ai)

pretrained/upstage/SOLAR-10.7B-Instruct-v1.0/config.json DELETED Viewed

@@ -1,28 +0,0 @@
-{
-  "_name_or_path": "upstage/SOLAR-10.7B-Instruct-v1.0",
-  "architectures": [
-    "LlamaForCausalLM"
-  ],
-  "attention_bias": false,
-  "bos_token_id": 1,
-  "eos_token_id": 2,
-  "pad_token_id": 2,
-  "hidden_act": "silu",
-  "hidden_size": 4096,
-  "initializer_range": 0.02,
-  "intermediate_size": 14336,
-  "max_position_embeddings": 4096,
-  "model_type": "llama",
-  "num_attention_heads": 32,
-  "num_hidden_layers": 48,
-  "num_key_value_heads": 8,
-  "pretraining_tp": 1,
-  "rms_norm_eps": 1e-05,
-  "rope_scaling": null,
-  "rope_theta": 10000.0,
-  "tie_word_embeddings": false,
-  "torch_dtype": "float16",
-  "transformers_version": "4.35.0",
-  "use_cache": true,
-  "vocab_size": 32000
-}

pretrained/upstage/SOLAR-10.7B-Instruct-v1.0/generation_config.json DELETED Viewed

@@ -1,8 +0,0 @@
-{
-  "_from_model_config": true,
-  "bos_token_id": 1,
-  "eos_token_id": 2,
-  "pad_token_id": 2,
-  "transformers_version": "4.35.2",
-  "use_cache": false
-}

pretrained/upstage/SOLAR-10.7B-Instruct-v1.0/model-00001-of-00005.safetensors DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:a236ffb3d7450f3525058c4a84379dbf7ec20e0cdc1786b7454e355a8899a3e7
-size 4943162240

pretrained/upstage/SOLAR-10.7B-Instruct-v1.0/model-00002-of-00005.safetensors DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:f08fa4ecc0ad6d7d14cc00af9586991f1e9cb7d0c67edbf33c69bc6528f416f3
-size 4999819232

pretrained/upstage/SOLAR-10.7B-Instruct-v1.0/model-00003-of-00005.safetensors DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:6cfcaa469d97b4be11b1eeae0e8f3611f93299b4bb1b33578ac5e5fb866fb154
-size 4915916080

pretrained/upstage/SOLAR-10.7B-Instruct-v1.0/model-00004-of-00005.safetensors DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:18bd0920761731ba8b43bdc568d13798f30e04377d94c781d1f926cdcccce172
-size 4915916080

pretrained/upstage/SOLAR-10.7B-Instruct-v1.0/model-00005-of-00005.safetensors DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:46e67ee561f241973fcebca4e911c8515bcabda67e1f30fe2136ea95d400d22a
-size 1688284744

pretrained/upstage/SOLAR-10.7B-Instruct-v1.0/model.safetensors.index.json DELETED Viewed

@@ -1,442 +0,0 @@
-{
-  "metadata": {
-    "total_size": 21463048192
-  },
-  "weight_map": {
-    "lm_head.weight": "model-00005-of-00005.safetensors",
-    "model.embed_tokens.weight": "model-00001-of-00005.safetensors",
-    "model.layers.0.input_layernorm.weight": "model-00001-of-00005.safetensors",
-    "model.layers.0.mlp.down_proj.weight": "model-00001-of-00005.safetensors",
-    "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00005.safetensors",
-    "model.layers.0.mlp.up_proj.weight": "model-00001-of-00005.safetensors",
-    "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00005.safetensors",
-    "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00005.safetensors",
-    "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00005.safetensors",
-    "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00005.safetensors",
-    "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00005.safetensors",
-    "model.layers.1.input_layernorm.weight": "model-00001-of-00005.safetensors",
-    "model.layers.1.mlp.down_proj.weight": "model-00001-of-00005.safetensors",
-    "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00005.safetensors",
-    "model.layers.1.mlp.up_proj.weight": "model-00001-of-00005.safetensors",
-    "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00005.safetensors",
-    "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00005.safetensors",
-    "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00005.safetensors",
-    "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00005.safetensors",
-    "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00005.safetensors",
-    "model.layers.10.input_layernorm.weight": "model-00002-of-00005.safetensors",
-    "model.layers.10.mlp.down_proj.weight": "model-00002-of-00005.safetensors",
-    "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00005.safetensors",
-    "model.layers.10.mlp.up_proj.weight": "model-00001-of-00005.safetensors",
-    "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00005.safetensors",
-    "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00005.safetensors",
-    "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00005.safetensors",
-    "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00005.safetensors",
-    "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00005.safetensors",
-    "model.layers.11.input_layernorm.weight": "model-00002-of-00005.safetensors",
-    "model.layers.11.mlp.down_proj.weight": "model-00002-of-00005.safetensors",
-    "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00005.safetensors",
-    "model.layers.11.mlp.up_proj.weight": "model-00002-of-00005.safetensors",
-    "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00005.safetensors",
-    "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00005.safetensors",
-    "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00005.safetensors",
-    "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00005.safetensors",
-    "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00005.safetensors",
-    "model.layers.12.input_layernorm.weight": "model-00002-of-00005.safetensors",
-    "model.layers.12.mlp.down_proj.weight": "model-00002-of-00005.safetensors",
-    "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00005.safetensors",
-    "model.layers.12.mlp.up_proj.weight": "model-00002-of-00005.safetensors",
-    "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00005.safetensors",
-    "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00005.safetensors",
-    "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00005.safetensors",
-    "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00005.safetensors",
-    "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00005.safetensors",
-    "model.layers.13.input_layernorm.weight": "model-00002-of-00005.safetensors",
-    "model.layers.13.mlp.down_proj.weight": "model-00002-of-00005.safetensors",
-    "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00005.safetensors",
-    "model.layers.13.mlp.up_proj.weight": "model-00002-of-00005.safetensors",
-    "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00005.safetensors",
-    "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00005.safetensors",
-    "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00005.safetensors",
-    "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00005.safetensors",
-    "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00005.safetensors",
-    "model.layers.14.input_layernorm.weight": "model-00002-of-00005.safetensors",
-    "model.layers.14.mlp.down_proj.weight": "model-00002-of-00005.safetensors",
-    "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00005.safetensors",
-    "model.layers.14.mlp.up_proj.weight": "model-00002-of-00005.safetensors",
-    "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00005.safetensors",
-    "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00005.safetensors",
-    "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00005.safetensors",
-    "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00005.safetensors",
-    "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00005.safetensors",
-    "model.layers.15.input_layernorm.weight": "model-00002-of-00005.safetensors",
-    "model.layers.15.mlp.down_proj.weight": "model-00002-of-00005.safetensors",
-    "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00005.safetensors",
-    "model.layers.15.mlp.up_proj.weight": "model-00002-of-00005.safetensors",
-    "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00005.safetensors",
-    "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00005.safetensors",
-    "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00005.safetensors",
-    "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00005.safetensors",
-    "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00005.safetensors",
-    "model.layers.16.input_layernorm.weight": "model-00002-of-00005.safetensors",
-    "model.layers.16.mlp.down_proj.weight": "model-00002-of-00005.safetensors",
-    "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00005.safetensors",
-    "model.layers.16.mlp.up_proj.weight": "model-00002-of-00005.safetensors",
-    "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00005.safetensors",
-    "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00005.safetensors",
-    "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00005.safetensors",
-    "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00005.safetensors",
-    "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00005.safetensors",
-    "model.layers.17.input_layernorm.weight": "model-00002-of-00005.safetensors",
-    "model.layers.17.mlp.down_proj.weight": "model-00002-of-00005.safetensors",
-    "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00005.safetensors",
-    "model.layers.17.mlp.up_proj.weight": "model-00002-of-00005.safetensors",
-    "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00005.safetensors",
-    "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00005.safetensors",
-    "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00005.safetensors",
-    "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00005.safetensors",
-    "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00005.safetensors",
-    "model.layers.18.input_layernorm.weight": "model-00002-of-00005.safetensors",
-    "model.layers.18.mlp.down_proj.weight": "model-00002-of-00005.safetensors",
-    "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00005.safetensors",
-    "model.layers.18.mlp.up_proj.weight": "model-00002-of-00005.safetensors",
-    "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00005.safetensors",
-    "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00005.safetensors",
-    "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00005.safetensors",
-    "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00005.safetensors",
-    "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00005.safetensors",
-    "model.layers.19.input_layernorm.weight": "model-00002-of-00005.safetensors",
-    "model.layers.19.mlp.down_proj.weight": "model-00002-of-00005.safetensors",
-    "model.layers.19.mlp.gate_proj.weight": "model-00002-of-00005.safetensors",
-    "model.layers.19.mlp.up_proj.weight": "model-00002-of-00005.safetensors",
-    "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00005.safetensors",
-    "model.layers.19.self_attn.k_proj.weight": "model-00002-of-00005.safetensors",
-    "model.layers.19.self_attn.o_proj.weight": "model-00002-of-00005.safetensors",
-    "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00005.safetensors",
-    "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00005.safetensors",
-    "model.layers.2.input_layernorm.weight": "model-00001-of-00005.safetensors",
-    "model.layers.2.mlp.down_proj.weight": "model-00001-of-00005.safetensors",
-    "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00005.safetensors",
-    "model.layers.2.mlp.up_proj.weight": "model-00001-of-00005.safetensors",
-    "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00005.safetensors",
-    "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00005.safetensors",
-    "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00005.safetensors",
-    "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00005.safetensors",
-    "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00005.safetensors",
-    "model.layers.20.input_layernorm.weight": "model-00002-of-00005.safetensors",
-    "model.layers.20.mlp.down_proj.weight": "model-00002-of-00005.safetensors",
-    "model.layers.20.mlp.gate_proj.weight": "model-00002-of-00005.safetensors",
-    "model.layers.20.mlp.up_proj.weight": "model-00002-of-00005.safetensors",
-    "model.layers.20.post_attention_layernorm.weight": "model-00002-of-00005.safetensors",
-    "model.layers.20.self_attn.k_proj.weight": "model-00002-of-00005.safetensors",
-    "model.layers.20.self_attn.o_proj.weight": "model-00002-of-00005.safetensors",
-    "model.layers.20.self_attn.q_proj.weight": "model-00002-of-00005.safetensors",
-    "model.layers.20.self_attn.v_proj.weight": "model-00002-of-00005.safetensors",
-    "model.layers.21.input_layernorm.weight": "model-00002-of-00005.safetensors",
-    "model.layers.21.mlp.down_proj.weight": "model-00002-of-00005.safetensors",
-    "model.layers.21.mlp.gate_proj.weight": "model-00002-of-00005.safetensors",
-    "model.layers.21.mlp.up_proj.weight": "model-00002-of-00005.safetensors",
-    "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00005.safetensors",
-    "model.layers.21.self_attn.k_proj.weight": "model-00002-of-00005.safetensors",
-    "model.layers.21.self_attn.o_proj.weight": "model-00002-of-00005.safetensors",
-    "model.layers.21.self_attn.q_proj.weight": "model-00002-of-00005.safetensors",
-    "model.layers.21.self_attn.v_proj.weight": "model-00002-of-00005.safetensors",
-    "model.layers.22.input_layernorm.weight": "model-00003-of-00005.safetensors",
-    "model.layers.22.mlp.down_proj.weight": "model-00003-of-00005.safetensors",
-    "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00005.safetensors",
-    "model.layers.22.mlp.up_proj.weight": "model-00003-of-00005.safetensors",
-    "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00005.safetensors",
-    "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00005.safetensors",
-    "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00005.safetensors",
-    "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00005.safetensors",
-    "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00005.safetensors",
-    "model.layers.23.input_layernorm.weight": "model-00003-of-00005.safetensors",
-    "model.layers.23.mlp.down_proj.weight": "model-00003-of-00005.safetensors",
-    "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00005.safetensors",
-    "model.layers.23.mlp.up_proj.weight": "model-00003-of-00005.safetensors",
-    "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00005.safetensors",
-    "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00005.safetensors",
-    "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00005.safetensors",
-    "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00005.safetensors",
-    "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00005.safetensors",
-    "model.layers.24.input_layernorm.weight": "model-00003-of-00005.safetensors",
-    "model.layers.24.mlp.down_proj.weight": "model-00003-of-00005.safetensors",
-    "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00005.safetensors",
-    "model.layers.24.mlp.up_proj.weight": "model-00003-of-00005.safetensors",
-    "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00005.safetensors",
-    "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00005.safetensors",
-    "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00005.safetensors",
-    "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00005.safetensors",
-    "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00005.safetensors",
-    "model.layers.25.input_layernorm.weight": "model-00003-of-00005.safetensors",
-    "model.layers.25.mlp.down_proj.weight": "model-00003-of-00005.safetensors",
-    "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00005.safetensors",
-    "model.layers.25.mlp.up_proj.weight": "model-00003-of-00005.safetensors",
-    "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00005.safetensors",
-    "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00005.safetensors",
-    "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00005.safetensors",
-    "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00005.safetensors",
-    "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00005.safetensors",
-    "model.layers.26.input_layernorm.weight": "model-00003-of-00005.safetensors",
-    "model.layers.26.mlp.down_proj.weight": "model-00003-of-00005.safetensors",
-    "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00005.safetensors",
-    "model.layers.26.mlp.up_proj.weight": "model-00003-of-00005.safetensors",
-    "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00005.safetensors",
-    "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00005.safetensors",
-    "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00005.safetensors",
-    "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00005.safetensors",
-    "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00005.safetensors",
-    "model.layers.27.input_layernorm.weight": "model-00003-of-00005.safetensors",
-    "model.layers.27.mlp.down_proj.weight": "model-00003-of-00005.safetensors",
-    "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00005.safetensors",
-    "model.layers.27.mlp.up_proj.weight": "model-00003-of-00005.safetensors",
-    "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00005.safetensors",
-    "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00005.safetensors",
-    "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00005.safetensors",
-    "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00005.safetensors",
-    "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00005.safetensors",
-    "model.layers.28.input_layernorm.weight": "model-00003-of-00005.safetensors",
-    "model.layers.28.mlp.down_proj.weight": "model-00003-of-00005.safetensors",
-    "model.layers.28.mlp.gate_proj.weight": "model-00003-of-00005.safetensors",
-    "model.layers.28.mlp.up_proj.weight": "model-00003-of-00005.safetensors",
-    "model.layers.28.post_attention_layernorm.weight": "model-00003-of-00005.safetensors",
-    "model.layers.28.self_attn.k_proj.weight": "model-00003-of-00005.safetensors",
-    "model.layers.28.self_attn.o_proj.weight": "model-00003-of-00005.safetensors",
-    "model.layers.28.self_attn.q_proj.weight": "model-00003-of-00005.safetensors",
-    "model.layers.28.self_attn.v_proj.weight": "model-00003-of-00005.safetensors",
-    "model.layers.29.input_layernorm.weight": "model-00003-of-00005.safetensors",
-    "model.layers.29.mlp.down_proj.weight": "model-00003-of-00005.safetensors",
-    "model.layers.29.mlp.gate_proj.weight": "model-00003-of-00005.safetensors",
-    "model.layers.29.mlp.up_proj.weight": "model-00003-of-00005.safetensors",
-    "model.layers.29.post_attention_layernorm.weight": "model-00003-of-00005.safetensors",
-    "model.layers.29.self_attn.k_proj.weight": "model-00003-of-00005.safetensors",
-    "model.layers.29.self_attn.o_proj.weight": "model-00003-of-00005.safetensors",
-    "model.layers.29.self_attn.q_proj.weight": "model-00003-of-00005.safetensors",
-    "model.layers.29.self_attn.v_proj.weight": "model-00003-of-00005.safetensors",
-    "model.layers.3.input_layernorm.weight": "model-00001-of-00005.safetensors",
-    "model.layers.3.mlp.down_proj.weight": "model-00001-of-00005.safetensors",
-    "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00005.safetensors",
-    "model.layers.3.mlp.up_proj.weight": "model-00001-of-00005.safetensors",
-    "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00005.safetensors",
-    "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00005.safetensors",
-    "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00005.safetensors",
-    "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00005.safetensors",
-    "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00005.safetensors",
-    "model.layers.30.input_layernorm.weight": "model-00003-of-00005.safetensors",
-    "model.layers.30.mlp.down_proj.weight": "model-00003-of-00005.safetensors",
-    "model.layers.30.mlp.gate_proj.weight": "model-00003-of-00005.safetensors",
-    "model.layers.30.mlp.up_proj.weight": "model-00003-of-00005.safetensors",
-    "model.layers.30.post_attention_layernorm.weight": "model-00003-of-00005.safetensors",
-    "model.layers.30.self_attn.k_proj.weight": "model-00003-of-00005.safetensors",
-    "model.layers.30.self_attn.o_proj.weight": "model-00003-of-00005.safetensors",
-    "model.layers.30.self_attn.q_proj.weight": "model-00003-of-00005.safetensors",
-    "model.layers.30.self_attn.v_proj.weight": "model-00003-of-00005.safetensors",
-    "model.layers.31.input_layernorm.weight": "model-00003-of-00005.safetensors",
-    "model.layers.31.mlp.down_proj.weight": "model-00003-of-00005.safetensors",
-    "model.layers.31.mlp.gate_proj.weight": "model-00003-of-00005.safetensors",
-    "model.layers.31.mlp.up_proj.weight": "model-00003-of-00005.safetensors",
-    "model.layers.31.post_attention_layernorm.weight": "model-00003-of-00005.safetensors",
-    "model.layers.31.self_attn.k_proj.weight": "model-00003-of-00005.safetensors",
-    "model.layers.31.self_attn.o_proj.weight": "model-00003-of-00005.safetensors",
-    "model.layers.31.self_attn.q_proj.weight": "model-00003-of-00005.safetensors",
-    "model.layers.31.self_attn.v_proj.weight": "model-00003-of-00005.safetensors",
-    "model.layers.32.input_layernorm.weight": "model-00003-of-00005.safetensors",
-    "model.layers.32.mlp.down_proj.weight": "model-00003-of-00005.safetensors",
-    "model.layers.32.mlp.gate_proj.weight": "model-00003-of-00005.safetensors",
-    "model.layers.32.mlp.up_proj.weight": "model-00003-of-00005.safetensors",
-    "model.layers.32.post_attention_layernorm.weight": "model-00003-of-00005.safetensors",
-    "model.layers.32.self_attn.k_proj.weight": "model-00003-of-00005.safetensors",
-    "model.layers.32.self_attn.o_proj.weight": "model-00003-of-00005.safetensors",
-    "model.layers.32.self_attn.q_proj.weight": "model-00003-of-00005.safetensors",
-    "model.layers.32.self_attn.v_proj.weight": "model-00003-of-00005.safetensors",
-    "model.layers.33.input_layernorm.weight": "model-00004-of-00005.safetensors",
-    "model.layers.33.mlp.down_proj.weight": "model-00004-of-00005.safetensors",
-    "model.layers.33.mlp.gate_proj.weight": "model-00003-of-00005.safetensors",
-    "model.layers.33.mlp.up_proj.weight": "model-00004-of-00005.safetensors",
-    "model.layers.33.post_attention_layernorm.weight": "model-00004-of-00005.safetensors",
-    "model.layers.33.self_attn.k_proj.weight": "model-00003-of-00005.safetensors",
-    "model.layers.33.self_attn.o_proj.weight": "model-00003-of-00005.safetensors",
-    "model.layers.33.self_attn.q_proj.weight": "model-00003-of-00005.safetensors",
-    "model.layers.33.self_attn.v_proj.weight": "model-00003-of-00005.safetensors",
-    "model.layers.34.input_layernorm.weight": "model-00004-of-00005.safetensors",
-    "model.layers.34.mlp.down_proj.weight": "model-00004-of-00005.safetensors",
-    "model.layers.34.mlp.gate_proj.weight": "model-00004-of-00005.safetensors",
-    "model.layers.34.mlp.up_proj.weight": "model-00004-of-00005.safetensors",
-    "model.layers.34.post_attention_layernorm.weight": "model-00004-of-00005.safetensors",
-    "model.layers.34.self_attn.k_proj.weight": "model-00004-of-00005.safetensors",
-    "model.layers.34.self_attn.o_proj.weight": "model-00004-of-00005.safetensors",
-    "model.layers.34.self_attn.q_proj.weight": "model-00004-of-00005.safetensors",
-    "model.layers.34.self_attn.v_proj.weight": "model-00004-of-00005.safetensors",
-    "model.layers.35.input_layernorm.weight": "model-00004-of-00005.safetensors",
-    "model.layers.35.mlp.down_proj.weight": "model-00004-of-00005.safetensors",
-    "model.layers.35.mlp.gate_proj.weight": "model-00004-of-00005.safetensors",
-    "model.layers.35.mlp.up_proj.weight": "model-00004-of-00005.safetensors",
-    "model.layers.35.post_attention_layernorm.weight": "model-00004-of-00005.safetensors",
-    "model.layers.35.self_attn.k_proj.weight": "model-00004-of-00005.safetensors",
-    "model.layers.35.self_attn.o_proj.weight": "model-00004-of-00005.safetensors",
-    "model.layers.35.self_attn.q_proj.weight": "model-00004-of-00005.safetensors",
-    "model.layers.35.self_attn.v_proj.weight": "model-00004-of-00005.safetensors",
-    "model.layers.36.input_layernorm.weight": "model-00004-of-00005.safetensors",
-    "model.layers.36.mlp.down_proj.weight": "model-00004-of-00005.safetensors",
-    "model.layers.36.mlp.gate_proj.weight": "model-00004-of-00005.safetensors",
-    "model.layers.36.mlp.up_proj.weight": "model-00004-of-00005.safetensors",
-    "model.layers.36.post_attention_layernorm.weight": "model-00004-of-00005.safetensors",
-    "model.layers.36.self_attn.k_proj.weight": "model-00004-of-00005.safetensors",
-    "model.layers.36.self_attn.o_proj.weight": "model-00004-of-00005.safetensors",
-    "model.layers.36.self_attn.q_proj.weight": "model-00004-of-00005.safetensors",
-    "model.layers.36.self_attn.v_proj.weight": "model-00004-of-00005.safetensors",
-    "model.layers.37.input_layernorm.weight": "model-00004-of-00005.safetensors",
-    "model.layers.37.mlp.down_proj.weight": "model-00004-of-00005.safetensors",
-    "model.layers.37.mlp.gate_proj.weight": "model-00004-of-00005.safetensors",
-    "model.layers.37.mlp.up_proj.weight": "model-00004-of-00005.safetensors",
-    "model.layers.37.post_attention_layernorm.weight": "model-00004-of-00005.safetensors",
-    "model.layers.37.self_attn.k_proj.weight": "model-00004-of-00005.safetensors",
-    "model.layers.37.self_attn.o_proj.weight": "model-00004-of-00005.safetensors",
-    "model.layers.37.self_attn.q_proj.weight": "model-00004-of-00005.safetensors",
-    "model.layers.37.self_attn.v_proj.weight": "model-00004-of-00005.safetensors",
-    "model.layers.38.input_layernorm.weight": "model-00004-of-00005.safetensors",
-    "model.layers.38.mlp.down_proj.weight": "model-00004-of-00005.safetensors",
-    "model.layers.38.mlp.gate_proj.weight": "model-00004-of-00005.safetensors",
-    "model.layers.38.mlp.up_proj.weight": "model-00004-of-00005.safetensors",
-    "model.layers.38.post_attention_layernorm.weight": "model-00004-of-00005.safetensors",
-    "model.layers.38.self_attn.k_proj.weight": "model-00004-of-00005.safetensors",
-    "model.layers.38.self_attn.o_proj.weight": "model-00004-of-00005.safetensors",
-    "model.layers.38.self_attn.q_proj.weight": "model-00004-of-00005.safetensors",
-    "model.layers.38.self_attn.v_proj.weight": "model-00004-of-00005.safetensors",
-    "model.layers.39.input_layernorm.weight": "model-00004-of-00005.safetensors",
-    "model.layers.39.mlp.down_proj.weight": "model-00004-of-00005.safetensors",
-    "model.layers.39.mlp.gate_proj.weight": "model-00004-of-00005.safetensors",
-    "model.layers.39.mlp.up_proj.weight": "model-00004-of-00005.safetensors",
-    "model.layers.39.post_attention_layernorm.weight": "model-00004-of-00005.safetensors",
-    "model.layers.39.self_attn.k_proj.weight": "model-00004-of-00005.safetensors",
-    "model.layers.39.self_attn.o_proj.weight": "model-00004-of-00005.safetensors",
-    "model.layers.39.self_attn.q_proj.weight": "model-00004-of-00005.safetensors",
-    "model.layers.39.self_attn.v_proj.weight": "model-00004-of-00005.safetensors",
-    "model.layers.4.input_layernorm.weight": "model-00001-of-00005.safetensors",
-    "model.layers.4.mlp.down_proj.weight": "model-00001-of-00005.safetensors",
-    "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00005.safetensors",
-    "model.layers.4.mlp.up_proj.weight": "model-00001-of-00005.safetensors",
-    "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00005.safetensors",
-    "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00005.safetensors",
-    "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00005.safetensors",
-    "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00005.safetensors",
-    "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00005.safetensors",
-    "model.layers.40.input_layernorm.weight": "model-00004-of-00005.safetensors",
-    "model.layers.40.mlp.down_proj.weight": "model-00004-of-00005.safetensors",
-    "model.layers.40.mlp.gate_proj.weight": "model-00004-of-00005.safetensors",
-    "model.layers.40.mlp.up_proj.weight": "model-00004-of-00005.safetensors",
-    "model.layers.40.post_attention_layernorm.weight": "model-00004-of-00005.safetensors",
-    "model.layers.40.self_attn.k_proj.weight": "model-00004-of-00005.safetensors",
-    "model.layers.40.self_attn.o_proj.weight": "model-00004-of-00005.safetensors",
-    "model.layers.40.self_attn.q_proj.weight": "model-00004-of-00005.safetensors",
-    "model.layers.40.self_attn.v_proj.weight": "model-00004-of-00005.safetensors",
-    "model.layers.41.input_layernorm.weight": "model-00004-of-00005.safetensors",
-    "model.layers.41.mlp.down_proj.weight": "model-00004-of-00005.safetensors",
-    "model.layers.41.mlp.gate_proj.weight": "model-00004-of-00005.safetensors",
-    "model.layers.41.mlp.up_proj.weight": "model-00004-of-00005.safetensors",
-    "model.layers.41.post_attention_layernorm.weight": "model-00004-of-00005.safetensors",
-    "model.layers.41.self_attn.k_proj.weight": "model-00004-of-00005.safetensors",
-    "model.layers.41.self_attn.o_proj.weight": "model-00004-of-00005.safetensors",
-    "model.layers.41.self_attn.q_proj.weight": "model-00004-of-00005.safetensors",
-    "model.layers.41.self_attn.v_proj.weight": "model-00004-of-00005.safetensors",
-    "model.layers.42.input_layernorm.weight": "model-00004-of-00005.safetensors",
-    "model.layers.42.mlp.down_proj.weight": "model-00004-of-00005.safetensors",
-    "model.layers.42.mlp.gate_proj.weight": "model-00004-of-00005.safetensors",
-    "model.layers.42.mlp.up_proj.weight": "model-00004-of-00005.safetensors",
-    "model.layers.42.post_attention_layernorm.weight": "model-00004-of-00005.safetensors",
-    "model.layers.42.self_attn.k_proj.weight": "model-00004-of-00005.safetensors",
-    "model.layers.42.self_attn.o_proj.weight": "model-00004-of-00005.safetensors",
-    "model.layers.42.self_attn.q_proj.weight": "model-00004-of-00005.safetensors",
-    "model.layers.42.self_attn.v_proj.weight": "model-00004-of-00005.safetensors",
-    "model.layers.43.input_layernorm.weight": "model-00004-of-00005.safetensors",
-    "model.layers.43.mlp.down_proj.weight": "model-00004-of-00005.safetensors",
-    "model.layers.43.mlp.gate_proj.weight": "model-00004-of-00005.safetensors",
-    "model.layers.43.mlp.up_proj.weight": "model-00004-of-00005.safetensors",
-    "model.layers.43.post_attention_layernorm.weight": "model-00004-of-00005.safetensors",
-    "model.layers.43.self_attn.k_proj.weight": "model-00004-of-00005.safetensors",
-    "model.layers.43.self_attn.o_proj.weight": "model-00004-of-00005.safetensors",
-    "model.layers.43.self_attn.q_proj.weight": "model-00004-of-00005.safetensors",
-    "model.layers.43.self_attn.v_proj.weight": "model-00004-of-00005.safetensors",
-    "model.layers.44.input_layernorm.weight": "model-00005-of-00005.safetensors",
-    "model.layers.44.mlp.down_proj.weight": "model-00005-of-00005.safetensors",
-    "model.layers.44.mlp.gate_proj.weight": "model-00004-of-00005.safetensors",
-    "model.layers.44.mlp.up_proj.weight": "model-00004-of-00005.safetensors",
-    "model.layers.44.post_attention_layernorm.weight": "model-00005-of-00005.safetensors",
-    "model.layers.44.self_attn.k_proj.weight": "model-00004-of-00005.safetensors",
-    "model.layers.44.self_attn.o_proj.weight": "model-00004-of-00005.safetensors",
-    "model.layers.44.self_attn.q_proj.weight": "model-00004-of-00005.safetensors",
-    "model.layers.44.self_attn.v_proj.weight": "model-00004-of-00005.safetensors",
-    "model.layers.45.input_layernorm.weight": "model-00005-of-00005.safetensors",
-    "model.layers.45.mlp.down_proj.weight": "model-00005-of-00005.safetensors",
-    "model.layers.45.mlp.gate_proj.weight": "model-00005-of-00005.safetensors",
-    "model.layers.45.mlp.up_proj.weight": "model-00005-of-00005.safetensors",
-    "model.layers.45.post_attention_layernorm.weight": "model-00005-of-00005.safetensors",
-    "model.layers.45.self_attn.k_proj.weight": "model-00005-of-00005.safetensors",
-    "model.layers.45.self_attn.o_proj.weight": "model-00005-of-00005.safetensors",
-    "model.layers.45.self_attn.q_proj.weight": "model-00005-of-00005.safetensors",
-    "model.layers.45.self_attn.v_proj.weight": "model-00005-of-00005.safetensors",
-    "model.layers.46.input_layernorm.weight": "model-00005-of-00005.safetensors",
-    "model.layers.46.mlp.down_proj.weight": "model-00005-of-00005.safetensors",
-    "model.layers.46.mlp.gate_proj.weight": "model-00005-of-00005.safetensors",
-    "model.layers.46.mlp.up_proj.weight": "model-00005-of-00005.safetensors",
-    "model.layers.46.post_attention_layernorm.weight": "model-00005-of-00005.safetensors",
-    "model.layers.46.self_attn.k_proj.weight": "model-00005-of-00005.safetensors",
-    "model.layers.46.self_attn.o_proj.weight": "model-00005-of-00005.safetensors",
-    "model.layers.46.self_attn.q_proj.weight": "model-00005-of-00005.safetensors",
-    "model.layers.46.self_attn.v_proj.weight": "model-00005-of-00005.safetensors",
-    "model.layers.47.input_layernorm.weight": "model-00005-of-00005.safetensors",
-    "model.layers.47.mlp.down_proj.weight": "model-00005-of-00005.safetensors",
-    "model.layers.47.mlp.gate_proj.weight": "model-00005-of-00005.safetensors",
-    "model.layers.47.mlp.up_proj.weight": "model-00005-of-00005.safetensors",
-    "model.layers.47.post_attention_layernorm.weight": "model-00005-of-00005.safetensors",
-    "model.layers.47.self_attn.k_proj.weight": "model-00005-of-00005.safetensors",
-    "model.layers.47.self_attn.o_proj.weight": "model-00005-of-00005.safetensors",
-    "model.layers.47.self_attn.q_proj.weight": "model-00005-of-00005.safetensors",
-    "model.layers.47.self_attn.v_proj.weight": "model-00005-of-00005.safetensors",
-    "model.layers.5.input_layernorm.weight": "model-00001-of-00005.safetensors",
-    "model.layers.5.mlp.down_proj.weight": "model-00001-of-00005.safetensors",
-    "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00005.safetensors",
-    "model.layers.5.mlp.up_proj.weight": "model-00001-of-00005.safetensors",
-    "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00005.safetensors",
-    "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00005.safetensors",
-    "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00005.safetensors",
-    "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00005.safetensors",
-    "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00005.safetensors",
-    "model.layers.6.input_layernorm.weight": "model-00001-of-00005.safetensors",
-    "model.layers.6.mlp.down_proj.weight": "model-00001-of-00005.safetensors",
-    "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00005.safetensors",
-    "model.layers.6.mlp.up_proj.weight": "model-00001-of-00005.safetensors",
-    "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00005.safetensors",
-    "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00005.safetensors",
-    "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00005.safetensors",
-    "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00005.safetensors",
-    "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00005.safetensors",
-    "model.layers.7.input_layernorm.weight": "model-00001-of-00005.safetensors",
-    "model.layers.7.mlp.down_proj.weight": "model-00001-of-00005.safetensors",
-    "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00005.safetensors",
-    "model.layers.7.mlp.up_proj.weight": "model-00001-of-00005.safetensors",
-    "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00005.safetensors",
-    "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00005.safetensors",
-    "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00005.safetensors",
-    "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00005.safetensors",
-    "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00005.safetensors",
-    "model.layers.8.input_layernorm.weight": "model-00001-of-00005.safetensors",
-    "model.layers.8.mlp.down_proj.weight": "model-00001-of-00005.safetensors",
-    "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00005.safetensors",
-    "model.layers.8.mlp.up_proj.weight": "model-00001-of-00005.safetensors",
-    "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00005.safetensors",
-    "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00005.safetensors",
-    "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00005.safetensors",
-    "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00005.safetensors",
-    "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00005.safetensors",
-    "model.layers.9.input_layernorm.weight": "model-00001-of-00005.safetensors",
-    "model.layers.9.mlp.down_proj.weight": "model-00001-of-00005.safetensors",
-    "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00005.safetensors",
-    "model.layers.9.mlp.up_proj.weight": "model-00001-of-00005.safetensors",
-    "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00005.safetensors",
-    "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00005.safetensors",
-    "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00005.safetensors",
-    "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00005.safetensors",
-    "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00005.safetensors",
-    "model.norm.weight": "model-00005-of-00005.safetensors"
-  }
-}

pretrained/upstage/SOLAR-10.7B-Instruct-v1.0/solar_logo.png DELETED Viewed

Binary file (77.1 kB)

pretrained/upstage/SOLAR-10.7B-Instruct-v1.0/tokenizer.json DELETED Viewed

The diff for this file is too large to render. See raw diff

pretrained/upstage/SOLAR-10.7B-Instruct-v1.0/tokenizer.model DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:dadfd56d766715c61d2ef780a525ab43b8e6da4de6865bda3d95fdef5e134055
-size 493443

pretrained/upstage/SOLAR-10.7B-Instruct-v1.0/tokenizer_config.json DELETED Viewed

@@ -1,43 +0,0 @@
-{
-  "add_bos_token": true,
-  "add_eos_token": false,
-  "added_tokens_decoder": {
-    "0": {
-      "content": "<unk>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "1": {
-      "content": "<s>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "2": {
-      "content": "</s>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    }
-  },
-  "additional_special_tokens": [],
-  "chat_template": "{% for message in messages %}{% if message['role'] == 'system' %}{% if message['content']%}{{'### System:\n' + message['content']+'\n\n'}}{% endif %}{% elif message['role'] == 'user' %}{{'### User:\n' + message['content']+'\n\n'}}{% elif message['role'] == 'assistant' %}{{'### Assistant:\n'  + message['content']}}{% endif %}{% if loop.last and add_generation_prompt %}{{ '### Assistant:\n' }}{% endif %}{% endfor %}",
-  "bos_token": "<s>",
-  "clean_up_tokenization_spaces": false,
-  "eos_token": "</s>",
-  "legacy": true,
-  "model_max_length": 1000000000000000019884624838656,
-  "pad_token": "</s>",
-  "sp_model_kwargs": {},
-  "spaces_between_special_tokens": false,
-  "tokenizer_class": "LlamaTokenizer",
-  "unk_token": "<unk>",
-  "use_default_system_prompt": true
-}