zhangzhao219 commited on
Commit
5ac2298
1 Parent(s): a542899

Delete pretrained

Browse files
Files changed (31) hide show
  1. pretrained/nomic-ai/nomic-embed-text-v1/.gitattributes +0 -35
  2. pretrained/nomic-ai/nomic-embed-text-v1/1_Pooling/config.json +0 -9
  3. pretrained/nomic-ai/nomic-embed-text-v1/README.md +0 -2736
  4. pretrained/nomic-ai/nomic-embed-text-v1/config.json +0 -56
  5. pretrained/nomic-ai/nomic-embed-text-v1/config_sentence_transformers.json +0 -7
  6. pretrained/nomic-ai/nomic-embed-text-v1/configuration_hf_nomic_bert.py +0 -53
  7. pretrained/nomic-ai/nomic-embed-text-v1/model.safetensors +0 -3
  8. pretrained/nomic-ai/nomic-embed-text-v1/modeling_hf_nomic_bert.py +0 -1238
  9. pretrained/nomic-ai/nomic-embed-text-v1/modules.json +0 -20
  10. pretrained/nomic-ai/nomic-embed-text-v1/onnx/model.onnx +0 -3
  11. pretrained/nomic-ai/nomic-embed-text-v1/onnx/model_quantized.onnx +0 -3
  12. pretrained/nomic-ai/nomic-embed-text-v1/pytorch_model.bin +0 -3
  13. pretrained/nomic-ai/nomic-embed-text-v1/sentence_bert_config.json +0 -4
  14. pretrained/nomic-ai/nomic-embed-text-v1/special_tokens_map.json +0 -7
  15. pretrained/nomic-ai/nomic-embed-text-v1/tokenizer.json +0 -0
  16. pretrained/nomic-ai/nomic-embed-text-v1/tokenizer_config.json +0 -55
  17. pretrained/nomic-ai/nomic-embed-text-v1/vocab.txt +0 -0
  18. pretrained/upstage/SOLAR-10.7B-Instruct-v1.0/.gitattributes +0 -35
  19. pretrained/upstage/SOLAR-10.7B-Instruct-v1.0/README.md +0 -181
  20. pretrained/upstage/SOLAR-10.7B-Instruct-v1.0/config.json +0 -28
  21. pretrained/upstage/SOLAR-10.7B-Instruct-v1.0/generation_config.json +0 -8
  22. pretrained/upstage/SOLAR-10.7B-Instruct-v1.0/model-00001-of-00005.safetensors +0 -3
  23. pretrained/upstage/SOLAR-10.7B-Instruct-v1.0/model-00002-of-00005.safetensors +0 -3
  24. pretrained/upstage/SOLAR-10.7B-Instruct-v1.0/model-00003-of-00005.safetensors +0 -3
  25. pretrained/upstage/SOLAR-10.7B-Instruct-v1.0/model-00004-of-00005.safetensors +0 -3
  26. pretrained/upstage/SOLAR-10.7B-Instruct-v1.0/model-00005-of-00005.safetensors +0 -3
  27. pretrained/upstage/SOLAR-10.7B-Instruct-v1.0/model.safetensors.index.json +0 -442
  28. pretrained/upstage/SOLAR-10.7B-Instruct-v1.0/solar_logo.png +0 -0
  29. pretrained/upstage/SOLAR-10.7B-Instruct-v1.0/tokenizer.json +0 -0
  30. pretrained/upstage/SOLAR-10.7B-Instruct-v1.0/tokenizer.model +0 -3
  31. pretrained/upstage/SOLAR-10.7B-Instruct-v1.0/tokenizer_config.json +0 -43
pretrained/nomic-ai/nomic-embed-text-v1/.gitattributes DELETED
@@ -1,35 +0,0 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
pretrained/nomic-ai/nomic-embed-text-v1/1_Pooling/config.json DELETED
@@ -1,9 +0,0 @@
1
- {
2
- "word_embedding_dimension": 768,
3
- "pooling_mode_cls_token": false,
4
- "pooling_mode_mean_tokens": true,
5
- "pooling_mode_max_tokens": false,
6
- "pooling_mode_mean_sqrt_len_tokens": false,
7
- "pooling_mode_weightedmean_tokens": false,
8
- "pooling_mode_lasttoken": false
9
- }
 
 
 
 
 
 
 
 
 
 
pretrained/nomic-ai/nomic-embed-text-v1/README.md DELETED
@@ -1,2736 +0,0 @@
1
- ---
2
- library_name: sentence-transformers
3
- pipeline_tag: sentence-similarity
4
- tags:
5
- - feature-extraction
6
- - sentence-similarity
7
- - mteb
8
- - transformers
9
- - transformers.js
10
- model-index:
11
- - name: epoch_0_model
12
- results:
13
- - task:
14
- type: Classification
15
- dataset:
16
- type: mteb/amazon_counterfactual
17
- name: MTEB AmazonCounterfactualClassification (en)
18
- config: en
19
- split: test
20
- revision: e8379541af4e31359cca9fbcf4b00f2671dba205
21
- metrics:
22
- - type: accuracy
23
- value: 76.8507462686567
24
- - type: ap
25
- value: 40.592189159090495
26
- - type: f1
27
- value: 71.01634655512476
28
- - task:
29
- type: Classification
30
- dataset:
31
- type: mteb/amazon_polarity
32
- name: MTEB AmazonPolarityClassification
33
- config: default
34
- split: test
35
- revision: e2d317d38cd51312af73b3d32a06d1a08b442046
36
- metrics:
37
- - type: accuracy
38
- value: 91.51892500000001
39
- - type: ap
40
- value: 88.50346762975335
41
- - type: f1
42
- value: 91.50342077459624
43
- - task:
44
- type: Classification
45
- dataset:
46
- type: mteb/amazon_reviews_multi
47
- name: MTEB AmazonReviewsClassification (en)
48
- config: en
49
- split: test
50
- revision: 1399c76144fd37290681b995c656ef9b2e06e26d
51
- metrics:
52
- - type: accuracy
53
- value: 47.364
54
- - type: f1
55
- value: 46.72708080922794
56
- - task:
57
- type: Retrieval
58
- dataset:
59
- type: arguana
60
- name: MTEB ArguAna
61
- config: default
62
- split: test
63
- revision: None
64
- metrics:
65
- - type: map_at_1
66
- value: 25.178
67
- - type: map_at_10
68
- value: 40.244
69
- - type: map_at_100
70
- value: 41.321999999999996
71
- - type: map_at_1000
72
- value: 41.331
73
- - type: map_at_3
74
- value: 35.016999999999996
75
- - type: map_at_5
76
- value: 37.99
77
- - type: mrr_at_1
78
- value: 25.605
79
- - type: mrr_at_10
80
- value: 40.422000000000004
81
- - type: mrr_at_100
82
- value: 41.507
83
- - type: mrr_at_1000
84
- value: 41.516
85
- - type: mrr_at_3
86
- value: 35.23
87
- - type: mrr_at_5
88
- value: 38.15
89
- - type: ndcg_at_1
90
- value: 25.178
91
- - type: ndcg_at_10
92
- value: 49.258
93
- - type: ndcg_at_100
94
- value: 53.776
95
- - type: ndcg_at_1000
96
- value: 53.995000000000005
97
- - type: ndcg_at_3
98
- value: 38.429
99
- - type: ndcg_at_5
100
- value: 43.803
101
- - type: precision_at_1
102
- value: 25.178
103
- - type: precision_at_10
104
- value: 7.831
105
- - type: precision_at_100
106
- value: 0.979
107
- - type: precision_at_1000
108
- value: 0.1
109
- - type: precision_at_3
110
- value: 16.121
111
- - type: precision_at_5
112
- value: 12.29
113
- - type: recall_at_1
114
- value: 25.178
115
- - type: recall_at_10
116
- value: 78.307
117
- - type: recall_at_100
118
- value: 97.866
119
- - type: recall_at_1000
120
- value: 99.57300000000001
121
- - type: recall_at_3
122
- value: 48.364000000000004
123
- - type: recall_at_5
124
- value: 61.451
125
- - task:
126
- type: Clustering
127
- dataset:
128
- type: mteb/arxiv-clustering-p2p
129
- name: MTEB ArxivClusteringP2P
130
- config: default
131
- split: test
132
- revision: a122ad7f3f0291bf49cc6f4d32aa80929df69d5d
133
- metrics:
134
- - type: v_measure
135
- value: 45.93034494751465
136
- - task:
137
- type: Clustering
138
- dataset:
139
- type: mteb/arxiv-clustering-s2s
140
- name: MTEB ArxivClusteringS2S
141
- config: default
142
- split: test
143
- revision: f910caf1a6075f7329cdf8c1a6135696f37dbd53
144
- metrics:
145
- - type: v_measure
146
- value: 36.64579480054327
147
- - task:
148
- type: Reranking
149
- dataset:
150
- type: mteb/askubuntudupquestions-reranking
151
- name: MTEB AskUbuntuDupQuestions
152
- config: default
153
- split: test
154
- revision: 2000358ca161889fa9c082cb41daa8dcfb161a54
155
- metrics:
156
- - type: map
157
- value: 60.601310529222054
158
- - type: mrr
159
- value: 75.04484896451656
160
- - task:
161
- type: STS
162
- dataset:
163
- type: mteb/biosses-sts
164
- name: MTEB BIOSSES
165
- config: default
166
- split: test
167
- revision: d3fb88f8f02e40887cd149695127462bbcf29b4a
168
- metrics:
169
- - type: cos_sim_pearson
170
- value: 88.57797718095814
171
- - type: cos_sim_spearman
172
- value: 86.47064499110101
173
- - type: euclidean_pearson
174
- value: 87.4559602783142
175
- - type: euclidean_spearman
176
- value: 86.47064499110101
177
- - type: manhattan_pearson
178
- value: 87.7232764230245
179
- - type: manhattan_spearman
180
- value: 86.91222131777742
181
- - task:
182
- type: Classification
183
- dataset:
184
- type: mteb/banking77
185
- name: MTEB Banking77Classification
186
- config: default
187
- split: test
188
- revision: 0fd18e25b25c072e09e0d92ab615fda904d66300
189
- metrics:
190
- - type: accuracy
191
- value: 84.5422077922078
192
- - type: f1
193
- value: 84.47657456950589
194
- - task:
195
- type: Clustering
196
- dataset:
197
- type: mteb/biorxiv-clustering-p2p
198
- name: MTEB BiorxivClusteringP2P
199
- config: default
200
- split: test
201
- revision: 65b79d1d13f80053f67aca9498d9402c2d9f1f40
202
- metrics:
203
- - type: v_measure
204
- value: 38.48953561974464
205
- - task:
206
- type: Clustering
207
- dataset:
208
- type: mteb/biorxiv-clustering-s2s
209
- name: MTEB BiorxivClusteringS2S
210
- config: default
211
- split: test
212
- revision: 258694dd0231531bc1fd9de6ceb52a0853c6d908
213
- metrics:
214
- - type: v_measure
215
- value: 32.75995857510105
216
- - task:
217
- type: Retrieval
218
- dataset:
219
- type: BeIR/cqadupstack
220
- name: MTEB CQADupstackAndroidRetrieval
221
- config: default
222
- split: test
223
- revision: None
224
- metrics:
225
- - type: map_at_1
226
- value: 30.008000000000003
227
- - type: map_at_10
228
- value: 39.51
229
- - type: map_at_100
230
- value: 40.841
231
- - type: map_at_1000
232
- value: 40.973
233
- - type: map_at_3
234
- value: 36.248999999999995
235
- - type: map_at_5
236
- value: 38.096999999999994
237
- - type: mrr_at_1
238
- value: 36.481
239
- - type: mrr_at_10
240
- value: 44.818000000000005
241
- - type: mrr_at_100
242
- value: 45.64
243
- - type: mrr_at_1000
244
- value: 45.687
245
- - type: mrr_at_3
246
- value: 42.036
247
- - type: mrr_at_5
248
- value: 43.782
249
- - type: ndcg_at_1
250
- value: 36.481
251
- - type: ndcg_at_10
252
- value: 45.152
253
- - type: ndcg_at_100
254
- value: 50.449
255
- - type: ndcg_at_1000
256
- value: 52.76499999999999
257
- - type: ndcg_at_3
258
- value: 40.161
259
- - type: ndcg_at_5
260
- value: 42.577999999999996
261
- - type: precision_at_1
262
- value: 36.481
263
- - type: precision_at_10
264
- value: 8.369
265
- - type: precision_at_100
266
- value: 1.373
267
- - type: precision_at_1000
268
- value: 0.186
269
- - type: precision_at_3
270
- value: 18.693
271
- - type: precision_at_5
272
- value: 13.533999999999999
273
- - type: recall_at_1
274
- value: 30.008000000000003
275
- - type: recall_at_10
276
- value: 56.108999999999995
277
- - type: recall_at_100
278
- value: 78.55499999999999
279
- - type: recall_at_1000
280
- value: 93.659
281
- - type: recall_at_3
282
- value: 41.754999999999995
283
- - type: recall_at_5
284
- value: 48.296
285
- - task:
286
- type: Retrieval
287
- dataset:
288
- type: BeIR/cqadupstack
289
- name: MTEB CQADupstackEnglishRetrieval
290
- config: default
291
- split: test
292
- revision: None
293
- metrics:
294
- - type: map_at_1
295
- value: 30.262
296
- - type: map_at_10
297
- value: 40.139
298
- - type: map_at_100
299
- value: 41.394
300
- - type: map_at_1000
301
- value: 41.526
302
- - type: map_at_3
303
- value: 37.155
304
- - type: map_at_5
305
- value: 38.785
306
- - type: mrr_at_1
307
- value: 38.153
308
- - type: mrr_at_10
309
- value: 46.369
310
- - type: mrr_at_100
311
- value: 47.072
312
- - type: mrr_at_1000
313
- value: 47.111999999999995
314
- - type: mrr_at_3
315
- value: 44.268
316
- - type: mrr_at_5
317
- value: 45.389
318
- - type: ndcg_at_1
319
- value: 38.153
320
- - type: ndcg_at_10
321
- value: 45.925
322
- - type: ndcg_at_100
323
- value: 50.394000000000005
324
- - type: ndcg_at_1000
325
- value: 52.37500000000001
326
- - type: ndcg_at_3
327
- value: 41.754000000000005
328
- - type: ndcg_at_5
329
- value: 43.574
330
- - type: precision_at_1
331
- value: 38.153
332
- - type: precision_at_10
333
- value: 8.796
334
- - type: precision_at_100
335
- value: 1.432
336
- - type: precision_at_1000
337
- value: 0.189
338
- - type: precision_at_3
339
- value: 20.318
340
- - type: precision_at_5
341
- value: 14.395
342
- - type: recall_at_1
343
- value: 30.262
344
- - type: recall_at_10
345
- value: 55.72200000000001
346
- - type: recall_at_100
347
- value: 74.97500000000001
348
- - type: recall_at_1000
349
- value: 87.342
350
- - type: recall_at_3
351
- value: 43.129
352
- - type: recall_at_5
353
- value: 48.336
354
- - task:
355
- type: Retrieval
356
- dataset:
357
- type: BeIR/cqadupstack
358
- name: MTEB CQADupstackGamingRetrieval
359
- config: default
360
- split: test
361
- revision: None
362
- metrics:
363
- - type: map_at_1
364
- value: 39.951
365
- - type: map_at_10
366
- value: 51.248000000000005
367
- - type: map_at_100
368
- value: 52.188
369
- - type: map_at_1000
370
- value: 52.247
371
- - type: map_at_3
372
- value: 48.211
373
- - type: map_at_5
374
- value: 49.797000000000004
375
- - type: mrr_at_1
376
- value: 45.329
377
- - type: mrr_at_10
378
- value: 54.749
379
- - type: mrr_at_100
380
- value: 55.367999999999995
381
- - type: mrr_at_1000
382
- value: 55.400000000000006
383
- - type: mrr_at_3
384
- value: 52.382
385
- - type: mrr_at_5
386
- value: 53.649
387
- - type: ndcg_at_1
388
- value: 45.329
389
- - type: ndcg_at_10
390
- value: 56.847
391
- - type: ndcg_at_100
392
- value: 60.738
393
- - type: ndcg_at_1000
394
- value: 61.976
395
- - type: ndcg_at_3
396
- value: 51.59
397
- - type: ndcg_at_5
398
- value: 53.915
399
- - type: precision_at_1
400
- value: 45.329
401
- - type: precision_at_10
402
- value: 8.959
403
- - type: precision_at_100
404
- value: 1.187
405
- - type: precision_at_1000
406
- value: 0.134
407
- - type: precision_at_3
408
- value: 22.612
409
- - type: precision_at_5
410
- value: 15.273
411
- - type: recall_at_1
412
- value: 39.951
413
- - type: recall_at_10
414
- value: 70.053
415
- - type: recall_at_100
416
- value: 86.996
417
- - type: recall_at_1000
418
- value: 95.707
419
- - type: recall_at_3
420
- value: 56.032000000000004
421
- - type: recall_at_5
422
- value: 61.629999999999995
423
- - task:
424
- type: Retrieval
425
- dataset:
426
- type: BeIR/cqadupstack
427
- name: MTEB CQADupstackGisRetrieval
428
- config: default
429
- split: test
430
- revision: None
431
- metrics:
432
- - type: map_at_1
433
- value: 25.566
434
- - type: map_at_10
435
- value: 33.207
436
- - type: map_at_100
437
- value: 34.166000000000004
438
- - type: map_at_1000
439
- value: 34.245
440
- - type: map_at_3
441
- value: 30.94
442
- - type: map_at_5
443
- value: 32.01
444
- - type: mrr_at_1
445
- value: 27.345000000000002
446
- - type: mrr_at_10
447
- value: 35.193000000000005
448
- - type: mrr_at_100
449
- value: 35.965
450
- - type: mrr_at_1000
451
- value: 36.028999999999996
452
- - type: mrr_at_3
453
- value: 32.806000000000004
454
- - type: mrr_at_5
455
- value: 34.021
456
- - type: ndcg_at_1
457
- value: 27.345000000000002
458
- - type: ndcg_at_10
459
- value: 37.891999999999996
460
- - type: ndcg_at_100
461
- value: 42.664
462
- - type: ndcg_at_1000
463
- value: 44.757000000000005
464
- - type: ndcg_at_3
465
- value: 33.123000000000005
466
- - type: ndcg_at_5
467
- value: 35.035
468
- - type: precision_at_1
469
- value: 27.345000000000002
470
- - type: precision_at_10
471
- value: 5.763
472
- - type: precision_at_100
473
- value: 0.859
474
- - type: precision_at_1000
475
- value: 0.108
476
- - type: precision_at_3
477
- value: 13.71
478
- - type: precision_at_5
479
- value: 9.401
480
- - type: recall_at_1
481
- value: 25.566
482
- - type: recall_at_10
483
- value: 50.563
484
- - type: recall_at_100
485
- value: 72.86399999999999
486
- - type: recall_at_1000
487
- value: 88.68599999999999
488
- - type: recall_at_3
489
- value: 37.43
490
- - type: recall_at_5
491
- value: 41.894999999999996
492
- - task:
493
- type: Retrieval
494
- dataset:
495
- type: BeIR/cqadupstack
496
- name: MTEB CQADupstackMathematicaRetrieval
497
- config: default
498
- split: test
499
- revision: None
500
- metrics:
501
- - type: map_at_1
502
- value: 16.663
503
- - type: map_at_10
504
- value: 23.552
505
- - type: map_at_100
506
- value: 24.538
507
- - type: map_at_1000
508
- value: 24.661
509
- - type: map_at_3
510
- value: 21.085
511
- - type: map_at_5
512
- value: 22.391
513
- - type: mrr_at_1
514
- value: 20.025000000000002
515
- - type: mrr_at_10
516
- value: 27.643
517
- - type: mrr_at_100
518
- value: 28.499999999999996
519
- - type: mrr_at_1000
520
- value: 28.582
521
- - type: mrr_at_3
522
- value: 25.083
523
- - type: mrr_at_5
524
- value: 26.544
525
- - type: ndcg_at_1
526
- value: 20.025000000000002
527
- - type: ndcg_at_10
528
- value: 28.272000000000002
529
- - type: ndcg_at_100
530
- value: 33.353
531
- - type: ndcg_at_1000
532
- value: 36.454
533
- - type: ndcg_at_3
534
- value: 23.579
535
- - type: ndcg_at_5
536
- value: 25.685000000000002
537
- - type: precision_at_1
538
- value: 20.025000000000002
539
- - type: precision_at_10
540
- value: 5.187
541
- - type: precision_at_100
542
- value: 0.897
543
- - type: precision_at_1000
544
- value: 0.13
545
- - type: precision_at_3
546
- value: 10.987
547
- - type: precision_at_5
548
- value: 8.06
549
- - type: recall_at_1
550
- value: 16.663
551
- - type: recall_at_10
552
- value: 38.808
553
- - type: recall_at_100
554
- value: 61.305
555
- - type: recall_at_1000
556
- value: 83.571
557
- - type: recall_at_3
558
- value: 25.907999999999998
559
- - type: recall_at_5
560
- value: 31.214
561
- - task:
562
- type: Retrieval
563
- dataset:
564
- type: BeIR/cqadupstack
565
- name: MTEB CQADupstackPhysicsRetrieval
566
- config: default
567
- split: test
568
- revision: None
569
- metrics:
570
- - type: map_at_1
571
- value: 27.695999999999998
572
- - type: map_at_10
573
- value: 37.018
574
- - type: map_at_100
575
- value: 38.263000000000005
576
- - type: map_at_1000
577
- value: 38.371
578
- - type: map_at_3
579
- value: 34.226
580
- - type: map_at_5
581
- value: 35.809999999999995
582
- - type: mrr_at_1
583
- value: 32.916000000000004
584
- - type: mrr_at_10
585
- value: 42.067
586
- - type: mrr_at_100
587
- value: 42.925000000000004
588
- - type: mrr_at_1000
589
- value: 42.978
590
- - type: mrr_at_3
591
- value: 39.637
592
- - type: mrr_at_5
593
- value: 41.134
594
- - type: ndcg_at_1
595
- value: 32.916000000000004
596
- - type: ndcg_at_10
597
- value: 42.539
598
- - type: ndcg_at_100
599
- value: 47.873
600
- - type: ndcg_at_1000
601
- value: 50.08200000000001
602
- - type: ndcg_at_3
603
- value: 37.852999999999994
604
- - type: ndcg_at_5
605
- value: 40.201
606
- - type: precision_at_1
607
- value: 32.916000000000004
608
- - type: precision_at_10
609
- value: 7.5840000000000005
610
- - type: precision_at_100
611
- value: 1.199
612
- - type: precision_at_1000
613
- value: 0.155
614
- - type: precision_at_3
615
- value: 17.485
616
- - type: precision_at_5
617
- value: 12.512
618
- - type: recall_at_1
619
- value: 27.695999999999998
620
- - type: recall_at_10
621
- value: 53.638
622
- - type: recall_at_100
623
- value: 76.116
624
- - type: recall_at_1000
625
- value: 91.069
626
- - type: recall_at_3
627
- value: 41.13
628
- - type: recall_at_5
629
- value: 46.872
630
- - task:
631
- type: Retrieval
632
- dataset:
633
- type: BeIR/cqadupstack
634
- name: MTEB CQADupstackProgrammersRetrieval
635
- config: default
636
- split: test
637
- revision: None
638
- metrics:
639
- - type: map_at_1
640
- value: 24.108
641
- - type: map_at_10
642
- value: 33.372
643
- - type: map_at_100
644
- value: 34.656
645
- - type: map_at_1000
646
- value: 34.768
647
- - type: map_at_3
648
- value: 30.830999999999996
649
- - type: map_at_5
650
- value: 32.204
651
- - type: mrr_at_1
652
- value: 29.110000000000003
653
- - type: mrr_at_10
654
- value: 37.979
655
- - type: mrr_at_100
656
- value: 38.933
657
- - type: mrr_at_1000
658
- value: 38.988
659
- - type: mrr_at_3
660
- value: 35.731
661
- - type: mrr_at_5
662
- value: 36.963
663
- - type: ndcg_at_1
664
- value: 29.110000000000003
665
- - type: ndcg_at_10
666
- value: 38.635000000000005
667
- - type: ndcg_at_100
668
- value: 44.324999999999996
669
- - type: ndcg_at_1000
670
- value: 46.747
671
- - type: ndcg_at_3
672
- value: 34.37
673
- - type: ndcg_at_5
674
- value: 36.228
675
- - type: precision_at_1
676
- value: 29.110000000000003
677
- - type: precision_at_10
678
- value: 6.963
679
- - type: precision_at_100
680
- value: 1.146
681
- - type: precision_at_1000
682
- value: 0.152
683
- - type: precision_at_3
684
- value: 16.400000000000002
685
- - type: precision_at_5
686
- value: 11.552999999999999
687
- - type: recall_at_1
688
- value: 24.108
689
- - type: recall_at_10
690
- value: 49.597
691
- - type: recall_at_100
692
- value: 73.88900000000001
693
- - type: recall_at_1000
694
- value: 90.62400000000001
695
- - type: recall_at_3
696
- value: 37.662
697
- - type: recall_at_5
698
- value: 42.565
699
- - task:
700
- type: Retrieval
701
- dataset:
702
- type: BeIR/cqadupstack
703
- name: MTEB CQADupstackRetrieval
704
- config: default
705
- split: test
706
- revision: None
707
- metrics:
708
- - type: map_at_1
709
- value: 25.00791666666667
710
- - type: map_at_10
711
- value: 33.287749999999996
712
- - type: map_at_100
713
- value: 34.41141666666667
714
- - type: map_at_1000
715
- value: 34.52583333333333
716
- - type: map_at_3
717
- value: 30.734416666666668
718
- - type: map_at_5
719
- value: 32.137166666666666
720
- - type: mrr_at_1
721
- value: 29.305666666666664
722
- - type: mrr_at_10
723
- value: 37.22966666666666
724
- - type: mrr_at_100
725
- value: 38.066583333333334
726
- - type: mrr_at_1000
727
- value: 38.12616666666667
728
- - type: mrr_at_3
729
- value: 34.92275
730
- - type: mrr_at_5
731
- value: 36.23333333333334
732
- - type: ndcg_at_1
733
- value: 29.305666666666664
734
- - type: ndcg_at_10
735
- value: 38.25533333333333
736
- - type: ndcg_at_100
737
- value: 43.25266666666666
738
- - type: ndcg_at_1000
739
- value: 45.63583333333334
740
- - type: ndcg_at_3
741
- value: 33.777166666666666
742
- - type: ndcg_at_5
743
- value: 35.85
744
- - type: precision_at_1
745
- value: 29.305666666666664
746
- - type: precision_at_10
747
- value: 6.596416666666667
748
- - type: precision_at_100
749
- value: 1.0784166666666668
750
- - type: precision_at_1000
751
- value: 0.14666666666666664
752
- - type: precision_at_3
753
- value: 15.31075
754
- - type: precision_at_5
755
- value: 10.830916666666667
756
- - type: recall_at_1
757
- value: 25.00791666666667
758
- - type: recall_at_10
759
- value: 49.10933333333333
760
- - type: recall_at_100
761
- value: 71.09216666666667
762
- - type: recall_at_1000
763
- value: 87.77725000000001
764
- - type: recall_at_3
765
- value: 36.660916666666665
766
- - type: recall_at_5
767
- value: 41.94149999999999
768
- - task:
769
- type: Retrieval
770
- dataset:
771
- type: BeIR/cqadupstack
772
- name: MTEB CQADupstackStatsRetrieval
773
- config: default
774
- split: test
775
- revision: None
776
- metrics:
777
- - type: map_at_1
778
- value: 23.521
779
- - type: map_at_10
780
- value: 30.043
781
- - type: map_at_100
782
- value: 30.936000000000003
783
- - type: map_at_1000
784
- value: 31.022
785
- - type: map_at_3
786
- value: 27.926000000000002
787
- - type: map_at_5
788
- value: 29.076999999999998
789
- - type: mrr_at_1
790
- value: 26.227
791
- - type: mrr_at_10
792
- value: 32.822
793
- - type: mrr_at_100
794
- value: 33.61
795
- - type: mrr_at_1000
796
- value: 33.672000000000004
797
- - type: mrr_at_3
798
- value: 30.776999999999997
799
- - type: mrr_at_5
800
- value: 31.866
801
- - type: ndcg_at_1
802
- value: 26.227
803
- - type: ndcg_at_10
804
- value: 34.041
805
- - type: ndcg_at_100
806
- value: 38.394
807
- - type: ndcg_at_1000
808
- value: 40.732
809
- - type: ndcg_at_3
810
- value: 30.037999999999997
811
- - type: ndcg_at_5
812
- value: 31.845000000000002
813
- - type: precision_at_1
814
- value: 26.227
815
- - type: precision_at_10
816
- value: 5.244999999999999
817
- - type: precision_at_100
818
- value: 0.808
819
- - type: precision_at_1000
820
- value: 0.107
821
- - type: precision_at_3
822
- value: 12.679000000000002
823
- - type: precision_at_5
824
- value: 8.773
825
- - type: recall_at_1
826
- value: 23.521
827
- - type: recall_at_10
828
- value: 43.633
829
- - type: recall_at_100
830
- value: 63.126000000000005
831
- - type: recall_at_1000
832
- value: 80.765
833
- - type: recall_at_3
834
- value: 32.614
835
- - type: recall_at_5
836
- value: 37.15
837
- - task:
838
- type: Retrieval
839
- dataset:
840
- type: BeIR/cqadupstack
841
- name: MTEB CQADupstackTexRetrieval
842
- config: default
843
- split: test
844
- revision: None
845
- metrics:
846
- - type: map_at_1
847
- value: 16.236
848
- - type: map_at_10
849
- value: 22.898
850
- - type: map_at_100
851
- value: 23.878
852
- - type: map_at_1000
853
- value: 24.009
854
- - type: map_at_3
855
- value: 20.87
856
- - type: map_at_5
857
- value: 22.025
858
- - type: mrr_at_1
859
- value: 19.339000000000002
860
- - type: mrr_at_10
861
- value: 26.382
862
- - type: mrr_at_100
863
- value: 27.245
864
- - type: mrr_at_1000
865
- value: 27.33
866
- - type: mrr_at_3
867
- value: 24.386
868
- - type: mrr_at_5
869
- value: 25.496000000000002
870
- - type: ndcg_at_1
871
- value: 19.339000000000002
872
- - type: ndcg_at_10
873
- value: 27.139999999999997
874
- - type: ndcg_at_100
875
- value: 31.944
876
- - type: ndcg_at_1000
877
- value: 35.077999999999996
878
- - type: ndcg_at_3
879
- value: 23.424
880
- - type: ndcg_at_5
881
- value: 25.188
882
- - type: precision_at_1
883
- value: 19.339000000000002
884
- - type: precision_at_10
885
- value: 4.8309999999999995
886
- - type: precision_at_100
887
- value: 0.845
888
- - type: precision_at_1000
889
- value: 0.128
890
- - type: precision_at_3
891
- value: 10.874
892
- - type: precision_at_5
893
- value: 7.825
894
- - type: recall_at_1
895
- value: 16.236
896
- - type: recall_at_10
897
- value: 36.513
898
- - type: recall_at_100
899
- value: 57.999
900
- - type: recall_at_1000
901
- value: 80.512
902
- - type: recall_at_3
903
- value: 26.179999999999996
904
- - type: recall_at_5
905
- value: 30.712
906
- - task:
907
- type: Retrieval
908
- dataset:
909
- type: BeIR/cqadupstack
910
- name: MTEB CQADupstackUnixRetrieval
911
- config: default
912
- split: test
913
- revision: None
914
- metrics:
915
- - type: map_at_1
916
- value: 24.11
917
- - type: map_at_10
918
- value: 31.566
919
- - type: map_at_100
920
- value: 32.647
921
- - type: map_at_1000
922
- value: 32.753
923
- - type: map_at_3
924
- value: 29.24
925
- - type: map_at_5
926
- value: 30.564999999999998
927
- - type: mrr_at_1
928
- value: 28.265
929
- - type: mrr_at_10
930
- value: 35.504000000000005
931
- - type: mrr_at_100
932
- value: 36.436
933
- - type: mrr_at_1000
934
- value: 36.503
935
- - type: mrr_at_3
936
- value: 33.349000000000004
937
- - type: mrr_at_5
938
- value: 34.622
939
- - type: ndcg_at_1
940
- value: 28.265
941
- - type: ndcg_at_10
942
- value: 36.192
943
- - type: ndcg_at_100
944
- value: 41.388000000000005
945
- - type: ndcg_at_1000
946
- value: 43.948
947
- - type: ndcg_at_3
948
- value: 31.959
949
- - type: ndcg_at_5
950
- value: 33.998
951
- - type: precision_at_1
952
- value: 28.265
953
- - type: precision_at_10
954
- value: 5.989
955
- - type: precision_at_100
956
- value: 0.9650000000000001
957
- - type: precision_at_1000
958
- value: 0.13
959
- - type: precision_at_3
960
- value: 14.335
961
- - type: precision_at_5
962
- value: 10.112
963
- - type: recall_at_1
964
- value: 24.11
965
- - type: recall_at_10
966
- value: 46.418
967
- - type: recall_at_100
968
- value: 69.314
969
- - type: recall_at_1000
970
- value: 87.397
971
- - type: recall_at_3
972
- value: 34.724
973
- - type: recall_at_5
974
- value: 39.925
975
- - task:
976
- type: Retrieval
977
- dataset:
978
- type: BeIR/cqadupstack
979
- name: MTEB CQADupstackWebmastersRetrieval
980
- config: default
981
- split: test
982
- revision: None
983
- metrics:
984
- - type: map_at_1
985
- value: 22.091
986
- - type: map_at_10
987
- value: 29.948999999999998
988
- - type: map_at_100
989
- value: 31.502000000000002
990
- - type: map_at_1000
991
- value: 31.713
992
- - type: map_at_3
993
- value: 27.464
994
- - type: map_at_5
995
- value: 28.968
996
- - type: mrr_at_1
997
- value: 26.482
998
- - type: mrr_at_10
999
- value: 34.009
1000
- - type: mrr_at_100
1001
- value: 35.081
1002
- - type: mrr_at_1000
1003
- value: 35.138000000000005
1004
- - type: mrr_at_3
1005
- value: 31.785000000000004
1006
- - type: mrr_at_5
1007
- value: 33.178999999999995
1008
- - type: ndcg_at_1
1009
- value: 26.482
1010
- - type: ndcg_at_10
1011
- value: 35.008
1012
- - type: ndcg_at_100
1013
- value: 41.272999999999996
1014
- - type: ndcg_at_1000
1015
- value: 43.972
1016
- - type: ndcg_at_3
1017
- value: 30.804
1018
- - type: ndcg_at_5
1019
- value: 33.046
1020
- - type: precision_at_1
1021
- value: 26.482
1022
- - type: precision_at_10
1023
- value: 6.462
1024
- - type: precision_at_100
1025
- value: 1.431
1026
- - type: precision_at_1000
1027
- value: 0.22899999999999998
1028
- - type: precision_at_3
1029
- value: 14.360999999999999
1030
- - type: precision_at_5
1031
- value: 10.474
1032
- - type: recall_at_1
1033
- value: 22.091
1034
- - type: recall_at_10
1035
- value: 45.125
1036
- - type: recall_at_100
1037
- value: 72.313
1038
- - type: recall_at_1000
1039
- value: 89.503
1040
- - type: recall_at_3
1041
- value: 33.158
1042
- - type: recall_at_5
1043
- value: 39.086999999999996
1044
- - task:
1045
- type: Retrieval
1046
- dataset:
1047
- type: BeIR/cqadupstack
1048
- name: MTEB CQADupstackWordpressRetrieval
1049
- config: default
1050
- split: test
1051
- revision: None
1052
- metrics:
1053
- - type: map_at_1
1054
- value: 19.883
1055
- - type: map_at_10
1056
- value: 26.951000000000004
1057
- - type: map_at_100
1058
- value: 27.927999999999997
1059
- - type: map_at_1000
1060
- value: 28.022000000000002
1061
- - type: map_at_3
1062
- value: 24.616
1063
- - type: map_at_5
1064
- value: 25.917
1065
- - type: mrr_at_1
1066
- value: 21.996
1067
- - type: mrr_at_10
1068
- value: 29.221000000000004
1069
- - type: mrr_at_100
1070
- value: 30.024
1071
- - type: mrr_at_1000
1072
- value: 30.095
1073
- - type: mrr_at_3
1074
- value: 26.833000000000002
1075
- - type: mrr_at_5
1076
- value: 28.155
1077
- - type: ndcg_at_1
1078
- value: 21.996
1079
- - type: ndcg_at_10
1080
- value: 31.421
1081
- - type: ndcg_at_100
1082
- value: 36.237
1083
- - type: ndcg_at_1000
1084
- value: 38.744
1085
- - type: ndcg_at_3
1086
- value: 26.671
1087
- - type: ndcg_at_5
1088
- value: 28.907
1089
- - type: precision_at_1
1090
- value: 21.996
1091
- - type: precision_at_10
1092
- value: 5.009
1093
- - type: precision_at_100
1094
- value: 0.799
1095
- - type: precision_at_1000
1096
- value: 0.11199999999999999
1097
- - type: precision_at_3
1098
- value: 11.275
1099
- - type: precision_at_5
1100
- value: 8.059
1101
- - type: recall_at_1
1102
- value: 19.883
1103
- - type: recall_at_10
1104
- value: 43.132999999999996
1105
- - type: recall_at_100
1106
- value: 65.654
1107
- - type: recall_at_1000
1108
- value: 84.492
1109
- - type: recall_at_3
1110
- value: 30.209000000000003
1111
- - type: recall_at_5
1112
- value: 35.616
1113
- - task:
1114
- type: Retrieval
1115
- dataset:
1116
- type: climate-fever
1117
- name: MTEB ClimateFEVER
1118
- config: default
1119
- split: test
1120
- revision: None
1121
- metrics:
1122
- - type: map_at_1
1123
- value: 17.756
1124
- - type: map_at_10
1125
- value: 30.378
1126
- - type: map_at_100
1127
- value: 32.537
1128
- - type: map_at_1000
1129
- value: 32.717
1130
- - type: map_at_3
1131
- value: 25.599
1132
- - type: map_at_5
1133
- value: 28.372999999999998
1134
- - type: mrr_at_1
1135
- value: 41.303
1136
- - type: mrr_at_10
1137
- value: 53.483999999999995
1138
- - type: mrr_at_100
1139
- value: 54.106
1140
- - type: mrr_at_1000
1141
- value: 54.127
1142
- - type: mrr_at_3
1143
- value: 50.315
1144
- - type: mrr_at_5
1145
- value: 52.396
1146
- - type: ndcg_at_1
1147
- value: 41.303
1148
- - type: ndcg_at_10
1149
- value: 40.503
1150
- - type: ndcg_at_100
1151
- value: 47.821000000000005
1152
- - type: ndcg_at_1000
1153
- value: 50.788
1154
- - type: ndcg_at_3
1155
- value: 34.364
1156
- - type: ndcg_at_5
1157
- value: 36.818
1158
- - type: precision_at_1
1159
- value: 41.303
1160
- - type: precision_at_10
1161
- value: 12.463000000000001
1162
- - type: precision_at_100
1163
- value: 2.037
1164
- - type: precision_at_1000
1165
- value: 0.26
1166
- - type: precision_at_3
1167
- value: 25.798
1168
- - type: precision_at_5
1169
- value: 19.896
1170
- - type: recall_at_1
1171
- value: 17.756
1172
- - type: recall_at_10
1173
- value: 46.102
1174
- - type: recall_at_100
1175
- value: 70.819
1176
- - type: recall_at_1000
1177
- value: 87.21799999999999
1178
- - type: recall_at_3
1179
- value: 30.646
1180
- - type: recall_at_5
1181
- value: 38.022
1182
- - task:
1183
- type: Retrieval
1184
- dataset:
1185
- type: dbpedia-entity
1186
- name: MTEB DBPedia
1187
- config: default
1188
- split: test
1189
- revision: None
1190
- metrics:
1191
- - type: map_at_1
1192
- value: 9.033
1193
- - type: map_at_10
1194
- value: 20.584
1195
- - type: map_at_100
1196
- value: 29.518
1197
- - type: map_at_1000
1198
- value: 31.186000000000003
1199
- - type: map_at_3
1200
- value: 14.468
1201
- - type: map_at_5
1202
- value: 17.177
1203
- - type: mrr_at_1
1204
- value: 69.75
1205
- - type: mrr_at_10
1206
- value: 77.025
1207
- - type: mrr_at_100
1208
- value: 77.36699999999999
1209
- - type: mrr_at_1000
1210
- value: 77.373
1211
- - type: mrr_at_3
1212
- value: 75.583
1213
- - type: mrr_at_5
1214
- value: 76.396
1215
- - type: ndcg_at_1
1216
- value: 58.5
1217
- - type: ndcg_at_10
1218
- value: 45.033
1219
- - type: ndcg_at_100
1220
- value: 49.071
1221
- - type: ndcg_at_1000
1222
- value: 56.056
1223
- - type: ndcg_at_3
1224
- value: 49.936
1225
- - type: ndcg_at_5
1226
- value: 47.471999999999994
1227
- - type: precision_at_1
1228
- value: 69.75
1229
- - type: precision_at_10
1230
- value: 35.775
1231
- - type: precision_at_100
1232
- value: 11.594999999999999
1233
- - type: precision_at_1000
1234
- value: 2.062
1235
- - type: precision_at_3
1236
- value: 52.5
1237
- - type: precision_at_5
1238
- value: 45.300000000000004
1239
- - type: recall_at_1
1240
- value: 9.033
1241
- - type: recall_at_10
1242
- value: 26.596999999999998
1243
- - type: recall_at_100
1244
- value: 54.607000000000006
1245
- - type: recall_at_1000
1246
- value: 76.961
1247
- - type: recall_at_3
1248
- value: 15.754999999999999
1249
- - type: recall_at_5
1250
- value: 20.033
1251
- - task:
1252
- type: Classification
1253
- dataset:
1254
- type: mteb/emotion
1255
- name: MTEB EmotionClassification
1256
- config: default
1257
- split: test
1258
- revision: 4f58c6b202a23cf9a4da393831edf4f9183cad37
1259
- metrics:
1260
- - type: accuracy
1261
- value: 48.345000000000006
1262
- - type: f1
1263
- value: 43.4514918068706
1264
- - task:
1265
- type: Retrieval
1266
- dataset:
1267
- type: fever
1268
- name: MTEB FEVER
1269
- config: default
1270
- split: test
1271
- revision: None
1272
- metrics:
1273
- - type: map_at_1
1274
- value: 71.29100000000001
1275
- - type: map_at_10
1276
- value: 81.059
1277
- - type: map_at_100
1278
- value: 81.341
1279
- - type: map_at_1000
1280
- value: 81.355
1281
- - type: map_at_3
1282
- value: 79.74799999999999
1283
- - type: map_at_5
1284
- value: 80.612
1285
- - type: mrr_at_1
1286
- value: 76.40299999999999
1287
- - type: mrr_at_10
1288
- value: 84.615
1289
- - type: mrr_at_100
1290
- value: 84.745
1291
- - type: mrr_at_1000
1292
- value: 84.748
1293
- - type: mrr_at_3
1294
- value: 83.776
1295
- - type: mrr_at_5
1296
- value: 84.343
1297
- - type: ndcg_at_1
1298
- value: 76.40299999999999
1299
- - type: ndcg_at_10
1300
- value: 84.981
1301
- - type: ndcg_at_100
1302
- value: 86.00999999999999
1303
- - type: ndcg_at_1000
1304
- value: 86.252
1305
- - type: ndcg_at_3
1306
- value: 82.97
1307
- - type: ndcg_at_5
1308
- value: 84.152
1309
- - type: precision_at_1
1310
- value: 76.40299999999999
1311
- - type: precision_at_10
1312
- value: 10.446
1313
- - type: precision_at_100
1314
- value: 1.1199999999999999
1315
- - type: precision_at_1000
1316
- value: 0.116
1317
- - type: precision_at_3
1318
- value: 32.147999999999996
1319
- - type: precision_at_5
1320
- value: 20.135
1321
- - type: recall_at_1
1322
- value: 71.29100000000001
1323
- - type: recall_at_10
1324
- value: 93.232
1325
- - type: recall_at_100
1326
- value: 97.363
1327
- - type: recall_at_1000
1328
- value: 98.905
1329
- - type: recall_at_3
1330
- value: 87.893
1331
- - type: recall_at_5
1332
- value: 90.804
1333
- - task:
1334
- type: Retrieval
1335
- dataset:
1336
- type: fiqa
1337
- name: MTEB FiQA2018
1338
- config: default
1339
- split: test
1340
- revision: None
1341
- metrics:
1342
- - type: map_at_1
1343
- value: 18.667
1344
- - type: map_at_10
1345
- value: 30.853
1346
- - type: map_at_100
1347
- value: 32.494
1348
- - type: map_at_1000
1349
- value: 32.677
1350
- - type: map_at_3
1351
- value: 26.91
1352
- - type: map_at_5
1353
- value: 29.099000000000004
1354
- - type: mrr_at_1
1355
- value: 37.191
1356
- - type: mrr_at_10
1357
- value: 46.171
1358
- - type: mrr_at_100
1359
- value: 47.056
1360
- - type: mrr_at_1000
1361
- value: 47.099000000000004
1362
- - type: mrr_at_3
1363
- value: 44.059
1364
- - type: mrr_at_5
1365
- value: 45.147
1366
- - type: ndcg_at_1
1367
- value: 37.191
1368
- - type: ndcg_at_10
1369
- value: 38.437
1370
- - type: ndcg_at_100
1371
- value: 44.62
1372
- - type: ndcg_at_1000
1373
- value: 47.795
1374
- - type: ndcg_at_3
1375
- value: 35.003
1376
- - type: ndcg_at_5
1377
- value: 36.006
1378
- - type: precision_at_1
1379
- value: 37.191
1380
- - type: precision_at_10
1381
- value: 10.586
1382
- - type: precision_at_100
1383
- value: 1.688
1384
- - type: precision_at_1000
1385
- value: 0.22699999999999998
1386
- - type: precision_at_3
1387
- value: 23.302
1388
- - type: precision_at_5
1389
- value: 17.006
1390
- - type: recall_at_1
1391
- value: 18.667
1392
- - type: recall_at_10
1393
- value: 45.367000000000004
1394
- - type: recall_at_100
1395
- value: 68.207
1396
- - type: recall_at_1000
1397
- value: 87.072
1398
- - type: recall_at_3
1399
- value: 32.129000000000005
1400
- - type: recall_at_5
1401
- value: 37.719
1402
- - task:
1403
- type: Retrieval
1404
- dataset:
1405
- type: hotpotqa
1406
- name: MTEB HotpotQA
1407
- config: default
1408
- split: test
1409
- revision: None
1410
- metrics:
1411
- - type: map_at_1
1412
- value: 39.494
1413
- - type: map_at_10
1414
- value: 66.223
1415
- - type: map_at_100
1416
- value: 67.062
1417
- - type: map_at_1000
1418
- value: 67.11500000000001
1419
- - type: map_at_3
1420
- value: 62.867
1421
- - type: map_at_5
1422
- value: 64.994
1423
- - type: mrr_at_1
1424
- value: 78.987
1425
- - type: mrr_at_10
1426
- value: 84.585
1427
- - type: mrr_at_100
1428
- value: 84.773
1429
- - type: mrr_at_1000
1430
- value: 84.77900000000001
1431
- - type: mrr_at_3
1432
- value: 83.592
1433
- - type: mrr_at_5
1434
- value: 84.235
1435
- - type: ndcg_at_1
1436
- value: 78.987
1437
- - type: ndcg_at_10
1438
- value: 73.64
1439
- - type: ndcg_at_100
1440
- value: 76.519
1441
- - type: ndcg_at_1000
1442
- value: 77.51
1443
- - type: ndcg_at_3
1444
- value: 68.893
1445
- - type: ndcg_at_5
1446
- value: 71.585
1447
- - type: precision_at_1
1448
- value: 78.987
1449
- - type: precision_at_10
1450
- value: 15.529000000000002
1451
- - type: precision_at_100
1452
- value: 1.7770000000000001
1453
- - type: precision_at_1000
1454
- value: 0.191
1455
- - type: precision_at_3
1456
- value: 44.808
1457
- - type: precision_at_5
1458
- value: 29.006999999999998
1459
- - type: recall_at_1
1460
- value: 39.494
1461
- - type: recall_at_10
1462
- value: 77.643
1463
- - type: recall_at_100
1464
- value: 88.825
1465
- - type: recall_at_1000
1466
- value: 95.321
1467
- - type: recall_at_3
1468
- value: 67.211
1469
- - type: recall_at_5
1470
- value: 72.519
1471
- - task:
1472
- type: Classification
1473
- dataset:
1474
- type: mteb/imdb
1475
- name: MTEB ImdbClassification
1476
- config: default
1477
- split: test
1478
- revision: 3d86128a09e091d6018b6d26cad27f2739fc2db7
1479
- metrics:
1480
- - type: accuracy
1481
- value: 85.55959999999999
1482
- - type: ap
1483
- value: 80.7246500384617
1484
- - type: f1
1485
- value: 85.52336485065454
1486
- - task:
1487
- type: Retrieval
1488
- dataset:
1489
- type: msmarco
1490
- name: MTEB MSMARCO
1491
- config: default
1492
- split: dev
1493
- revision: None
1494
- metrics:
1495
- - type: map_at_1
1496
- value: 23.631
1497
- - type: map_at_10
1498
- value: 36.264
1499
- - type: map_at_100
1500
- value: 37.428
1501
- - type: map_at_1000
1502
- value: 37.472
1503
- - type: map_at_3
1504
- value: 32.537
1505
- - type: map_at_5
1506
- value: 34.746
1507
- - type: mrr_at_1
1508
- value: 24.312
1509
- - type: mrr_at_10
1510
- value: 36.858000000000004
1511
- - type: mrr_at_100
1512
- value: 37.966
1513
- - type: mrr_at_1000
1514
- value: 38.004
1515
- - type: mrr_at_3
1516
- value: 33.188
1517
- - type: mrr_at_5
1518
- value: 35.367
1519
- - type: ndcg_at_1
1520
- value: 24.312
1521
- - type: ndcg_at_10
1522
- value: 43.126999999999995
1523
- - type: ndcg_at_100
1524
- value: 48.642
1525
- - type: ndcg_at_1000
1526
- value: 49.741
1527
- - type: ndcg_at_3
1528
- value: 35.589
1529
- - type: ndcg_at_5
1530
- value: 39.515
1531
- - type: precision_at_1
1532
- value: 24.312
1533
- - type: precision_at_10
1534
- value: 6.699
1535
- - type: precision_at_100
1536
- value: 0.9450000000000001
1537
- - type: precision_at_1000
1538
- value: 0.104
1539
- - type: precision_at_3
1540
- value: 15.153
1541
- - type: precision_at_5
1542
- value: 11.065999999999999
1543
- - type: recall_at_1
1544
- value: 23.631
1545
- - type: recall_at_10
1546
- value: 64.145
1547
- - type: recall_at_100
1548
- value: 89.41
1549
- - type: recall_at_1000
1550
- value: 97.83500000000001
1551
- - type: recall_at_3
1552
- value: 43.769000000000005
1553
- - type: recall_at_5
1554
- value: 53.169
1555
- - task:
1556
- type: Classification
1557
- dataset:
1558
- type: mteb/mtop_domain
1559
- name: MTEB MTOPDomainClassification (en)
1560
- config: en
1561
- split: test
1562
- revision: d80d48c1eb48d3562165c59d59d0034df9fff0bf
1563
- metrics:
1564
- - type: accuracy
1565
- value: 93.4108527131783
1566
- - type: f1
1567
- value: 93.1415880261038
1568
- - task:
1569
- type: Classification
1570
- dataset:
1571
- type: mteb/mtop_intent
1572
- name: MTEB MTOPIntentClassification (en)
1573
- config: en
1574
- split: test
1575
- revision: ae001d0e6b1228650b7bd1c2c65fb50ad11a8aba
1576
- metrics:
1577
- - type: accuracy
1578
- value: 77.24806201550388
1579
- - type: f1
1580
- value: 60.531916308197175
1581
- - task:
1582
- type: Classification
1583
- dataset:
1584
- type: mteb/amazon_massive_intent
1585
- name: MTEB MassiveIntentClassification (en)
1586
- config: en
1587
- split: test
1588
- revision: 31efe3c427b0bae9c22cbb560b8f15491cc6bed7
1589
- metrics:
1590
- - type: accuracy
1591
- value: 73.71553463349024
1592
- - type: f1
1593
- value: 71.70753174900791
1594
- - task:
1595
- type: Classification
1596
- dataset:
1597
- type: mteb/amazon_massive_scenario
1598
- name: MTEB MassiveScenarioClassification (en)
1599
- config: en
1600
- split: test
1601
- revision: 7d571f92784cd94a019292a1f45445077d0ef634
1602
- metrics:
1603
- - type: accuracy
1604
- value: 77.79757901815736
1605
- - type: f1
1606
- value: 77.83719850433258
1607
- - task:
1608
- type: Clustering
1609
- dataset:
1610
- type: mteb/medrxiv-clustering-p2p
1611
- name: MTEB MedrxivClusteringP2P
1612
- config: default
1613
- split: test
1614
- revision: e7a26af6f3ae46b30dde8737f02c07b1505bcc73
1615
- metrics:
1616
- - type: v_measure
1617
- value: 33.74193296622113
1618
- - task:
1619
- type: Clustering
1620
- dataset:
1621
- type: mteb/medrxiv-clustering-s2s
1622
- name: MTEB MedrxivClusteringS2S
1623
- config: default
1624
- split: test
1625
- revision: 35191c8c0dca72d8ff3efcd72aa802307d469663
1626
- metrics:
1627
- - type: v_measure
1628
- value: 30.64257594108566
1629
- - task:
1630
- type: Reranking
1631
- dataset:
1632
- type: mteb/mind_small
1633
- name: MTEB MindSmallReranking
1634
- config: default
1635
- split: test
1636
- revision: 3bdac13927fdc888b903db93b2ffdbd90b295a69
1637
- metrics:
1638
- - type: map
1639
- value: 30.811018518883625
1640
- - type: mrr
1641
- value: 31.910376577445003
1642
- - task:
1643
- type: Retrieval
1644
- dataset:
1645
- type: nfcorpus
1646
- name: MTEB NFCorpus
1647
- config: default
1648
- split: test
1649
- revision: None
1650
- metrics:
1651
- - type: map_at_1
1652
- value: 5.409
1653
- - type: map_at_10
1654
- value: 13.093
1655
- - type: map_at_100
1656
- value: 16.256999999999998
1657
- - type: map_at_1000
1658
- value: 17.617
1659
- - type: map_at_3
1660
- value: 9.555
1661
- - type: map_at_5
1662
- value: 11.428
1663
- - type: mrr_at_1
1664
- value: 45.201
1665
- - type: mrr_at_10
1666
- value: 54.179
1667
- - type: mrr_at_100
1668
- value: 54.812000000000005
1669
- - type: mrr_at_1000
1670
- value: 54.840999999999994
1671
- - type: mrr_at_3
1672
- value: 51.909000000000006
1673
- - type: mrr_at_5
1674
- value: 53.519000000000005
1675
- - type: ndcg_at_1
1676
- value: 43.189
1677
- - type: ndcg_at_10
1678
- value: 35.028
1679
- - type: ndcg_at_100
1680
- value: 31.226
1681
- - type: ndcg_at_1000
1682
- value: 39.678000000000004
1683
- - type: ndcg_at_3
1684
- value: 40.596
1685
- - type: ndcg_at_5
1686
- value: 38.75
1687
- - type: precision_at_1
1688
- value: 44.582
1689
- - type: precision_at_10
1690
- value: 25.974999999999998
1691
- - type: precision_at_100
1692
- value: 7.793
1693
- - type: precision_at_1000
1694
- value: 2.036
1695
- - type: precision_at_3
1696
- value: 38.493
1697
- - type: precision_at_5
1698
- value: 33.994
1699
- - type: recall_at_1
1700
- value: 5.409
1701
- - type: recall_at_10
1702
- value: 16.875999999999998
1703
- - type: recall_at_100
1704
- value: 30.316
1705
- - type: recall_at_1000
1706
- value: 60.891
1707
- - type: recall_at_3
1708
- value: 10.688
1709
- - type: recall_at_5
1710
- value: 13.832
1711
- - task:
1712
- type: Retrieval
1713
- dataset:
1714
- type: nq
1715
- name: MTEB NQ
1716
- config: default
1717
- split: test
1718
- revision: None
1719
- metrics:
1720
- - type: map_at_1
1721
- value: 36.375
1722
- - type: map_at_10
1723
- value: 51.991
1724
- - type: map_at_100
1725
- value: 52.91400000000001
1726
- - type: map_at_1000
1727
- value: 52.93600000000001
1728
- - type: map_at_3
1729
- value: 48.014
1730
- - type: map_at_5
1731
- value: 50.381
1732
- - type: mrr_at_1
1733
- value: 40.759
1734
- - type: mrr_at_10
1735
- value: 54.617000000000004
1736
- - type: mrr_at_100
1737
- value: 55.301
1738
- - type: mrr_at_1000
1739
- value: 55.315000000000005
1740
- - type: mrr_at_3
1741
- value: 51.516
1742
- - type: mrr_at_5
1743
- value: 53.435
1744
- - type: ndcg_at_1
1745
- value: 40.759
1746
- - type: ndcg_at_10
1747
- value: 59.384
1748
- - type: ndcg_at_100
1749
- value: 63.157
1750
- - type: ndcg_at_1000
1751
- value: 63.654999999999994
1752
- - type: ndcg_at_3
1753
- value: 52.114000000000004
1754
- - type: ndcg_at_5
1755
- value: 55.986000000000004
1756
- - type: precision_at_1
1757
- value: 40.759
1758
- - type: precision_at_10
1759
- value: 9.411999999999999
1760
- - type: precision_at_100
1761
- value: 1.153
1762
- - type: precision_at_1000
1763
- value: 0.12
1764
- - type: precision_at_3
1765
- value: 23.329
1766
- - type: precision_at_5
1767
- value: 16.256999999999998
1768
- - type: recall_at_1
1769
- value: 36.375
1770
- - type: recall_at_10
1771
- value: 79.053
1772
- - type: recall_at_100
1773
- value: 95.167
1774
- - type: recall_at_1000
1775
- value: 98.82
1776
- - type: recall_at_3
1777
- value: 60.475
1778
- - type: recall_at_5
1779
- value: 69.327
1780
- - task:
1781
- type: Retrieval
1782
- dataset:
1783
- type: quora
1784
- name: MTEB QuoraRetrieval
1785
- config: default
1786
- split: test
1787
- revision: None
1788
- metrics:
1789
- - type: map_at_1
1790
- value: 70.256
1791
- - type: map_at_10
1792
- value: 83.8
1793
- - type: map_at_100
1794
- value: 84.425
1795
- - type: map_at_1000
1796
- value: 84.444
1797
- - type: map_at_3
1798
- value: 80.906
1799
- - type: map_at_5
1800
- value: 82.717
1801
- - type: mrr_at_1
1802
- value: 80.97999999999999
1803
- - type: mrr_at_10
1804
- value: 87.161
1805
- - type: mrr_at_100
1806
- value: 87.262
1807
- - type: mrr_at_1000
1808
- value: 87.263
1809
- - type: mrr_at_3
1810
- value: 86.175
1811
- - type: mrr_at_5
1812
- value: 86.848
1813
- - type: ndcg_at_1
1814
- value: 80.97999999999999
1815
- - type: ndcg_at_10
1816
- value: 87.697
1817
- - type: ndcg_at_100
1818
- value: 88.959
1819
- - type: ndcg_at_1000
1820
- value: 89.09899999999999
1821
- - type: ndcg_at_3
1822
- value: 84.83800000000001
1823
- - type: ndcg_at_5
1824
- value: 86.401
1825
- - type: precision_at_1
1826
- value: 80.97999999999999
1827
- - type: precision_at_10
1828
- value: 13.261000000000001
1829
- - type: precision_at_100
1830
- value: 1.5150000000000001
1831
- - type: precision_at_1000
1832
- value: 0.156
1833
- - type: precision_at_3
1834
- value: 37.01
1835
- - type: precision_at_5
1836
- value: 24.298000000000002
1837
- - type: recall_at_1
1838
- value: 70.256
1839
- - type: recall_at_10
1840
- value: 94.935
1841
- - type: recall_at_100
1842
- value: 99.274
1843
- - type: recall_at_1000
1844
- value: 99.928
1845
- - type: recall_at_3
1846
- value: 86.602
1847
- - type: recall_at_5
1848
- value: 91.133
1849
- - task:
1850
- type: Clustering
1851
- dataset:
1852
- type: mteb/reddit-clustering
1853
- name: MTEB RedditClustering
1854
- config: default
1855
- split: test
1856
- revision: 24640382cdbf8abc73003fb0fa6d111a705499eb
1857
- metrics:
1858
- - type: v_measure
1859
- value: 56.322692497613104
1860
- - task:
1861
- type: Clustering
1862
- dataset:
1863
- type: mteb/reddit-clustering-p2p
1864
- name: MTEB RedditClusteringP2P
1865
- config: default
1866
- split: test
1867
- revision: 282350215ef01743dc01b456c7f5241fa8937f16
1868
- metrics:
1869
- - type: v_measure
1870
- value: 61.895813503775074
1871
- - task:
1872
- type: Retrieval
1873
- dataset:
1874
- type: scidocs
1875
- name: MTEB SCIDOCS
1876
- config: default
1877
- split: test
1878
- revision: None
1879
- metrics:
1880
- - type: map_at_1
1881
- value: 4.338
1882
- - type: map_at_10
1883
- value: 10.767
1884
- - type: map_at_100
1885
- value: 12.537999999999998
1886
- - type: map_at_1000
1887
- value: 12.803999999999998
1888
- - type: map_at_3
1889
- value: 7.788
1890
- - type: map_at_5
1891
- value: 9.302000000000001
1892
- - type: mrr_at_1
1893
- value: 21.4
1894
- - type: mrr_at_10
1895
- value: 31.637999999999998
1896
- - type: mrr_at_100
1897
- value: 32.688
1898
- - type: mrr_at_1000
1899
- value: 32.756
1900
- - type: mrr_at_3
1901
- value: 28.433000000000003
1902
- - type: mrr_at_5
1903
- value: 30.178
1904
- - type: ndcg_at_1
1905
- value: 21.4
1906
- - type: ndcg_at_10
1907
- value: 18.293
1908
- - type: ndcg_at_100
1909
- value: 25.274
1910
- - type: ndcg_at_1000
1911
- value: 30.284
1912
- - type: ndcg_at_3
1913
- value: 17.391000000000002
1914
- - type: ndcg_at_5
1915
- value: 15.146999999999998
1916
- - type: precision_at_1
1917
- value: 21.4
1918
- - type: precision_at_10
1919
- value: 9.48
1920
- - type: precision_at_100
1921
- value: 1.949
1922
- - type: precision_at_1000
1923
- value: 0.316
1924
- - type: precision_at_3
1925
- value: 16.167
1926
- - type: precision_at_5
1927
- value: 13.22
1928
- - type: recall_at_1
1929
- value: 4.338
1930
- - type: recall_at_10
1931
- value: 19.213
1932
- - type: recall_at_100
1933
- value: 39.562999999999995
1934
- - type: recall_at_1000
1935
- value: 64.08
1936
- - type: recall_at_3
1937
- value: 9.828000000000001
1938
- - type: recall_at_5
1939
- value: 13.383000000000001
1940
- - task:
1941
- type: STS
1942
- dataset:
1943
- type: mteb/sickr-sts
1944
- name: MTEB SICK-R
1945
- config: default
1946
- split: test
1947
- revision: a6ea5a8cab320b040a23452cc28066d9beae2cee
1948
- metrics:
1949
- - type: cos_sim_pearson
1950
- value: 82.42568163642142
1951
- - type: cos_sim_spearman
1952
- value: 78.5797159641342
1953
- - type: euclidean_pearson
1954
- value: 80.22151260811604
1955
- - type: euclidean_spearman
1956
- value: 78.5797151953878
1957
- - type: manhattan_pearson
1958
- value: 80.21224215864788
1959
- - type: manhattan_spearman
1960
- value: 78.55641478381344
1961
- - task:
1962
- type: STS
1963
- dataset:
1964
- type: mteb/sts12-sts
1965
- name: MTEB STS12
1966
- config: default
1967
- split: test
1968
- revision: a0d554a64d88156834ff5ae9920b964011b16384
1969
- metrics:
1970
- - type: cos_sim_pearson
1971
- value: 85.44020710812569
1972
- - type: cos_sim_spearman
1973
- value: 78.91631735081286
1974
- - type: euclidean_pearson
1975
- value: 81.64188964182102
1976
- - type: euclidean_spearman
1977
- value: 78.91633286881678
1978
- - type: manhattan_pearson
1979
- value: 81.69294748512496
1980
- - type: manhattan_spearman
1981
- value: 78.93438558002656
1982
- - task:
1983
- type: STS
1984
- dataset:
1985
- type: mteb/sts13-sts
1986
- name: MTEB STS13
1987
- config: default
1988
- split: test
1989
- revision: 7e90230a92c190f1bf69ae9002b8cea547a64cca
1990
- metrics:
1991
- - type: cos_sim_pearson
1992
- value: 84.27165426412311
1993
- - type: cos_sim_spearman
1994
- value: 85.40429140249618
1995
- - type: euclidean_pearson
1996
- value: 84.7509580724893
1997
- - type: euclidean_spearman
1998
- value: 85.40429140249618
1999
- - type: manhattan_pearson
2000
- value: 84.76488289321308
2001
- - type: manhattan_spearman
2002
- value: 85.4256793698708
2003
- - task:
2004
- type: STS
2005
- dataset:
2006
- type: mteb/sts14-sts
2007
- name: MTEB STS14
2008
- config: default
2009
- split: test
2010
- revision: 6031580fec1f6af667f0bd2da0a551cf4f0b2375
2011
- metrics:
2012
- - type: cos_sim_pearson
2013
- value: 83.138851760732
2014
- - type: cos_sim_spearman
2015
- value: 81.64101363896586
2016
- - type: euclidean_pearson
2017
- value: 82.55165038934942
2018
- - type: euclidean_spearman
2019
- value: 81.64105257080502
2020
- - type: manhattan_pearson
2021
- value: 82.52802949883335
2022
- - type: manhattan_spearman
2023
- value: 81.61255430718158
2024
- - task:
2025
- type: STS
2026
- dataset:
2027
- type: mteb/sts15-sts
2028
- name: MTEB STS15
2029
- config: default
2030
- split: test
2031
- revision: ae752c7c21bf194d8b67fd573edf7ae58183cbe3
2032
- metrics:
2033
- - type: cos_sim_pearson
2034
- value: 86.0654695484029
2035
- - type: cos_sim_spearman
2036
- value: 87.20408521902229
2037
- - type: euclidean_pearson
2038
- value: 86.8110651362115
2039
- - type: euclidean_spearman
2040
- value: 87.20408521902229
2041
- - type: manhattan_pearson
2042
- value: 86.77984656478691
2043
- - type: manhattan_spearman
2044
- value: 87.1719947099227
2045
- - task:
2046
- type: STS
2047
- dataset:
2048
- type: mteb/sts16-sts
2049
- name: MTEB STS16
2050
- config: default
2051
- split: test
2052
- revision: 4d8694f8f0e0100860b497b999b3dbed754a0513
2053
- metrics:
2054
- - type: cos_sim_pearson
2055
- value: 83.77823915496512
2056
- - type: cos_sim_spearman
2057
- value: 85.43566325729779
2058
- - type: euclidean_pearson
2059
- value: 84.5396956658821
2060
- - type: euclidean_spearman
2061
- value: 85.43566325729779
2062
- - type: manhattan_pearson
2063
- value: 84.5665398848169
2064
- - type: manhattan_spearman
2065
- value: 85.44375870303232
2066
- - task:
2067
- type: STS
2068
- dataset:
2069
- type: mteb/sts17-crosslingual-sts
2070
- name: MTEB STS17 (en-en)
2071
- config: en-en
2072
- split: test
2073
- revision: af5e6fb845001ecf41f4c1e033ce921939a2a68d
2074
- metrics:
2075
- - type: cos_sim_pearson
2076
- value: 87.20030208471798
2077
- - type: cos_sim_spearman
2078
- value: 87.20485505076539
2079
- - type: euclidean_pearson
2080
- value: 88.10588324368722
2081
- - type: euclidean_spearman
2082
- value: 87.20485505076539
2083
- - type: manhattan_pearson
2084
- value: 87.92324770415183
2085
- - type: manhattan_spearman
2086
- value: 87.0571314561877
2087
- - task:
2088
- type: STS
2089
- dataset:
2090
- type: mteb/sts22-crosslingual-sts
2091
- name: MTEB STS22 (en)
2092
- config: en
2093
- split: test
2094
- revision: 6d1ba47164174a496b7fa5d3569dae26a6813b80
2095
- metrics:
2096
- - type: cos_sim_pearson
2097
- value: 63.06093161604453
2098
- - type: cos_sim_spearman
2099
- value: 64.2163140357722
2100
- - type: euclidean_pearson
2101
- value: 65.27589680994006
2102
- - type: euclidean_spearman
2103
- value: 64.2163140357722
2104
- - type: manhattan_pearson
2105
- value: 65.45904383711101
2106
- - type: manhattan_spearman
2107
- value: 64.55404716679305
2108
- - task:
2109
- type: STS
2110
- dataset:
2111
- type: mteb/stsbenchmark-sts
2112
- name: MTEB STSBenchmark
2113
- config: default
2114
- split: test
2115
- revision: b0fddb56ed78048fa8b90373c8a3cfc37b684831
2116
- metrics:
2117
- - type: cos_sim_pearson
2118
- value: 84.32976164578706
2119
- - type: cos_sim_spearman
2120
- value: 85.54302197678368
2121
- - type: euclidean_pearson
2122
- value: 85.26307149193056
2123
- - type: euclidean_spearman
2124
- value: 85.54302197678368
2125
- - type: manhattan_pearson
2126
- value: 85.26647282029371
2127
- - type: manhattan_spearman
2128
- value: 85.5316135265568
2129
- - task:
2130
- type: Reranking
2131
- dataset:
2132
- type: mteb/scidocs-reranking
2133
- name: MTEB SciDocsRR
2134
- config: default
2135
- split: test
2136
- revision: d3c5e1fc0b855ab6097bf1cda04dd73947d7caab
2137
- metrics:
2138
- - type: map
2139
- value: 81.44675968318754
2140
- - type: mrr
2141
- value: 94.92741826075158
2142
- - task:
2143
- type: Retrieval
2144
- dataset:
2145
- type: scifact
2146
- name: MTEB SciFact
2147
- config: default
2148
- split: test
2149
- revision: None
2150
- metrics:
2151
- - type: map_at_1
2152
- value: 56.34400000000001
2153
- - type: map_at_10
2154
- value: 65.927
2155
- - type: map_at_100
2156
- value: 66.431
2157
- - type: map_at_1000
2158
- value: 66.461
2159
- - type: map_at_3
2160
- value: 63.529
2161
- - type: map_at_5
2162
- value: 64.818
2163
- - type: mrr_at_1
2164
- value: 59.333000000000006
2165
- - type: mrr_at_10
2166
- value: 67.54599999999999
2167
- - type: mrr_at_100
2168
- value: 67.892
2169
- - type: mrr_at_1000
2170
- value: 67.917
2171
- - type: mrr_at_3
2172
- value: 65.778
2173
- - type: mrr_at_5
2174
- value: 66.794
2175
- - type: ndcg_at_1
2176
- value: 59.333000000000006
2177
- - type: ndcg_at_10
2178
- value: 70.5
2179
- - type: ndcg_at_100
2180
- value: 72.688
2181
- - type: ndcg_at_1000
2182
- value: 73.483
2183
- - type: ndcg_at_3
2184
- value: 66.338
2185
- - type: ndcg_at_5
2186
- value: 68.265
2187
- - type: precision_at_1
2188
- value: 59.333000000000006
2189
- - type: precision_at_10
2190
- value: 9.3
2191
- - type: precision_at_100
2192
- value: 1.053
2193
- - type: precision_at_1000
2194
- value: 0.11199999999999999
2195
- - type: precision_at_3
2196
- value: 25.889
2197
- - type: precision_at_5
2198
- value: 16.866999999999997
2199
- - type: recall_at_1
2200
- value: 56.34400000000001
2201
- - type: recall_at_10
2202
- value: 82.789
2203
- - type: recall_at_100
2204
- value: 92.767
2205
- - type: recall_at_1000
2206
- value: 99
2207
- - type: recall_at_3
2208
- value: 71.64399999999999
2209
- - type: recall_at_5
2210
- value: 76.322
2211
- - task:
2212
- type: PairClassification
2213
- dataset:
2214
- type: mteb/sprintduplicatequestions-pairclassification
2215
- name: MTEB SprintDuplicateQuestions
2216
- config: default
2217
- split: test
2218
- revision: d66bd1f72af766a5cc4b0ca5e00c162f89e8cc46
2219
- metrics:
2220
- - type: cos_sim_accuracy
2221
- value: 99.75742574257426
2222
- - type: cos_sim_ap
2223
- value: 93.52081548447406
2224
- - type: cos_sim_f1
2225
- value: 87.33850129198966
2226
- - type: cos_sim_precision
2227
- value: 90.37433155080214
2228
- - type: cos_sim_recall
2229
- value: 84.5
2230
- - type: dot_accuracy
2231
- value: 99.75742574257426
2232
- - type: dot_ap
2233
- value: 93.52081548447406
2234
- - type: dot_f1
2235
- value: 87.33850129198966
2236
- - type: dot_precision
2237
- value: 90.37433155080214
2238
- - type: dot_recall
2239
- value: 84.5
2240
- - type: euclidean_accuracy
2241
- value: 99.75742574257426
2242
- - type: euclidean_ap
2243
- value: 93.52081548447406
2244
- - type: euclidean_f1
2245
- value: 87.33850129198966
2246
- - type: euclidean_precision
2247
- value: 90.37433155080214
2248
- - type: euclidean_recall
2249
- value: 84.5
2250
- - type: manhattan_accuracy
2251
- value: 99.75841584158415
2252
- - type: manhattan_ap
2253
- value: 93.4975678585854
2254
- - type: manhattan_f1
2255
- value: 87.26708074534162
2256
- - type: manhattan_precision
2257
- value: 90.45064377682404
2258
- - type: manhattan_recall
2259
- value: 84.3
2260
- - type: max_accuracy
2261
- value: 99.75841584158415
2262
- - type: max_ap
2263
- value: 93.52081548447406
2264
- - type: max_f1
2265
- value: 87.33850129198966
2266
- - task:
2267
- type: Clustering
2268
- dataset:
2269
- type: mteb/stackexchange-clustering
2270
- name: MTEB StackExchangeClustering
2271
- config: default
2272
- split: test
2273
- revision: 6cbc1f7b2bc0622f2e39d2c77fa502909748c259
2274
- metrics:
2275
- - type: v_measure
2276
- value: 64.31437036686651
2277
- - task:
2278
- type: Clustering
2279
- dataset:
2280
- type: mteb/stackexchange-clustering-p2p
2281
- name: MTEB StackExchangeClusteringP2P
2282
- config: default
2283
- split: test
2284
- revision: 815ca46b2622cec33ccafc3735d572c266efdb44
2285
- metrics:
2286
- - type: v_measure
2287
- value: 33.25569319007206
2288
- - task:
2289
- type: Reranking
2290
- dataset:
2291
- type: mteb/stackoverflowdupquestions-reranking
2292
- name: MTEB StackOverflowDupQuestions
2293
- config: default
2294
- split: test
2295
- revision: e185fbe320c72810689fc5848eb6114e1ef5ec69
2296
- metrics:
2297
- - type: map
2298
- value: 49.90474939720706
2299
- - type: mrr
2300
- value: 50.568115503777264
2301
- - task:
2302
- type: Summarization
2303
- dataset:
2304
- type: mteb/summeval
2305
- name: MTEB SummEval
2306
- config: default
2307
- split: test
2308
- revision: cda12ad7615edc362dbf25a00fdd61d3b1eaf93c
2309
- metrics:
2310
- - type: cos_sim_pearson
2311
- value: 29.866828641244712
2312
- - type: cos_sim_spearman
2313
- value: 30.077555055873866
2314
- - type: dot_pearson
2315
- value: 29.866832988572266
2316
- - type: dot_spearman
2317
- value: 30.077555055873866
2318
- - task:
2319
- type: Retrieval
2320
- dataset:
2321
- type: trec-covid
2322
- name: MTEB TRECCOVID
2323
- config: default
2324
- split: test
2325
- revision: None
2326
- metrics:
2327
- - type: map_at_1
2328
- value: 0.232
2329
- - type: map_at_10
2330
- value: 2.094
2331
- - type: map_at_100
2332
- value: 11.971
2333
- - type: map_at_1000
2334
- value: 28.158
2335
- - type: map_at_3
2336
- value: 0.688
2337
- - type: map_at_5
2338
- value: 1.114
2339
- - type: mrr_at_1
2340
- value: 88
2341
- - type: mrr_at_10
2342
- value: 93.4
2343
- - type: mrr_at_100
2344
- value: 93.4
2345
- - type: mrr_at_1000
2346
- value: 93.4
2347
- - type: mrr_at_3
2348
- value: 93
2349
- - type: mrr_at_5
2350
- value: 93.4
2351
- - type: ndcg_at_1
2352
- value: 84
2353
- - type: ndcg_at_10
2354
- value: 79.923
2355
- - type: ndcg_at_100
2356
- value: 61.17
2357
- - type: ndcg_at_1000
2358
- value: 53.03
2359
- - type: ndcg_at_3
2360
- value: 84.592
2361
- - type: ndcg_at_5
2362
- value: 82.821
2363
- - type: precision_at_1
2364
- value: 88
2365
- - type: precision_at_10
2366
- value: 85
2367
- - type: precision_at_100
2368
- value: 63.019999999999996
2369
- - type: precision_at_1000
2370
- value: 23.554
2371
- - type: precision_at_3
2372
- value: 89.333
2373
- - type: precision_at_5
2374
- value: 87.2
2375
- - type: recall_at_1
2376
- value: 0.232
2377
- - type: recall_at_10
2378
- value: 2.255
2379
- - type: recall_at_100
2380
- value: 14.823
2381
- - type: recall_at_1000
2382
- value: 49.456
2383
- - type: recall_at_3
2384
- value: 0.718
2385
- - type: recall_at_5
2386
- value: 1.175
2387
- - task:
2388
- type: Retrieval
2389
- dataset:
2390
- type: webis-touche2020
2391
- name: MTEB Touche2020
2392
- config: default
2393
- split: test
2394
- revision: None
2395
- metrics:
2396
- - type: map_at_1
2397
- value: 2.547
2398
- - type: map_at_10
2399
- value: 11.375
2400
- - type: map_at_100
2401
- value: 18.194
2402
- - type: map_at_1000
2403
- value: 19.749
2404
- - type: map_at_3
2405
- value: 5.825
2406
- - type: map_at_5
2407
- value: 8.581
2408
- - type: mrr_at_1
2409
- value: 32.653
2410
- - type: mrr_at_10
2411
- value: 51.32
2412
- - type: mrr_at_100
2413
- value: 51.747
2414
- - type: mrr_at_1000
2415
- value: 51.747
2416
- - type: mrr_at_3
2417
- value: 47.278999999999996
2418
- - type: mrr_at_5
2419
- value: 48.605
2420
- - type: ndcg_at_1
2421
- value: 29.592000000000002
2422
- - type: ndcg_at_10
2423
- value: 28.151
2424
- - type: ndcg_at_100
2425
- value: 39.438
2426
- - type: ndcg_at_1000
2427
- value: 50.769
2428
- - type: ndcg_at_3
2429
- value: 30.758999999999997
2430
- - type: ndcg_at_5
2431
- value: 30.366
2432
- - type: precision_at_1
2433
- value: 32.653
2434
- - type: precision_at_10
2435
- value: 25.714
2436
- - type: precision_at_100
2437
- value: 8.041
2438
- - type: precision_at_1000
2439
- value: 1.555
2440
- - type: precision_at_3
2441
- value: 33.333
2442
- - type: precision_at_5
2443
- value: 31.837
2444
- - type: recall_at_1
2445
- value: 2.547
2446
- - type: recall_at_10
2447
- value: 18.19
2448
- - type: recall_at_100
2449
- value: 49.538
2450
- - type: recall_at_1000
2451
- value: 83.86
2452
- - type: recall_at_3
2453
- value: 7.329
2454
- - type: recall_at_5
2455
- value: 11.532
2456
- - task:
2457
- type: Classification
2458
- dataset:
2459
- type: mteb/toxic_conversations_50k
2460
- name: MTEB ToxicConversationsClassification
2461
- config: default
2462
- split: test
2463
- revision: d7c0de2777da35d6aae2200a62c6e0e5af397c4c
2464
- metrics:
2465
- - type: accuracy
2466
- value: 71.4952
2467
- - type: ap
2468
- value: 14.793362635531409
2469
- - type: f1
2470
- value: 55.204635551516915
2471
- - task:
2472
- type: Classification
2473
- dataset:
2474
- type: mteb/tweet_sentiment_extraction
2475
- name: MTEB TweetSentimentExtractionClassification
2476
- config: default
2477
- split: test
2478
- revision: d604517c81ca91fe16a244d1248fc021f9ecee7a
2479
- metrics:
2480
- - type: accuracy
2481
- value: 61.5365025466893
2482
- - type: f1
2483
- value: 61.81742556334845
2484
- - task:
2485
- type: Clustering
2486
- dataset:
2487
- type: mteb/twentynewsgroups-clustering
2488
- name: MTEB TwentyNewsgroupsClustering
2489
- config: default
2490
- split: test
2491
- revision: 6125ec4e24fa026cec8a478383ee943acfbd5449
2492
- metrics:
2493
- - type: v_measure
2494
- value: 49.05531070301185
2495
- - task:
2496
- type: PairClassification
2497
- dataset:
2498
- type: mteb/twittersemeval2015-pairclassification
2499
- name: MTEB TwitterSemEval2015
2500
- config: default
2501
- split: test
2502
- revision: 70970daeab8776df92f5ea462b6173c0b46fd2d1
2503
- metrics:
2504
- - type: cos_sim_accuracy
2505
- value: 86.51725576682364
2506
- - type: cos_sim_ap
2507
- value: 75.2292304265163
2508
- - type: cos_sim_f1
2509
- value: 69.54022988505749
2510
- - type: cos_sim_precision
2511
- value: 63.65629110039457
2512
- - type: cos_sim_recall
2513
- value: 76.62269129287598
2514
- - type: dot_accuracy
2515
- value: 86.51725576682364
2516
- - type: dot_ap
2517
- value: 75.22922386081054
2518
- - type: dot_f1
2519
- value: 69.54022988505749
2520
- - type: dot_precision
2521
- value: 63.65629110039457
2522
- - type: dot_recall
2523
- value: 76.62269129287598
2524
- - type: euclidean_accuracy
2525
- value: 86.51725576682364
2526
- - type: euclidean_ap
2527
- value: 75.22925730473472
2528
- - type: euclidean_f1
2529
- value: 69.54022988505749
2530
- - type: euclidean_precision
2531
- value: 63.65629110039457
2532
- - type: euclidean_recall
2533
- value: 76.62269129287598
2534
- - type: manhattan_accuracy
2535
- value: 86.52321630804077
2536
- - type: manhattan_ap
2537
- value: 75.20608115037336
2538
- - type: manhattan_f1
2539
- value: 69.60000000000001
2540
- - type: manhattan_precision
2541
- value: 64.37219730941705
2542
- - type: manhattan_recall
2543
- value: 75.75197889182058
2544
- - type: max_accuracy
2545
- value: 86.52321630804077
2546
- - type: max_ap
2547
- value: 75.22925730473472
2548
- - type: max_f1
2549
- value: 69.60000000000001
2550
- - task:
2551
- type: PairClassification
2552
- dataset:
2553
- type: mteb/twitterurlcorpus-pairclassification
2554
- name: MTEB TwitterURLCorpus
2555
- config: default
2556
- split: test
2557
- revision: 8b6510b0b1fa4e4c4f879467980e9be563ec1cdf
2558
- metrics:
2559
- - type: cos_sim_accuracy
2560
- value: 89.34877944657896
2561
- - type: cos_sim_ap
2562
- value: 86.71257569277373
2563
- - type: cos_sim_f1
2564
- value: 79.10386355986088
2565
- - type: cos_sim_precision
2566
- value: 76.91468470434214
2567
- - type: cos_sim_recall
2568
- value: 81.4213119802895
2569
- - type: dot_accuracy
2570
- value: 89.34877944657896
2571
- - type: dot_ap
2572
- value: 86.71257133133368
2573
- - type: dot_f1
2574
- value: 79.10386355986088
2575
- - type: dot_precision
2576
- value: 76.91468470434214
2577
- - type: dot_recall
2578
- value: 81.4213119802895
2579
- - type: euclidean_accuracy
2580
- value: 89.34877944657896
2581
- - type: euclidean_ap
2582
- value: 86.71257651501476
2583
- - type: euclidean_f1
2584
- value: 79.10386355986088
2585
- - type: euclidean_precision
2586
- value: 76.91468470434214
2587
- - type: euclidean_recall
2588
- value: 81.4213119802895
2589
- - type: manhattan_accuracy
2590
- value: 89.35848177901967
2591
- - type: manhattan_ap
2592
- value: 86.69330615469126
2593
- - type: manhattan_f1
2594
- value: 79.13867741453949
2595
- - type: manhattan_precision
2596
- value: 76.78881807647741
2597
- - type: manhattan_recall
2598
- value: 81.63689559593472
2599
- - type: max_accuracy
2600
- value: 89.35848177901967
2601
- - type: max_ap
2602
- value: 86.71257651501476
2603
- - type: max_f1
2604
- value: 79.13867741453949
2605
- license: apache-2.0
2606
- language:
2607
- - en
2608
- ---
2609
-
2610
-
2611
- # nomic-embed-text-v1: A Reproducible Long Context (8192) Text Embedder
2612
-
2613
- `nomic-embed-text-v1` is 8192 context length text encoder that surpasses OpenAI text-embedding-ada-002 and text-embedding-3-small performance on short and long context tasks.
2614
-
2615
-
2616
-
2617
- | Name | SeqLen | MTEB | LoCo | Jina Long Context | Open Weights | Open Training Code | Open Data |
2618
- | :-------------------------------:| :----- | :-------- | :------: | :---------------: | :-----------: | :----------------: | :---------- |
2619
- | nomic-embed-text-v1 | 8192 | **62.39** |**85.53** | 54.16 | ✅ | ✅ | ✅ |
2620
- | jina-embeddings-v2-base-en | 8192 | 60.39 | 85.45 | 51.90 | ✅ | ❌ | ❌ |
2621
- | text-embedding-3-small | 8191 | 62.26 | 82.40 | **58.20** | ❌ | ❌ | ❌ |
2622
- | text-embedding-ada-002 | 8191 | 60.99 | 52.7 | 55.25 | ❌ | ❌ | ❌ |
2623
-
2624
-
2625
- ## Hosted Inference API
2626
-
2627
- The easiest way to get started with Nomic Embed is through the Nomic Embedding API.
2628
-
2629
- Generating embeddings with the `nomic` Python client is as easy as
2630
-
2631
- ```python
2632
- from nomic import embed
2633
-
2634
- output = embed.text(
2635
- texts=['Nomic Embedding API', '#keepAIOpen'],
2636
- model='nomic-embed-text-v1',
2637
- task_type='search_document'
2638
- )
2639
-
2640
- print(output)
2641
- ```
2642
-
2643
- For more information, see the [API reference](https://docs.nomic.ai/reference/endpoints/nomic-embed-text)
2644
-
2645
- ## Data Visualization
2646
- Click the Nomic Atlas map below to visualize a 5M sample of our contrastive pretraining data!
2647
-
2648
-
2649
- [![image/webp](https://cdn-uploads.huggingface.co/production/uploads/607997c83a565c15675055b3/pjhJhuNyRfPagRd_c_iUz.webp)](https://atlas.nomic.ai/map/nomic-text-embed-v1-5m-sample)
2650
-
2651
- ## Training Details
2652
-
2653
- We train our embedder using a multi-stage training pipeline. Starting from a long-context [BERT model](https://huggingface.co/nomic-ai/nomic-bert-2048),
2654
- the first unsupervised contrastive stage trains on a dataset generated from weakly related text pairs, such as question-answer pairs from forums like StackExchange and Quora, title-body pairs from Amazon reviews, and summarizations from news articles.
2655
-
2656
- In the second finetuning stage, higher quality labeled datasets such as search queries and answers from web searches are leveraged. Data curation and hard-example mining is crucial in this stage.
2657
-
2658
- For more details, see the Nomic Embed [Technical Report](https://static.nomic.ai/reports/2024_Nomic_Embed_Text_Technical_Report.pdf) and corresponding [blog post](https://blog.nomic.ai/posts/nomic-embed-text-v1).
2659
-
2660
- Training data to train the models is released in its entirety. For more details, see the `contrastors` [repository](https://github.com/nomic-ai/contrastors)
2661
-
2662
- ## Usage
2663
-
2664
- Note `nomic-embed-text` requires prefixes! We support the prefixes `[search_query, search_document, classification, clustering]`.
2665
- For retrieval applications, you should prepend `search_document` for all your documents and `search_query` for your queries.
2666
-
2667
- ### Sentence Transformers
2668
- ```python
2669
- from sentence_transformers import SentenceTransformer
2670
-
2671
- model = SentenceTransformer("nomic-ai/nomic-embed-text-v1", trust_remote_code=True)
2672
- sentences = ['search_query: What is TSNE?', 'search_query: Who is Laurens van der Maaten?']
2673
- embeddings = model.encode(sentences)
2674
- print(embeddings)
2675
- ```
2676
-
2677
- ### Transformers
2678
-
2679
- ```python
2680
- import torch
2681
- import torch.nn.functional as F
2682
- from transformers import AutoTokenizer, AutoModel
2683
-
2684
- def mean_pooling(model_output, attention_mask):
2685
- token_embeddings = model_output[0]
2686
- input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
2687
- return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
2688
-
2689
- sentences = ['search_query: What is TSNE?', 'search_query: Who is Laurens van der Maaten?']
2690
-
2691
- tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
2692
- model = AutoModel.from_pretrained('nomic-ai/nomic-embed-text-v1', trust_remote_code=True)
2693
- model.eval()
2694
-
2695
- encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
2696
-
2697
- with torch.no_grad():
2698
- model_output = model(**encoded_input)
2699
-
2700
- embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
2701
- embeddings = F.normalize(embeddings, p=2, dim=1)
2702
- print(embeddings)
2703
- ```
2704
-
2705
- The model natively supports scaling of the sequence length past 2048 tokens. To do so,
2706
-
2707
- ```diff
2708
- - tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
2709
- + tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased', model_max_length=8192)
2710
-
2711
-
2712
- - model = AutoModel.from_pretrained('nomic-ai/nomic-embed-text-v1', trust_remote_code=True)
2713
- + model = AutoModel.from_pretrained('nomic-ai/nomic-embed-text-v1', trust_remote_code=True, rotary_scaling_factor=2)
2714
- ```
2715
-
2716
- ### Transformers.js
2717
-
2718
- ```js
2719
- import { pipeline } from '@xenova/transformers';
2720
-
2721
- // Create a feature extraction pipeline
2722
- const extractor = await pipeline('feature-extraction', 'nomic-ai/nomic-embed-text-v1', {
2723
- quantized: false, // Comment out this line to use the quantized version
2724
- });
2725
-
2726
- // Compute sentence embeddings
2727
- const texts = ['What is TSNE?', 'Who is Laurens van der Maaten?'];
2728
- const embeddings = await extractor(texts, { pooling: 'mean', normalize: true });
2729
- console.log(embeddings);
2730
- ```
2731
-
2732
- # Join the Nomic Community
2733
-
2734
- - Nomic: [https://nomic.ai](https://nomic.ai)
2735
- - Discord: [https://discord.gg/myY5YDR8z8](https://discord.gg/myY5YDR8z8)
2736
- - Twitter: [https://twitter.com/nomic_ai](https://twitter.com/nomic_ai)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
pretrained/nomic-ai/nomic-embed-text-v1/config.json DELETED
@@ -1,56 +0,0 @@
1
- {
2
- "activation_function": "swiglu",
3
- "architectures": [
4
- "NomicBertModel"
5
- ],
6
- "attn_pdrop": 0.0,
7
- "auto_map": {
8
- "AutoConfig": "configuration_hf_nomic_bert.NomicBertConfig",
9
- "AutoModel": "modeling_hf_nomic_bert.NomicBertModel",
10
- "AutoModelForMaskedLM": "nomic-ai/nomic-bert-2048--modeling_hf_nomic_bert.NomicBertForPreTraining"
11
- },
12
- "bos_token_id": null,
13
- "causal": false,
14
- "dense_seq_output": true,
15
- "embd_pdrop": 0.0,
16
- "eos_token_id": null,
17
- "fused_bias_fc": true,
18
- "fused_dropout_add_ln": true,
19
- "initializer_range": 0.02,
20
- "layer_norm_epsilon": 1e-12,
21
- "mlp_fc1_bias": false,
22
- "mlp_fc2_bias": false,
23
- "model_type": "nomic_bert",
24
- "n_embd": 768,
25
- "n_head": 12,
26
- "n_inner": 3072,
27
- "n_layer": 12,
28
- "n_positions": 8192,
29
- "pad_vocab_size_multiple": 64,
30
- "parallel_block": false,
31
- "parallel_block_tied_norm": false,
32
- "prenorm": false,
33
- "qkv_proj_bias": false,
34
- "reorder_and_upcast_attn": false,
35
- "resid_pdrop": 0.0,
36
- "rotary_emb_base": 1000,
37
- "rotary_emb_fraction": 1.0,
38
- "rotary_emb_interleaved": false,
39
- "rotary_emb_scale_base": null,
40
- "rotary_scaling_factor": 2,
41
- "scale_attn_by_inverse_layer_idx": false,
42
- "scale_attn_weights": true,
43
- "summary_activation": null,
44
- "summary_first_dropout": 0.0,
45
- "summary_proj_to_labels": true,
46
- "summary_type": "cls_index",
47
- "summary_use_proj": true,
48
- "torch_dtype": "float32",
49
- "transformers_version": "4.34.0",
50
- "type_vocab_size": 2,
51
- "use_cache": true,
52
- "use_flash_attn": true,
53
- "use_rms_norm": false,
54
- "use_xentropy": true,
55
- "vocab_size": 30528
56
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
pretrained/nomic-ai/nomic-embed-text-v1/config_sentence_transformers.json DELETED
@@ -1,7 +0,0 @@
1
- {
2
- "__version__": {
3
- "sentence_transformers": "2.4.0.dev0",
4
- "transformers": "4.37.2",
5
- "pytorch": "2.1.0+cu121"
6
- }
7
- }
 
 
 
 
 
 
 
 
pretrained/nomic-ai/nomic-embed-text-v1/configuration_hf_nomic_bert.py DELETED
@@ -1,53 +0,0 @@
1
- from transformers import GPT2Config
2
-
3
-
4
- class NomicBertConfig(GPT2Config):
5
- model_type = "nomic_bert"
6
-
7
- def __init__(self,
8
- prenorm=False,
9
- parallel_block=False,
10
- parallel_block_tied_norm=False,
11
- rotary_emb_fraction=0.0,
12
- fused_dropout_add_ln=False,
13
- fused_bias_fc=False,
14
- use_flash_attn=False,
15
- use_xentropy=False,
16
- qkv_proj_bias=True,
17
- rotary_emb_base=1000,
18
- rotary_emb_scale_base=None,
19
- rotary_emb_interleaved=False,
20
- mlp_fc1_bias=True,
21
- mlp_fc2_bias=True,
22
- use_rms_norm=False,
23
- causal=False,
24
- type_vocab_size=2,
25
- dense_seq_output=True,
26
- pad_vocab_size_multiple=1,
27
- tie_word_embeddings=True,
28
- rotary_scaling_factor=1.0,
29
- **kwargs,
30
- ):
31
- self.prenorm = prenorm
32
- self.parallel_block = parallel_block
33
- self.parallel_block_tied_norm = parallel_block_tied_norm
34
- self.rotary_emb_fraction = rotary_emb_fraction
35
- self.tie_word_embeddings = tie_word_embeddings
36
- self.fused_dropout_add_ln = fused_dropout_add_ln
37
- self.fused_bias_fc = fused_bias_fc
38
- self.use_flash_attn = use_flash_attn
39
- self.use_xentropy = use_xentropy
40
- self.qkv_proj_bias = qkv_proj_bias
41
- self.rotary_emb_base = rotary_emb_base
42
- self.rotary_emb_scale_base = rotary_emb_scale_base
43
- self.rotary_emb_interleaved = rotary_emb_interleaved
44
- self.mlp_fc1_bias = mlp_fc1_bias
45
- self.mlp_fc2_bias = mlp_fc2_bias
46
- self.use_rms_norm = use_rms_norm
47
- self.causal = causal
48
- self.type_vocab_size = type_vocab_size
49
- self.dense_seq_output = dense_seq_output
50
- self.pad_vocab_size_multiple = pad_vocab_size_multiple
51
- self.rotary_scaling_factor = rotary_scaling_factor
52
-
53
- super().__init__(**kwargs)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
pretrained/nomic-ai/nomic-embed-text-v1/model.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:47e396424a085a613034450cd4bf9e8acfb568b19089ae1c5c4e7051ae286877
3
- size 546938168
 
 
 
 
pretrained/nomic-ai/nomic-embed-text-v1/modeling_hf_nomic_bert.py DELETED
@@ -1,1238 +0,0 @@
1
- # Copyright (c) 2022, Tri Dao.
2
- # This BERT implementation is based on our MLPerf 2.0 and MLPerf 2.1 BERT implementation.
3
- # https://github.com/mlcommons/training_results_v2.0/blob/main/HazyResearch/benchmarks/bert/implementations/pytorch/modeling.py
4
- # https://github.com/mlcommons/training_results_v2.1/blob/main/Azure-HazyResearch/benchmarks/bert/implementations/ND96amsr_A100_v4/modeling.py
5
-
6
- # Inspired by https://github.com/huggingface/transformers/blob/main/src/transformers/models/bert/modeling_bert.py
7
- import os
8
- import logging
9
- from functools import partial
10
- from typing import Optional, List, Tuple, Union
11
-
12
- import torch
13
- import torch.nn as nn
14
- import torch.nn.functional as F
15
- from einops import rearrange, repeat
16
- from transformers import GPT2Config, PreTrainedModel
17
- from transformers.models.bert.modeling_bert import (
18
- BaseModelOutputWithPoolingAndCrossAttentions,
19
- MaskedLMOutput,
20
- SequenceClassifierOutput
21
- )
22
-
23
- import re
24
- from collections import OrderedDict
25
- from safetensors.torch import load_file as safe_load_file
26
- from transformers.utils import (
27
- SAFE_WEIGHTS_INDEX_NAME,
28
- SAFE_WEIGHTS_NAME,
29
- WEIGHTS_INDEX_NAME,
30
- WEIGHTS_NAME,
31
- )
32
- from transformers.utils.hub import cached_file, get_checkpoint_shard_files
33
-
34
-
35
- from .configuration_hf_nomic_bert import NomicBertConfig
36
-
37
- logger = logging.getLogger(__name__)
38
-
39
- # adapted from flash attention, added safe serialization option for hf models
40
- def state_dict_from_pretrained(model_name, safe_serialization=False, device=None, dtype=None):
41
- # If not fp32, then we don't want to load directly to the GPU
42
- mapped_device = "cpu" if dtype not in [torch.float32, None] else device
43
- is_sharded = False
44
- load_safe = False
45
- resolved_archive_file = None
46
-
47
- weights_path = os.path.join(model_name, WEIGHTS_NAME)
48
- weights_index_path = os.path.join(model_name, WEIGHTS_INDEX_NAME)
49
- safe_weights_path = os.path.join(model_name, SAFE_WEIGHTS_NAME)
50
- safe_weights_index_path = os.path.join(model_name, SAFE_WEIGHTS_INDEX_NAME)
51
-
52
- if os.path.isfile(weights_path):
53
- resolved_archive_file = cached_file(
54
- model_name, WEIGHTS_NAME, _raise_exceptions_for_missing_entries=False
55
- )
56
- elif os.path.isfile(weights_index_path):
57
- resolved_archive_file = cached_file(
58
- model_name, WEIGHTS_INDEX_NAME, _raise_exceptions_for_missing_entries=False
59
- )
60
- is_sharded = True
61
- elif os.path.isfile(safe_weights_path):
62
- resolved_archive_file = cached_file(
63
- model_name, SAFE_WEIGHTS_NAME, _raise_exceptions_for_missing_entries=False
64
- )
65
- load_safe = True
66
- elif os.path.isfile(safe_weights_index_path):
67
- resolved_archive_file = cached_file(
68
- model_name, SAFE_WEIGHTS_INDEX_NAME, _raise_exceptions_for_missing_entries=False
69
- )
70
- is_sharded = True
71
- load_safe = True
72
- else: # Try loading from HF hub instead of from local files
73
- weight_name = WEIGHTS_NAME if not safe_serialization else SAFE_WEIGHTS_NAME
74
- resolved_archive_file = cached_file(model_name, weight_name, _raise_exceptions_for_missing_entries=False)
75
- if resolved_archive_file is None:
76
- weight_index = WEIGHTS_INDEX_NAME if not safe_serialization else SAFE_WEIGHTS_INDEX_NAME
77
- resolved_archive_file = cached_file(model_name, weight_index,
78
- _raise_exceptions_for_missing_entries=False)
79
- if resolved_archive_file is not None:
80
- is_sharded = True
81
-
82
- load_safe = safe_serialization
83
-
84
- if resolved_archive_file is None:
85
- raise EnvironmentError(f"Model name {model_name} was not found.")
86
-
87
- if load_safe:
88
- loader = partial(safe_load_file, device=mapped_device)
89
- else:
90
- loader = partial(torch.load, map_location=mapped_device)
91
-
92
- if is_sharded:
93
- # resolved_archive_file becomes a list of files that point to the different
94
- # checkpoint shards in this case.
95
- resolved_archive_file, sharded_metadata = get_checkpoint_shard_files(
96
- model_name, resolved_archive_file
97
- )
98
- state_dict = {}
99
- for sharded_file in resolved_archive_file:
100
- state_dict.update(loader(sharded_file))
101
- else:
102
- state_dict = loader(resolved_archive_file)
103
- # Convert dtype before moving to GPU to save memory
104
- if dtype is not None:
105
- state_dict = {k: v.to(dtype=dtype) for k, v in state_dict.items()}
106
- state_dict = {k: v.to(device=device) for k, v in state_dict.items()}
107
- return state_dict
108
-
109
-
110
- def filter_shapes(state_dict, model):
111
- """
112
- Filters the state dict to match the current model shape.
113
- """
114
- filtered_state_dict = {}
115
- for key, value in state_dict.items():
116
- if key in model.state_dict():
117
- if value.shape == model.state_dict()[key].shape:
118
- filtered_state_dict[key] = value
119
- return filtered_state_dict
120
-
121
-
122
- def remap_bert_state_dict(state_dict, config, remove_bert=False, remove_cls_weights=False, add_pooling_layer=False):
123
- """
124
- Map the state_dict of a Huggingface BERT model to be flash_attn compatible.
125
- """
126
- def add_bert_prefix(key):
127
- # prepend bert. to the key
128
- if key.startswith("bert.") or key.startswith("cls."):
129
- return key
130
- return f"bert.{key}"
131
-
132
- state_dict = OrderedDict((add_bert_prefix(k), v) for k, v in state_dict.items())
133
-
134
- # LayerNorm
135
- def key_mapping_ln_gamma_beta(key):
136
- key = re.sub(r"LayerNorm.gamma$", "LayerNorm.weight", key)
137
- key = re.sub(r"LayerNorm.beta$", "LayerNorm.bias", key)
138
- return key
139
-
140
- state_dict = OrderedDict((key_mapping_ln_gamma_beta(k), v) for k, v in state_dict.items())
141
-
142
- # Layers
143
- def key_mapping_layers(key):
144
- return re.sub(r"^bert.encoder.layer\.", "bert.encoder.layers.", key)
145
-
146
- state_dict = OrderedDict((key_mapping_layers(k), v) for k, v in state_dict.items())
147
-
148
- # LayerNorm
149
- def key_mapping_ln(key):
150
- key = re.sub(r"^bert.embeddings.LayerNorm.", "bert.emb_ln.", key)
151
- key = re.sub(
152
- r"^bert.encoder.layers.(\d+).attention.output.LayerNorm.(weight|bias)",
153
- r"bert.encoder.layers.\1.norm1.\2",
154
- key,
155
- )
156
- key = re.sub(
157
- r"^bert.encoder.layers.(\d+).output.LayerNorm.(weight|bias)",
158
- r"bert.encoder.layers.\1.norm2.\2",
159
- key,
160
- )
161
- key = re.sub(
162
- r"^cls.predictions.transform.LayerNorm.(weight|bias)",
163
- r"cls.predictions.transform.layer_norm.\1",
164
- key,
165
- )
166
- return key
167
-
168
- state_dict = OrderedDict((key_mapping_ln(k), v) for k, v in state_dict.items())
169
-
170
- # MLP
171
- def key_mapping_mlp(key):
172
- key = re.sub(
173
- r"^bert.encoder.layers.(\d+).intermediate.dense.(weight|bias)",
174
- r"bert.encoder.layers.\1.mlp.fc1.\2",
175
- key,
176
- )
177
- key = re.sub(
178
- r"^bert.encoder.layers.(\d+).output.dense.(weight|bias)",
179
- r"bert.encoder.layers.\1.mlp.fc2.\2",
180
- key,
181
- )
182
- return key
183
-
184
- state_dict = OrderedDict((key_mapping_mlp(k), v) for k, v in state_dict.items())
185
-
186
- # Attention
187
- last_layer_subset = getattr(config, "last_layer_subset", False)
188
- for d in range(config.num_hidden_layers):
189
- if f"bert.encoder.layers.{d}.attention.self.query.weight" not in state_dict:
190
- continue
191
- Wq = state_dict.pop(f"bert.encoder.layers.{d}.attention.self.query.weight")
192
- Wk = state_dict.pop(f"bert.encoder.layers.{d}.attention.self.key.weight")
193
- Wv = state_dict.pop(f"bert.encoder.layers.{d}.attention.self.value.weight")
194
- bq = state_dict.pop(f"bert.encoder.layers.{d}.attention.self.query.bias")
195
- bk = state_dict.pop(f"bert.encoder.layers.{d}.attention.self.key.bias")
196
- bv = state_dict.pop(f"bert.encoder.layers.{d}.attention.self.value.bias")
197
- if not (last_layer_subset and d == config.num_hidden_layers - 1):
198
- state_dict[f"bert.encoder.layers.{d}.attn.Wqkv.weight"] = torch.cat(
199
- [Wq, Wk, Wv], dim=0
200
- )
201
- state_dict[f"bert.encoder.layers.{d}.attn.Wqkv.bias"] = torch.cat([bq, bk, bv], dim=0)
202
- else:
203
- state_dict[f"bert.encoder.layers.{d}.attn.Wq.weight"] = Wq
204
- state_dict[f"bert.encoder.layers.{d}.attn.Wkv.weight"] = torch.cat([Wk, Wv], dim=0)
205
- state_dict[f"bert.encoder.layers.{d}.attn.Wq.bias"] = bq
206
- state_dict[f"bert.encoder.layers.{d}.attn.Wkv.bias"] = torch.cat([bk, bv], dim=0)
207
-
208
- def key_mapping_attn(key):
209
- return re.sub(
210
- r"^bert.encoder.layers.(\d+).attention.output.dense.(weight|bias)",
211
- r"bert.encoder.layers.\1.attn.out_proj.\2",
212
- key,
213
- )
214
-
215
- state_dict = OrderedDict((key_mapping_attn(k), v) for k, v in state_dict.items())
216
-
217
- def key_mapping_decoder_bias(key):
218
- return re.sub(r"^cls.predictions.bias", "cls.predictions.decoder.bias", key)
219
-
220
-
221
- # remove nsp weights, we don't use
222
- state_dict.pop("cls.seq_relationship.weight", None)
223
- state_dict.pop("cls.seq_relationship.bias", None)
224
- state_dict.pop("bert.embeddings.position_ids", None)
225
-
226
- state_dict = OrderedDict((key_mapping_decoder_bias(k), v) for k, v in state_dict.items())
227
-
228
- if remove_cls_weights:
229
- cls_weights = ["cls.predictions.decoder.bias",
230
- "cls.predictions.transform.dense.weight",
231
- "cls.predictions.transform.dense.bias",
232
- "cls.predictions.transform.layer_norm.weight",
233
- "cls.predictions.transform.layer_norm.bias",
234
- "cls.predictions.decoder.weight"]
235
- for weight in cls_weights:
236
- state_dict.pop(weight, None)
237
-
238
- # Word embedding
239
- pad_vocab_size_multiple = getattr(config, "pad_vocab_size_multiple", 1)
240
- if pad_vocab_size_multiple > 1:
241
- word_embeddings = state_dict["bert.embeddings.word_embeddings.weight"]
242
- state_dict["bert.embeddings.word_embeddings.weight"] = F.pad(
243
- word_embeddings, (0, 0, 0, config.vocab_size - word_embeddings.shape[0])
244
- )
245
- if not remove_cls_weights:
246
- decoder_weight = state_dict["cls.predictions.decoder.weight"]
247
- state_dict["cls.predictions.decoder.weight"] = F.pad(
248
- decoder_weight, (0, 0, 0, config.vocab_size - decoder_weight.shape[0])
249
- )
250
- # If the vocab was padded, we want to set the decoder bias for those padded indices to be
251
- # strongly negative (i.e. the decoder shouldn't predict those indices).
252
- # TD [2022-05-09]: I don't think it affects the MLPerf training.
253
- if "cls.predictions.decoder.bias" in state_dict:
254
- decoder_bias = state_dict["cls.predictions.decoder.bias"]
255
- state_dict["cls.predictions.decoder.bias"] = F.pad(
256
- decoder_bias, (0, config.vocab_size - decoder_bias.shape[0]), value=-100.0
257
- )
258
-
259
- if add_pooling_layer is False:
260
- pooler_weights = ["bert.pooler.dense.weight",
261
- "bert.pooler.dense.bias",
262
- ]
263
- for key in pooler_weights:
264
- state_dict.pop(key, None)
265
-
266
- if remove_bert:
267
- def remove_bert_prefix(key):
268
- key = re.sub(r"^bert.", "", key)
269
- return key
270
-
271
- state_dict = OrderedDict((remove_bert_prefix(k), v) for k, v in state_dict.items())
272
-
273
-
274
- return state_dict
275
-
276
-
277
- class NomicBertPreTrainedModel(PreTrainedModel):
278
- """An abstract class to handle weights initialization and
279
- a simple interface for dowloading and loading pretrained models.
280
- """
281
- config_class = NomicBertConfig
282
- base_model_prefix = "model"
283
- supports_gradient_checkpointing = True
284
- _no_split_modules = ["Block"]
285
- _skip_keys_device_placement = "past_key_values"
286
-
287
- def __init__(self, config, *inputs, **kwargs):
288
- super().__init__(config)
289
- if not isinstance(config, GPT2Config):
290
- raise ValueError(
291
- "Parameter config in `{}(config)` should be an instance of class `GPT2Config`. "
292
- "To create a model from a Google pretrained model use "
293
- "`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`".format(
294
- self.__class__.__name__, self.__class__.__name__
295
- )
296
- )
297
- self.config = config
298
-
299
- @classmethod
300
- def from_pretrained(cls, model_name, config=None, *inputs, **kwargs):
301
- """
302
- Instantiate a NomicBertPreTrainedModel from a pre-trained model file or a pytorch state dict.
303
- Download and cache the pre-trained model file if needed.
304
-
305
- Params:
306
- pretrained_model_name_or_path: either:
307
- - a path or url to a pretrained model archive containing:
308
- . `bert_config.json` a configuration file for the model
309
- . `pytorch_model.bin` a PyTorch dump of a NomicBertForPretraining instance
310
- - a path or url to a pretrained model archive containing:
311
- . `bert_config.json` a configuration file for the model
312
- . `model.chkpt` a TensorFlow checkpoint
313
- *inputs, **kwargs: additional input for the specific NomicBert class
314
- (ex: num_labels for NomicBertForSequenceClassification)
315
- """
316
- # Instantiate model.
317
- if config is None:
318
- config = cls.config_class.from_pretrained(model_name)
319
- remove_cls = cls != NomicBertForPreTraining
320
- remove_bert_prefix = cls != NomicBertForPreTraining
321
- ignore_mismatched_shapes = kwargs.pop("ignore_mismatched_sizes", False)
322
- num_labels = kwargs.pop("num_labels", None)
323
- rotary_scaling_factor = kwargs.pop("rotary_scaling_factor", None)
324
- if rotary_scaling_factor:
325
- config.rotary_scaling_factor = rotary_scaling_factor
326
- else:
327
- config.rotary_scaling_factor = None
328
- if config.n_positions <= 0 and config.rotary_emb_fraction > 0:
329
- config.n_positions = 2048
330
- if num_labels:
331
- config.num_labels = num_labels
332
-
333
- if "add_pooling_layer" in kwargs:
334
- model = cls(config, *inputs, add_pooling_layer=kwargs.pop("add_pooling_layer"))
335
- else:
336
- if cls == NomicBertModel:
337
- model = cls(config, *inputs, add_pooling_layer=False)
338
- else:
339
- model = cls(config, *inputs)
340
- # TODO: fix this
341
- # Assuming we know what we're doing when loading from disk
342
- # Prob a bad assumption but i'm tired and want to train this asap
343
- if os.path.exists(model_name):
344
- state_dict = torch.load(f"{model_name}/pytorch_model.bin")
345
- if ignore_mismatched_shapes:
346
- state_dict = filter_shapes(state_dict, model)
347
- load_return = model.load_state_dict(state_dict, strict=False)
348
- else:
349
- # TODO: can probably check config class and see if we need to remap from a bert model
350
- state_dict = state_dict_from_pretrained(model_name)
351
- state_dict = remap_bert_state_dict(state_dict,
352
- config,
353
- remove_bert=remove_bert_prefix,
354
- remove_cls_weights=remove_cls,
355
- add_pooling_layer=getattr(config, "add_pooling_layer", False)
356
- )
357
- if ignore_mismatched_shapes:
358
- state_dict = filter_shapes(state_dict, model)
359
-
360
- load_return = model.load_state_dict(
361
- state_dict,
362
- strict=True
363
- )
364
- logger.warning(load_return)
365
- return model
366
-
367
- def _set_gradient_checkpointing(self, module, value=False):
368
- if isinstance(module, NomicBertEncoder):
369
- module.gradient_checkpointing = value
370
-
371
-
372
- # https://github.com/huggingface/transformers/blob/7032e0203262ebb2ebf55da8d2e01f873973e835/src/transformers/models/bert/modeling_bert.py#L748
373
- def _init_weights(module, initializer_range=0.02):
374
- if isinstance(module, nn.Linear):
375
- nn.init.normal_(module.weight, std=initializer_range)
376
- if module.bias is not None:
377
- nn.init.zeros_(module.bias)
378
- elif isinstance(module, nn.Embedding):
379
- nn.init.normal_(module.weight, std=initializer_range)
380
- if module.padding_idx is not None:
381
- nn.init.zeros_(module.weight[module.padding_idx])
382
-
383
-
384
- class NomicBertEmbeddings(nn.Module):
385
- def __init__(
386
- self,
387
- config
388
- ):
389
- """
390
- If max_position_embeddings <= 0, there's no position embeddings
391
- If type_vocab_size <= 0, there's no token type embeddings
392
- """
393
- super().__init__()
394
- self.word_embeddings = nn.Embedding(
395
- config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id
396
- )
397
- self.max_position_embeddings = config.max_position_embeddings if config.rotary_emb_fraction <= 0 else 0
398
- self.type_vocab_size = config.type_vocab_size
399
- if self.max_position_embeddings > 0 and config.rotary_emb_fraction <= 0:
400
- self.position_embeddings = nn.Embedding(
401
- config.max_position_embeddings, config.hidden_size,
402
- )
403
- if self.type_vocab_size > 0:
404
- self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
405
-
406
- def forward(self, input_ids, position_ids=None, token_type_ids=None):
407
- """
408
- input_ids: (batch, seqlen)
409
- position_ids: (batch, seqlen)
410
- token_type_ids: (batch, seqlen)
411
- """
412
- batch_size, seqlen = input_ids.shape
413
- embeddings = self.word_embeddings(input_ids)
414
-
415
- if self.type_vocab_size > 0:
416
- if token_type_ids is None:
417
- token_type_ids = torch.zeros(seqlen, dtype=torch.long, device=input_ids.device)
418
- token_type_embeddings = self.token_type_embeddings(token_type_ids)
419
- embeddings = embeddings + token_type_embeddings
420
-
421
- if self.max_position_embeddings > 0:
422
- if position_ids is None:
423
- position_ids = torch.arange(seqlen, dtype=torch.long, device=input_ids.device)
424
- position_embeddings = self.position_embeddings(position_ids)
425
- embeddings = embeddings + position_embeddings
426
- return embeddings
427
-
428
- class NomicBertMLP(nn.Module):
429
- def __init__(
430
- self,
431
- in_features,
432
- hidden_features=None,
433
- out_features=None,
434
- activation=F.gelu,
435
- bias1=True,
436
- bias2=True,
437
- return_residual=False,
438
- fused_bias_fc=False,
439
- ):
440
- super().__init__()
441
- out_features = out_features if out_features is not None else in_features
442
- hidden_features = hidden_features if hidden_features is not None else in_features * 4
443
- self.return_residual = return_residual
444
- self.fc1 = nn.Linear(in_features, hidden_features, bias=bias1)
445
- approximate = (
446
- "tanh"
447
- if activation in ["gelu_new", "gelu_fast", "gelu_pytorch_tanh"]
448
- else "none"
449
- )
450
- self.activation = nn.GELU(approximate=approximate) if activation == "gelu" else activation
451
- self.fc2 = nn.Linear(hidden_features, out_features, bias=bias2)
452
-
453
- def forward(self, x):
454
- y = self.fc1(x)
455
- y = self.activation(y)
456
- y = self.fc2(y)
457
- return y if not self.return_residual else (y, x)
458
-
459
-
460
- class NomciBertGatedMLP(nn.Module):
461
- def __init__(
462
- self,
463
- in_features,
464
- hidden_features=None,
465
- out_features=None,
466
- activation=F.sigmoid,
467
- bias1=True,
468
- bias2=True,
469
- multiple_of=256,
470
- return_residual=False,
471
- fused_bias_fc=True,
472
- device=None,
473
- dtype=None,
474
- ):
475
- super().__init__()
476
- out_features = out_features if out_features is not None else in_features
477
- hidden_features = (
478
- hidden_features if hidden_features is not None else int(8 * in_features / 3)
479
- )
480
- hidden_features = (hidden_features + multiple_of - 1) // multiple_of * multiple_of
481
- self.return_residual = return_residual
482
-
483
- self.fc11 = nn.Linear(in_features, hidden_features, bias=bias1)
484
- self.fc12 = nn.Linear(in_features, hidden_features, bias=bias1)
485
- self.activation = activation
486
- self.fc2 = nn.Linear(hidden_features, out_features, bias=bias2)
487
-
488
- def forward(self, x):
489
- y = self.fc11(x)
490
- gate = self.fc12(x)
491
- if self.activation == F.sigmoid: # Special case for GLU
492
- y = F.glu(torch.cat([y, gate], dim=-1), dim=-1)
493
- else:
494
- y = y * self.activation(gate)
495
- y = self.fc2(y)
496
- return y if not self.return_residual else (y, x)
497
-
498
-
499
- def rotate_half(x, interleaved=False):
500
- if not interleaved:
501
- x1, x2 = x.chunk(2, dim=-1)
502
- return torch.cat((-x2, x1), dim=-1)
503
- else:
504
- x1, x2 = x[..., ::2], x[..., 1::2]
505
- return rearrange(torch.stack((-x2, x1), dim=-1), "... d two -> ... (d two)", two=2)
506
-
507
-
508
- def apply_rotary_emb(x, cos, sin, offset=0, interleaved=False):
509
- """
510
- x: (batch_size, seqlen, nheads, headdim)
511
- cos, sin: (seqlen, rotary_dim / 2) or (batch_size, seqlen, rotary_dim / 2)
512
- """
513
- ro_dim = cos.shape[-1] * 2
514
- assert ro_dim <= x.shape[-1]
515
- cos, sin = (
516
- cos[offset: offset + x.shape[1]],
517
- sin[offset: offset + x.shape[1]],
518
- )
519
- cos = repeat(cos, "... d -> ... 1 (2 d)" if not interleaved else "... d -> ... 1 (d 2)")
520
- sin = repeat(sin, "... d -> ... 1 (2 d)" if not interleaved else "... d -> ... 1 (d 2)")
521
- return torch.cat(
522
- [x[..., :ro_dim] * cos + rotate_half(x[..., :ro_dim], interleaved) * sin, x[..., ro_dim:]],
523
- dim=-1,
524
- )
525
-
526
-
527
- class NomicBertRotaryEmbedding(nn.Module):
528
- def __init__(
529
- self,
530
- dim: int,
531
- base=10000.0,
532
- interleaved=False,
533
- scale_base=None,
534
- pos_idx_in_fp32=True,
535
- device=None,
536
- ):
537
- """
538
- interleaved: if True, rotate pairs of even and odd dimensions (GPT-J style) instead
539
- of 1st half and 2nd half (GPT-NeoX style).
540
- pos_idx_in_fp32: if True, the position indices [0.0, ..., seqlen - 1] are in fp32,
541
- otherwise they might be in lower precision.
542
- This option was added because previously (before 2023-07-02), when we construct
543
- the position indices, we use the dtype of self.inv_freq. In most cases this would
544
- be fp32, but if the model is trained in pure bf16 (not mixed precision), then
545
- self.inv_freq would be bf16, and the position indices are also in bf16.
546
- Because of the limited precision of bf16 (e.g. 1995.0 is rounded to 2000.0), the
547
- embeddings for some positions will coincide.
548
- To maintain compatibility with models previously trained in pure bf16,
549
- we add this option.
550
- """
551
- super().__init__()
552
- self.dim = dim
553
- self.base = float(base)
554
- self.pos_idx_in_fp32 = pos_idx_in_fp32
555
- # Generate and save the inverse frequency buffer (non trainable)
556
- inv_freq = self._compute_inv_freq(device)
557
- self.register_buffer("inv_freq", inv_freq, persistent=False)
558
- self.interleaved = interleaved
559
- self.scale_base = scale_base
560
- scale = (
561
- (torch.arange(0, dim, 2, device=device, dtype=torch.float32) + 0.4 * dim) / (1.4 * dim)
562
- if scale_base is not None
563
- else None
564
- )
565
- self.register_buffer("scale", scale, persistent=False)
566
-
567
- self._seq_len_cached = 0
568
- self._cos_cached = None
569
- self._sin_cached = None
570
- self._cos_k_cached = None
571
- self._sin_k_cached = None
572
-
573
- def _compute_inv_freq(self, device=None):
574
- return 1.0 / (
575
- self.base
576
- ** (torch.arange(0, self.dim, 2, device=device, dtype=torch.float32) / self.dim)
577
- )
578
-
579
- def _update_cos_sin_cache(self, seqlen, device=None, dtype=None):
580
- # Reset the tables if the sequence length has changed,
581
- # if we're on a new device (possibly due to tracing for instance),
582
- # or if we're switching from inference mode to training
583
- if (
584
- seqlen > self._seq_len_cached
585
- or self._cos_cached is None
586
- or self._cos_cached.device != device
587
- or self._cos_cached.dtype != dtype
588
- or (self.training and self._cos_cached.is_inference())
589
- ):
590
- self._seq_len_cached = seqlen
591
- # We want fp32 here, not self.inv_freq.dtype, since the model could be loaded in bf16
592
- # And the output of arange can be quite large, so bf16 would lose a lot of precision.
593
- # However, for compatibility reason, we add an option to use the dtype of self.inv_freq.
594
- if self.pos_idx_in_fp32:
595
- t = torch.arange(seqlen, device=device, dtype=torch.float32)
596
- # We want fp32 here as well since inv_freq will be multiplied with t, and the output
597
- # will be large. Having it in bf16 will lose a lot of precision and cause the
598
- # cos & sin output to change significantly.
599
- # We want to recompute self.inv_freq if it was not loaded in fp32
600
- if self.inv_freq.dtype != torch.float32:
601
- inv_freq = self._compute_inv_freq(device=device)
602
- else:
603
- inv_freq = self.inv_freq
604
- else:
605
- t = torch.arange(seqlen, device=device, dtype=self.inv_freq.dtype)
606
- inv_freq = self.inv_freq
607
- # Don't do einsum, it converts fp32 to fp16 under AMP
608
- # freqs = torch.einsum("i,j->ij", t, self.inv_freq)
609
- freqs = torch.outer(t, inv_freq)
610
- self._cos_cached = torch.cos(freqs).to(dtype)
611
- self._sin_cached = torch.sin(freqs).to(dtype)
612
-
613
- def forward(
614
- self,
615
- qkv: torch.Tensor,
616
- kv: Optional[torch.Tensor] = None,
617
- seqlen_offset: Union[int, torch.Tensor] = 0,
618
- max_seqlen: Optional[int] = None,
619
- ) -> Tuple[torch.Tensor, torch.Tensor]:
620
- """
621
- qkv: (batch, seqlen, 3, nheads, headdim) if kv is none,
622
- else it's just q of shape (batch, seqlen, nheads, headdim)
623
- kv: (batch, seqlen, 2, nheads, headdim)
624
- seqlen_offset: (batch_size,) or int. Each sequence in x is shifted by this amount.
625
- Most commonly used in inference when we have KV cache.
626
- If it's a tensor of shape (batch_size,), then to update the cos / sin cache, one
627
- should pass in max_seqlen, which will update the cos / sin cache up to that length.
628
- Apply rotary embedding *inplace* to qkv and / or kv.
629
- """
630
- seqlen = qkv.shape[1]
631
- if seqlen > self._seq_len_cached:
632
- self._update_cos_sin_cache(seqlen, device=qkv.device, dtype=qkv.dtype)
633
- elif max_seqlen is not None:
634
- self._update_cos_sin_cache(max_seqlen, device=qkv.device, dtype=qkv.dtype)
635
- elif isinstance(seqlen_offset, int):
636
- self._update_cos_sin_cache(seqlen + seqlen_offset, device=qkv.device, dtype=qkv.dtype)
637
-
638
- q_rot = apply_rotary_emb(qkv[:, :, 0], self._cos_cached, self._sin_cached, seqlen_offset, self.interleaved)
639
- k_rot = apply_rotary_emb(qkv[:, :, 1], self._cos_cached, self._sin_cached, seqlen_offset, self.interleaved)
640
- return torch.stack((q_rot, k_rot, qkv[:, :, 2]), dim=2)
641
-
642
-
643
- class NomicBertDynamicNTKRotaryEmbedding(NomicBertRotaryEmbedding):
644
- def __init__(self, rotary_scaling_factor, max_position_embeddings, **kwargs):
645
- super().__init__(**kwargs)
646
- self.rotary_scaling_factor = rotary_scaling_factor
647
- self.max_position_embeddings = max_position_embeddings
648
-
649
-
650
- def _compute_inv_freq(self, base=None, device=None):
651
- if base is None:
652
- base = self.base
653
- return 1.0 / (
654
- base
655
- ** (torch.arange(0, self.dim, 2, device=device, dtype=torch.float32) / self.dim)
656
- )
657
-
658
- def _update_cos_sin_cache(self, seqlen, device=None, dtype=None):
659
- # Reset the tables if the sequence length has changed,
660
- # if we're on a new device (possibly due to tracing for instance),
661
- # or if we're switching from inference mode to training
662
- if seqlen > self.max_position_embeddings:
663
- base = self.base * (
664
- (self.rotary_scaling_factor * seqlen / self.max_position_embeddings) - (self.rotary_scaling_factor - 1)
665
- ) ** (self.dim / (self.dim - 2))
666
- inv_freq = self._compute_inv_freq(base=base, device=device)
667
- self.register_buffer("inv_freq", inv_freq, persistent=False)
668
-
669
- if (
670
- seqlen > self._seq_len_cached
671
- or self._cos_cached is None
672
- or self._cos_cached.device != device
673
- or self._cos_cached.dtype != dtype
674
- or (self.training and self._cos_cached.is_inference())
675
- ):
676
- self._seq_len_cached = seqlen
677
- # We want fp32 here, not self.inv_freq.dtype, since the model could be loaded in bf16
678
- # And the output of arange can be quite large, so bf16 would lose a lot of precision.
679
- # However, for compatibility reason, we add an option to use the dtype of self.inv_freq.
680
- if self.pos_idx_in_fp32:
681
- t = torch.arange(seqlen, device=device, dtype=torch.float32)
682
- # We want fp32 here as well since inv_freq will be multiplied with t, and the output
683
- # will be large. Having it in bf16 will lose a lot of precision and cause the
684
- # cos & sin output to change significantly.
685
- # We want to recompute self.inv_freq if it was not loaded in fp32
686
- if self.inv_freq.dtype != torch.float32:
687
- if seqlen > self.max_position_embeddings:
688
- base = self.base * (
689
- (self.scaling_factor * seqlen / self.max_position_embeddings) - (self.scaling_factor - 1)
690
- ) ** (self.dim / (self.dim - 2))
691
- else:
692
- base = self.base
693
- inv_freq = self._compute_inv_freq(device=device, base=base)
694
- else:
695
- inv_freq = self.inv_freq
696
- else:
697
- t = torch.arange(seqlen, device=device, dtype=self.inv_freq.dtype)
698
- inv_freq = self.inv_freq
699
- # Don't do einsum, it converts fp32 to fp16 under AMP
700
- # freqs = torch.einsum("i,j->ij", t, self.inv_freq)
701
- freqs = torch.outer(t, inv_freq)
702
- if self.scale is None:
703
- self._cos_cached = torch.cos(freqs).to(dtype)
704
- self._sin_cached = torch.sin(freqs).to(dtype)
705
- else:
706
- power = (
707
- torch.arange(seqlen, dtype=self.scale.dtype, device=self.scale.device)
708
- - seqlen // 2
709
- ) / self.scale_base
710
- scale = self.scale.to(device=power.device) ** rearrange(power, "s -> s 1")
711
- # We want the multiplication by scale to happen in fp32
712
- self._cos_cached = (torch.cos(freqs) * scale).to(dtype)
713
- self._sin_cached = (torch.sin(freqs) * scale).to(dtype)
714
- self._cos_k_cached = (torch.cos(freqs) / scale).to(dtype)
715
- self._sin_k_cached = (torch.sin(freqs) / scale).to(dtype)
716
-
717
- class NomicBertAttention(nn.Module):
718
- """Multi-head self-attention and cross-attention"""
719
-
720
- def __init__(
721
- self,
722
- config,
723
- ) -> None:
724
- """
725
- num_heads_kv: can be used to toggle MQA / GQA. If None, use num_heads.
726
- return_residual: whether to return the input x along with the output. This is for
727
- performance reason: for post-norm architecture, returning the input allows us
728
- to fuse the backward of nn.Linear with the residual connection.
729
- """
730
- super().__init__()
731
- self.embed_dim = config.n_embd
732
- self.use_flash_attn = config.use_flash_attn
733
- self.fused_bias_fc = config.fused_bias_fc
734
-
735
- self.num_heads = config.n_head
736
- self.num_heads_kv = config.num_heads_kv if getattr(config, "num_heads_kv", None) is not None else self.num_heads
737
- assert self.embed_dim % self.num_heads == 0, "embed_dim must be divisible by num_heads"
738
- self.head_dim = self.embed_dim // self.num_heads
739
- # we don't really support mqa / gqa for now
740
- qkv_dim = self.head_dim * (self.num_heads + 2 * self.num_heads_kv)
741
-
742
- self.register_buffer(
743
- "norm_factor",
744
- torch.sqrt(torch.tensor(self.head_dim, dtype=torch.float32)).to(torch.get_default_dtype()),
745
- persistent=False,
746
- )
747
-
748
- self.rotary_emb_dim = self.head_dim * config.rotary_emb_fraction
749
- if self.rotary_emb_dim > 0:
750
- if config.rotary_scaling_factor:
751
- self.rotary_emb = NomicBertDynamicNTKRotaryEmbedding(
752
- dim=self.rotary_emb_dim,
753
- base=config.rotary_emb_base,
754
- scale_base=config.rotary_emb_scale_base,
755
- interleaved=config.rotary_emb_interleaved,
756
- rotary_scaling_factor=config.rotary_scaling_factor,
757
- max_position_embeddings=config.n_positions,
758
- )
759
- else:
760
- self.rotary_emb = NomicBertRotaryEmbedding(
761
- dim=self.rotary_emb_dim,
762
- base=config.rotary_emb_base,
763
- scale_base=config.rotary_emb_scale_base,
764
- interleaved=config.rotary_emb_interleaved,
765
- )
766
- # bug in xformers: https://github.com/facebookresearch/xformers/issues/841
767
- # uses the head dimension instead of the sequence dimension
768
- self.rotary_head_dim = getattr(config, "rotary_head_dim", False)
769
-
770
- self.Wqkv = nn.Linear(self.embed_dim, qkv_dim, bias=config.qkv_proj_bias)
771
-
772
- self.out_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.qkv_proj_bias)
773
- self.causal = config.causal
774
- self.drop = nn.Dropout(config.attn_pdrop)
775
-
776
- def forward(
777
- self,
778
- hidden_states: torch.Tensor,
779
- attention_mask: Optional[torch.Tensor] = None,
780
- position_ids: Optional[torch.LongTensor] = None,
781
- past_key_value: Optional[Tuple[torch.Tensor]] = None,
782
- output_attentions: bool = False,
783
- use_cache: bool = False,
784
- is_padded_inputs: Optional[bool] = True,
785
- cu_seqlens: Optional[torch.Tensor] = None,
786
- max_seq_len: Optional[int] = None,
787
- ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
788
-
789
- has_layer_past = past_key_value is not None
790
-
791
- if has_layer_past:
792
- past_key_value = past_key_value[0]
793
- past_len = past_key_value[1]
794
- else:
795
- past_len = 0
796
-
797
- qkv = self.Wqkv(hidden_states)
798
- qkv = rearrange(qkv, "... (three h d) -> ... three h d", three=3, d=self.head_dim)
799
-
800
- past_key_value = (past_key_value, past_len + qkv.size(1)) if use_cache else None
801
-
802
- if self.rotary_emb_dim > 0:
803
- if self.rotary_head_dim:
804
- qkv = rearrange(qkv, "b s three h d -> b h three s d")
805
- qkv = self.rotary_emb(qkv, seqlen_offset=past_len)
806
-
807
- if self.rotary_head_dim:
808
- qkv = rearrange(qkv, "b h three s d -> b s three h d")
809
-
810
- query, key, value = qkv[:, :, 0], qkv[:, :, 1], qkv[:, :, 2]
811
-
812
- query = query.permute(0, 2, 1, 3)
813
- key = key.permute(0, 2, 1, 3)
814
- value = value.permute(0, 2, 1, 3)
815
-
816
- attention_scores = torch.matmul(query, key.transpose(-1, -2)) / self.norm_factor
817
- if attention_mask is not None:
818
- attention_scores = attention_scores + attention_mask
819
-
820
- attentions_probs = F.softmax(attention_scores, dim=-1)
821
- attentions_probs = self.drop(attentions_probs)
822
-
823
- attn_output = torch.matmul(attentions_probs, value)
824
- attn_output = rearrange(attn_output.permute(0, 2, 1, 3), "... h d -> ... (h d)")
825
-
826
- attn_output = self.out_proj(attn_output)
827
-
828
- return attn_output
829
-
830
-
831
- class NomicBertBlock(nn.Module):
832
- def __init__(
833
- self,
834
- config,
835
- ):
836
- super().__init__()
837
- self.prenorm = config.prenorm
838
- self.fused_dropout_add_ln = config.fused_dropout_add_ln
839
-
840
- self.attn = NomicBertAttention(config)
841
- activation = (
842
- F.sigmoid
843
- if config.activation_function == "glu"
844
- else (F.silu if config.activation_function == "swiglu" else F.gelu)
845
- )
846
- if config.activation_function in ["glu", "swiglu", "geglu"]:
847
- self.mlp = NomciBertGatedMLP(config.n_embd, hidden_features=config.n_inner, bias1=config.mlp_fc1_bias, bias2=config.mlp_fc2_bias, activation=activation, fused_bias_fc=config.fused_bias_fc)
848
- else:
849
- self.mlp = NomicBertMLP(config.n_embd, hidden_features=config.n_inner, bias1=config.mlp_fc1_bias, bias2=config.mlp_fc2_bias, activation=activation, fused_bias_fc=config.fused_bias_fc)
850
-
851
- self.dropout1 = nn.Dropout(config.resid_pdrop)
852
- self.norm1 = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
853
- self.norm2 = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
854
- self.dropout2 = nn.Dropout(config.resid_pdrop)
855
-
856
- def forward(
857
- self,
858
- hidden_states: torch.Tensor,
859
- hidden_states2: torch.Tensor,
860
- residual: Optional[torch.Tensor] = None,
861
- attention_mask: Optional[torch.Tensor] = None,
862
- position_ids: Optional[torch.LongTensor] = None,
863
- past_key_value: Optional[Tuple[torch.Tensor]] = None,
864
- is_padded_inputs: Optional[bool] = True,
865
- output_attentions: Optional[bool] = False,
866
- use_cache: Optional[bool] = False,
867
- cu_seqlens: Optional[torch.Tensor] = None,
868
- max_seq_len: Optional[int] = None,
869
- ):
870
- r"""Pass the input through the encoder layer.
871
-
872
- Args:
873
- hidden_states: the sequence to the encoder layer (required).
874
- residual: if postnorm, residual=None, If prenorm, hidden_states = Attn/MLP(LN(residual))
875
- mixer_subset: for cross-attention only. If not None, will take a subset of x
876
- before applying the query projection. Useful for e.g., ViT where we only care
877
- about the CLS token in the last layer.
878
- """
879
- if self.prenorm:
880
- dropped = self.dropout1(hidden_states)
881
- residual = (dropped + residual) if residual is not None else dropped
882
- hidden_states = self.norm1(residual.to(dtype=self.norm1.weight.dtype))
883
- hidden_states = self.attn(hidden_states, attention_mask=attention_mask, is_padded_inputs=is_padded_inputs, cu_seqlens=cu_seqlens, max_seq_len=max_seq_len)
884
-
885
- dropped = self.dropout2(hidden_states)
886
- residual = (dropped + residual) if residual is not None else dropped
887
- hidden_states = self.norm2(residual.to(dtype=self.norm2.weight.dtype))
888
- hidden_states = self.mlp(hidden_states)
889
-
890
- return hidden_states, None, residual
891
- else:
892
- assert residual is None
893
- attn_outputs = self.attn(hidden_states,
894
- attention_mask=attention_mask,
895
- is_padded_inputs=is_padded_inputs,
896
- cu_seqlens=cu_seqlens,
897
- max_seq_len=max_seq_len)
898
- hidden_states = self.norm1(
899
- (self.dropout1(attn_outputs) + hidden_states).to(
900
- dtype=self.norm1.weight.dtype
901
- )
902
- )
903
- mlp_out = self.mlp(hidden_states)
904
-
905
- hidden_states = self.norm2(
906
- (self.dropout2(mlp_out) + hidden_states).to(
907
- dtype=self.norm2.weight.dtype
908
- )
909
- )
910
- return hidden_states, None, None
911
-
912
-
913
- class NomicBertEncoder(nn.Module):
914
- def __init__(self, config: GPT2Config):
915
- super().__init__()
916
- self.layers = nn.ModuleList(
917
- [NomicBertBlock(config) for _ in range(config.n_layer)]
918
- )
919
- self.gradient_checkpointing = False
920
- self.config = config
921
-
922
- def forward(self,
923
- hidden_states: torch.LongTensor = None,
924
- attention_mask: Optional[torch.Tensor] = None,
925
- position_ids: Optional[torch.LongTensor] = None,
926
- past_key_values: Optional[List[torch.FloatTensor]] = None,
927
- inputs_embeds: Optional[torch.FloatTensor] = None,
928
- use_cache: Optional[bool] = None,
929
- output_attentions: Optional[bool] = None,
930
- output_hidden_states: Optional[bool] = None,
931
- return_dict: Optional[bool] = None,
932
- is_padded_inputs: Optional[bool] = True,):
933
-
934
- """If subset_mask is not None, we only want output for the subset of the sequence.
935
- This means that we only compute the last layer output for these tokens.
936
- subset_mask: (batch, seqlen), dtype=torch.bool
937
- """
938
- hidden_states2 = None
939
- residual = None
940
-
941
-
942
- for _, layer in enumerate(self.layers):
943
- if self.gradient_checkpointing and self.training:
944
-
945
- def create_custom_forward(module):
946
- def custom_forward(*inputs):
947
- # None for past_key_value
948
- return module(*inputs)
949
-
950
- return custom_forward
951
-
952
- hidden_states, hidden_states2, residual = torch.utils.checkpoint.checkpoint(
953
- create_custom_forward(layer),
954
- hidden_states,
955
- hidden_states2,
956
- residual,
957
- attention_mask,
958
- None,
959
- None,
960
- is_padded_inputs,
961
- # if you freeze ANY layers, you need `use_reentrant=False`
962
- # https://github.com/huggingface/transformers/issues/21381
963
- # https://discuss.pytorch.org/t/checkpoint-with-no-grad-requiring-inputs-problem/19117/7
964
- use_reentrant=False,
965
- )
966
-
967
- else:
968
- hidden_states, hidden_states2, residual = layer(
969
- hidden_states,
970
- hidden_states2,
971
- residual,
972
- attention_mask,
973
- position_ids,
974
- None,
975
- is_padded_inputs,
976
- output_attentions,
977
- use_cache,
978
- )
979
- return hidden_states
980
-
981
-
982
- class NomicBertPooler(nn.Module):
983
- def __init__(self, config):
984
- super().__init__()
985
- self.dense = nn.Linear(config.n_embd, config.n_embd)
986
- self.activation = nn.Tanh()
987
-
988
- def forward(self, hidden_states, pool=True):
989
- # We "pool" the model by simply taking the hidden state corresponding
990
- # to the first token.
991
- first_token_tensor = hidden_states[:, 0] if pool else hidden_states
992
- pooled_output = self.dense(first_token_tensor)
993
- pooled_output = self.activation(pooled_output)
994
- return pooled_output
995
-
996
-
997
- class NomicBertPredictionHeadTransform(nn.Module):
998
- def __init__(self, config):
999
- super().__init__()
1000
- self.dense = nn.Linear(config.n_embd, config.n_embd, bias=config.mlp_fc1_bias)
1001
- approximate = (
1002
- "tanh"
1003
- if config.activation_function in ["gelu_new", "gelu_fast", "gelu_pytorch_tanh"]
1004
- else "none"
1005
- )
1006
- if config.activation_function == "swiglu":
1007
- self.transform_act_fn = F.silu
1008
- else:
1009
- self.transform_act_fn = nn.GELU(approximate=approximate)
1010
-
1011
- self.layer_norm = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
1012
-
1013
- def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
1014
- hidden_states = self.dense(hidden_states)
1015
- hidden_states = self.transform_act_fn(hidden_states)
1016
- hidden_states = self.layer_norm(hidden_states)
1017
-
1018
- return hidden_states
1019
-
1020
-
1021
- class NomicBertLMPredictionHead(nn.Module):
1022
- def __init__(self, config):
1023
- super().__init__()
1024
-
1025
- self.transform = NomicBertPredictionHeadTransform(config)
1026
-
1027
- self.decoder = nn.Linear(config.n_embd, config.vocab_size, bias=config.mlp_fc1_bias)
1028
-
1029
- def forward(self, hidden_states):
1030
- hidden_states = self.transform(hidden_states)
1031
- hidden_states = self.decoder(hidden_states)
1032
- return hidden_states
1033
-
1034
-
1035
- class NomicBertPreTrainingHeads(nn.Module):
1036
- def __init__(self, config):
1037
- super().__init__()
1038
- self.predictions = NomicBertLMPredictionHead(config)
1039
-
1040
- def forward(self, sequence_output):
1041
- prediction_scores = self.predictions(sequence_output)
1042
- return prediction_scores
1043
-
1044
-
1045
- class NomicBertModel(NomicBertPreTrainedModel):
1046
- def __init__(self, config: GPT2Config, add_pooling_layer=True):
1047
- super().__init__(config)
1048
- self.pad_vocab_size_multiple = getattr(config, "pad_vocab_size_multiple", 1)
1049
- if config.vocab_size % self.pad_vocab_size_multiple != 0:
1050
- config.vocab_size += self.pad_vocab_size_multiple - (
1051
- config.vocab_size % self.pad_vocab_size_multiple
1052
- )
1053
-
1054
- assert config.activation_function in ["gelu", "gelu_new", "gelu_fast", "gelu_pytorch_tanh", "swiglu", "geglu", "glu"]
1055
-
1056
- self.embeddings = NomicBertEmbeddings(
1057
- config
1058
- )
1059
- self.emb_drop = nn.Dropout(config.resid_pdrop)
1060
- self.emb_ln = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
1061
- self.encoder = NomicBertEncoder(config)
1062
- self.pooler = NomicBertPooler(config) if add_pooling_layer else None
1063
-
1064
- self.apply(partial(_init_weights, initializer_range=config.initializer_range))
1065
-
1066
- def forward(
1067
- self,
1068
- input_ids,
1069
- position_ids=None,
1070
- token_type_ids=None,
1071
- attention_mask=None,
1072
- return_dict=None,
1073
- ):
1074
- if token_type_ids is None:
1075
- token_type_ids = torch.zeros_like(input_ids)
1076
- hidden_states = self.embeddings(
1077
- input_ids, position_ids=position_ids, token_type_ids=token_type_ids
1078
- )
1079
- hidden_states = self.emb_ln(hidden_states)
1080
- hidden_states = self.emb_drop(hidden_states)
1081
-
1082
- attention_mask = self.get_extended_attention_mask(attention_mask, input_ids.shape)
1083
- sequence_output = self.encoder(
1084
- hidden_states, attention_mask=attention_mask, return_dict=return_dict,
1085
- )
1086
-
1087
- pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
1088
-
1089
- return BaseModelOutputWithPoolingAndCrossAttentions(
1090
- last_hidden_state=sequence_output,
1091
- pooler_output=pooled_output,
1092
- )
1093
-
1094
-
1095
- class NomicBertForPreTraining(NomicBertPreTrainedModel):
1096
- _tied_weights_keys = ["predictions.decoder.bias", "cls.predictions.decoder.weight"]
1097
-
1098
- def __init__(self, config: GPT2Config):
1099
- super().__init__(config)
1100
-
1101
- self.bert = NomicBertModel(config, add_pooling_layer=getattr(config, "add_pooling_layer", False))
1102
- self.cls = NomicBertPreTrainingHeads(config)
1103
- self.mlm_loss = nn.CrossEntropyLoss()
1104
-
1105
- # Initialize weights and apply final processing
1106
- self.apply(partial(_init_weights, initializer_range=config.initializer_range))
1107
- self.tie_weights()
1108
-
1109
- def tie_weights(self):
1110
- self.cls.predictions.decoder.weight = self.bert.embeddings.word_embeddings.weight
1111
-
1112
- def forward(
1113
- self,
1114
- input_ids,
1115
- position_ids=None,
1116
- token_type_ids=None,
1117
- attention_mask=None,
1118
- labels=None,
1119
- ):
1120
- """
1121
- If labels are provided, they must be -100 for masked out tokens (as specified in the attention
1122
- mask).
1123
- Outputs:
1124
- if `labels` and `next_sentence_label` are not `None`:
1125
- Outputs the total_loss which is the sum of the masked language modeling loss and the next
1126
- sentence classification loss.
1127
- if `labels` or `next_sentence_label` is `None`:
1128
- Outputs a tuple comprising
1129
- - the masked language modeling logits of shape [batch_size, sequence_length, vocab_size], and
1130
- - the next sentence classification logits of shape [batch_size, 2].
1131
-
1132
- """
1133
- outputs = self.bert(
1134
- input_ids,
1135
- position_ids=position_ids,
1136
- token_type_ids=token_type_ids,
1137
- attention_mask=attention_mask.bool() if attention_mask is not None else None,
1138
- )
1139
- sequence_output, _ = outputs.last_hidden_state, outputs.pooler_output
1140
-
1141
- prediction_scores = self.cls(sequence_output)
1142
-
1143
- total_loss = None
1144
- if labels is not None:
1145
- masked_lm_loss = self.mlm_loss(
1146
- rearrange(prediction_scores, "... v -> (...) v"),
1147
- rearrange(labels, "... -> (...)"),
1148
- )
1149
- total_loss = masked_lm_loss.float()
1150
-
1151
- return MaskedLMOutput(
1152
- loss=total_loss,
1153
- logits=prediction_scores,
1154
- hidden_states=outputs.hidden_states,
1155
- attentions=None,
1156
- )
1157
-
1158
-
1159
- class NomicBertForSequenceClassification(NomicBertPreTrainedModel):
1160
- def __init__(self, config):
1161
- super().__init__(config)
1162
- self.num_labels = config.num_labels
1163
- self.config = config
1164
-
1165
- self.bert = NomicBertModel(config)
1166
- classifier_dropout = (
1167
- getattr(config, "classifier_dropout", config.embd_pdrop)
1168
- )
1169
- self.dropout = nn.Dropout(classifier_dropout)
1170
- self.classifier = nn.Linear(config.n_embd, config.num_labels)
1171
-
1172
- # Initialize weights and apply final processing
1173
- self.post_init()
1174
-
1175
- def forward(
1176
- self,
1177
- input_ids: Optional[torch.Tensor] = None,
1178
- attention_mask: Optional[torch.Tensor] = None,
1179
- token_type_ids: Optional[torch.Tensor] = None,
1180
- position_ids: Optional[torch.Tensor] = None,
1181
- head_mask: Optional[torch.Tensor] = None,
1182
- inputs_embeds: Optional[torch.Tensor] = None,
1183
- labels: Optional[torch.Tensor] = None,
1184
- output_attentions: Optional[bool] = None,
1185
- output_hidden_states: Optional[bool] = None,
1186
- return_dict: Optional[bool] = None,
1187
- ):
1188
- r"""
1189
- labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
1190
- Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
1191
- config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
1192
- `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
1193
- """
1194
- return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1195
- outputs = self.bert(
1196
- input_ids,
1197
- position_ids=position_ids,
1198
- token_type_ids=token_type_ids,
1199
- attention_mask=attention_mask.bool() if attention_mask is not None else None,
1200
- )
1201
-
1202
- pooled_output = outputs[1]
1203
-
1204
- pooled_output = self.dropout(pooled_output)
1205
- logits = self.classifier(pooled_output)
1206
-
1207
- loss = None
1208
- if labels is not None:
1209
- if self.config.problem_type is None:
1210
- if self.num_labels == 1:
1211
- self.config.problem_type = "regression"
1212
- elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
1213
- self.config.problem_type = "single_label_classification"
1214
- else:
1215
- self.config.problem_type = "multi_label_classification"
1216
-
1217
- if self.config.problem_type == "regression":
1218
- loss_fct = nn.MSELoss()
1219
- if self.num_labels == 1:
1220
- loss = loss_fct(logits.squeeze(), labels.squeeze())
1221
- else:
1222
- loss = loss_fct(logits, labels)
1223
- elif self.config.problem_type == "single_label_classification":
1224
- loss_fct = nn.CrossEntropyLoss()
1225
- loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
1226
- elif self.config.problem_type == "multi_label_classification":
1227
- loss_fct = nn.BCEWithLogitsLoss()
1228
- loss = loss_fct(logits, labels)
1229
- if not return_dict:
1230
- output = (logits,) + outputs[2:]
1231
- return ((loss,) + output) if loss is not None else output
1232
-
1233
- return SequenceClassifierOutput(
1234
- loss=loss,
1235
- logits=logits,
1236
- hidden_states=outputs.hidden_states,
1237
- attentions=outputs.attentions,
1238
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
pretrained/nomic-ai/nomic-embed-text-v1/modules.json DELETED
@@ -1,20 +0,0 @@
1
- [
2
- {
3
- "idx": 0,
4
- "name": "0",
5
- "path": "",
6
- "type": "sentence_transformers.models.Transformer"
7
- },
8
- {
9
- "idx": 1,
10
- "name": "1",
11
- "path": "1_Pooling",
12
- "type": "sentence_transformers.models.Pooling"
13
- },
14
- {
15
- "idx": 2,
16
- "name": "2",
17
- "path": "2_Normalize",
18
- "type": "sentence_transformers.models.Normalize"
19
- }
20
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
pretrained/nomic-ai/nomic-embed-text-v1/onnx/model.onnx DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:347440e93b5ec979fdcf6041b72721aade7b9680c16022e2830db7115ff6fd9f
3
- size 547552426
 
 
 
 
pretrained/nomic-ai/nomic-embed-text-v1/onnx/model_quantized.onnx DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:b7941066a6529a287e2502ea6cb68ff82006d311eac53627dc88c259cbcbda64
3
- size 138355983
 
 
 
 
pretrained/nomic-ai/nomic-embed-text-v1/pytorch_model.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:9fc78c00133aac4e12f358cfe9546e893cb82bb9bb7956506fbbcaa1700ce17c
3
- size 546961866
 
 
 
 
pretrained/nomic-ai/nomic-embed-text-v1/sentence_bert_config.json DELETED
@@ -1,4 +0,0 @@
1
- {
2
- "max_seq_length": 8192,
3
- "do_lower_case": false
4
- }
 
 
 
 
 
pretrained/nomic-ai/nomic-embed-text-v1/special_tokens_map.json DELETED
@@ -1,7 +0,0 @@
1
- {
2
- "cls_token": "[CLS]",
3
- "mask_token": "[MASK]",
4
- "pad_token": "[PAD]",
5
- "sep_token": "[SEP]",
6
- "unk_token": "[UNK]"
7
- }
 
 
 
 
 
 
 
 
pretrained/nomic-ai/nomic-embed-text-v1/tokenizer.json DELETED
The diff for this file is too large to render. See raw diff
 
pretrained/nomic-ai/nomic-embed-text-v1/tokenizer_config.json DELETED
@@ -1,55 +0,0 @@
1
- {
2
- "added_tokens_decoder": {
3
- "0": {
4
- "content": "[PAD]",
5
- "lstrip": false,
6
- "normalized": false,
7
- "rstrip": false,
8
- "single_word": false,
9
- "special": true
10
- },
11
- "100": {
12
- "content": "[UNK]",
13
- "lstrip": false,
14
- "normalized": false,
15
- "rstrip": false,
16
- "single_word": false,
17
- "special": true
18
- },
19
- "101": {
20
- "content": "[CLS]",
21
- "lstrip": false,
22
- "normalized": false,
23
- "rstrip": false,
24
- "single_word": false,
25
- "special": true
26
- },
27
- "102": {
28
- "content": "[SEP]",
29
- "lstrip": false,
30
- "normalized": false,
31
- "rstrip": false,
32
- "single_word": false,
33
- "special": true
34
- },
35
- "103": {
36
- "content": "[MASK]",
37
- "lstrip": false,
38
- "normalized": false,
39
- "rstrip": false,
40
- "single_word": false,
41
- "special": true
42
- }
43
- },
44
- "clean_up_tokenization_spaces": true,
45
- "cls_token": "[CLS]",
46
- "do_lower_case": true,
47
- "mask_token": "[MASK]",
48
- "model_max_length": 8192,
49
- "pad_token": "[PAD]",
50
- "sep_token": "[SEP]",
51
- "strip_accents": null,
52
- "tokenize_chinese_chars": true,
53
- "tokenizer_class": "BertTokenizer",
54
- "unk_token": "[UNK]"
55
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
pretrained/nomic-ai/nomic-embed-text-v1/vocab.txt DELETED
The diff for this file is too large to render. See raw diff
 
pretrained/upstage/SOLAR-10.7B-Instruct-v1.0/.gitattributes DELETED
@@ -1,35 +0,0 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
pretrained/upstage/SOLAR-10.7B-Instruct-v1.0/README.md DELETED
@@ -1,181 +0,0 @@
1
- ---
2
- datasets:
3
- - c-s-ale/alpaca-gpt4-data
4
- - Open-Orca/OpenOrca
5
- - Intel/orca_dpo_pairs
6
- - allenai/ultrafeedback_binarized_cleaned
7
- language:
8
- - en
9
- license: cc-by-nc-4.0
10
- base_model:
11
- - upstage/SOLAR-10.7B-v1.0
12
- ---
13
-
14
- <p align="left">
15
- <img src="https://huggingface.co/upstage/SOLAR-10.7B-Instruct-v1.0/resolve/main/solar_logo.png" width="150"/>
16
- <p>
17
-
18
- # **Meet 10.7B Solar: Elevating Performance with Upstage Depth UP Scaling!**
19
-
20
- **(This model is [upstage/SOLAR-10.7B-v1.0](https://huggingface.co/upstage/SOLAR-10.7B-v1.0) fine-tuned version for single-turn conversation.)**
21
-
22
-
23
- # **Introduction**
24
- We introduce SOLAR-10.7B, an advanced large language model (LLM) with 10.7 billion parameters, demonstrating superior performance in various natural language processing (NLP) tasks. It's compact, yet remarkably powerful, and demonstrates unparalleled state-of-the-art performance in models with parameters under 30B.
25
-
26
- We present a methodology for scaling LLMs called depth up-scaling (DUS) , which encompasses architectural modifications and continued pretraining. In other words, we integrated Mistral 7B weights into the upscaled layers, and finally, continued pre-training for the entire model.
27
-
28
-
29
- SOLAR-10.7B has remarkable performance. It outperforms models with up to 30B parameters, even surpassing the recent Mixtral 8X7B model. For detailed information, please refer to the experimental table.
30
- Solar 10.7B is an ideal choice for fine-tuning. SOLAR-10.7B offers robustness and adaptability for your fine-tuning needs. Our simple instruction fine-tuning using the SOLAR-10.7B pre-trained model yields significant performance improvements.
31
-
32
- For full details of this model please read our [paper](https://arxiv.org/abs/2312.15166).
33
-
34
-
35
- # **Instruction Fine-Tuning Strategy**
36
-
37
- We utilize state-of-the-art instruction fine-tuning methods including supervised fine-tuning (SFT) and direct preference optimization (DPO) [1].
38
-
39
- We used a mixture of the following datasets
40
- - c-s-ale/alpaca-gpt4-data (SFT)
41
- - Open-Orca/OpenOrca (SFT)
42
- - in-house generated data utilizing Metamath [2] (SFT, DPO)
43
- - Intel/orca_dpo_pairs (DPO)
44
- - allenai/ultrafeedback_binarized_cleaned (DPO)
45
-
46
- where we were careful of data contamination by not using GSM8K samples when generating data and filtering tasks when applicable via the following list.
47
- ```python
48
- filtering_task_list = [
49
- 'task228_arc_answer_generation_easy',
50
- 'ai2_arc/ARC-Challenge:1.0.0',
51
- 'ai2_arc/ARC-Easy:1.0.0',
52
- 'task229_arc_answer_generation_hard',
53
- 'hellaswag:1.1.0',
54
- 'task1389_hellaswag_completion',
55
- 'cot_gsm8k',
56
- 'cot_gsm8k_ii',
57
- 'drop:2.0.0',
58
- 'winogrande:1.1.0'
59
- ]
60
- ```
61
-
62
- Using the datasets mentioned above, we applied SFT and iterative DPO training, a proprietary alignment strategy, to maximize the performance of our resulting model.
63
-
64
- [1] Rafailov, R., Sharma, A., Mitchell, E., Ermon, S., Manning, C.D. and Finn, C., 2023. Direct preference optimization: Your language model is secretly a reward model. NeurIPS.
65
-
66
- [2] Yu, L., Jiang, W., Shi, H., Yu, J., Liu, Z., Zhang, Y., Kwok, J.T., Li, Z., Weller, A. and Liu, W., 2023. Metamath: Bootstrap your own mathematical questions for large language models. arXiv preprint arXiv:2309.12284.
67
-
68
- # **Data Contamination Test Results**
69
-
70
- Recently, there have been contamination issues in some models on the LLM leaderboard.
71
- We note that we made every effort to exclude any benchmark-related datasets from training.
72
- We also ensured the integrity of our model by conducting a data contamination test [3] that is also used by the HuggingFace team [4, 5].
73
-
74
- Our results, with `result < 0.1, %:` being well below 0.9, indicate that our model is free from contamination.
75
-
76
- *The data contamination test results of HellaSwag and Winograde will be added once [3] supports them.*
77
-
78
- | Model | ARC | MMLU | TruthfulQA | GSM8K |
79
- |------------------------------|-------|-------|-------|-------|
80
- | **SOLAR-10.7B-Instruct-v1.0**| result < 0.1, %: 0.06 |result < 0.1, %: 0.15 | result < 0.1, %: 0.28 | result < 0.1, %: 0.70 |
81
-
82
- [3] https://github.com/swj0419/detect-pretrain-code-contamination
83
-
84
- [4] https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474#657f2245365456e362412a06
85
-
86
- [5] https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/265#657b6debf81f6b44b8966230
87
-
88
- # **Evaluation Results**
89
-
90
- | Model | H6 | Model Size |
91
- |----------------------------------------|-------|------------|
92
- | **SOLAR-10.7B-Instruct-v1.0** | **74.20** | **~ 11B** |
93
- | mistralai/Mixtral-8x7B-Instruct-v0.1 | 72.62 | ~ 46.7B |
94
- | 01-ai/Yi-34B-200K | 70.81 | ~ 34B |
95
- | 01-ai/Yi-34B | 69.42 | ~ 34B |
96
- | mistralai/Mixtral-8x7B-v0.1 | 68.42 | ~ 46.7B |
97
- | meta-llama/Llama-2-70b-hf | 67.87 | ~ 70B |
98
- | tiiuae/falcon-180B | 67.85 | ~ 180B |
99
- | **SOLAR-10.7B-v1.0** | **66.04** | **~11B** |
100
- | mistralai/Mistral-7B-Instruct-v0.2 | 65.71 | ~ 7B |
101
- | Qwen/Qwen-14B | 65.86 | ~ 14B |
102
- | 01-ai/Yi-34B-Chat | 65.32 | ~34B |
103
- | meta-llama/Llama-2-70b-chat-hf | 62.4 | ~ 70B |
104
- | mistralai/Mistral-7B-v0.1 | 60.97 | ~ 7B |
105
- | mistralai/Mistral-7B-Instruct-v0.1 | 54.96 | ~ 7B |
106
-
107
- # **Usage Instructions**
108
-
109
- This model has been fine-tuned primarily for single-turn conversation, making it less suitable for multi-turn conversations such as chat.
110
-
111
- ### **Version**
112
-
113
- Make sure you have the correct version of the transformers library installed:
114
-
115
- ```sh
116
- pip install transformers==4.35.2
117
- ```
118
-
119
- ### **Loading the Model**
120
-
121
- Use the following Python code to load the model:
122
-
123
- ```python
124
- import torch
125
- from transformers import AutoModelForCausalLM, AutoTokenizer
126
-
127
- tokenizer = AutoTokenizer.from_pretrained("Upstage/SOLAR-10.7B-Instruct-v1.0")
128
- model = AutoModelForCausalLM.from_pretrained(
129
- "Upstage/SOLAR-10.7B-Instruct-v1.0",
130
- device_map="auto",
131
- torch_dtype=torch.float16,
132
- )
133
- ```
134
-
135
- ### **Conducting Single-Turn Conversation**
136
-
137
- ```python
138
- conversation = [ {'role': 'user', 'content': 'Hello?'} ]
139
-
140
- prompt = tokenizer.apply_chat_template(conversation, tokenize=False, add_generation_prompt=True)
141
-
142
- inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
143
- outputs = model.generate(**inputs, use_cache=True, max_length=4096)
144
- output_text = tokenizer.decode(outputs[0])
145
- print(output_text)
146
- ```
147
-
148
- Below is an example of the output.
149
- ```
150
- <s> ### User:
151
- Hello?
152
-
153
- ### Assistant:
154
- Hello, how can I assist you today? Please feel free to ask any questions or request help with a specific task.</s>
155
- ```
156
-
157
- ### **License**
158
- - [upstage/SOLAR-10.7B-v1.0](https://huggingface.co/upstage/SOLAR-10.7B-v1.0): apache-2.0
159
- - [upstage/SOLAR-10.7B-Instruct-v1.0](https://huggingface.co/upstage/SOLAR-10.7B-Instruct-v1.0): cc-by-nc-4.0
160
- - Since some non-commercial datasets such as Alpaca are used for fine-tuning, we release this model as cc-by-nc-4.0.
161
-
162
- ### **How to Cite**
163
-
164
- Please cite this model using this format.
165
-
166
- ```bibtex
167
- @misc{kim2023solar,
168
- title={SOLAR 10.7B: Scaling Large Language Models with Simple yet Effective Depth Up-Scaling},
169
- author={Dahyun Kim and Chanjun Park and Sanghoon Kim and Wonsung Lee and Wonho Song and Yunsu Kim and Hyeonwoo Kim and Yungi Kim and Hyeonju Lee and Jihoo Kim and Changbae Ahn and Seonghoon Yang and Sukyung Lee and Hyunbyung Park and Gyoungjin Gim and Mikyoung Cha and Hwalsuk Lee and Sunghun Kim},
170
- year={2023},
171
- eprint={2312.15166},
172
- archivePrefix={arXiv},
173
- primaryClass={cs.CL}
174
- }
175
- ```
176
-
177
- ### **The Upstage AI Team** ###
178
- Upstage is creating the best LLM and DocAI. Please find more information at https://upstage.ai
179
-
180
- ### **Contact Us** ###
181
- Any questions and suggestions, please use the discussion tab. If you want to contact us directly, drop an email to [contact@upstage.ai](mailto:contact@upstage.ai)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
pretrained/upstage/SOLAR-10.7B-Instruct-v1.0/config.json DELETED
@@ -1,28 +0,0 @@
1
- {
2
- "_name_or_path": "upstage/SOLAR-10.7B-Instruct-v1.0",
3
- "architectures": [
4
- "LlamaForCausalLM"
5
- ],
6
- "attention_bias": false,
7
- "bos_token_id": 1,
8
- "eos_token_id": 2,
9
- "pad_token_id": 2,
10
- "hidden_act": "silu",
11
- "hidden_size": 4096,
12
- "initializer_range": 0.02,
13
- "intermediate_size": 14336,
14
- "max_position_embeddings": 4096,
15
- "model_type": "llama",
16
- "num_attention_heads": 32,
17
- "num_hidden_layers": 48,
18
- "num_key_value_heads": 8,
19
- "pretraining_tp": 1,
20
- "rms_norm_eps": 1e-05,
21
- "rope_scaling": null,
22
- "rope_theta": 10000.0,
23
- "tie_word_embeddings": false,
24
- "torch_dtype": "float16",
25
- "transformers_version": "4.35.0",
26
- "use_cache": true,
27
- "vocab_size": 32000
28
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
pretrained/upstage/SOLAR-10.7B-Instruct-v1.0/generation_config.json DELETED
@@ -1,8 +0,0 @@
1
- {
2
- "_from_model_config": true,
3
- "bos_token_id": 1,
4
- "eos_token_id": 2,
5
- "pad_token_id": 2,
6
- "transformers_version": "4.35.2",
7
- "use_cache": false
8
- }
 
 
 
 
 
 
 
 
 
pretrained/upstage/SOLAR-10.7B-Instruct-v1.0/model-00001-of-00005.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:a236ffb3d7450f3525058c4a84379dbf7ec20e0cdc1786b7454e355a8899a3e7
3
- size 4943162240
 
 
 
 
pretrained/upstage/SOLAR-10.7B-Instruct-v1.0/model-00002-of-00005.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:f08fa4ecc0ad6d7d14cc00af9586991f1e9cb7d0c67edbf33c69bc6528f416f3
3
- size 4999819232
 
 
 
 
pretrained/upstage/SOLAR-10.7B-Instruct-v1.0/model-00003-of-00005.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:6cfcaa469d97b4be11b1eeae0e8f3611f93299b4bb1b33578ac5e5fb866fb154
3
- size 4915916080
 
 
 
 
pretrained/upstage/SOLAR-10.7B-Instruct-v1.0/model-00004-of-00005.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:18bd0920761731ba8b43bdc568d13798f30e04377d94c781d1f926cdcccce172
3
- size 4915916080
 
 
 
 
pretrained/upstage/SOLAR-10.7B-Instruct-v1.0/model-00005-of-00005.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:46e67ee561f241973fcebca4e911c8515bcabda67e1f30fe2136ea95d400d22a
3
- size 1688284744
 
 
 
 
pretrained/upstage/SOLAR-10.7B-Instruct-v1.0/model.safetensors.index.json DELETED
@@ -1,442 +0,0 @@
1
- {
2
- "metadata": {
3
- "total_size": 21463048192
4
- },
5
- "weight_map": {
6
- "lm_head.weight": "model-00005-of-00005.safetensors",
7
- "model.embed_tokens.weight": "model-00001-of-00005.safetensors",
8
- "model.layers.0.input_layernorm.weight": "model-00001-of-00005.safetensors",
9
- "model.layers.0.mlp.down_proj.weight": "model-00001-of-00005.safetensors",
10
- "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00005.safetensors",
11
- "model.layers.0.mlp.up_proj.weight": "model-00001-of-00005.safetensors",
12
- "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00005.safetensors",
13
- "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00005.safetensors",
14
- "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00005.safetensors",
15
- "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00005.safetensors",
16
- "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00005.safetensors",
17
- "model.layers.1.input_layernorm.weight": "model-00001-of-00005.safetensors",
18
- "model.layers.1.mlp.down_proj.weight": "model-00001-of-00005.safetensors",
19
- "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00005.safetensors",
20
- "model.layers.1.mlp.up_proj.weight": "model-00001-of-00005.safetensors",
21
- "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00005.safetensors",
22
- "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00005.safetensors",
23
- "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00005.safetensors",
24
- "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00005.safetensors",
25
- "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00005.safetensors",
26
- "model.layers.10.input_layernorm.weight": "model-00002-of-00005.safetensors",
27
- "model.layers.10.mlp.down_proj.weight": "model-00002-of-00005.safetensors",
28
- "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00005.safetensors",
29
- "model.layers.10.mlp.up_proj.weight": "model-00001-of-00005.safetensors",
30
- "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00005.safetensors",
31
- "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00005.safetensors",
32
- "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00005.safetensors",
33
- "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00005.safetensors",
34
- "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00005.safetensors",
35
- "model.layers.11.input_layernorm.weight": "model-00002-of-00005.safetensors",
36
- "model.layers.11.mlp.down_proj.weight": "model-00002-of-00005.safetensors",
37
- "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00005.safetensors",
38
- "model.layers.11.mlp.up_proj.weight": "model-00002-of-00005.safetensors",
39
- "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00005.safetensors",
40
- "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00005.safetensors",
41
- "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00005.safetensors",
42
- "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00005.safetensors",
43
- "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00005.safetensors",
44
- "model.layers.12.input_layernorm.weight": "model-00002-of-00005.safetensors",
45
- "model.layers.12.mlp.down_proj.weight": "model-00002-of-00005.safetensors",
46
- "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00005.safetensors",
47
- "model.layers.12.mlp.up_proj.weight": "model-00002-of-00005.safetensors",
48
- "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00005.safetensors",
49
- "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00005.safetensors",
50
- "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00005.safetensors",
51
- "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00005.safetensors",
52
- "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00005.safetensors",
53
- "model.layers.13.input_layernorm.weight": "model-00002-of-00005.safetensors",
54
- "model.layers.13.mlp.down_proj.weight": "model-00002-of-00005.safetensors",
55
- "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00005.safetensors",
56
- "model.layers.13.mlp.up_proj.weight": "model-00002-of-00005.safetensors",
57
- "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00005.safetensors",
58
- "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00005.safetensors",
59
- "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00005.safetensors",
60
- "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00005.safetensors",
61
- "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00005.safetensors",
62
- "model.layers.14.input_layernorm.weight": "model-00002-of-00005.safetensors",
63
- "model.layers.14.mlp.down_proj.weight": "model-00002-of-00005.safetensors",
64
- "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00005.safetensors",
65
- "model.layers.14.mlp.up_proj.weight": "model-00002-of-00005.safetensors",
66
- "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00005.safetensors",
67
- "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00005.safetensors",
68
- "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00005.safetensors",
69
- "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00005.safetensors",
70
- "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00005.safetensors",
71
- "model.layers.15.input_layernorm.weight": "model-00002-of-00005.safetensors",
72
- "model.layers.15.mlp.down_proj.weight": "model-00002-of-00005.safetensors",
73
- "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00005.safetensors",
74
- "model.layers.15.mlp.up_proj.weight": "model-00002-of-00005.safetensors",
75
- "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00005.safetensors",
76
- "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00005.safetensors",
77
- "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00005.safetensors",
78
- "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00005.safetensors",
79
- "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00005.safetensors",
80
- "model.layers.16.input_layernorm.weight": "model-00002-of-00005.safetensors",
81
- "model.layers.16.mlp.down_proj.weight": "model-00002-of-00005.safetensors",
82
- "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00005.safetensors",
83
- "model.layers.16.mlp.up_proj.weight": "model-00002-of-00005.safetensors",
84
- "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00005.safetensors",
85
- "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00005.safetensors",
86
- "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00005.safetensors",
87
- "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00005.safetensors",
88
- "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00005.safetensors",
89
- "model.layers.17.input_layernorm.weight": "model-00002-of-00005.safetensors",
90
- "model.layers.17.mlp.down_proj.weight": "model-00002-of-00005.safetensors",
91
- "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00005.safetensors",
92
- "model.layers.17.mlp.up_proj.weight": "model-00002-of-00005.safetensors",
93
- "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00005.safetensors",
94
- "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00005.safetensors",
95
- "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00005.safetensors",
96
- "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00005.safetensors",
97
- "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00005.safetensors",
98
- "model.layers.18.input_layernorm.weight": "model-00002-of-00005.safetensors",
99
- "model.layers.18.mlp.down_proj.weight": "model-00002-of-00005.safetensors",
100
- "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00005.safetensors",
101
- "model.layers.18.mlp.up_proj.weight": "model-00002-of-00005.safetensors",
102
- "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00005.safetensors",
103
- "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00005.safetensors",
104
- "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00005.safetensors",
105
- "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00005.safetensors",
106
- "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00005.safetensors",
107
- "model.layers.19.input_layernorm.weight": "model-00002-of-00005.safetensors",
108
- "model.layers.19.mlp.down_proj.weight": "model-00002-of-00005.safetensors",
109
- "model.layers.19.mlp.gate_proj.weight": "model-00002-of-00005.safetensors",
110
- "model.layers.19.mlp.up_proj.weight": "model-00002-of-00005.safetensors",
111
- "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00005.safetensors",
112
- "model.layers.19.self_attn.k_proj.weight": "model-00002-of-00005.safetensors",
113
- "model.layers.19.self_attn.o_proj.weight": "model-00002-of-00005.safetensors",
114
- "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00005.safetensors",
115
- "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00005.safetensors",
116
- "model.layers.2.input_layernorm.weight": "model-00001-of-00005.safetensors",
117
- "model.layers.2.mlp.down_proj.weight": "model-00001-of-00005.safetensors",
118
- "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00005.safetensors",
119
- "model.layers.2.mlp.up_proj.weight": "model-00001-of-00005.safetensors",
120
- "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00005.safetensors",
121
- "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00005.safetensors",
122
- "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00005.safetensors",
123
- "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00005.safetensors",
124
- "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00005.safetensors",
125
- "model.layers.20.input_layernorm.weight": "model-00002-of-00005.safetensors",
126
- "model.layers.20.mlp.down_proj.weight": "model-00002-of-00005.safetensors",
127
- "model.layers.20.mlp.gate_proj.weight": "model-00002-of-00005.safetensors",
128
- "model.layers.20.mlp.up_proj.weight": "model-00002-of-00005.safetensors",
129
- "model.layers.20.post_attention_layernorm.weight": "model-00002-of-00005.safetensors",
130
- "model.layers.20.self_attn.k_proj.weight": "model-00002-of-00005.safetensors",
131
- "model.layers.20.self_attn.o_proj.weight": "model-00002-of-00005.safetensors",
132
- "model.layers.20.self_attn.q_proj.weight": "model-00002-of-00005.safetensors",
133
- "model.layers.20.self_attn.v_proj.weight": "model-00002-of-00005.safetensors",
134
- "model.layers.21.input_layernorm.weight": "model-00002-of-00005.safetensors",
135
- "model.layers.21.mlp.down_proj.weight": "model-00002-of-00005.safetensors",
136
- "model.layers.21.mlp.gate_proj.weight": "model-00002-of-00005.safetensors",
137
- "model.layers.21.mlp.up_proj.weight": "model-00002-of-00005.safetensors",
138
- "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00005.safetensors",
139
- "model.layers.21.self_attn.k_proj.weight": "model-00002-of-00005.safetensors",
140
- "model.layers.21.self_attn.o_proj.weight": "model-00002-of-00005.safetensors",
141
- "model.layers.21.self_attn.q_proj.weight": "model-00002-of-00005.safetensors",
142
- "model.layers.21.self_attn.v_proj.weight": "model-00002-of-00005.safetensors",
143
- "model.layers.22.input_layernorm.weight": "model-00003-of-00005.safetensors",
144
- "model.layers.22.mlp.down_proj.weight": "model-00003-of-00005.safetensors",
145
- "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00005.safetensors",
146
- "model.layers.22.mlp.up_proj.weight": "model-00003-of-00005.safetensors",
147
- "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00005.safetensors",
148
- "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00005.safetensors",
149
- "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00005.safetensors",
150
- "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00005.safetensors",
151
- "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00005.safetensors",
152
- "model.layers.23.input_layernorm.weight": "model-00003-of-00005.safetensors",
153
- "model.layers.23.mlp.down_proj.weight": "model-00003-of-00005.safetensors",
154
- "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00005.safetensors",
155
- "model.layers.23.mlp.up_proj.weight": "model-00003-of-00005.safetensors",
156
- "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00005.safetensors",
157
- "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00005.safetensors",
158
- "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00005.safetensors",
159
- "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00005.safetensors",
160
- "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00005.safetensors",
161
- "model.layers.24.input_layernorm.weight": "model-00003-of-00005.safetensors",
162
- "model.layers.24.mlp.down_proj.weight": "model-00003-of-00005.safetensors",
163
- "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00005.safetensors",
164
- "model.layers.24.mlp.up_proj.weight": "model-00003-of-00005.safetensors",
165
- "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00005.safetensors",
166
- "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00005.safetensors",
167
- "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00005.safetensors",
168
- "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00005.safetensors",
169
- "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00005.safetensors",
170
- "model.layers.25.input_layernorm.weight": "model-00003-of-00005.safetensors",
171
- "model.layers.25.mlp.down_proj.weight": "model-00003-of-00005.safetensors",
172
- "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00005.safetensors",
173
- "model.layers.25.mlp.up_proj.weight": "model-00003-of-00005.safetensors",
174
- "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00005.safetensors",
175
- "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00005.safetensors",
176
- "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00005.safetensors",
177
- "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00005.safetensors",
178
- "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00005.safetensors",
179
- "model.layers.26.input_layernorm.weight": "model-00003-of-00005.safetensors",
180
- "model.layers.26.mlp.down_proj.weight": "model-00003-of-00005.safetensors",
181
- "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00005.safetensors",
182
- "model.layers.26.mlp.up_proj.weight": "model-00003-of-00005.safetensors",
183
- "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00005.safetensors",
184
- "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00005.safetensors",
185
- "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00005.safetensors",
186
- "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00005.safetensors",
187
- "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00005.safetensors",
188
- "model.layers.27.input_layernorm.weight": "model-00003-of-00005.safetensors",
189
- "model.layers.27.mlp.down_proj.weight": "model-00003-of-00005.safetensors",
190
- "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00005.safetensors",
191
- "model.layers.27.mlp.up_proj.weight": "model-00003-of-00005.safetensors",
192
- "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00005.safetensors",
193
- "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00005.safetensors",
194
- "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00005.safetensors",
195
- "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00005.safetensors",
196
- "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00005.safetensors",
197
- "model.layers.28.input_layernorm.weight": "model-00003-of-00005.safetensors",
198
- "model.layers.28.mlp.down_proj.weight": "model-00003-of-00005.safetensors",
199
- "model.layers.28.mlp.gate_proj.weight": "model-00003-of-00005.safetensors",
200
- "model.layers.28.mlp.up_proj.weight": "model-00003-of-00005.safetensors",
201
- "model.layers.28.post_attention_layernorm.weight": "model-00003-of-00005.safetensors",
202
- "model.layers.28.self_attn.k_proj.weight": "model-00003-of-00005.safetensors",
203
- "model.layers.28.self_attn.o_proj.weight": "model-00003-of-00005.safetensors",
204
- "model.layers.28.self_attn.q_proj.weight": "model-00003-of-00005.safetensors",
205
- "model.layers.28.self_attn.v_proj.weight": "model-00003-of-00005.safetensors",
206
- "model.layers.29.input_layernorm.weight": "model-00003-of-00005.safetensors",
207
- "model.layers.29.mlp.down_proj.weight": "model-00003-of-00005.safetensors",
208
- "model.layers.29.mlp.gate_proj.weight": "model-00003-of-00005.safetensors",
209
- "model.layers.29.mlp.up_proj.weight": "model-00003-of-00005.safetensors",
210
- "model.layers.29.post_attention_layernorm.weight": "model-00003-of-00005.safetensors",
211
- "model.layers.29.self_attn.k_proj.weight": "model-00003-of-00005.safetensors",
212
- "model.layers.29.self_attn.o_proj.weight": "model-00003-of-00005.safetensors",
213
- "model.layers.29.self_attn.q_proj.weight": "model-00003-of-00005.safetensors",
214
- "model.layers.29.self_attn.v_proj.weight": "model-00003-of-00005.safetensors",
215
- "model.layers.3.input_layernorm.weight": "model-00001-of-00005.safetensors",
216
- "model.layers.3.mlp.down_proj.weight": "model-00001-of-00005.safetensors",
217
- "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00005.safetensors",
218
- "model.layers.3.mlp.up_proj.weight": "model-00001-of-00005.safetensors",
219
- "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00005.safetensors",
220
- "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00005.safetensors",
221
- "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00005.safetensors",
222
- "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00005.safetensors",
223
- "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00005.safetensors",
224
- "model.layers.30.input_layernorm.weight": "model-00003-of-00005.safetensors",
225
- "model.layers.30.mlp.down_proj.weight": "model-00003-of-00005.safetensors",
226
- "model.layers.30.mlp.gate_proj.weight": "model-00003-of-00005.safetensors",
227
- "model.layers.30.mlp.up_proj.weight": "model-00003-of-00005.safetensors",
228
- "model.layers.30.post_attention_layernorm.weight": "model-00003-of-00005.safetensors",
229
- "model.layers.30.self_attn.k_proj.weight": "model-00003-of-00005.safetensors",
230
- "model.layers.30.self_attn.o_proj.weight": "model-00003-of-00005.safetensors",
231
- "model.layers.30.self_attn.q_proj.weight": "model-00003-of-00005.safetensors",
232
- "model.layers.30.self_attn.v_proj.weight": "model-00003-of-00005.safetensors",
233
- "model.layers.31.input_layernorm.weight": "model-00003-of-00005.safetensors",
234
- "model.layers.31.mlp.down_proj.weight": "model-00003-of-00005.safetensors",
235
- "model.layers.31.mlp.gate_proj.weight": "model-00003-of-00005.safetensors",
236
- "model.layers.31.mlp.up_proj.weight": "model-00003-of-00005.safetensors",
237
- "model.layers.31.post_attention_layernorm.weight": "model-00003-of-00005.safetensors",
238
- "model.layers.31.self_attn.k_proj.weight": "model-00003-of-00005.safetensors",
239
- "model.layers.31.self_attn.o_proj.weight": "model-00003-of-00005.safetensors",
240
- "model.layers.31.self_attn.q_proj.weight": "model-00003-of-00005.safetensors",
241
- "model.layers.31.self_attn.v_proj.weight": "model-00003-of-00005.safetensors",
242
- "model.layers.32.input_layernorm.weight": "model-00003-of-00005.safetensors",
243
- "model.layers.32.mlp.down_proj.weight": "model-00003-of-00005.safetensors",
244
- "model.layers.32.mlp.gate_proj.weight": "model-00003-of-00005.safetensors",
245
- "model.layers.32.mlp.up_proj.weight": "model-00003-of-00005.safetensors",
246
- "model.layers.32.post_attention_layernorm.weight": "model-00003-of-00005.safetensors",
247
- "model.layers.32.self_attn.k_proj.weight": "model-00003-of-00005.safetensors",
248
- "model.layers.32.self_attn.o_proj.weight": "model-00003-of-00005.safetensors",
249
- "model.layers.32.self_attn.q_proj.weight": "model-00003-of-00005.safetensors",
250
- "model.layers.32.self_attn.v_proj.weight": "model-00003-of-00005.safetensors",
251
- "model.layers.33.input_layernorm.weight": "model-00004-of-00005.safetensors",
252
- "model.layers.33.mlp.down_proj.weight": "model-00004-of-00005.safetensors",
253
- "model.layers.33.mlp.gate_proj.weight": "model-00003-of-00005.safetensors",
254
- "model.layers.33.mlp.up_proj.weight": "model-00004-of-00005.safetensors",
255
- "model.layers.33.post_attention_layernorm.weight": "model-00004-of-00005.safetensors",
256
- "model.layers.33.self_attn.k_proj.weight": "model-00003-of-00005.safetensors",
257
- "model.layers.33.self_attn.o_proj.weight": "model-00003-of-00005.safetensors",
258
- "model.layers.33.self_attn.q_proj.weight": "model-00003-of-00005.safetensors",
259
- "model.layers.33.self_attn.v_proj.weight": "model-00003-of-00005.safetensors",
260
- "model.layers.34.input_layernorm.weight": "model-00004-of-00005.safetensors",
261
- "model.layers.34.mlp.down_proj.weight": "model-00004-of-00005.safetensors",
262
- "model.layers.34.mlp.gate_proj.weight": "model-00004-of-00005.safetensors",
263
- "model.layers.34.mlp.up_proj.weight": "model-00004-of-00005.safetensors",
264
- "model.layers.34.post_attention_layernorm.weight": "model-00004-of-00005.safetensors",
265
- "model.layers.34.self_attn.k_proj.weight": "model-00004-of-00005.safetensors",
266
- "model.layers.34.self_attn.o_proj.weight": "model-00004-of-00005.safetensors",
267
- "model.layers.34.self_attn.q_proj.weight": "model-00004-of-00005.safetensors",
268
- "model.layers.34.self_attn.v_proj.weight": "model-00004-of-00005.safetensors",
269
- "model.layers.35.input_layernorm.weight": "model-00004-of-00005.safetensors",
270
- "model.layers.35.mlp.down_proj.weight": "model-00004-of-00005.safetensors",
271
- "model.layers.35.mlp.gate_proj.weight": "model-00004-of-00005.safetensors",
272
- "model.layers.35.mlp.up_proj.weight": "model-00004-of-00005.safetensors",
273
- "model.layers.35.post_attention_layernorm.weight": "model-00004-of-00005.safetensors",
274
- "model.layers.35.self_attn.k_proj.weight": "model-00004-of-00005.safetensors",
275
- "model.layers.35.self_attn.o_proj.weight": "model-00004-of-00005.safetensors",
276
- "model.layers.35.self_attn.q_proj.weight": "model-00004-of-00005.safetensors",
277
- "model.layers.35.self_attn.v_proj.weight": "model-00004-of-00005.safetensors",
278
- "model.layers.36.input_layernorm.weight": "model-00004-of-00005.safetensors",
279
- "model.layers.36.mlp.down_proj.weight": "model-00004-of-00005.safetensors",
280
- "model.layers.36.mlp.gate_proj.weight": "model-00004-of-00005.safetensors",
281
- "model.layers.36.mlp.up_proj.weight": "model-00004-of-00005.safetensors",
282
- "model.layers.36.post_attention_layernorm.weight": "model-00004-of-00005.safetensors",
283
- "model.layers.36.self_attn.k_proj.weight": "model-00004-of-00005.safetensors",
284
- "model.layers.36.self_attn.o_proj.weight": "model-00004-of-00005.safetensors",
285
- "model.layers.36.self_attn.q_proj.weight": "model-00004-of-00005.safetensors",
286
- "model.layers.36.self_attn.v_proj.weight": "model-00004-of-00005.safetensors",
287
- "model.layers.37.input_layernorm.weight": "model-00004-of-00005.safetensors",
288
- "model.layers.37.mlp.down_proj.weight": "model-00004-of-00005.safetensors",
289
- "model.layers.37.mlp.gate_proj.weight": "model-00004-of-00005.safetensors",
290
- "model.layers.37.mlp.up_proj.weight": "model-00004-of-00005.safetensors",
291
- "model.layers.37.post_attention_layernorm.weight": "model-00004-of-00005.safetensors",
292
- "model.layers.37.self_attn.k_proj.weight": "model-00004-of-00005.safetensors",
293
- "model.layers.37.self_attn.o_proj.weight": "model-00004-of-00005.safetensors",
294
- "model.layers.37.self_attn.q_proj.weight": "model-00004-of-00005.safetensors",
295
- "model.layers.37.self_attn.v_proj.weight": "model-00004-of-00005.safetensors",
296
- "model.layers.38.input_layernorm.weight": "model-00004-of-00005.safetensors",
297
- "model.layers.38.mlp.down_proj.weight": "model-00004-of-00005.safetensors",
298
- "model.layers.38.mlp.gate_proj.weight": "model-00004-of-00005.safetensors",
299
- "model.layers.38.mlp.up_proj.weight": "model-00004-of-00005.safetensors",
300
- "model.layers.38.post_attention_layernorm.weight": "model-00004-of-00005.safetensors",
301
- "model.layers.38.self_attn.k_proj.weight": "model-00004-of-00005.safetensors",
302
- "model.layers.38.self_attn.o_proj.weight": "model-00004-of-00005.safetensors",
303
- "model.layers.38.self_attn.q_proj.weight": "model-00004-of-00005.safetensors",
304
- "model.layers.38.self_attn.v_proj.weight": "model-00004-of-00005.safetensors",
305
- "model.layers.39.input_layernorm.weight": "model-00004-of-00005.safetensors",
306
- "model.layers.39.mlp.down_proj.weight": "model-00004-of-00005.safetensors",
307
- "model.layers.39.mlp.gate_proj.weight": "model-00004-of-00005.safetensors",
308
- "model.layers.39.mlp.up_proj.weight": "model-00004-of-00005.safetensors",
309
- "model.layers.39.post_attention_layernorm.weight": "model-00004-of-00005.safetensors",
310
- "model.layers.39.self_attn.k_proj.weight": "model-00004-of-00005.safetensors",
311
- "model.layers.39.self_attn.o_proj.weight": "model-00004-of-00005.safetensors",
312
- "model.layers.39.self_attn.q_proj.weight": "model-00004-of-00005.safetensors",
313
- "model.layers.39.self_attn.v_proj.weight": "model-00004-of-00005.safetensors",
314
- "model.layers.4.input_layernorm.weight": "model-00001-of-00005.safetensors",
315
- "model.layers.4.mlp.down_proj.weight": "model-00001-of-00005.safetensors",
316
- "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00005.safetensors",
317
- "model.layers.4.mlp.up_proj.weight": "model-00001-of-00005.safetensors",
318
- "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00005.safetensors",
319
- "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00005.safetensors",
320
- "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00005.safetensors",
321
- "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00005.safetensors",
322
- "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00005.safetensors",
323
- "model.layers.40.input_layernorm.weight": "model-00004-of-00005.safetensors",
324
- "model.layers.40.mlp.down_proj.weight": "model-00004-of-00005.safetensors",
325
- "model.layers.40.mlp.gate_proj.weight": "model-00004-of-00005.safetensors",
326
- "model.layers.40.mlp.up_proj.weight": "model-00004-of-00005.safetensors",
327
- "model.layers.40.post_attention_layernorm.weight": "model-00004-of-00005.safetensors",
328
- "model.layers.40.self_attn.k_proj.weight": "model-00004-of-00005.safetensors",
329
- "model.layers.40.self_attn.o_proj.weight": "model-00004-of-00005.safetensors",
330
- "model.layers.40.self_attn.q_proj.weight": "model-00004-of-00005.safetensors",
331
- "model.layers.40.self_attn.v_proj.weight": "model-00004-of-00005.safetensors",
332
- "model.layers.41.input_layernorm.weight": "model-00004-of-00005.safetensors",
333
- "model.layers.41.mlp.down_proj.weight": "model-00004-of-00005.safetensors",
334
- "model.layers.41.mlp.gate_proj.weight": "model-00004-of-00005.safetensors",
335
- "model.layers.41.mlp.up_proj.weight": "model-00004-of-00005.safetensors",
336
- "model.layers.41.post_attention_layernorm.weight": "model-00004-of-00005.safetensors",
337
- "model.layers.41.self_attn.k_proj.weight": "model-00004-of-00005.safetensors",
338
- "model.layers.41.self_attn.o_proj.weight": "model-00004-of-00005.safetensors",
339
- "model.layers.41.self_attn.q_proj.weight": "model-00004-of-00005.safetensors",
340
- "model.layers.41.self_attn.v_proj.weight": "model-00004-of-00005.safetensors",
341
- "model.layers.42.input_layernorm.weight": "model-00004-of-00005.safetensors",
342
- "model.layers.42.mlp.down_proj.weight": "model-00004-of-00005.safetensors",
343
- "model.layers.42.mlp.gate_proj.weight": "model-00004-of-00005.safetensors",
344
- "model.layers.42.mlp.up_proj.weight": "model-00004-of-00005.safetensors",
345
- "model.layers.42.post_attention_layernorm.weight": "model-00004-of-00005.safetensors",
346
- "model.layers.42.self_attn.k_proj.weight": "model-00004-of-00005.safetensors",
347
- "model.layers.42.self_attn.o_proj.weight": "model-00004-of-00005.safetensors",
348
- "model.layers.42.self_attn.q_proj.weight": "model-00004-of-00005.safetensors",
349
- "model.layers.42.self_attn.v_proj.weight": "model-00004-of-00005.safetensors",
350
- "model.layers.43.input_layernorm.weight": "model-00004-of-00005.safetensors",
351
- "model.layers.43.mlp.down_proj.weight": "model-00004-of-00005.safetensors",
352
- "model.layers.43.mlp.gate_proj.weight": "model-00004-of-00005.safetensors",
353
- "model.layers.43.mlp.up_proj.weight": "model-00004-of-00005.safetensors",
354
- "model.layers.43.post_attention_layernorm.weight": "model-00004-of-00005.safetensors",
355
- "model.layers.43.self_attn.k_proj.weight": "model-00004-of-00005.safetensors",
356
- "model.layers.43.self_attn.o_proj.weight": "model-00004-of-00005.safetensors",
357
- "model.layers.43.self_attn.q_proj.weight": "model-00004-of-00005.safetensors",
358
- "model.layers.43.self_attn.v_proj.weight": "model-00004-of-00005.safetensors",
359
- "model.layers.44.input_layernorm.weight": "model-00005-of-00005.safetensors",
360
- "model.layers.44.mlp.down_proj.weight": "model-00005-of-00005.safetensors",
361
- "model.layers.44.mlp.gate_proj.weight": "model-00004-of-00005.safetensors",
362
- "model.layers.44.mlp.up_proj.weight": "model-00004-of-00005.safetensors",
363
- "model.layers.44.post_attention_layernorm.weight": "model-00005-of-00005.safetensors",
364
- "model.layers.44.self_attn.k_proj.weight": "model-00004-of-00005.safetensors",
365
- "model.layers.44.self_attn.o_proj.weight": "model-00004-of-00005.safetensors",
366
- "model.layers.44.self_attn.q_proj.weight": "model-00004-of-00005.safetensors",
367
- "model.layers.44.self_attn.v_proj.weight": "model-00004-of-00005.safetensors",
368
- "model.layers.45.input_layernorm.weight": "model-00005-of-00005.safetensors",
369
- "model.layers.45.mlp.down_proj.weight": "model-00005-of-00005.safetensors",
370
- "model.layers.45.mlp.gate_proj.weight": "model-00005-of-00005.safetensors",
371
- "model.layers.45.mlp.up_proj.weight": "model-00005-of-00005.safetensors",
372
- "model.layers.45.post_attention_layernorm.weight": "model-00005-of-00005.safetensors",
373
- "model.layers.45.self_attn.k_proj.weight": "model-00005-of-00005.safetensors",
374
- "model.layers.45.self_attn.o_proj.weight": "model-00005-of-00005.safetensors",
375
- "model.layers.45.self_attn.q_proj.weight": "model-00005-of-00005.safetensors",
376
- "model.layers.45.self_attn.v_proj.weight": "model-00005-of-00005.safetensors",
377
- "model.layers.46.input_layernorm.weight": "model-00005-of-00005.safetensors",
378
- "model.layers.46.mlp.down_proj.weight": "model-00005-of-00005.safetensors",
379
- "model.layers.46.mlp.gate_proj.weight": "model-00005-of-00005.safetensors",
380
- "model.layers.46.mlp.up_proj.weight": "model-00005-of-00005.safetensors",
381
- "model.layers.46.post_attention_layernorm.weight": "model-00005-of-00005.safetensors",
382
- "model.layers.46.self_attn.k_proj.weight": "model-00005-of-00005.safetensors",
383
- "model.layers.46.self_attn.o_proj.weight": "model-00005-of-00005.safetensors",
384
- "model.layers.46.self_attn.q_proj.weight": "model-00005-of-00005.safetensors",
385
- "model.layers.46.self_attn.v_proj.weight": "model-00005-of-00005.safetensors",
386
- "model.layers.47.input_layernorm.weight": "model-00005-of-00005.safetensors",
387
- "model.layers.47.mlp.down_proj.weight": "model-00005-of-00005.safetensors",
388
- "model.layers.47.mlp.gate_proj.weight": "model-00005-of-00005.safetensors",
389
- "model.layers.47.mlp.up_proj.weight": "model-00005-of-00005.safetensors",
390
- "model.layers.47.post_attention_layernorm.weight": "model-00005-of-00005.safetensors",
391
- "model.layers.47.self_attn.k_proj.weight": "model-00005-of-00005.safetensors",
392
- "model.layers.47.self_attn.o_proj.weight": "model-00005-of-00005.safetensors",
393
- "model.layers.47.self_attn.q_proj.weight": "model-00005-of-00005.safetensors",
394
- "model.layers.47.self_attn.v_proj.weight": "model-00005-of-00005.safetensors",
395
- "model.layers.5.input_layernorm.weight": "model-00001-of-00005.safetensors",
396
- "model.layers.5.mlp.down_proj.weight": "model-00001-of-00005.safetensors",
397
- "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00005.safetensors",
398
- "model.layers.5.mlp.up_proj.weight": "model-00001-of-00005.safetensors",
399
- "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00005.safetensors",
400
- "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00005.safetensors",
401
- "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00005.safetensors",
402
- "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00005.safetensors",
403
- "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00005.safetensors",
404
- "model.layers.6.input_layernorm.weight": "model-00001-of-00005.safetensors",
405
- "model.layers.6.mlp.down_proj.weight": "model-00001-of-00005.safetensors",
406
- "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00005.safetensors",
407
- "model.layers.6.mlp.up_proj.weight": "model-00001-of-00005.safetensors",
408
- "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00005.safetensors",
409
- "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00005.safetensors",
410
- "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00005.safetensors",
411
- "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00005.safetensors",
412
- "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00005.safetensors",
413
- "model.layers.7.input_layernorm.weight": "model-00001-of-00005.safetensors",
414
- "model.layers.7.mlp.down_proj.weight": "model-00001-of-00005.safetensors",
415
- "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00005.safetensors",
416
- "model.layers.7.mlp.up_proj.weight": "model-00001-of-00005.safetensors",
417
- "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00005.safetensors",
418
- "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00005.safetensors",
419
- "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00005.safetensors",
420
- "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00005.safetensors",
421
- "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00005.safetensors",
422
- "model.layers.8.input_layernorm.weight": "model-00001-of-00005.safetensors",
423
- "model.layers.8.mlp.down_proj.weight": "model-00001-of-00005.safetensors",
424
- "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00005.safetensors",
425
- "model.layers.8.mlp.up_proj.weight": "model-00001-of-00005.safetensors",
426
- "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00005.safetensors",
427
- "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00005.safetensors",
428
- "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00005.safetensors",
429
- "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00005.safetensors",
430
- "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00005.safetensors",
431
- "model.layers.9.input_layernorm.weight": "model-00001-of-00005.safetensors",
432
- "model.layers.9.mlp.down_proj.weight": "model-00001-of-00005.safetensors",
433
- "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00005.safetensors",
434
- "model.layers.9.mlp.up_proj.weight": "model-00001-of-00005.safetensors",
435
- "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00005.safetensors",
436
- "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00005.safetensors",
437
- "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00005.safetensors",
438
- "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00005.safetensors",
439
- "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00005.safetensors",
440
- "model.norm.weight": "model-00005-of-00005.safetensors"
441
- }
442
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
pretrained/upstage/SOLAR-10.7B-Instruct-v1.0/solar_logo.png DELETED
Binary file (77.1 kB)
 
pretrained/upstage/SOLAR-10.7B-Instruct-v1.0/tokenizer.json DELETED
The diff for this file is too large to render. See raw diff
 
pretrained/upstage/SOLAR-10.7B-Instruct-v1.0/tokenizer.model DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:dadfd56d766715c61d2ef780a525ab43b8e6da4de6865bda3d95fdef5e134055
3
- size 493443
 
 
 
 
pretrained/upstage/SOLAR-10.7B-Instruct-v1.0/tokenizer_config.json DELETED
@@ -1,43 +0,0 @@
1
- {
2
- "add_bos_token": true,
3
- "add_eos_token": false,
4
- "added_tokens_decoder": {
5
- "0": {
6
- "content": "<unk>",
7
- "lstrip": false,
8
- "normalized": false,
9
- "rstrip": false,
10
- "single_word": false,
11
- "special": true
12
- },
13
- "1": {
14
- "content": "<s>",
15
- "lstrip": false,
16
- "normalized": false,
17
- "rstrip": false,
18
- "single_word": false,
19
- "special": true
20
- },
21
- "2": {
22
- "content": "</s>",
23
- "lstrip": false,
24
- "normalized": false,
25
- "rstrip": false,
26
- "single_word": false,
27
- "special": true
28
- }
29
- },
30
- "additional_special_tokens": [],
31
- "chat_template": "{% for message in messages %}{% if message['role'] == 'system' %}{% if message['content']%}{{'### System:\n' + message['content']+'\n\n'}}{% endif %}{% elif message['role'] == 'user' %}{{'### User:\n' + message['content']+'\n\n'}}{% elif message['role'] == 'assistant' %}{{'### Assistant:\n' + message['content']}}{% endif %}{% if loop.last and add_generation_prompt %}{{ '### Assistant:\n' }}{% endif %}{% endfor %}",
32
- "bos_token": "<s>",
33
- "clean_up_tokenization_spaces": false,
34
- "eos_token": "</s>",
35
- "legacy": true,
36
- "model_max_length": 1000000000000000019884624838656,
37
- "pad_token": "</s>",
38
- "sp_model_kwargs": {},
39
- "spaces_between_special_tokens": false,
40
- "tokenizer_class": "LlamaTokenizer",
41
- "unk_token": "<unk>",
42
- "use_default_system_prompt": true
43
- }