FINGU-AI commited on
Commit
e6213f7
1 Parent(s): 6abba5a

Upload folder using huggingface_hub

Browse files
1_Pooling/config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "word_embedding_dimension": 1536,
3
+ "pooling_mode_cls_token": false,
4
+ "pooling_mode_mean_tokens": true,
5
+ "pooling_mode_max_tokens": false,
6
+ "pooling_mode_mean_sqrt_len_tokens": false,
7
+ "pooling_mode_weightedmean_tokens": false,
8
+ "pooling_mode_lasttoken": false,
9
+ "include_prompt": true
10
+ }
2_Dense/config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"in_features": 1536, "out_features": 1024, "bias": true, "activation_function": "torch.nn.modules.linear.Identity"}
2_Dense/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5f390995eeb0263a835f7154e90193bcbb447be8abf9687b21c72896edc74d6f
3
+ size 6295712
README.md CHANGED
@@ -1,3 +1,655 @@
1
- ---
2
- license: apache-2.0
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: dunzhang/stella_en_1.5B_v5
3
+ datasets: []
4
+ language: []
5
+ library_name: sentence-transformers
6
+ metrics:
7
+ - cosine_accuracy@1
8
+ - cosine_accuracy@3
9
+ - cosine_accuracy@5
10
+ - cosine_accuracy@10
11
+ - cosine_precision@1
12
+ - cosine_precision@3
13
+ - cosine_precision@5
14
+ - cosine_precision@10
15
+ - cosine_recall@1
16
+ - cosine_recall@3
17
+ - cosine_recall@5
18
+ - cosine_recall@10
19
+ - cosine_ndcg@10
20
+ - cosine_mrr@10
21
+ - cosine_map@100
22
+ pipeline_tag: sentence-similarity
23
+ tags:
24
+ - sentence-transformers
25
+ - sentence-similarity
26
+ - feature-extraction
27
+ - generated_from_trainer
28
+ - dataset_size:693000
29
+ - loss:MatryoshkaLoss
30
+ - loss:MultipleNegativesRankingLoss
31
+ widget:
32
+ - source_sentence: Paracrystalline materials are defined as having short and medium
33
+ range ordering in their lattice (similar to the liquid crystal phases) but lacking
34
+ crystal-like long-range ordering at least in one direction.
35
+ sentences:
36
+ - 'Instruct: Given a web search query, retrieve relevant passages that answer the
37
+ query.
38
+
39
+ Query: Paracrystalline'
40
+ - 'Instruct: Given a web search query, retrieve relevant passages that answer the
41
+ query.
42
+
43
+ Query: Øystein Dahle'
44
+ - 'Instruct: Given a web search query, retrieve relevant passages that answer the
45
+ query.
46
+
47
+ Query: Makis Belevonis'
48
+ - source_sentence: 'Hạ Trạch is a commune ( xã ) and village in Bố Trạch District
49
+ , Quảng Bình Province , in Vietnam . Category : Populated places in Quang Binh
50
+ Province Category : Communes of Quang Binh Province'
51
+ sentences:
52
+ - 'Instruct: Given a web search query, retrieve relevant passages that answer the
53
+ query.
54
+
55
+ Query: The Taill of how this forsaid Tod maid his Confessioun to Freir Wolf Waitskaith'
56
+ - 'Instruct: Given a web search query, retrieve relevant passages that answer the
57
+ query.
58
+
59
+ Query: Hạ Trạch'
60
+ - 'Instruct: Given a web search query, retrieve relevant passages that answer the
61
+ query.
62
+
63
+ Query: Tadaxa'
64
+ - source_sentence: The Golden Mosque (سنهرى مسجد, Sunehri Masjid) is a mosque in Old
65
+ Delhi. It is located outside the southwestern corner of Delhi Gate of the Red
66
+ Fort, opposite the Netaji Subhash Park.
67
+ sentences:
68
+ - 'Instruct: Given a web search query, retrieve relevant passages that answer the
69
+ query.
70
+
71
+ Query: Algorithm'
72
+ - 'Instruct: Given a web search query, retrieve relevant passages that answer the
73
+ query.
74
+
75
+ Query: Golden Mosque (Red Fort)'
76
+ - 'Instruct: Given a web search query, retrieve relevant passages that answer the
77
+ query.
78
+
79
+ Query: Parnaso Español'
80
+ - source_sentence: Unibank, S.A. is one of Haiti's two largest private commercial
81
+ banks. The bank was founded in 1993 by a group of Haitian investors and is the
82
+ main company of "Groupe Financier National (GFN)". It opened its first office
83
+ in July 1993 in downtown Port-au-Prince and has 50 branches throughout the country
84
+ as of the end of 2016.
85
+ sentences:
86
+ - 'Instruct: Given a web search query, retrieve relevant passages that answer the
87
+ query.
88
+
89
+ Query: Sky TG24'
90
+ - 'Instruct: Given a web search query, retrieve relevant passages that answer the
91
+ query.
92
+
93
+ Query: Ghomijeh'
94
+ - 'Instruct: Given a web search query, retrieve relevant passages that answer the
95
+ query.
96
+
97
+ Query: Unibank (Haiti)'
98
+ - source_sentence: The Tchaikovsky Symphony Orchestra is a Russian classical music
99
+ orchestra established in 1930. It was founded as the Moscow Radio Symphony Orchestra,
100
+ and served as the official symphony for the Soviet All-Union Radio network. Following
101
+ the dissolution of the, Soviet Union in 1991, the orchestra was renamed in 1993
102
+ by the Russian Ministry of Culture in recognition of the central role the music
103
+ of Tchaikovsky plays in its repertoire. The current music director is Vladimir
104
+ Fedoseyev, who has been in that position since 1974.
105
+ sentences:
106
+ - 'Instruct: Given a web search query, retrieve relevant passages that answer the
107
+ query.
108
+
109
+ Query: Harald J.W. Mueller-Kirsten'
110
+ - 'Instruct: Given a web search query, retrieve relevant passages that answer the
111
+ query.
112
+
113
+ Query: Sierra del Lacandón'
114
+ - 'Instruct: Given a web search query, retrieve relevant passages that answer the
115
+ query.
116
+
117
+ Query: Tchaikovsky Symphony Orchestra'
118
+ model-index:
119
+ - name: SentenceTransformer based on dunzhang/stella_en_1.5B_v5
120
+ results:
121
+ - task:
122
+ type: information-retrieval
123
+ name: Information Retrieval
124
+ dataset:
125
+ name: Unknown
126
+ type: unknown
127
+ metrics:
128
+ - type: cosine_accuracy@1
129
+ value: 0.9447811447811448
130
+ name: Cosine Accuracy@1
131
+ - type: cosine_accuracy@3
132
+ value: 0.9686868686868687
133
+ name: Cosine Accuracy@3
134
+ - type: cosine_accuracy@5
135
+ value: 0.9764309764309764
136
+ name: Cosine Accuracy@5
137
+ - type: cosine_accuracy@10
138
+ value: 0.9811447811447811
139
+ name: Cosine Accuracy@10
140
+ - type: cosine_precision@1
141
+ value: 0.9447811447811448
142
+ name: Cosine Precision@1
143
+ - type: cosine_precision@3
144
+ value: 0.3228956228956229
145
+ name: Cosine Precision@3
146
+ - type: cosine_precision@5
147
+ value: 0.19528619528619526
148
+ name: Cosine Precision@5
149
+ - type: cosine_precision@10
150
+ value: 0.09811447811447811
151
+ name: Cosine Precision@10
152
+ - type: cosine_recall@1
153
+ value: 0.9447811447811448
154
+ name: Cosine Recall@1
155
+ - type: cosine_recall@3
156
+ value: 0.9686868686868687
157
+ name: Cosine Recall@3
158
+ - type: cosine_recall@5
159
+ value: 0.9764309764309764
160
+ name: Cosine Recall@5
161
+ - type: cosine_recall@10
162
+ value: 0.9811447811447811
163
+ name: Cosine Recall@10
164
+ - type: cosine_ndcg@10
165
+ value: 0.9636993273003078
166
+ name: Cosine Ndcg@10
167
+ - type: cosine_mrr@10
168
+ value: 0.9580071882849661
169
+ name: Cosine Mrr@10
170
+ - type: cosine_map@100
171
+ value: 0.9586207391258978
172
+ name: Cosine Map@100
173
+ - type: cosine_accuracy@1
174
+ value: 0.9444444444444444
175
+ name: Cosine Accuracy@1
176
+ - type: cosine_accuracy@3
177
+ value: 0.97003367003367
178
+ name: Cosine Accuracy@3
179
+ - type: cosine_accuracy@5
180
+ value: 0.9764309764309764
181
+ name: Cosine Accuracy@5
182
+ - type: cosine_accuracy@10
183
+ value: 0.9824915824915825
184
+ name: Cosine Accuracy@10
185
+ - type: cosine_precision@1
186
+ value: 0.9444444444444444
187
+ name: Cosine Precision@1
188
+ - type: cosine_precision@3
189
+ value: 0.32334455667789
190
+ name: Cosine Precision@3
191
+ - type: cosine_precision@5
192
+ value: 0.19528619528619529
193
+ name: Cosine Precision@5
194
+ - type: cosine_precision@10
195
+ value: 0.09824915824915824
196
+ name: Cosine Precision@10
197
+ - type: cosine_recall@1
198
+ value: 0.9444444444444444
199
+ name: Cosine Recall@1
200
+ - type: cosine_recall@3
201
+ value: 0.97003367003367
202
+ name: Cosine Recall@3
203
+ - type: cosine_recall@5
204
+ value: 0.9764309764309764
205
+ name: Cosine Recall@5
206
+ - type: cosine_recall@10
207
+ value: 0.9824915824915825
208
+ name: Cosine Recall@10
209
+ - type: cosine_ndcg@10
210
+ value: 0.9639446842698776
211
+ name: Cosine Ndcg@10
212
+ - type: cosine_mrr@10
213
+ value: 0.9579490673935119
214
+ name: Cosine Mrr@10
215
+ - type: cosine_map@100
216
+ value: 0.9584482053349265
217
+ name: Cosine Map@100
218
+ - type: cosine_accuracy@1
219
+ value: 0.9437710437710438
220
+ name: Cosine Accuracy@1
221
+ - type: cosine_accuracy@3
222
+ value: 0.967003367003367
223
+ name: Cosine Accuracy@3
224
+ - type: cosine_accuracy@5
225
+ value: 0.9723905723905724
226
+ name: Cosine Accuracy@5
227
+ - type: cosine_accuracy@10
228
+ value: 0.9801346801346801
229
+ name: Cosine Accuracy@10
230
+ - type: cosine_precision@1
231
+ value: 0.9437710437710438
232
+ name: Cosine Precision@1
233
+ - type: cosine_precision@3
234
+ value: 0.322334455667789
235
+ name: Cosine Precision@3
236
+ - type: cosine_precision@5
237
+ value: 0.19447811447811444
238
+ name: Cosine Precision@5
239
+ - type: cosine_precision@10
240
+ value: 0.09801346801346802
241
+ name: Cosine Precision@10
242
+ - type: cosine_recall@1
243
+ value: 0.9437710437710438
244
+ name: Cosine Recall@1
245
+ - type: cosine_recall@3
246
+ value: 0.967003367003367
247
+ name: Cosine Recall@3
248
+ - type: cosine_recall@5
249
+ value: 0.9723905723905724
250
+ name: Cosine Recall@5
251
+ - type: cosine_recall@10
252
+ value: 0.9801346801346801
253
+ name: Cosine Recall@10
254
+ - type: cosine_ndcg@10
255
+ value: 0.9623908732460177
256
+ name: Cosine Ndcg@10
257
+ - type: cosine_mrr@10
258
+ value: 0.9566718775052107
259
+ name: Cosine Mrr@10
260
+ - type: cosine_map@100
261
+ value: 0.9572829070357247
262
+ name: Cosine Map@100
263
+ ---
264
+
265
+ # SentenceTransformer based on dunzhang/stella_en_1.5B_v5
266
+
267
+ This is a [sentence-transformers](https://www.SBERT.net) model finetuned from [dunzhang/stella_en_1.5B_v5](https://huggingface.co/dunzhang/stella_en_1.5B_v5). It maps sentences & paragraphs to a 1024-dimensional dense vector space and can be used for semantic textual similarity, semantic search, paraphrase mining, text classification, clustering, and more.
268
+
269
+ ## Model Details
270
+
271
+ ### Model Description
272
+ - **Model Type:** Sentence Transformer
273
+ - **Base model:** [dunzhang/stella_en_1.5B_v5](https://huggingface.co/dunzhang/stella_en_1.5B_v5) <!-- at revision 129dc50d3ca5f0f5ee0ce8944f65a8553c0f26e0 -->
274
+ - **Maximum Sequence Length:** 8096 tokens
275
+ - **Output Dimensionality:** 1024 tokens
276
+ - **Similarity Function:** Cosine Similarity
277
+ <!-- - **Training Dataset:** Unknown -->
278
+ <!-- - **Language:** Unknown -->
279
+ <!-- - **License:** Unknown -->
280
+
281
+ ### Model Sources
282
+
283
+ - **Documentation:** [Sentence Transformers Documentation](https://sbert.net)
284
+ - **Repository:** [Sentence Transformers on GitHub](https://github.com/UKPLab/sentence-transformers)
285
+ - **Hugging Face:** [Sentence Transformers on Hugging Face](https://huggingface.co/models?library=sentence-transformers)
286
+
287
+ ### Full Model Architecture
288
+
289
+ ```
290
+ SentenceTransformer(
291
+ (0): Transformer({'max_seq_length': 8096, 'do_lower_case': False}) with Transformer model: Qwen2Model
292
+ (1): Pooling({'word_embedding_dimension': 1536, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
293
+ (2): Dense({'in_features': 1536, 'out_features': 1024, 'bias': True, 'activation_function': 'torch.nn.modules.linear.Identity'})
294
+ )
295
+ ```
296
+
297
+ ## Usage
298
+
299
+ ### Direct Usage (Sentence Transformers)
300
+
301
+ First install the Sentence Transformers library:
302
+
303
+ ```bash
304
+ pip install -U sentence-transformers
305
+ ```
306
+
307
+ Then you can load this model and run inference.
308
+ ```python
309
+ from sentence_transformers import SentenceTransformer
310
+
311
+ # Download from the 🤗 Hub
312
+ model = SentenceTransformer("sentence_transformers_model_id")
313
+ # Run inference
314
+ sentences = [
315
+ 'The Tchaikovsky Symphony Orchestra is a Russian classical music orchestra established in 1930. It was founded as the Moscow Radio Symphony Orchestra, and served as the official symphony for the Soviet All-Union Radio network. Following the dissolution of the, Soviet Union in 1991, the orchestra was renamed in 1993 by the Russian Ministry of Culture in recognition of the central role the music of Tchaikovsky plays in its repertoire. The current music director is Vladimir Fedoseyev, who has been in that position since 1974.',
316
+ 'Instruct: Given a web search query, retrieve relevant passages that answer the query.\nQuery: Tchaikovsky Symphony Orchestra',
317
+ 'Instruct: Given a web search query, retrieve relevant passages that answer the query.\nQuery: Sierra del Lacandón',
318
+ ]
319
+ embeddings = model.encode(sentences)
320
+ print(embeddings.shape)
321
+ # [3, 1024]
322
+
323
+ # Get the similarity scores for the embeddings
324
+ similarities = model.similarity(embeddings, embeddings)
325
+ print(similarities.shape)
326
+ # [3, 3]
327
+ ```
328
+
329
+ <!--
330
+ ### Direct Usage (Transformers)
331
+
332
+ <details><summary>Click to see the direct usage in Transformers</summary>
333
+
334
+ </details>
335
+ -->
336
+
337
+ <!--
338
+ ### Downstream Usage (Sentence Transformers)
339
+
340
+ You can finetune this model on your own dataset.
341
+
342
+ <details><summary>Click to expand</summary>
343
+
344
+ </details>
345
+ -->
346
+
347
+ <!--
348
+ ### Out-of-Scope Use
349
+
350
+ *List how the model may foreseeably be misused and address what users ought not to do with the model.*
351
+ -->
352
+
353
+ ## Evaluation
354
+
355
+ ### Metrics
356
+
357
+ #### Information Retrieval
358
+
359
+ * Evaluated with [<code>InformationRetrievalEvaluator</code>](https://sbert.net/docs/package_reference/sentence_transformer/evaluation.html#sentence_transformers.evaluation.InformationRetrievalEvaluator)
360
+
361
+ | Metric | Value |
362
+ |:--------------------|:-----------|
363
+ | cosine_accuracy@1 | 0.9448 |
364
+ | cosine_accuracy@3 | 0.9687 |
365
+ | cosine_accuracy@5 | 0.9764 |
366
+ | cosine_accuracy@10 | 0.9811 |
367
+ | cosine_precision@1 | 0.9448 |
368
+ | cosine_precision@3 | 0.3229 |
369
+ | cosine_precision@5 | 0.1953 |
370
+ | cosine_precision@10 | 0.0981 |
371
+ | cosine_recall@1 | 0.9448 |
372
+ | cosine_recall@3 | 0.9687 |
373
+ | cosine_recall@5 | 0.9764 |
374
+ | cosine_recall@10 | 0.9811 |
375
+ | cosine_ndcg@10 | 0.9637 |
376
+ | cosine_mrr@10 | 0.958 |
377
+ | **cosine_map@100** | **0.9586** |
378
+
379
+ #### Information Retrieval
380
+
381
+ * Evaluated with [<code>InformationRetrievalEvaluator</code>](https://sbert.net/docs/package_reference/sentence_transformer/evaluation.html#sentence_transformers.evaluation.InformationRetrievalEvaluator)
382
+
383
+ | Metric | Value |
384
+ |:--------------------|:-----------|
385
+ | cosine_accuracy@1 | 0.9444 |
386
+ | cosine_accuracy@3 | 0.97 |
387
+ | cosine_accuracy@5 | 0.9764 |
388
+ | cosine_accuracy@10 | 0.9825 |
389
+ | cosine_precision@1 | 0.9444 |
390
+ | cosine_precision@3 | 0.3233 |
391
+ | cosine_precision@5 | 0.1953 |
392
+ | cosine_precision@10 | 0.0982 |
393
+ | cosine_recall@1 | 0.9444 |
394
+ | cosine_recall@3 | 0.97 |
395
+ | cosine_recall@5 | 0.9764 |
396
+ | cosine_recall@10 | 0.9825 |
397
+ | cosine_ndcg@10 | 0.9639 |
398
+ | cosine_mrr@10 | 0.9579 |
399
+ | **cosine_map@100** | **0.9584** |
400
+
401
+ #### Information Retrieval
402
+
403
+ * Evaluated with [<code>InformationRetrievalEvaluator</code>](https://sbert.net/docs/package_reference/sentence_transformer/evaluation.html#sentence_transformers.evaluation.InformationRetrievalEvaluator)
404
+
405
+ | Metric | Value |
406
+ |:--------------------|:-----------|
407
+ | cosine_accuracy@1 | 0.9438 |
408
+ | cosine_accuracy@3 | 0.967 |
409
+ | cosine_accuracy@5 | 0.9724 |
410
+ | cosine_accuracy@10 | 0.9801 |
411
+ | cosine_precision@1 | 0.9438 |
412
+ | cosine_precision@3 | 0.3223 |
413
+ | cosine_precision@5 | 0.1945 |
414
+ | cosine_precision@10 | 0.098 |
415
+ | cosine_recall@1 | 0.9438 |
416
+ | cosine_recall@3 | 0.967 |
417
+ | cosine_recall@5 | 0.9724 |
418
+ | cosine_recall@10 | 0.9801 |
419
+ | cosine_ndcg@10 | 0.9624 |
420
+ | cosine_mrr@10 | 0.9567 |
421
+ | **cosine_map@100** | **0.9573** |
422
+
423
+ <!--
424
+ ## Bias, Risks and Limitations
425
+
426
+ *What are the known or foreseeable issues stemming from this model? You could also flag here known failure cases or weaknesses of the model.*
427
+ -->
428
+
429
+ <!--
430
+ ### Recommendations
431
+
432
+ *What are recommendations with respect to the foreseeable issues? For example, filtering explicit content.*
433
+ -->
434
+
435
+ ## Training Details
436
+
437
+ ### Training Hyperparameters
438
+ #### Non-Default Hyperparameters
439
+
440
+ - `eval_strategy`: steps
441
+ - `per_device_eval_batch_size`: 4
442
+ - `gradient_accumulation_steps`: 4
443
+ - `learning_rate`: 2e-05
444
+ - `max_steps`: 1500
445
+ - `lr_scheduler_type`: cosine
446
+ - `warmup_ratio`: 0.1
447
+ - `warmup_steps`: 5
448
+ - `bf16`: True
449
+ - `tf32`: True
450
+ - `optim`: adamw_torch_fused
451
+ - `gradient_checkpointing`: True
452
+ - `gradient_checkpointing_kwargs`: {'use_reentrant': False}
453
+ - `batch_sampler`: no_duplicates
454
+
455
+ #### All Hyperparameters
456
+ <details><summary>Click to expand</summary>
457
+
458
+ - `overwrite_output_dir`: False
459
+ - `do_predict`: False
460
+ - `eval_strategy`: steps
461
+ - `prediction_loss_only`: True
462
+ - `per_device_train_batch_size`: 8
463
+ - `per_device_eval_batch_size`: 4
464
+ - `per_gpu_train_batch_size`: None
465
+ - `per_gpu_eval_batch_size`: None
466
+ - `gradient_accumulation_steps`: 4
467
+ - `eval_accumulation_steps`: None
468
+ - `learning_rate`: 2e-05
469
+ - `weight_decay`: 0.0
470
+ - `adam_beta1`: 0.9
471
+ - `adam_beta2`: 0.999
472
+ - `adam_epsilon`: 1e-08
473
+ - `max_grad_norm`: 1.0
474
+ - `num_train_epochs`: 3.0
475
+ - `max_steps`: 1500
476
+ - `lr_scheduler_type`: cosine
477
+ - `lr_scheduler_kwargs`: {}
478
+ - `warmup_ratio`: 0.1
479
+ - `warmup_steps`: 5
480
+ - `log_level`: passive
481
+ - `log_level_replica`: warning
482
+ - `log_on_each_node`: True
483
+ - `logging_nan_inf_filter`: True
484
+ - `save_safetensors`: True
485
+ - `save_on_each_node`: False
486
+ - `save_only_model`: False
487
+ - `restore_callback_states_from_checkpoint`: False
488
+ - `no_cuda`: False
489
+ - `use_cpu`: False
490
+ - `use_mps_device`: False
491
+ - `seed`: 42
492
+ - `data_seed`: None
493
+ - `jit_mode_eval`: False
494
+ - `use_ipex`: False
495
+ - `bf16`: True
496
+ - `fp16`: False
497
+ - `fp16_opt_level`: O1
498
+ - `half_precision_backend`: auto
499
+ - `bf16_full_eval`: False
500
+ - `fp16_full_eval`: False
501
+ - `tf32`: True
502
+ - `local_rank`: 0
503
+ - `ddp_backend`: None
504
+ - `tpu_num_cores`: None
505
+ - `tpu_metrics_debug`: False
506
+ - `debug`: []
507
+ - `dataloader_drop_last`: True
508
+ - `dataloader_num_workers`: 0
509
+ - `dataloader_prefetch_factor`: None
510
+ - `past_index`: -1
511
+ - `disable_tqdm`: False
512
+ - `remove_unused_columns`: True
513
+ - `label_names`: None
514
+ - `load_best_model_at_end`: False
515
+ - `ignore_data_skip`: False
516
+ - `fsdp`: []
517
+ - `fsdp_min_num_params`: 0
518
+ - `fsdp_config`: {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}
519
+ - `fsdp_transformer_layer_cls_to_wrap`: None
520
+ - `accelerator_config`: {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}
521
+ - `deepspeed`: None
522
+ - `label_smoothing_factor`: 0.0
523
+ - `optim`: adamw_torch_fused
524
+ - `optim_args`: None
525
+ - `adafactor`: False
526
+ - `group_by_length`: False
527
+ - `length_column_name`: length
528
+ - `ddp_find_unused_parameters`: None
529
+ - `ddp_bucket_cap_mb`: None
530
+ - `ddp_broadcast_buffers`: False
531
+ - `dataloader_pin_memory`: True
532
+ - `dataloader_persistent_workers`: False
533
+ - `skip_memory_metrics`: True
534
+ - `use_legacy_prediction_loop`: False
535
+ - `push_to_hub`: False
536
+ - `resume_from_checkpoint`: None
537
+ - `hub_model_id`: None
538
+ - `hub_strategy`: every_save
539
+ - `hub_private_repo`: False
540
+ - `hub_always_push`: False
541
+ - `gradient_checkpointing`: True
542
+ - `gradient_checkpointing_kwargs`: {'use_reentrant': False}
543
+ - `include_inputs_for_metrics`: False
544
+ - `eval_do_concat_batches`: True
545
+ - `fp16_backend`: auto
546
+ - `push_to_hub_model_id`: None
547
+ - `push_to_hub_organization`: None
548
+ - `mp_parameters`:
549
+ - `auto_find_batch_size`: False
550
+ - `full_determinism`: False
551
+ - `torchdynamo`: None
552
+ - `ray_scope`: last
553
+ - `ddp_timeout`: 1800
554
+ - `torch_compile`: False
555
+ - `torch_compile_backend`: None
556
+ - `torch_compile_mode`: None
557
+ - `dispatch_batches`: None
558
+ - `split_batches`: None
559
+ - `include_tokens_per_second`: False
560
+ - `include_num_input_tokens_seen`: False
561
+ - `neftune_noise_alpha`: None
562
+ - `optim_target_modules`: None
563
+ - `batch_eval_metrics`: False
564
+ - `batch_sampler`: no_duplicates
565
+ - `multi_dataset_batch_sampler`: proportional
566
+
567
+ </details>
568
+
569
+ ### Training Logs
570
+ | Epoch | Step | Training Loss | loss | cosine_map@100 |
571
+ |:------:|:----:|:-------------:|:------:|:--------------:|
572
+ | 0.0185 | 100 | 0.4835 | 0.0751 | 0.9138 |
573
+ | 0.0369 | 200 | 0.0646 | 0.0590 | 0.9384 |
574
+ | 0.0554 | 300 | 0.0594 | 0.0519 | 0.9462 |
575
+ | 0.0739 | 400 | 0.0471 | 0.0483 | 0.9514 |
576
+ | 0.0924 | 500 | 0.0524 | 0.0455 | 0.9531 |
577
+ | 0.1108 | 600 | 0.0435 | 0.0397 | 0.9546 |
578
+ | 0.1293 | 700 | 0.0336 | 0.0394 | 0.9549 |
579
+ | 0.1478 | 800 | 0.0344 | 0.0374 | 0.9565 |
580
+ | 0.1662 | 900 | 0.0393 | 0.0361 | 0.9568 |
581
+ | 0.1847 | 1000 | 0.0451 | 0.0361 | 0.9578 |
582
+ | 0.2032 | 1100 | 0.0278 | 0.0358 | 0.9568 |
583
+ | 0.2216 | 1200 | 0.0332 | 0.0356 | 0.9572 |
584
+ | 0.2401 | 1300 | 0.0317 | 0.0354 | 0.9575 |
585
+ | 0.2586 | 1400 | 0.026 | 0.0355 | 0.9574 |
586
+ | 0.2771 | 1500 | 0.0442 | 0.0355 | 0.9573 |
587
+
588
+
589
+ ### Framework Versions
590
+ - Python: 3.10.12
591
+ - Sentence Transformers: 3.0.1
592
+ - Transformers: 4.41.2
593
+ - PyTorch: 2.2.0+cu121
594
+ - Accelerate: 0.33.0
595
+ - Datasets: 2.20.0
596
+ - Tokenizers: 0.19.1
597
+
598
+ ## Citation
599
+
600
+ ### BibTeX
601
+
602
+ #### Sentence Transformers
603
+ ```bibtex
604
+ @inproceedings{reimers-2019-sentence-bert,
605
+ title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
606
+ author = "Reimers, Nils and Gurevych, Iryna",
607
+ booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
608
+ month = "11",
609
+ year = "2019",
610
+ publisher = "Association for Computational Linguistics",
611
+ url = "https://arxiv.org/abs/1908.10084",
612
+ }
613
+ ```
614
+
615
+ #### MatryoshkaLoss
616
+ ```bibtex
617
+ @misc{kusupati2024matryoshka,
618
+ title={Matryoshka Representation Learning},
619
+ author={Aditya Kusupati and Gantavya Bhatt and Aniket Rege and Matthew Wallingford and Aditya Sinha and Vivek Ramanujan and William Howard-Snyder and Kaifeng Chen and Sham Kakade and Prateek Jain and Ali Farhadi},
620
+ year={2024},
621
+ eprint={2205.13147},
622
+ archivePrefix={arXiv},
623
+ primaryClass={cs.LG}
624
+ }
625
+ ```
626
+
627
+ #### MultipleNegativesRankingLoss
628
+ ```bibtex
629
+ @misc{henderson2017efficient,
630
+ title={Efficient Natural Language Response Suggestion for Smart Reply},
631
+ author={Matthew Henderson and Rami Al-Rfou and Brian Strope and Yun-hsuan Sung and Laszlo Lukacs and Ruiqi Guo and Sanjiv Kumar and Balint Miklos and Ray Kurzweil},
632
+ year={2017},
633
+ eprint={1705.00652},
634
+ archivePrefix={arXiv},
635
+ primaryClass={cs.CL}
636
+ }
637
+ ```
638
+
639
+ <!--
640
+ ## Glossary
641
+
642
+ *Clearly define terms in order to be accessible across audiences.*
643
+ -->
644
+
645
+ <!--
646
+ ## Model Card Authors
647
+
648
+ *Lists the people who create the model card, providing recognition and accountability for the detailed work that goes into its construction.*
649
+ -->
650
+
651
+ <!--
652
+ ## Model Card Contact
653
+
654
+ *Provides a way for people who have updates to the Model Card, suggestions, or questions, to contact the Model Card authors.*
655
+ -->
added_tokens.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "<|endoftext|>": 151643,
3
+ "<|im_end|>": 151645,
4
+ "<|im_start|>": 151644
5
+ }
config.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "dunzhang/stella_en_1.5B_v5",
3
+ "architectures": [
4
+ "Qwen2Model"
5
+ ],
6
+ "attention_dropout": 0.0,
7
+ "auto_map": {
8
+ "AutoModel": "dunzhang/stella_en_1.5B_v5--modeling_qwen.Qwen2Model",
9
+ "AutoModelForCausalLM": "dunzhang/stella_en_1.5B_v5--modeling_qwen.Qwen2ForCausalLM",
10
+ "AutoModelForSequenceClassification": "dunzhang/stella_en_1.5B_v5--modeling_qwen.Qwen2ForSequenceClassification"
11
+ },
12
+ "bos_token_id": 151643,
13
+ "eos_token_id": 151643,
14
+ "hidden_act": "silu",
15
+ "hidden_size": 1536,
16
+ "initializer_range": 0.02,
17
+ "intermediate_size": 8960,
18
+ "max_position_embeddings": 131072,
19
+ "max_window_layers": 21,
20
+ "model_type": "qwen2",
21
+ "num_attention_heads": 12,
22
+ "num_hidden_layers": 28,
23
+ "num_key_value_heads": 2,
24
+ "rms_norm_eps": 1e-06,
25
+ "rope_theta": 1000000.0,
26
+ "sliding_window": 131072,
27
+ "tie_word_embeddings": false,
28
+ "torch_dtype": "bfloat16",
29
+ "transformers_version": "4.41.2",
30
+ "use_cache": true,
31
+ "use_sliding_window": false,
32
+ "vocab_size": 151646
33
+ }
config_sentence_transformers.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "__version__": {
3
+ "sentence_transformers": "3.0.1",
4
+ "transformers": "4.41.2",
5
+ "pytorch": "2.2.0+cu121"
6
+ },
7
+ "prompts": {
8
+ "s2p_query": "Instruct: Given a web search query, retrieve relevant passages that answer the query.\nQuery: ",
9
+ "s2s_query": "Instruct: Retrieve semantically similar text.\nQuery: "
10
+ },
11
+ "default_prompt_name": null,
12
+ "similarity_fn_name": "cosine"
13
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5a52192f0852cc0691c2ef24b099c6efe7cd7592d62268bfc93f3547ee328096
3
+ size 3086574240
modules.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "idx": 0,
4
+ "name": "0",
5
+ "path": "",
6
+ "type": "sentence_transformers.models.Transformer"
7
+ },
8
+ {
9
+ "idx": 1,
10
+ "name": "1",
11
+ "path": "1_Pooling",
12
+ "type": "sentence_transformers.models.Pooling"
13
+ },
14
+ {
15
+ "idx": 2,
16
+ "name": "2",
17
+ "path": "2_Dense",
18
+ "type": "sentence_transformers.models.Dense"
19
+ }
20
+ ]
optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6e6b9b073e7a6423f9839842837728552088566f99d11150e0c91b87923b6aa5
3
+ size 6185963010
rng_state_0.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f736aa6e4e965dceac6ed30e7eca2a0d3949aa430fda151600400f98440a428d
3
+ size 14960
rng_state_1.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3596920ed3585caad941f1fc8754fd023cdf76a121d76d9cd4c99e537e37c8e5
3
+ size 14960
rng_state_2.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2881081ac69f7dc403f8ea7d3e928119b5e3e1ab5a23741db37ffd676eef240f
3
+ size 14960
rng_state_3.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ded76f169c057a604c65ee15c3000965b11ee4b535dc49dabaf76581b4bf7969
3
+ size 14960
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aeef384dca819c9b44bc575058a43da9e90e28ca7bdc6079aaf024944abda117
3
+ size 1064
sentence_bert_config.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "max_seq_length": 8096,
3
+ "do_lower_case": false
4
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>"
5
+ ],
6
+ "eos_token": {
7
+ "content": "<|endoftext|>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false
12
+ },
13
+ "pad_token": {
14
+ "content": "<|endoftext|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false
19
+ }
20
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_eos_token": true,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ }
29
+ },
30
+ "additional_special_tokens": [
31
+ "<|im_start|>",
32
+ "<|im_end|>"
33
+ ],
34
+ "auto_map": {
35
+ "AutoTokenizer": [
36
+ "dunzhang/stella_en_1.5B_v5--tokenization_qwen.Qwen2Tokenizer",
37
+ "dunzhang/stella_en_1.5B_v5--tokenization_qwen.Qwen2TokenizerFast"
38
+ ]
39
+ },
40
+ "bos_token": null,
41
+ "chat_template": "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
42
+ "clean_up_tokenization_spaces": false,
43
+ "eos_token": "<|endoftext|>",
44
+ "errors": "replace",
45
+ "model_max_length": 512,
46
+ "pad_token": "<|endoftext|>",
47
+ "split_special_tokens": false,
48
+ "tokenizer_class": "Qwen2Tokenizer",
49
+ "unk_token": null
50
+ }
trainer_state.json ADDED
@@ -0,0 +1,498 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 0.27705947543405984,
5
+ "eval_steps": 100,
6
+ "global_step": 1500,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.01847063169560399,
13
+ "grad_norm": 17.696088790893555,
14
+ "learning_rate": 1.980139427847242e-05,
15
+ "loss": 0.4835,
16
+ "step": 100
17
+ },
18
+ {
19
+ "epoch": 0.01847063169560399,
20
+ "eval_cosine_accuracy@1": 0.8868686868686869,
21
+ "eval_cosine_accuracy@10": 0.9579124579124579,
22
+ "eval_cosine_accuracy@3": 0.9343434343434344,
23
+ "eval_cosine_accuracy@5": 0.9454545454545454,
24
+ "eval_cosine_map@100": 0.9137674463447905,
25
+ "eval_cosine_mrr@10": 0.9124690021912245,
26
+ "eval_cosine_ndcg@10": 0.9236137131767355,
27
+ "eval_cosine_precision@1": 0.8868686868686869,
28
+ "eval_cosine_precision@10": 0.09579124579124577,
29
+ "eval_cosine_precision@3": 0.3114478114478115,
30
+ "eval_cosine_precision@5": 0.18909090909090906,
31
+ "eval_cosine_recall@1": 0.8868686868686869,
32
+ "eval_cosine_recall@10": 0.9579124579124579,
33
+ "eval_cosine_recall@3": 0.9343434343434344,
34
+ "eval_cosine_recall@5": 0.9454545454545454,
35
+ "eval_loss": 0.07506837695837021,
36
+ "eval_runtime": 49.7303,
37
+ "eval_samples_per_second": 140.759,
38
+ "eval_sequential_score": 0.9137674463447905,
39
+ "eval_steps_per_second": 8.808,
40
+ "step": 100
41
+ },
42
+ {
43
+ "epoch": 0.03694126339120798,
44
+ "grad_norm": 2.3271963596343994,
45
+ "learning_rate": 1.917211301505453e-05,
46
+ "loss": 0.0646,
47
+ "step": 200
48
+ },
49
+ {
50
+ "epoch": 0.03694126339120798,
51
+ "eval_cosine_accuracy@1": 0.9195286195286195,
52
+ "eval_cosine_accuracy@10": 0.967003367003367,
53
+ "eval_cosine_accuracy@3": 0.9518518518518518,
54
+ "eval_cosine_accuracy@5": 0.9612794612794613,
55
+ "eval_cosine_map@100": 0.9383862806127067,
56
+ "eval_cosine_mrr@10": 0.937248810859922,
57
+ "eval_cosine_ndcg@10": 0.9445895366693552,
58
+ "eval_cosine_precision@1": 0.9195286195286195,
59
+ "eval_cosine_precision@10": 0.0967003367003367,
60
+ "eval_cosine_precision@3": 0.317283950617284,
61
+ "eval_cosine_precision@5": 0.19225589225589226,
62
+ "eval_cosine_recall@1": 0.9195286195286195,
63
+ "eval_cosine_recall@10": 0.967003367003367,
64
+ "eval_cosine_recall@3": 0.9518518518518518,
65
+ "eval_cosine_recall@5": 0.9612794612794613,
66
+ "eval_loss": 0.05896875262260437,
67
+ "eval_runtime": 51.3489,
68
+ "eval_samples_per_second": 136.322,
69
+ "eval_sequential_score": 0.9383862806127067,
70
+ "eval_steps_per_second": 8.53,
71
+ "step": 200
72
+ },
73
+ {
74
+ "epoch": 0.05541189508681197,
75
+ "grad_norm": 3.7376925945281982,
76
+ "learning_rate": 1.8139290433532415e-05,
77
+ "loss": 0.0594,
78
+ "step": 300
79
+ },
80
+ {
81
+ "epoch": 0.05541189508681197,
82
+ "eval_cosine_accuracy@1": 0.9303030303030303,
83
+ "eval_cosine_accuracy@10": 0.9737373737373738,
84
+ "eval_cosine_accuracy@3": 0.9579124579124579,
85
+ "eval_cosine_accuracy@5": 0.9656565656565657,
86
+ "eval_cosine_map@100": 0.9462389210939364,
87
+ "eval_cosine_mrr@10": 0.9454570840681954,
88
+ "eval_cosine_ndcg@10": 0.9523521034308455,
89
+ "eval_cosine_precision@1": 0.9303030303030303,
90
+ "eval_cosine_precision@10": 0.09737373737373735,
91
+ "eval_cosine_precision@3": 0.31930415263748596,
92
+ "eval_cosine_precision@5": 0.1931313131313131,
93
+ "eval_cosine_recall@1": 0.9303030303030303,
94
+ "eval_cosine_recall@10": 0.9737373737373738,
95
+ "eval_cosine_recall@3": 0.9579124579124579,
96
+ "eval_cosine_recall@5": 0.9656565656565657,
97
+ "eval_loss": 0.051894593983888626,
98
+ "eval_runtime": 49.496,
99
+ "eval_samples_per_second": 141.426,
100
+ "eval_sequential_score": 0.9462389210939364,
101
+ "eval_steps_per_second": 8.849,
102
+ "step": 300
103
+ },
104
+ {
105
+ "epoch": 0.07388252678241596,
106
+ "grad_norm": 0.3877984583377838,
107
+ "learning_rate": 1.6748367163042577e-05,
108
+ "loss": 0.0471,
109
+ "step": 400
110
+ },
111
+ {
112
+ "epoch": 0.07388252678241596,
113
+ "eval_cosine_accuracy@1": 0.9367003367003367,
114
+ "eval_cosine_accuracy@10": 0.9750841750841751,
115
+ "eval_cosine_accuracy@3": 0.9612794612794613,
116
+ "eval_cosine_accuracy@5": 0.969023569023569,
117
+ "eval_cosine_map@100": 0.9513700816079773,
118
+ "eval_cosine_mrr@10": 0.9505351130351131,
119
+ "eval_cosine_ndcg@10": 0.9565510675566292,
120
+ "eval_cosine_precision@1": 0.9367003367003367,
121
+ "eval_cosine_precision@10": 0.09750841750841752,
122
+ "eval_cosine_precision@3": 0.3204264870931538,
123
+ "eval_cosine_precision@5": 0.19380471380471379,
124
+ "eval_cosine_recall@1": 0.9367003367003367,
125
+ "eval_cosine_recall@10": 0.9750841750841751,
126
+ "eval_cosine_recall@3": 0.9612794612794613,
127
+ "eval_cosine_recall@5": 0.969023569023569,
128
+ "eval_loss": 0.04832224175333977,
129
+ "eval_runtime": 49.2695,
130
+ "eval_samples_per_second": 142.076,
131
+ "eval_sequential_score": 0.9513700816079773,
132
+ "eval_steps_per_second": 8.89,
133
+ "step": 400
134
+ },
135
+ {
136
+ "epoch": 0.09235315847801995,
137
+ "grad_norm": 0.9424126744270325,
138
+ "learning_rate": 1.5060539027168317e-05,
139
+ "loss": 0.0524,
140
+ "step": 500
141
+ },
142
+ {
143
+ "epoch": 0.09235315847801995,
144
+ "eval_cosine_accuracy@1": 0.9387205387205387,
145
+ "eval_cosine_accuracy@10": 0.9787878787878788,
146
+ "eval_cosine_accuracy@3": 0.9622895622895623,
147
+ "eval_cosine_accuracy@5": 0.9703703703703703,
148
+ "eval_cosine_map@100": 0.9530933506069861,
149
+ "eval_cosine_mrr@10": 0.9525124258457593,
150
+ "eval_cosine_ndcg@10": 0.9588799906525647,
151
+ "eval_cosine_precision@1": 0.9387205387205387,
152
+ "eval_cosine_precision@10": 0.09787878787878787,
153
+ "eval_cosine_precision@3": 0.3207631874298541,
154
+ "eval_cosine_precision@5": 0.19407407407407404,
155
+ "eval_cosine_recall@1": 0.9387205387205387,
156
+ "eval_cosine_recall@10": 0.9787878787878788,
157
+ "eval_cosine_recall@3": 0.9622895622895623,
158
+ "eval_cosine_recall@5": 0.9703703703703703,
159
+ "eval_loss": 0.04548173025250435,
160
+ "eval_runtime": 49.8784,
161
+ "eval_samples_per_second": 140.341,
162
+ "eval_sequential_score": 0.9530933506069861,
163
+ "eval_steps_per_second": 8.781,
164
+ "step": 500
165
+ },
166
+ {
167
+ "epoch": 0.11082379017362394,
168
+ "grad_norm": 1.4559208154678345,
169
+ "learning_rate": 1.315006463889948e-05,
170
+ "loss": 0.0435,
171
+ "step": 600
172
+ },
173
+ {
174
+ "epoch": 0.11082379017362394,
175
+ "eval_cosine_accuracy@1": 0.94006734006734,
176
+ "eval_cosine_accuracy@10": 0.9784511784511785,
177
+ "eval_cosine_accuracy@3": 0.965993265993266,
178
+ "eval_cosine_accuracy@5": 0.9713804713804713,
179
+ "eval_cosine_map@100": 0.9545577685553948,
180
+ "eval_cosine_mrr@10": 0.953849339960451,
181
+ "eval_cosine_ndcg@10": 0.9598872198067653,
182
+ "eval_cosine_precision@1": 0.94006734006734,
183
+ "eval_cosine_precision@10": 0.09784511784511785,
184
+ "eval_cosine_precision@3": 0.32199775533108865,
185
+ "eval_cosine_precision@5": 0.19427609427609427,
186
+ "eval_cosine_recall@1": 0.94006734006734,
187
+ "eval_cosine_recall@10": 0.9784511784511785,
188
+ "eval_cosine_recall@3": 0.965993265993266,
189
+ "eval_cosine_recall@5": 0.9713804713804713,
190
+ "eval_loss": 0.03971650078892708,
191
+ "eval_runtime": 50.0185,
192
+ "eval_samples_per_second": 139.948,
193
+ "eval_sequential_score": 0.9545577685553948,
194
+ "eval_steps_per_second": 8.757,
195
+ "step": 600
196
+ },
197
+ {
198
+ "epoch": 0.12929442186922793,
199
+ "grad_norm": 4.4629669189453125,
200
+ "learning_rate": 1.1100998277940316e-05,
201
+ "loss": 0.0336,
202
+ "step": 700
203
+ },
204
+ {
205
+ "epoch": 0.12929442186922793,
206
+ "eval_cosine_accuracy@1": 0.9404040404040404,
207
+ "eval_cosine_accuracy@10": 0.9804713804713805,
208
+ "eval_cosine_accuracy@3": 0.9646464646464646,
209
+ "eval_cosine_accuracy@5": 0.9717171717171718,
210
+ "eval_cosine_map@100": 0.9549470882860054,
211
+ "eval_cosine_mrr@10": 0.9544308160974829,
212
+ "eval_cosine_ndcg@10": 0.9607541640834583,
213
+ "eval_cosine_precision@1": 0.9404040404040404,
214
+ "eval_cosine_precision@10": 0.09804713804713802,
215
+ "eval_cosine_precision@3": 0.32154882154882153,
216
+ "eval_cosine_precision@5": 0.19434343434343435,
217
+ "eval_cosine_recall@1": 0.9404040404040404,
218
+ "eval_cosine_recall@10": 0.9804713804713805,
219
+ "eval_cosine_recall@3": 0.9646464646464646,
220
+ "eval_cosine_recall@5": 0.9717171717171718,
221
+ "eval_loss": 0.03936752304434776,
222
+ "eval_runtime": 49.3604,
223
+ "eval_samples_per_second": 141.814,
224
+ "eval_sequential_score": 0.9549470882860054,
225
+ "eval_steps_per_second": 8.874,
226
+ "step": 700
227
+ },
228
+ {
229
+ "epoch": 0.1477650535648319,
230
+ "grad_norm": 18.81490135192871,
231
+ "learning_rate": 9.003491792488438e-06,
232
+ "loss": 0.0344,
233
+ "step": 800
234
+ },
235
+ {
236
+ "epoch": 0.1477650535648319,
237
+ "eval_cosine_accuracy@1": 0.9424242424242424,
238
+ "eval_cosine_accuracy@10": 0.9791245791245792,
239
+ "eval_cosine_accuracy@3": 0.9673400673400674,
240
+ "eval_cosine_accuracy@5": 0.9730639730639731,
241
+ "eval_cosine_map@100": 0.9565285707123049,
242
+ "eval_cosine_mrr@10": 0.9558751536529315,
243
+ "eval_cosine_ndcg@10": 0.9615848253233655,
244
+ "eval_cosine_precision@1": 0.9424242424242424,
245
+ "eval_cosine_precision@10": 0.0979124579124579,
246
+ "eval_cosine_precision@3": 0.32244668911335583,
247
+ "eval_cosine_precision@5": 0.19461279461279463,
248
+ "eval_cosine_recall@1": 0.9424242424242424,
249
+ "eval_cosine_recall@10": 0.9791245791245792,
250
+ "eval_cosine_recall@3": 0.9673400673400674,
251
+ "eval_cosine_recall@5": 0.9730639730639731,
252
+ "eval_loss": 0.03738004341721535,
253
+ "eval_runtime": 50.7853,
254
+ "eval_samples_per_second": 137.835,
255
+ "eval_sequential_score": 0.9565285707123049,
256
+ "eval_steps_per_second": 8.625,
257
+ "step": 800
258
+ },
259
+ {
260
+ "epoch": 0.16623568526043592,
261
+ "grad_norm": 40.96598434448242,
262
+ "learning_rate": 6.9498282290438235e-06,
263
+ "loss": 0.0393,
264
+ "step": 900
265
+ },
266
+ {
267
+ "epoch": 0.16623568526043592,
268
+ "eval_cosine_accuracy@1": 0.9430976430976431,
269
+ "eval_cosine_accuracy@10": 0.9801346801346801,
270
+ "eval_cosine_accuracy@3": 0.9666666666666667,
271
+ "eval_cosine_accuracy@5": 0.9723905723905724,
272
+ "eval_cosine_map@100": 0.9567946373289521,
273
+ "eval_cosine_mrr@10": 0.9561974239752019,
274
+ "eval_cosine_ndcg@10": 0.9620315668852821,
275
+ "eval_cosine_precision@1": 0.9430976430976431,
276
+ "eval_cosine_precision@10": 0.098013468013468,
277
+ "eval_cosine_precision@3": 0.32222222222222224,
278
+ "eval_cosine_precision@5": 0.19447811447811444,
279
+ "eval_cosine_recall@1": 0.9430976430976431,
280
+ "eval_cosine_recall@10": 0.9801346801346801,
281
+ "eval_cosine_recall@3": 0.9666666666666667,
282
+ "eval_cosine_recall@5": 0.9723905723905724,
283
+ "eval_loss": 0.036062091588974,
284
+ "eval_runtime": 50.1457,
285
+ "eval_samples_per_second": 139.593,
286
+ "eval_sequential_score": 0.9567946373289521,
287
+ "eval_steps_per_second": 8.735,
288
+ "step": 900
289
+ },
290
+ {
291
+ "epoch": 0.1847063169560399,
292
+ "grad_norm": 7.633666038513184,
293
+ "learning_rate": 5.030361696847706e-06,
294
+ "loss": 0.0451,
295
+ "step": 1000
296
+ },
297
+ {
298
+ "epoch": 0.1847063169560399,
299
+ "eval_cosine_accuracy@1": 0.9447811447811448,
300
+ "eval_cosine_accuracy@10": 0.9804713804713805,
301
+ "eval_cosine_accuracy@3": 0.9673400673400674,
302
+ "eval_cosine_accuracy@5": 0.9720538720538721,
303
+ "eval_cosine_map@100": 0.9577987764578036,
304
+ "eval_cosine_mrr@10": 0.9572219549997326,
305
+ "eval_cosine_ndcg@10": 0.9628692157043424,
306
+ "eval_cosine_precision@1": 0.9447811447811448,
307
+ "eval_cosine_precision@10": 0.09804713804713805,
308
+ "eval_cosine_precision@3": 0.32244668911335583,
309
+ "eval_cosine_precision@5": 0.19441077441077437,
310
+ "eval_cosine_recall@1": 0.9447811447811448,
311
+ "eval_cosine_recall@10": 0.9804713804713805,
312
+ "eval_cosine_recall@3": 0.9673400673400674,
313
+ "eval_cosine_recall@5": 0.9720538720538721,
314
+ "eval_loss": 0.03610473498702049,
315
+ "eval_runtime": 49.6014,
316
+ "eval_samples_per_second": 141.125,
317
+ "eval_sequential_score": 0.9577987764578036,
318
+ "eval_steps_per_second": 8.83,
319
+ "step": 1000
320
+ },
321
+ {
322
+ "epoch": 0.20317694865164387,
323
+ "grad_norm": 4.241499900817871,
324
+ "learning_rate": 3.329542098903674e-06,
325
+ "loss": 0.0278,
326
+ "step": 1100
327
+ },
328
+ {
329
+ "epoch": 0.20317694865164387,
330
+ "eval_cosine_accuracy@1": 0.9430976430976431,
331
+ "eval_cosine_accuracy@10": 0.9804713804713805,
332
+ "eval_cosine_accuracy@3": 0.9666666666666667,
333
+ "eval_cosine_accuracy@5": 0.9717171717171718,
334
+ "eval_cosine_map@100": 0.9568101043018846,
335
+ "eval_cosine_mrr@10": 0.9562193362193362,
336
+ "eval_cosine_ndcg@10": 0.9621111011663721,
337
+ "eval_cosine_precision@1": 0.9430976430976431,
338
+ "eval_cosine_precision@10": 0.09804713804713805,
339
+ "eval_cosine_precision@3": 0.32222222222222224,
340
+ "eval_cosine_precision@5": 0.19434343434343435,
341
+ "eval_cosine_recall@1": 0.9430976430976431,
342
+ "eval_cosine_recall@10": 0.9804713804713805,
343
+ "eval_cosine_recall@3": 0.9666666666666667,
344
+ "eval_cosine_recall@5": 0.9717171717171718,
345
+ "eval_loss": 0.035753391683101654,
346
+ "eval_runtime": 50.7364,
347
+ "eval_samples_per_second": 137.968,
348
+ "eval_sequential_score": 0.9568101043018846,
349
+ "eval_steps_per_second": 8.633,
350
+ "step": 1100
351
+ },
352
+ {
353
+ "epoch": 0.22164758034724788,
354
+ "grad_norm": 10.087759971618652,
355
+ "learning_rate": 1.9221996276968523e-06,
356
+ "loss": 0.0332,
357
+ "step": 1200
358
+ },
359
+ {
360
+ "epoch": 0.22164758034724788,
361
+ "eval_cosine_accuracy@1": 0.9437710437710438,
362
+ "eval_cosine_accuracy@10": 0.9801346801346801,
363
+ "eval_cosine_accuracy@3": 0.9666666666666667,
364
+ "eval_cosine_accuracy@5": 0.9720538720538721,
365
+ "eval_cosine_map@100": 0.9572367070457485,
366
+ "eval_cosine_mrr@10": 0.9566237774571107,
367
+ "eval_cosine_ndcg@10": 0.9623501597917694,
368
+ "eval_cosine_precision@1": 0.9437710437710438,
369
+ "eval_cosine_precision@10": 0.09801346801346802,
370
+ "eval_cosine_precision@3": 0.32222222222222224,
371
+ "eval_cosine_precision@5": 0.19441077441077437,
372
+ "eval_cosine_recall@1": 0.9437710437710438,
373
+ "eval_cosine_recall@10": 0.9801346801346801,
374
+ "eval_cosine_recall@3": 0.9666666666666667,
375
+ "eval_cosine_recall@5": 0.9720538720538721,
376
+ "eval_loss": 0.03558042645454407,
377
+ "eval_runtime": 50.0835,
378
+ "eval_samples_per_second": 139.767,
379
+ "eval_sequential_score": 0.9572367070457485,
380
+ "eval_steps_per_second": 8.745,
381
+ "step": 1200
382
+ },
383
+ {
384
+ "epoch": 0.24011821204285186,
385
+ "grad_norm": 17.43052864074707,
386
+ "learning_rate": 8.702524949765645e-07,
387
+ "loss": 0.0317,
388
+ "step": 1300
389
+ },
390
+ {
391
+ "epoch": 0.24011821204285186,
392
+ "eval_cosine_accuracy@1": 0.9441077441077441,
393
+ "eval_cosine_accuracy@10": 0.9797979797979798,
394
+ "eval_cosine_accuracy@3": 0.9673400673400674,
395
+ "eval_cosine_accuracy@5": 0.9720538720538721,
396
+ "eval_cosine_map@100": 0.9574821374909939,
397
+ "eval_cosine_mrr@10": 0.9568467746245525,
398
+ "eval_cosine_ndcg@10": 0.9624426218971004,
399
+ "eval_cosine_precision@1": 0.9441077441077441,
400
+ "eval_cosine_precision@10": 0.09797979797979799,
401
+ "eval_cosine_precision@3": 0.32244668911335583,
402
+ "eval_cosine_precision@5": 0.19441077441077437,
403
+ "eval_cosine_recall@1": 0.9441077441077441,
404
+ "eval_cosine_recall@10": 0.9797979797979798,
405
+ "eval_cosine_recall@3": 0.9673400673400674,
406
+ "eval_cosine_recall@5": 0.9720538720538721,
407
+ "eval_loss": 0.03541325777769089,
408
+ "eval_runtime": 49.4999,
409
+ "eval_samples_per_second": 141.414,
410
+ "eval_sequential_score": 0.9574821374909939,
411
+ "eval_steps_per_second": 8.848,
412
+ "step": 1300
413
+ },
414
+ {
415
+ "epoch": 0.25858884373845586,
416
+ "grad_norm": 0.8455698490142822,
417
+ "learning_rate": 2.199827441298863e-07,
418
+ "loss": 0.026,
419
+ "step": 1400
420
+ },
421
+ {
422
+ "epoch": 0.25858884373845586,
423
+ "eval_cosine_accuracy@1": 0.9441077441077441,
424
+ "eval_cosine_accuracy@10": 0.9797979797979798,
425
+ "eval_cosine_accuracy@3": 0.9673400673400674,
426
+ "eval_cosine_accuracy@5": 0.9720538720538721,
427
+ "eval_cosine_map@100": 0.9574340839544937,
428
+ "eval_cosine_mrr@10": 0.9567861151194483,
429
+ "eval_cosine_ndcg@10": 0.9624031906320126,
430
+ "eval_cosine_precision@1": 0.9441077441077441,
431
+ "eval_cosine_precision@10": 0.09797979797979799,
432
+ "eval_cosine_precision@3": 0.32244668911335583,
433
+ "eval_cosine_precision@5": 0.19441077441077437,
434
+ "eval_cosine_recall@1": 0.9441077441077441,
435
+ "eval_cosine_recall@10": 0.9797979797979798,
436
+ "eval_cosine_recall@3": 0.9673400673400674,
437
+ "eval_cosine_recall@5": 0.9720538720538721,
438
+ "eval_loss": 0.035498470067977905,
439
+ "eval_runtime": 50.8752,
440
+ "eval_samples_per_second": 137.592,
441
+ "eval_sequential_score": 0.9574340839544937,
442
+ "eval_steps_per_second": 8.609,
443
+ "step": 1400
444
+ },
445
+ {
446
+ "epoch": 0.27705947543405984,
447
+ "grad_norm": 3.3353021144866943,
448
+ "learning_rate": 0.0,
449
+ "loss": 0.0442,
450
+ "step": 1500
451
+ },
452
+ {
453
+ "epoch": 0.27705947543405984,
454
+ "eval_cosine_accuracy@1": 0.9437710437710438,
455
+ "eval_cosine_accuracy@10": 0.9801346801346801,
456
+ "eval_cosine_accuracy@3": 0.967003367003367,
457
+ "eval_cosine_accuracy@5": 0.9723905723905724,
458
+ "eval_cosine_map@100": 0.9572829070357247,
459
+ "eval_cosine_mrr@10": 0.9566718775052107,
460
+ "eval_cosine_ndcg@10": 0.9623908732460177,
461
+ "eval_cosine_precision@1": 0.9437710437710438,
462
+ "eval_cosine_precision@10": 0.09801346801346802,
463
+ "eval_cosine_precision@3": 0.322334455667789,
464
+ "eval_cosine_precision@5": 0.19447811447811444,
465
+ "eval_cosine_recall@1": 0.9437710437710438,
466
+ "eval_cosine_recall@10": 0.9801346801346801,
467
+ "eval_cosine_recall@3": 0.967003367003367,
468
+ "eval_cosine_recall@5": 0.9723905723905724,
469
+ "eval_loss": 0.035526029765605927,
470
+ "eval_runtime": 45.9521,
471
+ "eval_samples_per_second": 152.333,
472
+ "eval_sequential_score": 0.9572829070357247,
473
+ "eval_steps_per_second": 9.532,
474
+ "step": 1500
475
+ }
476
+ ],
477
+ "logging_steps": 100,
478
+ "max_steps": 1500,
479
+ "num_input_tokens_seen": 0,
480
+ "num_train_epochs": 1,
481
+ "save_steps": 500,
482
+ "stateful_callbacks": {
483
+ "TrainerControl": {
484
+ "args": {
485
+ "should_epoch_stop": false,
486
+ "should_evaluate": false,
487
+ "should_log": false,
488
+ "should_save": true,
489
+ "should_training_stop": true
490
+ },
491
+ "attributes": {}
492
+ }
493
+ },
494
+ "total_flos": 0.0,
495
+ "train_batch_size": 8,
496
+ "trial_name": null,
497
+ "trial_params": null
498
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fc48324821e670334bde14afaceaab851215544086086930c4482698c24359fd
3
+ size 5368
vocab.json ADDED
The diff for this file is too large to render. See raw diff