Upload folder using huggingface_hub
Browse files- 1_Pooling/config.json +10 -0
- 2_Dense/config.json +1 -0
- 2_Dense/model.safetensors +3 -0
- README.md +650 -3
- added_tokens.json +5 -0
- config.json +33 -0
- config_sentence_transformers.json +13 -0
- merges.txt +0 -0
- model.safetensors +3 -0
- modules.json +20 -0
- optimizer.pt +3 -0
- rng_state_0.pth +3 -0
- rng_state_1.pth +3 -0
- rng_state_2.pth +3 -0
- rng_state_3.pth +3 -0
- scheduler.pt +3 -0
- sentence_bert_config.json +4 -0
- special_tokens_map.json +20 -0
- tokenizer.json +0 -0
- tokenizer_config.json +50 -0
- trainer_state.json +343 -0
- training_args.bin +3 -0
- vocab.json +0 -0
1_Pooling/config.json
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"word_embedding_dimension": 1536,
|
3 |
+
"pooling_mode_cls_token": false,
|
4 |
+
"pooling_mode_mean_tokens": true,
|
5 |
+
"pooling_mode_max_tokens": false,
|
6 |
+
"pooling_mode_mean_sqrt_len_tokens": false,
|
7 |
+
"pooling_mode_weightedmean_tokens": false,
|
8 |
+
"pooling_mode_lasttoken": false,
|
9 |
+
"include_prompt": true
|
10 |
+
}
|
2_Dense/config.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"in_features": 1536, "out_features": 1024, "bias": true, "activation_function": "torch.nn.modules.linear.Identity"}
|
2_Dense/model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a78eda2e316c4d8d4191ea359e5be6768097394c5b2b6676c7913d38887b929e
|
3 |
+
size 6295712
|
README.md
CHANGED
@@ -1,3 +1,650 @@
|
|
1 |
-
---
|
2 |
-
|
3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
base_model: dunzhang/stella_en_1.5B_v5
|
3 |
+
datasets: []
|
4 |
+
language: []
|
5 |
+
library_name: sentence-transformers
|
6 |
+
metrics:
|
7 |
+
- cosine_accuracy@1
|
8 |
+
- cosine_accuracy@3
|
9 |
+
- cosine_accuracy@5
|
10 |
+
- cosine_accuracy@10
|
11 |
+
- cosine_precision@1
|
12 |
+
- cosine_precision@3
|
13 |
+
- cosine_precision@5
|
14 |
+
- cosine_precision@10
|
15 |
+
- cosine_recall@1
|
16 |
+
- cosine_recall@3
|
17 |
+
- cosine_recall@5
|
18 |
+
- cosine_recall@10
|
19 |
+
- cosine_ndcg@10
|
20 |
+
- cosine_mrr@10
|
21 |
+
- cosine_map@100
|
22 |
+
pipeline_tag: sentence-similarity
|
23 |
+
tags:
|
24 |
+
- sentence-transformers
|
25 |
+
- sentence-similarity
|
26 |
+
- feature-extraction
|
27 |
+
- generated_from_trainer
|
28 |
+
- dataset_size:693000
|
29 |
+
- loss:MatryoshkaLoss
|
30 |
+
- loss:MultipleNegativesRankingLoss
|
31 |
+
widget:
|
32 |
+
- source_sentence: Paracrystalline materials are defined as having short and medium
|
33 |
+
range ordering in their lattice (similar to the liquid crystal phases) but lacking
|
34 |
+
crystal-like long-range ordering at least in one direction.
|
35 |
+
sentences:
|
36 |
+
- 'Instruct: Given a web search query, retrieve relevant passages that answer the
|
37 |
+
query.
|
38 |
+
|
39 |
+
Query: Paracrystalline'
|
40 |
+
- 'Instruct: Given a web search query, retrieve relevant passages that answer the
|
41 |
+
query.
|
42 |
+
|
43 |
+
Query: Øystein Dahle'
|
44 |
+
- 'Instruct: Given a web search query, retrieve relevant passages that answer the
|
45 |
+
query.
|
46 |
+
|
47 |
+
Query: Makis Belevonis'
|
48 |
+
- source_sentence: 'Hạ Trạch is a commune ( xã ) and village in Bố Trạch District
|
49 |
+
, Quảng Bình Province , in Vietnam . Category : Populated places in Quang Binh
|
50 |
+
Province Category : Communes of Quang Binh Province'
|
51 |
+
sentences:
|
52 |
+
- 'Instruct: Given a web search query, retrieve relevant passages that answer the
|
53 |
+
query.
|
54 |
+
|
55 |
+
Query: The Taill of how this forsaid Tod maid his Confessioun to Freir Wolf Waitskaith'
|
56 |
+
- 'Instruct: Given a web search query, retrieve relevant passages that answer the
|
57 |
+
query.
|
58 |
+
|
59 |
+
Query: Hạ Trạch'
|
60 |
+
- 'Instruct: Given a web search query, retrieve relevant passages that answer the
|
61 |
+
query.
|
62 |
+
|
63 |
+
Query: Tadaxa'
|
64 |
+
- source_sentence: The Golden Mosque (سنهرى مسجد, Sunehri Masjid) is a mosque in Old
|
65 |
+
Delhi. It is located outside the southwestern corner of Delhi Gate of the Red
|
66 |
+
Fort, opposite the Netaji Subhash Park.
|
67 |
+
sentences:
|
68 |
+
- 'Instruct: Given a web search query, retrieve relevant passages that answer the
|
69 |
+
query.
|
70 |
+
|
71 |
+
Query: Algorithm'
|
72 |
+
- 'Instruct: Given a web search query, retrieve relevant passages that answer the
|
73 |
+
query.
|
74 |
+
|
75 |
+
Query: Golden Mosque (Red Fort)'
|
76 |
+
- 'Instruct: Given a web search query, retrieve relevant passages that answer the
|
77 |
+
query.
|
78 |
+
|
79 |
+
Query: Parnaso Español'
|
80 |
+
- source_sentence: Unibank, S.A. is one of Haiti's two largest private commercial
|
81 |
+
banks. The bank was founded in 1993 by a group of Haitian investors and is the
|
82 |
+
main company of "Groupe Financier National (GFN)". It opened its first office
|
83 |
+
in July 1993 in downtown Port-au-Prince and has 50 branches throughout the country
|
84 |
+
as of the end of 2016.
|
85 |
+
sentences:
|
86 |
+
- 'Instruct: Given a web search query, retrieve relevant passages that answer the
|
87 |
+
query.
|
88 |
+
|
89 |
+
Query: Sky TG24'
|
90 |
+
- 'Instruct: Given a web search query, retrieve relevant passages that answer the
|
91 |
+
query.
|
92 |
+
|
93 |
+
Query: Ghomijeh'
|
94 |
+
- 'Instruct: Given a web search query, retrieve relevant passages that answer the
|
95 |
+
query.
|
96 |
+
|
97 |
+
Query: Unibank (Haiti)'
|
98 |
+
- source_sentence: The Tchaikovsky Symphony Orchestra is a Russian classical music
|
99 |
+
orchestra established in 1930. It was founded as the Moscow Radio Symphony Orchestra,
|
100 |
+
and served as the official symphony for the Soviet All-Union Radio network. Following
|
101 |
+
the dissolution of the, Soviet Union in 1991, the orchestra was renamed in 1993
|
102 |
+
by the Russian Ministry of Culture in recognition of the central role the music
|
103 |
+
of Tchaikovsky plays in its repertoire. The current music director is Vladimir
|
104 |
+
Fedoseyev, who has been in that position since 1974.
|
105 |
+
sentences:
|
106 |
+
- 'Instruct: Given a web search query, retrieve relevant passages that answer the
|
107 |
+
query.
|
108 |
+
|
109 |
+
Query: Harald J.W. Mueller-Kirsten'
|
110 |
+
- 'Instruct: Given a web search query, retrieve relevant passages that answer the
|
111 |
+
query.
|
112 |
+
|
113 |
+
Query: Sierra del Lacandón'
|
114 |
+
- 'Instruct: Given a web search query, retrieve relevant passages that answer the
|
115 |
+
query.
|
116 |
+
|
117 |
+
Query: Tchaikovsky Symphony Orchestra'
|
118 |
+
model-index:
|
119 |
+
- name: SentenceTransformer based on dunzhang/stella_en_1.5B_v5
|
120 |
+
results:
|
121 |
+
- task:
|
122 |
+
type: information-retrieval
|
123 |
+
name: Information Retrieval
|
124 |
+
dataset:
|
125 |
+
name: Unknown
|
126 |
+
type: unknown
|
127 |
+
metrics:
|
128 |
+
- type: cosine_accuracy@1
|
129 |
+
value: 0.9457912457912457
|
130 |
+
name: Cosine Accuracy@1
|
131 |
+
- type: cosine_accuracy@3
|
132 |
+
value: 0.9686868686868687
|
133 |
+
name: Cosine Accuracy@3
|
134 |
+
- type: cosine_accuracy@5
|
135 |
+
value: 0.9750841750841751
|
136 |
+
name: Cosine Accuracy@5
|
137 |
+
- type: cosine_accuracy@10
|
138 |
+
value: 0.9818181818181818
|
139 |
+
name: Cosine Accuracy@10
|
140 |
+
- type: cosine_precision@1
|
141 |
+
value: 0.9457912457912457
|
142 |
+
name: Cosine Precision@1
|
143 |
+
- type: cosine_precision@3
|
144 |
+
value: 0.3228956228956229
|
145 |
+
name: Cosine Precision@3
|
146 |
+
- type: cosine_precision@5
|
147 |
+
value: 0.195016835016835
|
148 |
+
name: Cosine Precision@5
|
149 |
+
- type: cosine_precision@10
|
150 |
+
value: 0.09818181818181818
|
151 |
+
name: Cosine Precision@10
|
152 |
+
- type: cosine_recall@1
|
153 |
+
value: 0.9457912457912457
|
154 |
+
name: Cosine Recall@1
|
155 |
+
- type: cosine_recall@3
|
156 |
+
value: 0.9686868686868687
|
157 |
+
name: Cosine Recall@3
|
158 |
+
- type: cosine_recall@5
|
159 |
+
value: 0.9750841750841751
|
160 |
+
name: Cosine Recall@5
|
161 |
+
- type: cosine_recall@10
|
162 |
+
value: 0.9818181818181818
|
163 |
+
name: Cosine Recall@10
|
164 |
+
- type: cosine_ndcg@10
|
165 |
+
value: 0.9641837379281919
|
166 |
+
name: Cosine Ndcg@10
|
167 |
+
- type: cosine_mrr@10
|
168 |
+
value: 0.9584885895997006
|
169 |
+
name: Cosine Mrr@10
|
170 |
+
- type: cosine_map@100
|
171 |
+
value: 0.9590455638710143
|
172 |
+
name: Cosine Map@100
|
173 |
+
- type: cosine_accuracy@1
|
174 |
+
value: 0.9447811447811448
|
175 |
+
name: Cosine Accuracy@1
|
176 |
+
- type: cosine_accuracy@3
|
177 |
+
value: 0.9696969696969697
|
178 |
+
name: Cosine Accuracy@3
|
179 |
+
- type: cosine_accuracy@5
|
180 |
+
value: 0.9754208754208754
|
181 |
+
name: Cosine Accuracy@5
|
182 |
+
- type: cosine_accuracy@10
|
183 |
+
value: 0.9824915824915825
|
184 |
+
name: Cosine Accuracy@10
|
185 |
+
- type: cosine_precision@1
|
186 |
+
value: 0.9447811447811448
|
187 |
+
name: Cosine Precision@1
|
188 |
+
- type: cosine_precision@3
|
189 |
+
value: 0.32323232323232326
|
190 |
+
name: Cosine Precision@3
|
191 |
+
- type: cosine_precision@5
|
192 |
+
value: 0.19508417508417508
|
193 |
+
name: Cosine Precision@5
|
194 |
+
- type: cosine_precision@10
|
195 |
+
value: 0.09824915824915824
|
196 |
+
name: Cosine Precision@10
|
197 |
+
- type: cosine_recall@1
|
198 |
+
value: 0.9447811447811448
|
199 |
+
name: Cosine Recall@1
|
200 |
+
- type: cosine_recall@3
|
201 |
+
value: 0.9696969696969697
|
202 |
+
name: Cosine Recall@3
|
203 |
+
- type: cosine_recall@5
|
204 |
+
value: 0.9754208754208754
|
205 |
+
name: Cosine Recall@5
|
206 |
+
- type: cosine_recall@10
|
207 |
+
value: 0.9824915824915825
|
208 |
+
name: Cosine Recall@10
|
209 |
+
- type: cosine_ndcg@10
|
210 |
+
value: 0.9641053714591453
|
211 |
+
name: Cosine Ndcg@10
|
212 |
+
- type: cosine_mrr@10
|
213 |
+
value: 0.9581715301159749
|
214 |
+
name: Cosine Mrr@10
|
215 |
+
- type: cosine_map@100
|
216 |
+
value: 0.9586773165340671
|
217 |
+
name: Cosine Map@100
|
218 |
+
- type: cosine_accuracy@1
|
219 |
+
value: 0.9447811447811448
|
220 |
+
name: Cosine Accuracy@1
|
221 |
+
- type: cosine_accuracy@3
|
222 |
+
value: 0.9673400673400674
|
223 |
+
name: Cosine Accuracy@3
|
224 |
+
- type: cosine_accuracy@5
|
225 |
+
value: 0.9720538720538721
|
226 |
+
name: Cosine Accuracy@5
|
227 |
+
- type: cosine_accuracy@10
|
228 |
+
value: 0.9804713804713805
|
229 |
+
name: Cosine Accuracy@10
|
230 |
+
- type: cosine_precision@1
|
231 |
+
value: 0.9447811447811448
|
232 |
+
name: Cosine Precision@1
|
233 |
+
- type: cosine_precision@3
|
234 |
+
value: 0.32244668911335583
|
235 |
+
name: Cosine Precision@3
|
236 |
+
- type: cosine_precision@5
|
237 |
+
value: 0.19441077441077437
|
238 |
+
name: Cosine Precision@5
|
239 |
+
- type: cosine_precision@10
|
240 |
+
value: 0.09804713804713805
|
241 |
+
name: Cosine Precision@10
|
242 |
+
- type: cosine_recall@1
|
243 |
+
value: 0.9447811447811448
|
244 |
+
name: Cosine Recall@1
|
245 |
+
- type: cosine_recall@3
|
246 |
+
value: 0.9673400673400674
|
247 |
+
name: Cosine Recall@3
|
248 |
+
- type: cosine_recall@5
|
249 |
+
value: 0.9720538720538721
|
250 |
+
name: Cosine Recall@5
|
251 |
+
- type: cosine_recall@10
|
252 |
+
value: 0.9804713804713805
|
253 |
+
name: Cosine Recall@10
|
254 |
+
- type: cosine_ndcg@10
|
255 |
+
value: 0.9628692157043424
|
256 |
+
name: Cosine Ndcg@10
|
257 |
+
- type: cosine_mrr@10
|
258 |
+
value: 0.9572219549997326
|
259 |
+
name: Cosine Mrr@10
|
260 |
+
- type: cosine_map@100
|
261 |
+
value: 0.9577987764578036
|
262 |
+
name: Cosine Map@100
|
263 |
+
---
|
264 |
+
|
265 |
+
# SentenceTransformer based on dunzhang/stella_en_1.5B_v5
|
266 |
+
|
267 |
+
This is a [sentence-transformers](https://www.SBERT.net) model finetuned from [dunzhang/stella_en_1.5B_v5](https://huggingface.co/dunzhang/stella_en_1.5B_v5). It maps sentences & paragraphs to a 1024-dimensional dense vector space and can be used for semantic textual similarity, semantic search, paraphrase mining, text classification, clustering, and more.
|
268 |
+
|
269 |
+
## Model Details
|
270 |
+
|
271 |
+
### Model Description
|
272 |
+
- **Model Type:** Sentence Transformer
|
273 |
+
- **Base model:** [dunzhang/stella_en_1.5B_v5](https://huggingface.co/dunzhang/stella_en_1.5B_v5) <!-- at revision 129dc50d3ca5f0f5ee0ce8944f65a8553c0f26e0 -->
|
274 |
+
- **Maximum Sequence Length:** 8096 tokens
|
275 |
+
- **Output Dimensionality:** 1024 tokens
|
276 |
+
- **Similarity Function:** Cosine Similarity
|
277 |
+
<!-- - **Training Dataset:** Unknown -->
|
278 |
+
<!-- - **Language:** Unknown -->
|
279 |
+
<!-- - **License:** Unknown -->
|
280 |
+
|
281 |
+
### Model Sources
|
282 |
+
|
283 |
+
- **Documentation:** [Sentence Transformers Documentation](https://sbert.net)
|
284 |
+
- **Repository:** [Sentence Transformers on GitHub](https://github.com/UKPLab/sentence-transformers)
|
285 |
+
- **Hugging Face:** [Sentence Transformers on Hugging Face](https://huggingface.co/models?library=sentence-transformers)
|
286 |
+
|
287 |
+
### Full Model Architecture
|
288 |
+
|
289 |
+
```
|
290 |
+
SentenceTransformer(
|
291 |
+
(0): Transformer({'max_seq_length': 8096, 'do_lower_case': False}) with Transformer model: Qwen2Model
|
292 |
+
(1): Pooling({'word_embedding_dimension': 1536, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
|
293 |
+
(2): Dense({'in_features': 1536, 'out_features': 1024, 'bias': True, 'activation_function': 'torch.nn.modules.linear.Identity'})
|
294 |
+
)
|
295 |
+
```
|
296 |
+
|
297 |
+
## Usage
|
298 |
+
|
299 |
+
### Direct Usage (Sentence Transformers)
|
300 |
+
|
301 |
+
First install the Sentence Transformers library:
|
302 |
+
|
303 |
+
```bash
|
304 |
+
pip install -U sentence-transformers
|
305 |
+
```
|
306 |
+
|
307 |
+
Then you can load this model and run inference.
|
308 |
+
```python
|
309 |
+
from sentence_transformers import SentenceTransformer
|
310 |
+
|
311 |
+
# Download from the 🤗 Hub
|
312 |
+
model = SentenceTransformer("sentence_transformers_model_id")
|
313 |
+
# Run inference
|
314 |
+
sentences = [
|
315 |
+
'The Tchaikovsky Symphony Orchestra is a Russian classical music orchestra established in 1930. It was founded as the Moscow Radio Symphony Orchestra, and served as the official symphony for the Soviet All-Union Radio network. Following the dissolution of the, Soviet Union in 1991, the orchestra was renamed in 1993 by the Russian Ministry of Culture in recognition of the central role the music of Tchaikovsky plays in its repertoire. The current music director is Vladimir Fedoseyev, who has been in that position since 1974.',
|
316 |
+
'Instruct: Given a web search query, retrieve relevant passages that answer the query.\nQuery: Tchaikovsky Symphony Orchestra',
|
317 |
+
'Instruct: Given a web search query, retrieve relevant passages that answer the query.\nQuery: Sierra del Lacandón',
|
318 |
+
]
|
319 |
+
embeddings = model.encode(sentences)
|
320 |
+
print(embeddings.shape)
|
321 |
+
# [3, 1024]
|
322 |
+
|
323 |
+
# Get the similarity scores for the embeddings
|
324 |
+
similarities = model.similarity(embeddings, embeddings)
|
325 |
+
print(similarities.shape)
|
326 |
+
# [3, 3]
|
327 |
+
```
|
328 |
+
|
329 |
+
<!--
|
330 |
+
### Direct Usage (Transformers)
|
331 |
+
|
332 |
+
<details><summary>Click to see the direct usage in Transformers</summary>
|
333 |
+
|
334 |
+
</details>
|
335 |
+
-->
|
336 |
+
|
337 |
+
<!--
|
338 |
+
### Downstream Usage (Sentence Transformers)
|
339 |
+
|
340 |
+
You can finetune this model on your own dataset.
|
341 |
+
|
342 |
+
<details><summary>Click to expand</summary>
|
343 |
+
|
344 |
+
</details>
|
345 |
+
-->
|
346 |
+
|
347 |
+
<!--
|
348 |
+
### Out-of-Scope Use
|
349 |
+
|
350 |
+
*List how the model may foreseeably be misused and address what users ought not to do with the model.*
|
351 |
+
-->
|
352 |
+
|
353 |
+
## Evaluation
|
354 |
+
|
355 |
+
### Metrics
|
356 |
+
|
357 |
+
#### Information Retrieval
|
358 |
+
|
359 |
+
* Evaluated with [<code>InformationRetrievalEvaluator</code>](https://sbert.net/docs/package_reference/sentence_transformer/evaluation.html#sentence_transformers.evaluation.InformationRetrievalEvaluator)
|
360 |
+
|
361 |
+
| Metric | Value |
|
362 |
+
|:--------------------|:----------|
|
363 |
+
| cosine_accuracy@1 | 0.9458 |
|
364 |
+
| cosine_accuracy@3 | 0.9687 |
|
365 |
+
| cosine_accuracy@5 | 0.9751 |
|
366 |
+
| cosine_accuracy@10 | 0.9818 |
|
367 |
+
| cosine_precision@1 | 0.9458 |
|
368 |
+
| cosine_precision@3 | 0.3229 |
|
369 |
+
| cosine_precision@5 | 0.195 |
|
370 |
+
| cosine_precision@10 | 0.0982 |
|
371 |
+
| cosine_recall@1 | 0.9458 |
|
372 |
+
| cosine_recall@3 | 0.9687 |
|
373 |
+
| cosine_recall@5 | 0.9751 |
|
374 |
+
| cosine_recall@10 | 0.9818 |
|
375 |
+
| cosine_ndcg@10 | 0.9642 |
|
376 |
+
| cosine_mrr@10 | 0.9585 |
|
377 |
+
| **cosine_map@100** | **0.959** |
|
378 |
+
|
379 |
+
#### Information Retrieval
|
380 |
+
|
381 |
+
* Evaluated with [<code>InformationRetrievalEvaluator</code>](https://sbert.net/docs/package_reference/sentence_transformer/evaluation.html#sentence_transformers.evaluation.InformationRetrievalEvaluator)
|
382 |
+
|
383 |
+
| Metric | Value |
|
384 |
+
|:--------------------|:-----------|
|
385 |
+
| cosine_accuracy@1 | 0.9448 |
|
386 |
+
| cosine_accuracy@3 | 0.9697 |
|
387 |
+
| cosine_accuracy@5 | 0.9754 |
|
388 |
+
| cosine_accuracy@10 | 0.9825 |
|
389 |
+
| cosine_precision@1 | 0.9448 |
|
390 |
+
| cosine_precision@3 | 0.3232 |
|
391 |
+
| cosine_precision@5 | 0.1951 |
|
392 |
+
| cosine_precision@10 | 0.0982 |
|
393 |
+
| cosine_recall@1 | 0.9448 |
|
394 |
+
| cosine_recall@3 | 0.9697 |
|
395 |
+
| cosine_recall@5 | 0.9754 |
|
396 |
+
| cosine_recall@10 | 0.9825 |
|
397 |
+
| cosine_ndcg@10 | 0.9641 |
|
398 |
+
| cosine_mrr@10 | 0.9582 |
|
399 |
+
| **cosine_map@100** | **0.9587** |
|
400 |
+
|
401 |
+
#### Information Retrieval
|
402 |
+
|
403 |
+
* Evaluated with [<code>InformationRetrievalEvaluator</code>](https://sbert.net/docs/package_reference/sentence_transformer/evaluation.html#sentence_transformers.evaluation.InformationRetrievalEvaluator)
|
404 |
+
|
405 |
+
| Metric | Value |
|
406 |
+
|:--------------------|:-----------|
|
407 |
+
| cosine_accuracy@1 | 0.9448 |
|
408 |
+
| cosine_accuracy@3 | 0.9673 |
|
409 |
+
| cosine_accuracy@5 | 0.9721 |
|
410 |
+
| cosine_accuracy@10 | 0.9805 |
|
411 |
+
| cosine_precision@1 | 0.9448 |
|
412 |
+
| cosine_precision@3 | 0.3224 |
|
413 |
+
| cosine_precision@5 | 0.1944 |
|
414 |
+
| cosine_precision@10 | 0.098 |
|
415 |
+
| cosine_recall@1 | 0.9448 |
|
416 |
+
| cosine_recall@3 | 0.9673 |
|
417 |
+
| cosine_recall@5 | 0.9721 |
|
418 |
+
| cosine_recall@10 | 0.9805 |
|
419 |
+
| cosine_ndcg@10 | 0.9629 |
|
420 |
+
| cosine_mrr@10 | 0.9572 |
|
421 |
+
| **cosine_map@100** | **0.9578** |
|
422 |
+
|
423 |
+
<!--
|
424 |
+
## Bias, Risks and Limitations
|
425 |
+
|
426 |
+
*What are the known or foreseeable issues stemming from this model? You could also flag here known failure cases or weaknesses of the model.*
|
427 |
+
-->
|
428 |
+
|
429 |
+
<!--
|
430 |
+
### Recommendations
|
431 |
+
|
432 |
+
*What are recommendations with respect to the foreseeable issues? For example, filtering explicit content.*
|
433 |
+
-->
|
434 |
+
|
435 |
+
## Training Details
|
436 |
+
|
437 |
+
### Training Hyperparameters
|
438 |
+
#### Non-Default Hyperparameters
|
439 |
+
|
440 |
+
- `eval_strategy`: steps
|
441 |
+
- `per_device_eval_batch_size`: 4
|
442 |
+
- `gradient_accumulation_steps`: 4
|
443 |
+
- `learning_rate`: 2e-05
|
444 |
+
- `max_steps`: 1500
|
445 |
+
- `lr_scheduler_type`: cosine
|
446 |
+
- `warmup_ratio`: 0.1
|
447 |
+
- `warmup_steps`: 5
|
448 |
+
- `bf16`: True
|
449 |
+
- `tf32`: True
|
450 |
+
- `optim`: adamw_torch_fused
|
451 |
+
- `gradient_checkpointing`: True
|
452 |
+
- `gradient_checkpointing_kwargs`: {'use_reentrant': False}
|
453 |
+
- `batch_sampler`: no_duplicates
|
454 |
+
|
455 |
+
#### All Hyperparameters
|
456 |
+
<details><summary>Click to expand</summary>
|
457 |
+
|
458 |
+
- `overwrite_output_dir`: False
|
459 |
+
- `do_predict`: False
|
460 |
+
- `eval_strategy`: steps
|
461 |
+
- `prediction_loss_only`: True
|
462 |
+
- `per_device_train_batch_size`: 8
|
463 |
+
- `per_device_eval_batch_size`: 4
|
464 |
+
- `per_gpu_train_batch_size`: None
|
465 |
+
- `per_gpu_eval_batch_size`: None
|
466 |
+
- `gradient_accumulation_steps`: 4
|
467 |
+
- `eval_accumulation_steps`: None
|
468 |
+
- `learning_rate`: 2e-05
|
469 |
+
- `weight_decay`: 0.0
|
470 |
+
- `adam_beta1`: 0.9
|
471 |
+
- `adam_beta2`: 0.999
|
472 |
+
- `adam_epsilon`: 1e-08
|
473 |
+
- `max_grad_norm`: 1.0
|
474 |
+
- `num_train_epochs`: 3.0
|
475 |
+
- `max_steps`: 1500
|
476 |
+
- `lr_scheduler_type`: cosine
|
477 |
+
- `lr_scheduler_kwargs`: {}
|
478 |
+
- `warmup_ratio`: 0.1
|
479 |
+
- `warmup_steps`: 5
|
480 |
+
- `log_level`: passive
|
481 |
+
- `log_level_replica`: warning
|
482 |
+
- `log_on_each_node`: True
|
483 |
+
- `logging_nan_inf_filter`: True
|
484 |
+
- `save_safetensors`: True
|
485 |
+
- `save_on_each_node`: False
|
486 |
+
- `save_only_model`: False
|
487 |
+
- `restore_callback_states_from_checkpoint`: False
|
488 |
+
- `no_cuda`: False
|
489 |
+
- `use_cpu`: False
|
490 |
+
- `use_mps_device`: False
|
491 |
+
- `seed`: 42
|
492 |
+
- `data_seed`: None
|
493 |
+
- `jit_mode_eval`: False
|
494 |
+
- `use_ipex`: False
|
495 |
+
- `bf16`: True
|
496 |
+
- `fp16`: False
|
497 |
+
- `fp16_opt_level`: O1
|
498 |
+
- `half_precision_backend`: auto
|
499 |
+
- `bf16_full_eval`: False
|
500 |
+
- `fp16_full_eval`: False
|
501 |
+
- `tf32`: True
|
502 |
+
- `local_rank`: 0
|
503 |
+
- `ddp_backend`: None
|
504 |
+
- `tpu_num_cores`: None
|
505 |
+
- `tpu_metrics_debug`: False
|
506 |
+
- `debug`: []
|
507 |
+
- `dataloader_drop_last`: True
|
508 |
+
- `dataloader_num_workers`: 0
|
509 |
+
- `dataloader_prefetch_factor`: None
|
510 |
+
- `past_index`: -1
|
511 |
+
- `disable_tqdm`: False
|
512 |
+
- `remove_unused_columns`: True
|
513 |
+
- `label_names`: None
|
514 |
+
- `load_best_model_at_end`: False
|
515 |
+
- `ignore_data_skip`: False
|
516 |
+
- `fsdp`: []
|
517 |
+
- `fsdp_min_num_params`: 0
|
518 |
+
- `fsdp_config`: {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}
|
519 |
+
- `fsdp_transformer_layer_cls_to_wrap`: None
|
520 |
+
- `accelerator_config`: {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}
|
521 |
+
- `deepspeed`: None
|
522 |
+
- `label_smoothing_factor`: 0.0
|
523 |
+
- `optim`: adamw_torch_fused
|
524 |
+
- `optim_args`: None
|
525 |
+
- `adafactor`: False
|
526 |
+
- `group_by_length`: False
|
527 |
+
- `length_column_name`: length
|
528 |
+
- `ddp_find_unused_parameters`: None
|
529 |
+
- `ddp_bucket_cap_mb`: None
|
530 |
+
- `ddp_broadcast_buffers`: False
|
531 |
+
- `dataloader_pin_memory`: True
|
532 |
+
- `dataloader_persistent_workers`: False
|
533 |
+
- `skip_memory_metrics`: True
|
534 |
+
- `use_legacy_prediction_loop`: False
|
535 |
+
- `push_to_hub`: False
|
536 |
+
- `resume_from_checkpoint`: None
|
537 |
+
- `hub_model_id`: None
|
538 |
+
- `hub_strategy`: every_save
|
539 |
+
- `hub_private_repo`: False
|
540 |
+
- `hub_always_push`: False
|
541 |
+
- `gradient_checkpointing`: True
|
542 |
+
- `gradient_checkpointing_kwargs`: {'use_reentrant': False}
|
543 |
+
- `include_inputs_for_metrics`: False
|
544 |
+
- `eval_do_concat_batches`: True
|
545 |
+
- `fp16_backend`: auto
|
546 |
+
- `push_to_hub_model_id`: None
|
547 |
+
- `push_to_hub_organization`: None
|
548 |
+
- `mp_parameters`:
|
549 |
+
- `auto_find_batch_size`: False
|
550 |
+
- `full_determinism`: False
|
551 |
+
- `torchdynamo`: None
|
552 |
+
- `ray_scope`: last
|
553 |
+
- `ddp_timeout`: 1800
|
554 |
+
- `torch_compile`: False
|
555 |
+
- `torch_compile_backend`: None
|
556 |
+
- `torch_compile_mode`: None
|
557 |
+
- `dispatch_batches`: None
|
558 |
+
- `split_batches`: None
|
559 |
+
- `include_tokens_per_second`: False
|
560 |
+
- `include_num_input_tokens_seen`: False
|
561 |
+
- `neftune_noise_alpha`: None
|
562 |
+
- `optim_target_modules`: None
|
563 |
+
- `batch_eval_metrics`: False
|
564 |
+
- `batch_sampler`: no_duplicates
|
565 |
+
- `multi_dataset_batch_sampler`: proportional
|
566 |
+
|
567 |
+
</details>
|
568 |
+
|
569 |
+
### Training Logs
|
570 |
+
| Epoch | Step | Training Loss | loss | cosine_map@100 |
|
571 |
+
|:------:|:----:|:-------------:|:------:|:--------------:|
|
572 |
+
| 0.0185 | 100 | 0.4835 | 0.0751 | 0.9138 |
|
573 |
+
| 0.0369 | 200 | 0.0646 | 0.0590 | 0.9384 |
|
574 |
+
| 0.0554 | 300 | 0.0594 | 0.0519 | 0.9462 |
|
575 |
+
| 0.0739 | 400 | 0.0471 | 0.0483 | 0.9514 |
|
576 |
+
| 0.0924 | 500 | 0.0524 | 0.0455 | 0.9531 |
|
577 |
+
| 0.1108 | 600 | 0.0435 | 0.0397 | 0.9546 |
|
578 |
+
| 0.1293 | 700 | 0.0336 | 0.0394 | 0.9549 |
|
579 |
+
| 0.1478 | 800 | 0.0344 | 0.0374 | 0.9565 |
|
580 |
+
| 0.1662 | 900 | 0.0393 | 0.0361 | 0.9568 |
|
581 |
+
| 0.1847 | 1000 | 0.0451 | 0.0361 | 0.9578 |
|
582 |
+
|
583 |
+
|
584 |
+
### Framework Versions
|
585 |
+
- Python: 3.10.12
|
586 |
+
- Sentence Transformers: 3.0.1
|
587 |
+
- Transformers: 4.41.2
|
588 |
+
- PyTorch: 2.2.0+cu121
|
589 |
+
- Accelerate: 0.33.0
|
590 |
+
- Datasets: 2.20.0
|
591 |
+
- Tokenizers: 0.19.1
|
592 |
+
|
593 |
+
## Citation
|
594 |
+
|
595 |
+
### BibTeX
|
596 |
+
|
597 |
+
#### Sentence Transformers
|
598 |
+
```bibtex
|
599 |
+
@inproceedings{reimers-2019-sentence-bert,
|
600 |
+
title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
|
601 |
+
author = "Reimers, Nils and Gurevych, Iryna",
|
602 |
+
booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
|
603 |
+
month = "11",
|
604 |
+
year = "2019",
|
605 |
+
publisher = "Association for Computational Linguistics",
|
606 |
+
url = "https://arxiv.org/abs/1908.10084",
|
607 |
+
}
|
608 |
+
```
|
609 |
+
|
610 |
+
#### MatryoshkaLoss
|
611 |
+
```bibtex
|
612 |
+
@misc{kusupati2024matryoshka,
|
613 |
+
title={Matryoshka Representation Learning},
|
614 |
+
author={Aditya Kusupati and Gantavya Bhatt and Aniket Rege and Matthew Wallingford and Aditya Sinha and Vivek Ramanujan and William Howard-Snyder and Kaifeng Chen and Sham Kakade and Prateek Jain and Ali Farhadi},
|
615 |
+
year={2024},
|
616 |
+
eprint={2205.13147},
|
617 |
+
archivePrefix={arXiv},
|
618 |
+
primaryClass={cs.LG}
|
619 |
+
}
|
620 |
+
```
|
621 |
+
|
622 |
+
#### MultipleNegativesRankingLoss
|
623 |
+
```bibtex
|
624 |
+
@misc{henderson2017efficient,
|
625 |
+
title={Efficient Natural Language Response Suggestion for Smart Reply},
|
626 |
+
author={Matthew Henderson and Rami Al-Rfou and Brian Strope and Yun-hsuan Sung and Laszlo Lukacs and Ruiqi Guo and Sanjiv Kumar and Balint Miklos and Ray Kurzweil},
|
627 |
+
year={2017},
|
628 |
+
eprint={1705.00652},
|
629 |
+
archivePrefix={arXiv},
|
630 |
+
primaryClass={cs.CL}
|
631 |
+
}
|
632 |
+
```
|
633 |
+
|
634 |
+
<!--
|
635 |
+
## Glossary
|
636 |
+
|
637 |
+
*Clearly define terms in order to be accessible across audiences.*
|
638 |
+
-->
|
639 |
+
|
640 |
+
<!--
|
641 |
+
## Model Card Authors
|
642 |
+
|
643 |
+
*Lists the people who create the model card, providing recognition and accountability for the detailed work that goes into its construction.*
|
644 |
+
-->
|
645 |
+
|
646 |
+
<!--
|
647 |
+
## Model Card Contact
|
648 |
+
|
649 |
+
*Provides a way for people who have updates to the Model Card, suggestions, or questions, to contact the Model Card authors.*
|
650 |
+
-->
|
added_tokens.json
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"<|endoftext|>": 151643,
|
3 |
+
"<|im_end|>": 151645,
|
4 |
+
"<|im_start|>": 151644
|
5 |
+
}
|
config.json
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "dunzhang/stella_en_1.5B_v5",
|
3 |
+
"architectures": [
|
4 |
+
"Qwen2Model"
|
5 |
+
],
|
6 |
+
"attention_dropout": 0.0,
|
7 |
+
"auto_map": {
|
8 |
+
"AutoModel": "dunzhang/stella_en_1.5B_v5--modeling_qwen.Qwen2Model",
|
9 |
+
"AutoModelForCausalLM": "dunzhang/stella_en_1.5B_v5--modeling_qwen.Qwen2ForCausalLM",
|
10 |
+
"AutoModelForSequenceClassification": "dunzhang/stella_en_1.5B_v5--modeling_qwen.Qwen2ForSequenceClassification"
|
11 |
+
},
|
12 |
+
"bos_token_id": 151643,
|
13 |
+
"eos_token_id": 151643,
|
14 |
+
"hidden_act": "silu",
|
15 |
+
"hidden_size": 1536,
|
16 |
+
"initializer_range": 0.02,
|
17 |
+
"intermediate_size": 8960,
|
18 |
+
"max_position_embeddings": 131072,
|
19 |
+
"max_window_layers": 21,
|
20 |
+
"model_type": "qwen2",
|
21 |
+
"num_attention_heads": 12,
|
22 |
+
"num_hidden_layers": 28,
|
23 |
+
"num_key_value_heads": 2,
|
24 |
+
"rms_norm_eps": 1e-06,
|
25 |
+
"rope_theta": 1000000.0,
|
26 |
+
"sliding_window": 131072,
|
27 |
+
"tie_word_embeddings": false,
|
28 |
+
"torch_dtype": "bfloat16",
|
29 |
+
"transformers_version": "4.41.2",
|
30 |
+
"use_cache": true,
|
31 |
+
"use_sliding_window": false,
|
32 |
+
"vocab_size": 151646
|
33 |
+
}
|
config_sentence_transformers.json
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"__version__": {
|
3 |
+
"sentence_transformers": "3.0.1",
|
4 |
+
"transformers": "4.41.2",
|
5 |
+
"pytorch": "2.2.0+cu121"
|
6 |
+
},
|
7 |
+
"prompts": {
|
8 |
+
"s2p_query": "Instruct: Given a web search query, retrieve relevant passages that answer the query.\nQuery: ",
|
9 |
+
"s2s_query": "Instruct: Retrieve semantically similar text.\nQuery: "
|
10 |
+
},
|
11 |
+
"default_prompt_name": null,
|
12 |
+
"similarity_fn_name": "cosine"
|
13 |
+
}
|
merges.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:55f69fe89e083f5959e1f1e3a4f5e647962b025d51ad040c5b2229ceea43feb9
|
3 |
+
size 3086574240
|
modules.json
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"idx": 0,
|
4 |
+
"name": "0",
|
5 |
+
"path": "",
|
6 |
+
"type": "sentence_transformers.models.Transformer"
|
7 |
+
},
|
8 |
+
{
|
9 |
+
"idx": 1,
|
10 |
+
"name": "1",
|
11 |
+
"path": "1_Pooling",
|
12 |
+
"type": "sentence_transformers.models.Pooling"
|
13 |
+
},
|
14 |
+
{
|
15 |
+
"idx": 2,
|
16 |
+
"name": "2",
|
17 |
+
"path": "2_Dense",
|
18 |
+
"type": "sentence_transformers.models.Dense"
|
19 |
+
}
|
20 |
+
]
|
optimizer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ad4c6ab7fce986ed76182f793b8df42d46f680b7dc1460eff9a4a979de9aeb0a
|
3 |
+
size 6185963010
|
rng_state_0.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5f0f81634b51f39c2bc64bb11062369f8b6db19ac7363008351335057d5f9e5f
|
3 |
+
size 14960
|
rng_state_1.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a0181c05fc64b348a99a4516e7a0d2696640444de3abe5a11baa153f5553cc8d
|
3 |
+
size 14960
|
rng_state_2.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:265ccfdc335679ae7ee260cd5346a12dd735197485a755aa2431d45405b28cf0
|
3 |
+
size 14960
|
rng_state_3.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d650712bb5285a056c794df33ab1cc908c9976375fcd81a004493656c3ac1f0d
|
3 |
+
size 14960
|
scheduler.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c1c966d855305dbdbe3e7516b03da5b72d250a16a08641463cf1ff44c6809016
|
3 |
+
size 1064
|
sentence_bert_config.json
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"max_seq_length": 8096,
|
3 |
+
"do_lower_case": false
|
4 |
+
}
|
special_tokens_map.json
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"additional_special_tokens": [
|
3 |
+
"<|im_start|>",
|
4 |
+
"<|im_end|>"
|
5 |
+
],
|
6 |
+
"eos_token": {
|
7 |
+
"content": "<|endoftext|>",
|
8 |
+
"lstrip": false,
|
9 |
+
"normalized": false,
|
10 |
+
"rstrip": false,
|
11 |
+
"single_word": false
|
12 |
+
},
|
13 |
+
"pad_token": {
|
14 |
+
"content": "<|endoftext|>",
|
15 |
+
"lstrip": false,
|
16 |
+
"normalized": false,
|
17 |
+
"rstrip": false,
|
18 |
+
"single_word": false
|
19 |
+
}
|
20 |
+
}
|
tokenizer.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
tokenizer_config.json
ADDED
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"add_eos_token": true,
|
3 |
+
"add_prefix_space": false,
|
4 |
+
"added_tokens_decoder": {
|
5 |
+
"151643": {
|
6 |
+
"content": "<|endoftext|>",
|
7 |
+
"lstrip": false,
|
8 |
+
"normalized": false,
|
9 |
+
"rstrip": false,
|
10 |
+
"single_word": false,
|
11 |
+
"special": true
|
12 |
+
},
|
13 |
+
"151644": {
|
14 |
+
"content": "<|im_start|>",
|
15 |
+
"lstrip": false,
|
16 |
+
"normalized": false,
|
17 |
+
"rstrip": false,
|
18 |
+
"single_word": false,
|
19 |
+
"special": true
|
20 |
+
},
|
21 |
+
"151645": {
|
22 |
+
"content": "<|im_end|>",
|
23 |
+
"lstrip": false,
|
24 |
+
"normalized": false,
|
25 |
+
"rstrip": false,
|
26 |
+
"single_word": false,
|
27 |
+
"special": true
|
28 |
+
}
|
29 |
+
},
|
30 |
+
"additional_special_tokens": [
|
31 |
+
"<|im_start|>",
|
32 |
+
"<|im_end|>"
|
33 |
+
],
|
34 |
+
"auto_map": {
|
35 |
+
"AutoTokenizer": [
|
36 |
+
"dunzhang/stella_en_1.5B_v5--tokenization_qwen.Qwen2Tokenizer",
|
37 |
+
"dunzhang/stella_en_1.5B_v5--tokenization_qwen.Qwen2TokenizerFast"
|
38 |
+
]
|
39 |
+
},
|
40 |
+
"bos_token": null,
|
41 |
+
"chat_template": "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
|
42 |
+
"clean_up_tokenization_spaces": false,
|
43 |
+
"eos_token": "<|endoftext|>",
|
44 |
+
"errors": "replace",
|
45 |
+
"model_max_length": 512,
|
46 |
+
"pad_token": "<|endoftext|>",
|
47 |
+
"split_special_tokens": false,
|
48 |
+
"tokenizer_class": "Qwen2Tokenizer",
|
49 |
+
"unk_token": null
|
50 |
+
}
|
trainer_state.json
ADDED
@@ -0,0 +1,343 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"best_metric": null,
|
3 |
+
"best_model_checkpoint": null,
|
4 |
+
"epoch": 0.1847063169560399,
|
5 |
+
"eval_steps": 100,
|
6 |
+
"global_step": 1000,
|
7 |
+
"is_hyper_param_search": false,
|
8 |
+
"is_local_process_zero": true,
|
9 |
+
"is_world_process_zero": true,
|
10 |
+
"log_history": [
|
11 |
+
{
|
12 |
+
"epoch": 0.01847063169560399,
|
13 |
+
"grad_norm": 17.696088790893555,
|
14 |
+
"learning_rate": 1.980139427847242e-05,
|
15 |
+
"loss": 0.4835,
|
16 |
+
"step": 100
|
17 |
+
},
|
18 |
+
{
|
19 |
+
"epoch": 0.01847063169560399,
|
20 |
+
"eval_cosine_accuracy@1": 0.8868686868686869,
|
21 |
+
"eval_cosine_accuracy@10": 0.9579124579124579,
|
22 |
+
"eval_cosine_accuracy@3": 0.9343434343434344,
|
23 |
+
"eval_cosine_accuracy@5": 0.9454545454545454,
|
24 |
+
"eval_cosine_map@100": 0.9137674463447905,
|
25 |
+
"eval_cosine_mrr@10": 0.9124690021912245,
|
26 |
+
"eval_cosine_ndcg@10": 0.9236137131767355,
|
27 |
+
"eval_cosine_precision@1": 0.8868686868686869,
|
28 |
+
"eval_cosine_precision@10": 0.09579124579124577,
|
29 |
+
"eval_cosine_precision@3": 0.3114478114478115,
|
30 |
+
"eval_cosine_precision@5": 0.18909090909090906,
|
31 |
+
"eval_cosine_recall@1": 0.8868686868686869,
|
32 |
+
"eval_cosine_recall@10": 0.9579124579124579,
|
33 |
+
"eval_cosine_recall@3": 0.9343434343434344,
|
34 |
+
"eval_cosine_recall@5": 0.9454545454545454,
|
35 |
+
"eval_loss": 0.07506837695837021,
|
36 |
+
"eval_runtime": 49.7303,
|
37 |
+
"eval_samples_per_second": 140.759,
|
38 |
+
"eval_sequential_score": 0.9137674463447905,
|
39 |
+
"eval_steps_per_second": 8.808,
|
40 |
+
"step": 100
|
41 |
+
},
|
42 |
+
{
|
43 |
+
"epoch": 0.03694126339120798,
|
44 |
+
"grad_norm": 2.3271963596343994,
|
45 |
+
"learning_rate": 1.917211301505453e-05,
|
46 |
+
"loss": 0.0646,
|
47 |
+
"step": 200
|
48 |
+
},
|
49 |
+
{
|
50 |
+
"epoch": 0.03694126339120798,
|
51 |
+
"eval_cosine_accuracy@1": 0.9195286195286195,
|
52 |
+
"eval_cosine_accuracy@10": 0.967003367003367,
|
53 |
+
"eval_cosine_accuracy@3": 0.9518518518518518,
|
54 |
+
"eval_cosine_accuracy@5": 0.9612794612794613,
|
55 |
+
"eval_cosine_map@100": 0.9383862806127067,
|
56 |
+
"eval_cosine_mrr@10": 0.937248810859922,
|
57 |
+
"eval_cosine_ndcg@10": 0.9445895366693552,
|
58 |
+
"eval_cosine_precision@1": 0.9195286195286195,
|
59 |
+
"eval_cosine_precision@10": 0.0967003367003367,
|
60 |
+
"eval_cosine_precision@3": 0.317283950617284,
|
61 |
+
"eval_cosine_precision@5": 0.19225589225589226,
|
62 |
+
"eval_cosine_recall@1": 0.9195286195286195,
|
63 |
+
"eval_cosine_recall@10": 0.967003367003367,
|
64 |
+
"eval_cosine_recall@3": 0.9518518518518518,
|
65 |
+
"eval_cosine_recall@5": 0.9612794612794613,
|
66 |
+
"eval_loss": 0.05896875262260437,
|
67 |
+
"eval_runtime": 51.3489,
|
68 |
+
"eval_samples_per_second": 136.322,
|
69 |
+
"eval_sequential_score": 0.9383862806127067,
|
70 |
+
"eval_steps_per_second": 8.53,
|
71 |
+
"step": 200
|
72 |
+
},
|
73 |
+
{
|
74 |
+
"epoch": 0.05541189508681197,
|
75 |
+
"grad_norm": 3.7376925945281982,
|
76 |
+
"learning_rate": 1.8139290433532415e-05,
|
77 |
+
"loss": 0.0594,
|
78 |
+
"step": 300
|
79 |
+
},
|
80 |
+
{
|
81 |
+
"epoch": 0.05541189508681197,
|
82 |
+
"eval_cosine_accuracy@1": 0.9303030303030303,
|
83 |
+
"eval_cosine_accuracy@10": 0.9737373737373738,
|
84 |
+
"eval_cosine_accuracy@3": 0.9579124579124579,
|
85 |
+
"eval_cosine_accuracy@5": 0.9656565656565657,
|
86 |
+
"eval_cosine_map@100": 0.9462389210939364,
|
87 |
+
"eval_cosine_mrr@10": 0.9454570840681954,
|
88 |
+
"eval_cosine_ndcg@10": 0.9523521034308455,
|
89 |
+
"eval_cosine_precision@1": 0.9303030303030303,
|
90 |
+
"eval_cosine_precision@10": 0.09737373737373735,
|
91 |
+
"eval_cosine_precision@3": 0.31930415263748596,
|
92 |
+
"eval_cosine_precision@5": 0.1931313131313131,
|
93 |
+
"eval_cosine_recall@1": 0.9303030303030303,
|
94 |
+
"eval_cosine_recall@10": 0.9737373737373738,
|
95 |
+
"eval_cosine_recall@3": 0.9579124579124579,
|
96 |
+
"eval_cosine_recall@5": 0.9656565656565657,
|
97 |
+
"eval_loss": 0.051894593983888626,
|
98 |
+
"eval_runtime": 49.496,
|
99 |
+
"eval_samples_per_second": 141.426,
|
100 |
+
"eval_sequential_score": 0.9462389210939364,
|
101 |
+
"eval_steps_per_second": 8.849,
|
102 |
+
"step": 300
|
103 |
+
},
|
104 |
+
{
|
105 |
+
"epoch": 0.07388252678241596,
|
106 |
+
"grad_norm": 0.3877984583377838,
|
107 |
+
"learning_rate": 1.6748367163042577e-05,
|
108 |
+
"loss": 0.0471,
|
109 |
+
"step": 400
|
110 |
+
},
|
111 |
+
{
|
112 |
+
"epoch": 0.07388252678241596,
|
113 |
+
"eval_cosine_accuracy@1": 0.9367003367003367,
|
114 |
+
"eval_cosine_accuracy@10": 0.9750841750841751,
|
115 |
+
"eval_cosine_accuracy@3": 0.9612794612794613,
|
116 |
+
"eval_cosine_accuracy@5": 0.969023569023569,
|
117 |
+
"eval_cosine_map@100": 0.9513700816079773,
|
118 |
+
"eval_cosine_mrr@10": 0.9505351130351131,
|
119 |
+
"eval_cosine_ndcg@10": 0.9565510675566292,
|
120 |
+
"eval_cosine_precision@1": 0.9367003367003367,
|
121 |
+
"eval_cosine_precision@10": 0.09750841750841752,
|
122 |
+
"eval_cosine_precision@3": 0.3204264870931538,
|
123 |
+
"eval_cosine_precision@5": 0.19380471380471379,
|
124 |
+
"eval_cosine_recall@1": 0.9367003367003367,
|
125 |
+
"eval_cosine_recall@10": 0.9750841750841751,
|
126 |
+
"eval_cosine_recall@3": 0.9612794612794613,
|
127 |
+
"eval_cosine_recall@5": 0.969023569023569,
|
128 |
+
"eval_loss": 0.04832224175333977,
|
129 |
+
"eval_runtime": 49.2695,
|
130 |
+
"eval_samples_per_second": 142.076,
|
131 |
+
"eval_sequential_score": 0.9513700816079773,
|
132 |
+
"eval_steps_per_second": 8.89,
|
133 |
+
"step": 400
|
134 |
+
},
|
135 |
+
{
|
136 |
+
"epoch": 0.09235315847801995,
|
137 |
+
"grad_norm": 0.9424126744270325,
|
138 |
+
"learning_rate": 1.5060539027168317e-05,
|
139 |
+
"loss": 0.0524,
|
140 |
+
"step": 500
|
141 |
+
},
|
142 |
+
{
|
143 |
+
"epoch": 0.09235315847801995,
|
144 |
+
"eval_cosine_accuracy@1": 0.9387205387205387,
|
145 |
+
"eval_cosine_accuracy@10": 0.9787878787878788,
|
146 |
+
"eval_cosine_accuracy@3": 0.9622895622895623,
|
147 |
+
"eval_cosine_accuracy@5": 0.9703703703703703,
|
148 |
+
"eval_cosine_map@100": 0.9530933506069861,
|
149 |
+
"eval_cosine_mrr@10": 0.9525124258457593,
|
150 |
+
"eval_cosine_ndcg@10": 0.9588799906525647,
|
151 |
+
"eval_cosine_precision@1": 0.9387205387205387,
|
152 |
+
"eval_cosine_precision@10": 0.09787878787878787,
|
153 |
+
"eval_cosine_precision@3": 0.3207631874298541,
|
154 |
+
"eval_cosine_precision@5": 0.19407407407407404,
|
155 |
+
"eval_cosine_recall@1": 0.9387205387205387,
|
156 |
+
"eval_cosine_recall@10": 0.9787878787878788,
|
157 |
+
"eval_cosine_recall@3": 0.9622895622895623,
|
158 |
+
"eval_cosine_recall@5": 0.9703703703703703,
|
159 |
+
"eval_loss": 0.04548173025250435,
|
160 |
+
"eval_runtime": 49.8784,
|
161 |
+
"eval_samples_per_second": 140.341,
|
162 |
+
"eval_sequential_score": 0.9530933506069861,
|
163 |
+
"eval_steps_per_second": 8.781,
|
164 |
+
"step": 500
|
165 |
+
},
|
166 |
+
{
|
167 |
+
"epoch": 0.11082379017362394,
|
168 |
+
"grad_norm": 1.4559208154678345,
|
169 |
+
"learning_rate": 1.315006463889948e-05,
|
170 |
+
"loss": 0.0435,
|
171 |
+
"step": 600
|
172 |
+
},
|
173 |
+
{
|
174 |
+
"epoch": 0.11082379017362394,
|
175 |
+
"eval_cosine_accuracy@1": 0.94006734006734,
|
176 |
+
"eval_cosine_accuracy@10": 0.9784511784511785,
|
177 |
+
"eval_cosine_accuracy@3": 0.965993265993266,
|
178 |
+
"eval_cosine_accuracy@5": 0.9713804713804713,
|
179 |
+
"eval_cosine_map@100": 0.9545577685553948,
|
180 |
+
"eval_cosine_mrr@10": 0.953849339960451,
|
181 |
+
"eval_cosine_ndcg@10": 0.9598872198067653,
|
182 |
+
"eval_cosine_precision@1": 0.94006734006734,
|
183 |
+
"eval_cosine_precision@10": 0.09784511784511785,
|
184 |
+
"eval_cosine_precision@3": 0.32199775533108865,
|
185 |
+
"eval_cosine_precision@5": 0.19427609427609427,
|
186 |
+
"eval_cosine_recall@1": 0.94006734006734,
|
187 |
+
"eval_cosine_recall@10": 0.9784511784511785,
|
188 |
+
"eval_cosine_recall@3": 0.965993265993266,
|
189 |
+
"eval_cosine_recall@5": 0.9713804713804713,
|
190 |
+
"eval_loss": 0.03971650078892708,
|
191 |
+
"eval_runtime": 50.0185,
|
192 |
+
"eval_samples_per_second": 139.948,
|
193 |
+
"eval_sequential_score": 0.9545577685553948,
|
194 |
+
"eval_steps_per_second": 8.757,
|
195 |
+
"step": 600
|
196 |
+
},
|
197 |
+
{
|
198 |
+
"epoch": 0.12929442186922793,
|
199 |
+
"grad_norm": 4.4629669189453125,
|
200 |
+
"learning_rate": 1.1100998277940316e-05,
|
201 |
+
"loss": 0.0336,
|
202 |
+
"step": 700
|
203 |
+
},
|
204 |
+
{
|
205 |
+
"epoch": 0.12929442186922793,
|
206 |
+
"eval_cosine_accuracy@1": 0.9404040404040404,
|
207 |
+
"eval_cosine_accuracy@10": 0.9804713804713805,
|
208 |
+
"eval_cosine_accuracy@3": 0.9646464646464646,
|
209 |
+
"eval_cosine_accuracy@5": 0.9717171717171718,
|
210 |
+
"eval_cosine_map@100": 0.9549470882860054,
|
211 |
+
"eval_cosine_mrr@10": 0.9544308160974829,
|
212 |
+
"eval_cosine_ndcg@10": 0.9607541640834583,
|
213 |
+
"eval_cosine_precision@1": 0.9404040404040404,
|
214 |
+
"eval_cosine_precision@10": 0.09804713804713802,
|
215 |
+
"eval_cosine_precision@3": 0.32154882154882153,
|
216 |
+
"eval_cosine_precision@5": 0.19434343434343435,
|
217 |
+
"eval_cosine_recall@1": 0.9404040404040404,
|
218 |
+
"eval_cosine_recall@10": 0.9804713804713805,
|
219 |
+
"eval_cosine_recall@3": 0.9646464646464646,
|
220 |
+
"eval_cosine_recall@5": 0.9717171717171718,
|
221 |
+
"eval_loss": 0.03936752304434776,
|
222 |
+
"eval_runtime": 49.3604,
|
223 |
+
"eval_samples_per_second": 141.814,
|
224 |
+
"eval_sequential_score": 0.9549470882860054,
|
225 |
+
"eval_steps_per_second": 8.874,
|
226 |
+
"step": 700
|
227 |
+
},
|
228 |
+
{
|
229 |
+
"epoch": 0.1477650535648319,
|
230 |
+
"grad_norm": 18.81490135192871,
|
231 |
+
"learning_rate": 9.003491792488438e-06,
|
232 |
+
"loss": 0.0344,
|
233 |
+
"step": 800
|
234 |
+
},
|
235 |
+
{
|
236 |
+
"epoch": 0.1477650535648319,
|
237 |
+
"eval_cosine_accuracy@1": 0.9424242424242424,
|
238 |
+
"eval_cosine_accuracy@10": 0.9791245791245792,
|
239 |
+
"eval_cosine_accuracy@3": 0.9673400673400674,
|
240 |
+
"eval_cosine_accuracy@5": 0.9730639730639731,
|
241 |
+
"eval_cosine_map@100": 0.9565285707123049,
|
242 |
+
"eval_cosine_mrr@10": 0.9558751536529315,
|
243 |
+
"eval_cosine_ndcg@10": 0.9615848253233655,
|
244 |
+
"eval_cosine_precision@1": 0.9424242424242424,
|
245 |
+
"eval_cosine_precision@10": 0.0979124579124579,
|
246 |
+
"eval_cosine_precision@3": 0.32244668911335583,
|
247 |
+
"eval_cosine_precision@5": 0.19461279461279463,
|
248 |
+
"eval_cosine_recall@1": 0.9424242424242424,
|
249 |
+
"eval_cosine_recall@10": 0.9791245791245792,
|
250 |
+
"eval_cosine_recall@3": 0.9673400673400674,
|
251 |
+
"eval_cosine_recall@5": 0.9730639730639731,
|
252 |
+
"eval_loss": 0.03738004341721535,
|
253 |
+
"eval_runtime": 50.7853,
|
254 |
+
"eval_samples_per_second": 137.835,
|
255 |
+
"eval_sequential_score": 0.9565285707123049,
|
256 |
+
"eval_steps_per_second": 8.625,
|
257 |
+
"step": 800
|
258 |
+
},
|
259 |
+
{
|
260 |
+
"epoch": 0.16623568526043592,
|
261 |
+
"grad_norm": 40.96598434448242,
|
262 |
+
"learning_rate": 6.9498282290438235e-06,
|
263 |
+
"loss": 0.0393,
|
264 |
+
"step": 900
|
265 |
+
},
|
266 |
+
{
|
267 |
+
"epoch": 0.16623568526043592,
|
268 |
+
"eval_cosine_accuracy@1": 0.9430976430976431,
|
269 |
+
"eval_cosine_accuracy@10": 0.9801346801346801,
|
270 |
+
"eval_cosine_accuracy@3": 0.9666666666666667,
|
271 |
+
"eval_cosine_accuracy@5": 0.9723905723905724,
|
272 |
+
"eval_cosine_map@100": 0.9567946373289521,
|
273 |
+
"eval_cosine_mrr@10": 0.9561974239752019,
|
274 |
+
"eval_cosine_ndcg@10": 0.9620315668852821,
|
275 |
+
"eval_cosine_precision@1": 0.9430976430976431,
|
276 |
+
"eval_cosine_precision@10": 0.098013468013468,
|
277 |
+
"eval_cosine_precision@3": 0.32222222222222224,
|
278 |
+
"eval_cosine_precision@5": 0.19447811447811444,
|
279 |
+
"eval_cosine_recall@1": 0.9430976430976431,
|
280 |
+
"eval_cosine_recall@10": 0.9801346801346801,
|
281 |
+
"eval_cosine_recall@3": 0.9666666666666667,
|
282 |
+
"eval_cosine_recall@5": 0.9723905723905724,
|
283 |
+
"eval_loss": 0.036062091588974,
|
284 |
+
"eval_runtime": 50.1457,
|
285 |
+
"eval_samples_per_second": 139.593,
|
286 |
+
"eval_sequential_score": 0.9567946373289521,
|
287 |
+
"eval_steps_per_second": 8.735,
|
288 |
+
"step": 900
|
289 |
+
},
|
290 |
+
{
|
291 |
+
"epoch": 0.1847063169560399,
|
292 |
+
"grad_norm": 7.633666038513184,
|
293 |
+
"learning_rate": 5.030361696847706e-06,
|
294 |
+
"loss": 0.0451,
|
295 |
+
"step": 1000
|
296 |
+
},
|
297 |
+
{
|
298 |
+
"epoch": 0.1847063169560399,
|
299 |
+
"eval_cosine_accuracy@1": 0.9447811447811448,
|
300 |
+
"eval_cosine_accuracy@10": 0.9804713804713805,
|
301 |
+
"eval_cosine_accuracy@3": 0.9673400673400674,
|
302 |
+
"eval_cosine_accuracy@5": 0.9720538720538721,
|
303 |
+
"eval_cosine_map@100": 0.9577987764578036,
|
304 |
+
"eval_cosine_mrr@10": 0.9572219549997326,
|
305 |
+
"eval_cosine_ndcg@10": 0.9628692157043424,
|
306 |
+
"eval_cosine_precision@1": 0.9447811447811448,
|
307 |
+
"eval_cosine_precision@10": 0.09804713804713805,
|
308 |
+
"eval_cosine_precision@3": 0.32244668911335583,
|
309 |
+
"eval_cosine_precision@5": 0.19441077441077437,
|
310 |
+
"eval_cosine_recall@1": 0.9447811447811448,
|
311 |
+
"eval_cosine_recall@10": 0.9804713804713805,
|
312 |
+
"eval_cosine_recall@3": 0.9673400673400674,
|
313 |
+
"eval_cosine_recall@5": 0.9720538720538721,
|
314 |
+
"eval_loss": 0.03610473498702049,
|
315 |
+
"eval_runtime": 49.6014,
|
316 |
+
"eval_samples_per_second": 141.125,
|
317 |
+
"eval_sequential_score": 0.9577987764578036,
|
318 |
+
"eval_steps_per_second": 8.83,
|
319 |
+
"step": 1000
|
320 |
+
}
|
321 |
+
],
|
322 |
+
"logging_steps": 100,
|
323 |
+
"max_steps": 1500,
|
324 |
+
"num_input_tokens_seen": 0,
|
325 |
+
"num_train_epochs": 1,
|
326 |
+
"save_steps": 500,
|
327 |
+
"stateful_callbacks": {
|
328 |
+
"TrainerControl": {
|
329 |
+
"args": {
|
330 |
+
"should_epoch_stop": false,
|
331 |
+
"should_evaluate": false,
|
332 |
+
"should_log": false,
|
333 |
+
"should_save": true,
|
334 |
+
"should_training_stop": false
|
335 |
+
},
|
336 |
+
"attributes": {}
|
337 |
+
}
|
338 |
+
},
|
339 |
+
"total_flos": 0.0,
|
340 |
+
"train_batch_size": 8,
|
341 |
+
"trial_name": null,
|
342 |
+
"trial_params": null
|
343 |
+
}
|
training_args.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:fc48324821e670334bde14afaceaab851215544086086930c4482698c24359fd
|
3 |
+
size 5368
|
vocab.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|