Add new SentenceTransformer model.
Browse files- .gitattributes +1 -0
- 1_Pooling/config.json +10 -0
- README.md +472 -0
- config.json +67 -0
- config_sentence_transformers.json +16 -0
- custom_st.py +229 -0
- model.safetensors +3 -0
- modules.json +20 -0
- sentence_bert_config.json +4 -0
- special_tokens_map.json +51 -0
- tokenizer.json +3 -0
- tokenizer_config.json +54 -0
.gitattributes
CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
1_Pooling/config.json
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"word_embedding_dimension": 1024,
|
3 |
+
"pooling_mode_cls_token": false,
|
4 |
+
"pooling_mode_mean_tokens": true,
|
5 |
+
"pooling_mode_max_tokens": false,
|
6 |
+
"pooling_mode_mean_sqrt_len_tokens": false,
|
7 |
+
"pooling_mode_weightedmean_tokens": false,
|
8 |
+
"pooling_mode_lasttoken": false,
|
9 |
+
"include_prompt": true
|
10 |
+
}
|
README.md
ADDED
@@ -0,0 +1,472 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
tags:
|
3 |
+
- sentence-transformers
|
4 |
+
- sentence-similarity
|
5 |
+
- feature-extraction
|
6 |
+
- generated_from_trainer
|
7 |
+
- dataset_size:50000
|
8 |
+
- loss:OnlineContrastiveLoss
|
9 |
+
base_model: jinaai/jina-embeddings-v3
|
10 |
+
widget:
|
11 |
+
- source_sentence: i be try to picture the pitch for dark angel . i be think matrix
|
12 |
+
and i be think bladerunner and i be think that chick that play faith in angel
|
13 |
+
and wear shiny black leather or some chick just like her and leave that one with
|
14 |
+
u . only get this . we will do it without any plot and dialogue and character
|
15 |
+
and decent action or budget and just some loud bang and a hot chick in shiny black
|
16 |
+
leather straddle a big throbbing bike . fanboys dig loud bang and hot chick in
|
17 |
+
shiny black leather straddle big throbbing bike and right . flashy and shallow
|
18 |
+
and dreary and formulaic and passionless and tedious and dull and dumb and humourless
|
19 |
+
and desultory and barely competent . live action anime without any action and
|
20 |
+
or indeed any life . sf just the way joe fanboy like it and in fact . negative
|
21 |
+
.
|
22 |
+
sentences:
|
23 |
+
- This is a semantically positive review.
|
24 |
+
- This is a semantically negative review.
|
25 |
+
- This is a semantically positive review.
|
26 |
+
- source_sentence: despite the high rating give to this film by imdb user and this
|
27 |
+
be nothing more than your typical girl with a bad childhood obsessively stalks
|
28 |
+
married man film . the attractive justine priestly brief nude scene may attract
|
29 |
+
voyeur and but the film be hackneyed tripe . half out of .
|
30 |
+
sentences:
|
31 |
+
- This is a semantically positive review.
|
32 |
+
- This is a semantically positive review.
|
33 |
+
- This is a semantically positive review.
|
34 |
+
- source_sentence: this movie portray ruth a a womanizing and hard drinking and gambling
|
35 |
+
and overeat sport figure with a little baseball thrown in . babe ruth early life
|
36 |
+
be quite interesting and this be for all intent and purpose be omit in this film
|
37 |
+
. also and lou gehrig be barely cover and this be a well know relationship and
|
38 |
+
good bad or indifferent and it should have be cover well than it be . his life
|
39 |
+
be more than all bad . he be an american hero and an icon that a lot of baseball
|
40 |
+
great pattern their life after . i feel that i be be fair to the memory of a great
|
41 |
+
baseball player that this film completely ignore . shame on the maker of this
|
42 |
+
film for capitalize on his fault and not his greatness .
|
43 |
+
sentences:
|
44 |
+
- This is a semantically positive review.
|
45 |
+
- This is a semantically negative review.
|
46 |
+
- This is a semantically positive review.
|
47 |
+
- source_sentence: the silent one panel cartoon henry come to fleischer studio and
|
48 |
+
bill a the world funny human in this dull little cartoon . betty and long past
|
49 |
+
her prime and thanks to the production code and be run a pet shop and leave henry
|
50 |
+
in charge for far too long five minute . a bore .
|
51 |
+
sentences:
|
52 |
+
- This is a semantically positive review.
|
53 |
+
- This is a semantically negative review.
|
54 |
+
- This is a semantically negative review.
|
55 |
+
- source_sentence: zu warrior most definitely should have be an animated series because
|
56 |
+
a a movie it like watch an old anime on acid . the movie just start out of nowhere
|
57 |
+
and people just fly around fight with metal wing and other stupid weapon until
|
58 |
+
this princess sacrifice herself for her lover on a cloud or something . whether
|
59 |
+
this princess be a god or an angel be beyond me but soon enough this fly wind
|
60 |
+
bad guy come in and kill her while the guy with the razor wing fight some other
|
61 |
+
mystical god or demon or wizard thing . the plot line be either not there or extremely
|
62 |
+
hard to follow you need to be insanely intelligent to get this movie . the plot
|
63 |
+
soon follow this chinese mortal who be call upon by this god to fight the evil
|
64 |
+
flying and princess kill bad guy and soon we have a very badly choreograph uwe
|
65 |
+
boll like fight scene complete with terrible martial art on a mountain or something
|
66 |
+
. even the visuals be weird some might say they be stun and colorful but i be
|
67 |
+
go to say they be blurry and acid trip like ( yes that a word . ) . i watch it
|
68 |
+
both dub and with subtitle and both be equally bad and hard to understand . who
|
69 |
+
be i kidding i do not understand it at all . it felt like i be watch episode 30
|
70 |
+
of some 1980 anime and completely miss how the story begin or like i start read
|
71 |
+
a comic series of 5 at number 4 because i have no clue how this thing start where
|
72 |
+
it be go or how it would end i be lose the entire time . i can honestly say this
|
73 |
+
be one of the bad film experience ever it be like watch inu yasha at episode 134
|
74 |
+
drunk . yeah that right you do not know what the hell be go on . don not waste
|
75 |
+
your brain try to figure this out .
|
76 |
+
sentences:
|
77 |
+
- This is a semantically positive review.
|
78 |
+
- This is a semantically negative review.
|
79 |
+
- This is a semantically positive review.
|
80 |
+
pipeline_tag: sentence-similarity
|
81 |
+
library_name: sentence-transformers
|
82 |
+
---
|
83 |
+
|
84 |
+
# SentenceTransformer based on jinaai/jina-embeddings-v3
|
85 |
+
|
86 |
+
This is a [sentence-transformers](https://www.SBERT.net) model finetuned from [jinaai/jina-embeddings-v3](https://huggingface.co/jinaai/jina-embeddings-v3). It maps sentences & paragraphs to a 1024-dimensional dense vector space and can be used for semantic textual similarity, semantic search, paraphrase mining, text classification, clustering, and more.
|
87 |
+
|
88 |
+
## Model Details
|
89 |
+
|
90 |
+
### Model Description
|
91 |
+
- **Model Type:** Sentence Transformer
|
92 |
+
- **Base model:** [jinaai/jina-embeddings-v3](https://huggingface.co/jinaai/jina-embeddings-v3) <!-- at revision 30996fea06f69ecd8382ee4f11e29acaf6b5405e -->
|
93 |
+
- **Maximum Sequence Length:** 8194 tokens
|
94 |
+
- **Output Dimensionality:** 1024 tokens
|
95 |
+
- **Similarity Function:** Cosine Similarity
|
96 |
+
<!-- - **Training Dataset:** Unknown -->
|
97 |
+
<!-- - **Language:** Unknown -->
|
98 |
+
<!-- - **License:** Unknown -->
|
99 |
+
|
100 |
+
### Model Sources
|
101 |
+
|
102 |
+
- **Documentation:** [Sentence Transformers Documentation](https://sbert.net)
|
103 |
+
- **Repository:** [Sentence Transformers on GitHub](https://github.com/UKPLab/sentence-transformers)
|
104 |
+
- **Hugging Face:** [Sentence Transformers on Hugging Face](https://huggingface.co/models?library=sentence-transformers)
|
105 |
+
|
106 |
+
### Full Model Architecture
|
107 |
+
|
108 |
+
```
|
109 |
+
SentenceTransformer(
|
110 |
+
(transformer): Transformer(
|
111 |
+
(auto_model): XLMRobertaLoRA(
|
112 |
+
(roberta): XLMRobertaModel(
|
113 |
+
(embeddings): XLMRobertaEmbeddings(
|
114 |
+
(word_embeddings): ParametrizedEmbedding(
|
115 |
+
250002, 1024, padding_idx=1
|
116 |
+
(parametrizations): ModuleDict(
|
117 |
+
(weight): ParametrizationList(
|
118 |
+
(0): LoRAParametrization()
|
119 |
+
)
|
120 |
+
)
|
121 |
+
)
|
122 |
+
(token_type_embeddings): ParametrizedEmbedding(
|
123 |
+
1, 1024
|
124 |
+
(parametrizations): ModuleDict(
|
125 |
+
(weight): ParametrizationList(
|
126 |
+
(0): LoRAParametrization()
|
127 |
+
)
|
128 |
+
)
|
129 |
+
)
|
130 |
+
)
|
131 |
+
(emb_drop): Dropout(p=0.1, inplace=False)
|
132 |
+
(emb_ln): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
|
133 |
+
(encoder): XLMRobertaEncoder(
|
134 |
+
(layers): ModuleList(
|
135 |
+
(0-23): 24 x Block(
|
136 |
+
(mixer): MHA(
|
137 |
+
(rotary_emb): RotaryEmbedding()
|
138 |
+
(Wqkv): ParametrizedLinearResidual(
|
139 |
+
in_features=1024, out_features=3072, bias=True
|
140 |
+
(parametrizations): ModuleDict(
|
141 |
+
(weight): ParametrizationList(
|
142 |
+
(0): LoRAParametrization()
|
143 |
+
)
|
144 |
+
)
|
145 |
+
)
|
146 |
+
(inner_attn): FlashSelfAttention(
|
147 |
+
(drop): Dropout(p=0.1, inplace=False)
|
148 |
+
)
|
149 |
+
(inner_cross_attn): FlashCrossAttention(
|
150 |
+
(drop): Dropout(p=0.1, inplace=False)
|
151 |
+
)
|
152 |
+
(out_proj): ParametrizedLinear(
|
153 |
+
in_features=1024, out_features=1024, bias=True
|
154 |
+
(parametrizations): ModuleDict(
|
155 |
+
(weight): ParametrizationList(
|
156 |
+
(0): LoRAParametrization()
|
157 |
+
)
|
158 |
+
)
|
159 |
+
)
|
160 |
+
)
|
161 |
+
(dropout1): Dropout(p=0.1, inplace=False)
|
162 |
+
(drop_path1): StochasticDepth(p=0.0, mode=row)
|
163 |
+
(norm1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
|
164 |
+
(mlp): Mlp(
|
165 |
+
(fc1): ParametrizedLinear(
|
166 |
+
in_features=1024, out_features=4096, bias=True
|
167 |
+
(parametrizations): ModuleDict(
|
168 |
+
(weight): ParametrizationList(
|
169 |
+
(0): LoRAParametrization()
|
170 |
+
)
|
171 |
+
)
|
172 |
+
)
|
173 |
+
(fc2): ParametrizedLinear(
|
174 |
+
in_features=4096, out_features=1024, bias=True
|
175 |
+
(parametrizations): ModuleDict(
|
176 |
+
(weight): ParametrizationList(
|
177 |
+
(0): LoRAParametrization()
|
178 |
+
)
|
179 |
+
)
|
180 |
+
)
|
181 |
+
)
|
182 |
+
(dropout2): Dropout(p=0.1, inplace=False)
|
183 |
+
(drop_path2): StochasticDepth(p=0.0, mode=row)
|
184 |
+
(norm2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
|
185 |
+
)
|
186 |
+
)
|
187 |
+
)
|
188 |
+
(pooler): XLMRobertaPooler(
|
189 |
+
(dense): ParametrizedLinear(
|
190 |
+
in_features=1024, out_features=1024, bias=True
|
191 |
+
(parametrizations): ModuleDict(
|
192 |
+
(weight): ParametrizationList(
|
193 |
+
(0): LoRAParametrization()
|
194 |
+
)
|
195 |
+
)
|
196 |
+
)
|
197 |
+
(activation): Tanh()
|
198 |
+
)
|
199 |
+
)
|
200 |
+
)
|
201 |
+
)
|
202 |
+
(pooler): Pooling({'word_embedding_dimension': 1024, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
|
203 |
+
(normalizer): Normalize()
|
204 |
+
)
|
205 |
+
```
|
206 |
+
|
207 |
+
## Usage
|
208 |
+
|
209 |
+
### Direct Usage (Sentence Transformers)
|
210 |
+
|
211 |
+
First install the Sentence Transformers library:
|
212 |
+
|
213 |
+
```bash
|
214 |
+
pip install -U sentence-transformers
|
215 |
+
```
|
216 |
+
|
217 |
+
Then you can load this model and run inference.
|
218 |
+
```python
|
219 |
+
from sentence_transformers import SentenceTransformer
|
220 |
+
|
221 |
+
# Download from the 🤗 Hub
|
222 |
+
model = SentenceTransformer("ELVISIO/jina_embeddings_v3_finetuned_online_contrastive_01")
|
223 |
+
# Run inference
|
224 |
+
sentences = [
|
225 |
+
'zu warrior most definitely should have be an animated series because a a movie it like watch an old anime on acid . the movie just start out of nowhere and people just fly around fight with metal wing and other stupid weapon until this princess sacrifice herself for her lover on a cloud or something . whether this princess be a god or an angel be beyond me but soon enough this fly wind bad guy come in and kill her while the guy with the razor wing fight some other mystical god or demon or wizard thing . the plot line be either not there or extremely hard to follow you need to be insanely intelligent to get this movie . the plot soon follow this chinese mortal who be call upon by this god to fight the evil flying and princess kill bad guy and soon we have a very badly choreograph uwe boll like fight scene complete with terrible martial art on a mountain or something . even the visuals be weird some might say they be stun and colorful but i be go to say they be blurry and acid trip like ( yes that a word . ) . i watch it both dub and with subtitle and both be equally bad and hard to understand . who be i kidding i do not understand it at all . it felt like i be watch episode 30 of some 1980 anime and completely miss how the story begin or like i start read a comic series of 5 at number 4 because i have no clue how this thing start where it be go or how it would end i be lose the entire time . i can honestly say this be one of the bad film experience ever it be like watch inu yasha at episode 134 drunk . yeah that right you do not know what the hell be go on . don not waste your brain try to figure this out .',
|
226 |
+
'This is a semantically negative review.',
|
227 |
+
'This is a semantically positive review.',
|
228 |
+
]
|
229 |
+
embeddings = model.encode(sentences)
|
230 |
+
print(embeddings.shape)
|
231 |
+
# [3, 1024]
|
232 |
+
|
233 |
+
# Get the similarity scores for the embeddings
|
234 |
+
similarities = model.similarity(embeddings, embeddings)
|
235 |
+
print(similarities.shape)
|
236 |
+
# [3, 3]
|
237 |
+
```
|
238 |
+
|
239 |
+
<!--
|
240 |
+
### Direct Usage (Transformers)
|
241 |
+
|
242 |
+
<details><summary>Click to see the direct usage in Transformers</summary>
|
243 |
+
|
244 |
+
</details>
|
245 |
+
-->
|
246 |
+
|
247 |
+
<!--
|
248 |
+
### Downstream Usage (Sentence Transformers)
|
249 |
+
|
250 |
+
You can finetune this model on your own dataset.
|
251 |
+
|
252 |
+
<details><summary>Click to expand</summary>
|
253 |
+
|
254 |
+
</details>
|
255 |
+
-->
|
256 |
+
|
257 |
+
<!--
|
258 |
+
### Out-of-Scope Use
|
259 |
+
|
260 |
+
*List how the model may foreseeably be misused and address what users ought not to do with the model.*
|
261 |
+
-->
|
262 |
+
|
263 |
+
<!--
|
264 |
+
## Bias, Risks and Limitations
|
265 |
+
|
266 |
+
*What are the known or foreseeable issues stemming from this model? You could also flag here known failure cases or weaknesses of the model.*
|
267 |
+
-->
|
268 |
+
|
269 |
+
<!--
|
270 |
+
### Recommendations
|
271 |
+
|
272 |
+
*What are recommendations with respect to the foreseeable issues? For example, filtering explicit content.*
|
273 |
+
-->
|
274 |
+
|
275 |
+
## Training Details
|
276 |
+
|
277 |
+
### Training Dataset
|
278 |
+
|
279 |
+
#### Unnamed Dataset
|
280 |
+
|
281 |
+
|
282 |
+
* Size: 50,000 training samples
|
283 |
+
* Columns: <code>sentence1</code>, <code>sentence2</code>, and <code>label</code>
|
284 |
+
* Approximate statistics based on the first 1000 samples:
|
285 |
+
| | sentence1 | sentence2 | label |
|
286 |
+
|:--------|:--------------------------------------------------------------------------------------|:----------------------------------------------------------------------------------|:--------------------------------------------------------------|
|
287 |
+
| type | string | string | float |
|
288 |
+
| details | <ul><li>min: 19 tokens</li><li>mean: 300.92 tokens</li><li>max: 1415 tokens</li></ul> | <ul><li>min: 11 tokens</li><li>mean: 11.0 tokens</li><li>max: 11 tokens</li></ul> | <ul><li>min: 0.0</li><li>mean: 0.5</li><li>max: 1.0</li></ul> |
|
289 |
+
* Samples:
|
290 |
+
| sentence1 | sentence2 | label |
|
291 |
+
|:------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:-----------------------------------------------------|:-----------------|
|
292 |
+
| <code>i rent i be curious yellow from my video store because of all the controversy that surround it when it be first release in 1967. i also hear that at first it be seize by u. s. custom if it ever try to enter this country and therefore be a fan of film consider controversial i really have to see this for myself . the plot be center around a young swedish drama student name lena who want to learn everything she can about life . in particular she want to focus her attention to make some sort of documentary on what the average swede think about certain political issue such a the vietnam war and race issue in the united state . in between ask politician and ordinary denizen of stockholm about their opinion on politics and she have sex with her drama teacher and classmate and and marry men . what kill me about i be curious yellow be that 40 year ago and this be consider pornographic . really and the sex and nudity scene be few and far between and even then it not shot like some cheaply make porno . while my countryman mind find it shock and in reality sex and nudity be a major staple in swedish cinema . even ingmar bergman and arguably their answer to good old boy john ford and have sex scene in his film . i do commend the filmmaker for the fact that any sex show in the film be show for artistic purpose rather than just to shock people and make money to be show in pornographic theater in america . i be curious yellow be a good film for anyone want to study the meat and potato ( no pun intend ) of swedish cinema . but really and this film doesn not have much of a plot .</code> | <code>This is a semantically negative review.</code> | <code>1.0</code> |
|
293 |
+
| <code>i rent i be curious yellow from my video store because of all the controversy that surround it when it be first release in 1967. i also hear that at first it be seize by u. s. custom if it ever try to enter this country and therefore be a fan of film consider controversial i really have to see this for myself . the plot be center around a young swedish drama student name lena who want to learn everything she can about life . in particular she want to focus her attention to make some sort of documentary on what the average swede think about certain political issue such a the vietnam war and race issue in the united state . in between ask politician and ordinary denizen of stockholm about their opinion on politics and she have sex with her drama teacher and classmate and and marry men . what kill me about i be curious yellow be that 40 year ago and this be consider pornographic . really and the sex and nudity scene be few and far between and even then it not shot like some cheaply make porno . while my countryman mind find it shock and in reality sex and nudity be a major staple in swedish cinema . even ingmar bergman and arguably their answer to good old boy john ford and have sex scene in his film . i do commend the filmmaker for the fact that any sex show in the film be show for artistic purpose rather than just to shock people and make money to be show in pornographic theater in america . i be curious yellow be a good film for anyone want to study the meat and potato ( no pun intend ) of swedish cinema . but really and this film doesn not have much of a plot .</code> | <code>This is a semantically positive review.</code> | <code>0.0</code> |
|
294 |
+
| <code>i be curious represent yellow be a risible and pretentious steam pile . it doesn not matter what one political view be because this film can hardly be take seriously on any level . a for the claim that frontal male nudity be an automatic nc 17 and that isn not true . i have see r rat film with male nudity . grant and they only offer some fleeting view and but where be the r rat film with gap vulva and flap labium . nowhere and because they do not exist . the same go for those crappy cable show represent schlongs swing in the breeze but not a clitoris in sight . and those pretentious indie movie like the brown bunny and in which be treat to the site of vincent gallo throb johnson and but not a trace of pink visible on chloe sevigny . before cry ( or imply ) double standard in matter of nudity and the mentally obtuse should take into account one unavoidably obvious anatomical difference between men and woman represent there be no genitals on display when actresses appear nude and and the same can not be say for a man . in fact and you generally would not see female genitals in an american film in anything short of porn or explicit erotica . this allege double standard be less a double standard than an admittedly depressing ability to come to term culturally with the inside of woman body .</code> | <code>This is a semantically negative review.</code> | <code>1.0</code> |
|
295 |
+
* Loss: [<code>OnlineContrastiveLoss</code>](https://sbert.net/docs/package_reference/sentence_transformer/losses.html#onlinecontrastiveloss)
|
296 |
+
|
297 |
+
### Training Hyperparameters
|
298 |
+
#### Non-Default Hyperparameters
|
299 |
+
|
300 |
+
- `per_device_train_batch_size`: 64
|
301 |
+
- `per_device_eval_batch_size`: 64
|
302 |
+
|
303 |
+
#### All Hyperparameters
|
304 |
+
<details><summary>Click to expand</summary>
|
305 |
+
|
306 |
+
- `overwrite_output_dir`: False
|
307 |
+
- `do_predict`: False
|
308 |
+
- `eval_strategy`: no
|
309 |
+
- `prediction_loss_only`: True
|
310 |
+
- `per_device_train_batch_size`: 64
|
311 |
+
- `per_device_eval_batch_size`: 64
|
312 |
+
- `per_gpu_train_batch_size`: None
|
313 |
+
- `per_gpu_eval_batch_size`: None
|
314 |
+
- `gradient_accumulation_steps`: 1
|
315 |
+
- `eval_accumulation_steps`: None
|
316 |
+
- `torch_empty_cache_steps`: None
|
317 |
+
- `learning_rate`: 5e-05
|
318 |
+
- `weight_decay`: 0.0
|
319 |
+
- `adam_beta1`: 0.9
|
320 |
+
- `adam_beta2`: 0.999
|
321 |
+
- `adam_epsilon`: 1e-08
|
322 |
+
- `max_grad_norm`: 1.0
|
323 |
+
- `num_train_epochs`: 3.0
|
324 |
+
- `max_steps`: -1
|
325 |
+
- `lr_scheduler_type`: linear
|
326 |
+
- `lr_scheduler_kwargs`: {}
|
327 |
+
- `warmup_ratio`: 0.0
|
328 |
+
- `warmup_steps`: 0
|
329 |
+
- `log_level`: passive
|
330 |
+
- `log_level_replica`: warning
|
331 |
+
- `log_on_each_node`: True
|
332 |
+
- `logging_nan_inf_filter`: True
|
333 |
+
- `save_safetensors`: True
|
334 |
+
- `save_on_each_node`: False
|
335 |
+
- `save_only_model`: False
|
336 |
+
- `restore_callback_states_from_checkpoint`: False
|
337 |
+
- `no_cuda`: False
|
338 |
+
- `use_cpu`: False
|
339 |
+
- `use_mps_device`: False
|
340 |
+
- `seed`: 42
|
341 |
+
- `data_seed`: None
|
342 |
+
- `jit_mode_eval`: False
|
343 |
+
- `use_ipex`: False
|
344 |
+
- `bf16`: False
|
345 |
+
- `fp16`: False
|
346 |
+
- `fp16_opt_level`: O1
|
347 |
+
- `half_precision_backend`: auto
|
348 |
+
- `bf16_full_eval`: False
|
349 |
+
- `fp16_full_eval`: False
|
350 |
+
- `tf32`: None
|
351 |
+
- `local_rank`: 0
|
352 |
+
- `ddp_backend`: None
|
353 |
+
- `tpu_num_cores`: None
|
354 |
+
- `tpu_metrics_debug`: False
|
355 |
+
- `debug`: []
|
356 |
+
- `dataloader_drop_last`: False
|
357 |
+
- `dataloader_num_workers`: 0
|
358 |
+
- `dataloader_prefetch_factor`: None
|
359 |
+
- `past_index`: -1
|
360 |
+
- `disable_tqdm`: False
|
361 |
+
- `remove_unused_columns`: True
|
362 |
+
- `label_names`: None
|
363 |
+
- `load_best_model_at_end`: False
|
364 |
+
- `ignore_data_skip`: False
|
365 |
+
- `fsdp`: []
|
366 |
+
- `fsdp_min_num_params`: 0
|
367 |
+
- `fsdp_config`: {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}
|
368 |
+
- `fsdp_transformer_layer_cls_to_wrap`: None
|
369 |
+
- `accelerator_config`: {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}
|
370 |
+
- `deepspeed`: None
|
371 |
+
- `label_smoothing_factor`: 0.0
|
372 |
+
- `optim`: adamw_torch
|
373 |
+
- `optim_args`: None
|
374 |
+
- `adafactor`: False
|
375 |
+
- `group_by_length`: False
|
376 |
+
- `length_column_name`: length
|
377 |
+
- `ddp_find_unused_parameters`: None
|
378 |
+
- `ddp_bucket_cap_mb`: None
|
379 |
+
- `ddp_broadcast_buffers`: False
|
380 |
+
- `dataloader_pin_memory`: True
|
381 |
+
- `dataloader_persistent_workers`: False
|
382 |
+
- `skip_memory_metrics`: True
|
383 |
+
- `use_legacy_prediction_loop`: False
|
384 |
+
- `push_to_hub`: False
|
385 |
+
- `resume_from_checkpoint`: None
|
386 |
+
- `hub_model_id`: None
|
387 |
+
- `hub_strategy`: every_save
|
388 |
+
- `hub_private_repo`: False
|
389 |
+
- `hub_always_push`: False
|
390 |
+
- `gradient_checkpointing`: False
|
391 |
+
- `gradient_checkpointing_kwargs`: None
|
392 |
+
- `include_inputs_for_metrics`: False
|
393 |
+
- `eval_do_concat_batches`: True
|
394 |
+
- `fp16_backend`: auto
|
395 |
+
- `push_to_hub_model_id`: None
|
396 |
+
- `push_to_hub_organization`: None
|
397 |
+
- `mp_parameters`:
|
398 |
+
- `auto_find_batch_size`: False
|
399 |
+
- `full_determinism`: False
|
400 |
+
- `torchdynamo`: None
|
401 |
+
- `ray_scope`: last
|
402 |
+
- `ddp_timeout`: 1800
|
403 |
+
- `torch_compile`: False
|
404 |
+
- `torch_compile_backend`: None
|
405 |
+
- `torch_compile_mode`: None
|
406 |
+
- `dispatch_batches`: None
|
407 |
+
- `split_batches`: None
|
408 |
+
- `include_tokens_per_second`: False
|
409 |
+
- `include_num_input_tokens_seen`: False
|
410 |
+
- `neftune_noise_alpha`: None
|
411 |
+
- `optim_target_modules`: None
|
412 |
+
- `batch_eval_metrics`: False
|
413 |
+
- `eval_on_start`: False
|
414 |
+
- `use_liger_kernel`: False
|
415 |
+
- `eval_use_gather_object`: False
|
416 |
+
- `batch_sampler`: batch_sampler
|
417 |
+
- `multi_dataset_batch_sampler`: proportional
|
418 |
+
|
419 |
+
</details>
|
420 |
+
|
421 |
+
### Training Logs
|
422 |
+
| Epoch | Step | Training Loss |
|
423 |
+
|:------:|:----:|:-------------:|
|
424 |
+
| 0.6394 | 500 | 0.9485 |
|
425 |
+
| 1.2788 | 1000 | 0.6908 |
|
426 |
+
| 1.9182 | 1500 | 0.7048 |
|
427 |
+
| 2.5575 | 2000 | 0.6892 |
|
428 |
+
|
429 |
+
|
430 |
+
### Framework Versions
|
431 |
+
- Python: 3.10.12
|
432 |
+
- Sentence Transformers: 3.1.1
|
433 |
+
- Transformers: 4.45.2
|
434 |
+
- PyTorch: 2.5.1+cu121
|
435 |
+
- Accelerate: 1.1.1
|
436 |
+
- Datasets: 3.1.0
|
437 |
+
- Tokenizers: 0.20.3
|
438 |
+
|
439 |
+
## Citation
|
440 |
+
|
441 |
+
### BibTeX
|
442 |
+
|
443 |
+
#### Sentence Transformers
|
444 |
+
```bibtex
|
445 |
+
@inproceedings{reimers-2019-sentence-bert,
|
446 |
+
title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
|
447 |
+
author = "Reimers, Nils and Gurevych, Iryna",
|
448 |
+
booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
|
449 |
+
month = "11",
|
450 |
+
year = "2019",
|
451 |
+
publisher = "Association for Computational Linguistics",
|
452 |
+
url = "https://arxiv.org/abs/1908.10084",
|
453 |
+
}
|
454 |
+
```
|
455 |
+
|
456 |
+
<!--
|
457 |
+
## Glossary
|
458 |
+
|
459 |
+
*Clearly define terms in order to be accessible across audiences.*
|
460 |
+
-->
|
461 |
+
|
462 |
+
<!--
|
463 |
+
## Model Card Authors
|
464 |
+
|
465 |
+
*Lists the people who create the model card, providing recognition and accountability for the detailed work that goes into its construction.*
|
466 |
+
-->
|
467 |
+
|
468 |
+
<!--
|
469 |
+
## Model Card Contact
|
470 |
+
|
471 |
+
*Provides a way for people who have updates to the Model Card, suggestions, or questions, to contact the Model Card authors.*
|
472 |
+
-->
|
config.json
ADDED
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "jinaai/jina-embeddings-v3",
|
3 |
+
"architectures": [
|
4 |
+
"XLMRobertaLoRA"
|
5 |
+
],
|
6 |
+
"attention_probs_dropout_prob": 0.1,
|
7 |
+
"auto_map": {
|
8 |
+
"AutoConfig": "jinaai/xlm-roberta-flash-implementation--configuration_xlm_roberta.XLMRobertaFlashConfig",
|
9 |
+
"AutoModel": "jinaai/xlm-roberta-flash-implementation--modeling_lora.XLMRobertaLoRA",
|
10 |
+
"AutoModelForMaskedLM": "jinaai/xlm-roberta-flash-implementation--modeling_xlm_roberta.XLMRobertaForMaskedLM",
|
11 |
+
"AutoModelForPreTraining": "jinaai/xlm-roberta-flash-implementation--modeling_xlm_roberta.XLMRobertaForPreTraining"
|
12 |
+
},
|
13 |
+
"bos_token_id": 0,
|
14 |
+
"classifier_dropout": null,
|
15 |
+
"emb_pooler": null,
|
16 |
+
"eos_token_id": 2,
|
17 |
+
"hidden_act": "gelu",
|
18 |
+
"hidden_dropout_prob": 0.1,
|
19 |
+
"hidden_size": 1024,
|
20 |
+
"initializer_range": 0.02,
|
21 |
+
"intermediate_size": 4096,
|
22 |
+
"layer_norm_eps": 1e-05,
|
23 |
+
"load_trained_adapters": true,
|
24 |
+
"lora_adaptations": [
|
25 |
+
"retrieval.query",
|
26 |
+
"retrieval.passage",
|
27 |
+
"separation",
|
28 |
+
"classification",
|
29 |
+
"text-matching"
|
30 |
+
],
|
31 |
+
"lora_alpha": 1,
|
32 |
+
"lora_dropout_p": 0.0,
|
33 |
+
"lora_main_params_trainable": false,
|
34 |
+
"lora_rank": 4,
|
35 |
+
"matryoshka_dimensions": [
|
36 |
+
32,
|
37 |
+
64,
|
38 |
+
128,
|
39 |
+
256,
|
40 |
+
512,
|
41 |
+
768,
|
42 |
+
1024
|
43 |
+
],
|
44 |
+
"max_position_embeddings": 8194,
|
45 |
+
"model_type": "xlm-roberta",
|
46 |
+
"num_attention_heads": 16,
|
47 |
+
"num_hidden_layers": 24,
|
48 |
+
"output_past": true,
|
49 |
+
"pad_token_id": 1,
|
50 |
+
"position_embedding_type": "rotary",
|
51 |
+
"rotary_emb_base": 20000.0,
|
52 |
+
"task_instructions": {
|
53 |
+
"classification": "",
|
54 |
+
"retrieval.passage": "Represent the document for retrieval: ",
|
55 |
+
"retrieval.query": "Represent the query for retrieving evidence documents: ",
|
56 |
+
"separation": "",
|
57 |
+
"text-matching": ""
|
58 |
+
},
|
59 |
+
"torch_dtype": "bfloat16",
|
60 |
+
"transformers_version": "4.45.2",
|
61 |
+
"truncate_dim": null,
|
62 |
+
"type_vocab_size": 1,
|
63 |
+
"use_cache": true,
|
64 |
+
"use_flash_attn": true,
|
65 |
+
"use_reentrant": false,
|
66 |
+
"vocab_size": 250002
|
67 |
+
}
|
config_sentence_transformers.json
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"__version__": {
|
3 |
+
"sentence_transformers": "3.1.1",
|
4 |
+
"transformers": "4.45.2",
|
5 |
+
"pytorch": "2.5.1+cu121"
|
6 |
+
},
|
7 |
+
"prompts": {
|
8 |
+
"retrieval.query": "Represent the query for retrieving evidence documents: ",
|
9 |
+
"retrieval.passage": "Represent the document for retrieval: ",
|
10 |
+
"separation": "",
|
11 |
+
"classification": "",
|
12 |
+
"text-matching": ""
|
13 |
+
},
|
14 |
+
"default_prompt_name": null,
|
15 |
+
"similarity_fn_name": "cosine"
|
16 |
+
}
|
custom_st.py
ADDED
@@ -0,0 +1,229 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import logging
|
3 |
+
import os
|
4 |
+
from io import BytesIO
|
5 |
+
from typing import Any, Dict, List, Optional, Tuple, Union
|
6 |
+
|
7 |
+
import torch
|
8 |
+
from torch import nn
|
9 |
+
from transformers import AutoConfig, AutoModel, AutoTokenizer
|
10 |
+
|
11 |
+
logger = logging.getLogger(__name__)
|
12 |
+
|
13 |
+
|
14 |
+
class Transformer(nn.Module):
|
15 |
+
"""Huggingface AutoModel to generate token embeddings.
|
16 |
+
Loads the correct class, e.g. BERT / RoBERTa etc.
|
17 |
+
|
18 |
+
Args:
|
19 |
+
model_name_or_path: Huggingface models name
|
20 |
+
(https://huggingface.co/models)
|
21 |
+
max_seq_length: Truncate any inputs longer than max_seq_length
|
22 |
+
model_args: Keyword arguments passed to the Huggingface
|
23 |
+
Transformers model
|
24 |
+
tokenizer_args: Keyword arguments passed to the Huggingface
|
25 |
+
Transformers tokenizer
|
26 |
+
config_args: Keyword arguments passed to the Huggingface
|
27 |
+
Transformers config
|
28 |
+
cache_dir: Cache dir for Huggingface Transformers to store/load
|
29 |
+
models
|
30 |
+
do_lower_case: If true, lowercases the input (independent if the
|
31 |
+
model is cased or not)
|
32 |
+
tokenizer_name_or_path: Name or path of the tokenizer. When
|
33 |
+
None, then model_name_or_path is used
|
34 |
+
"""
|
35 |
+
|
36 |
+
save_in_root: bool = True
|
37 |
+
|
38 |
+
def __init__(
|
39 |
+
self,
|
40 |
+
model_name_or_path: str,
|
41 |
+
max_seq_length: int = None,
|
42 |
+
model_args: Dict[str, Any] = None,
|
43 |
+
tokenizer_args: Dict[str, Any] = None,
|
44 |
+
config_args: Dict[str, Any] = None,
|
45 |
+
cache_dir: str = None,
|
46 |
+
do_lower_case: bool = False,
|
47 |
+
tokenizer_name_or_path: str = None,
|
48 |
+
**kwargs,
|
49 |
+
) -> None:
|
50 |
+
super().__init__()
|
51 |
+
self.config_keys = ["max_seq_length", "do_lower_case"]
|
52 |
+
self.do_lower_case = do_lower_case
|
53 |
+
if model_args is None:
|
54 |
+
model_args = {}
|
55 |
+
if tokenizer_args is None:
|
56 |
+
tokenizer_args = {}
|
57 |
+
if config_args is None:
|
58 |
+
config_args = {}
|
59 |
+
|
60 |
+
if kwargs.get("backend", "torch") != "torch":
|
61 |
+
logger.warning(
|
62 |
+
f'"jinaai/jina-embeddings-v3" is currently not compatible with the {kwargs["backend"]} backend. '
|
63 |
+
'Continuing with the "torch" backend.'
|
64 |
+
)
|
65 |
+
|
66 |
+
self.config = AutoConfig.from_pretrained(model_name_or_path, **config_args, cache_dir=cache_dir)
|
67 |
+
|
68 |
+
self._lora_adaptations = self.config.lora_adaptations
|
69 |
+
if (
|
70 |
+
not isinstance(self._lora_adaptations, list)
|
71 |
+
or len(self._lora_adaptations) < 1
|
72 |
+
):
|
73 |
+
raise ValueError(
|
74 |
+
f"`lora_adaptations` must be a list and contain at least one element"
|
75 |
+
)
|
76 |
+
self._adaptation_map = {
|
77 |
+
name: idx for idx, name in enumerate(self._lora_adaptations)
|
78 |
+
}
|
79 |
+
|
80 |
+
self.default_task = model_args.pop('default_task', None)
|
81 |
+
|
82 |
+
self.auto_model = AutoModel.from_pretrained(model_name_or_path, config=self.config, cache_dir=cache_dir, **model_args)
|
83 |
+
|
84 |
+
if max_seq_length is not None and "model_max_length" not in tokenizer_args:
|
85 |
+
tokenizer_args["model_max_length"] = max_seq_length
|
86 |
+
self.tokenizer = AutoTokenizer.from_pretrained(
|
87 |
+
tokenizer_name_or_path if tokenizer_name_or_path is not None else model_name_or_path,
|
88 |
+
cache_dir=cache_dir,
|
89 |
+
**tokenizer_args,
|
90 |
+
)
|
91 |
+
|
92 |
+
# No max_seq_length set. Try to infer from model
|
93 |
+
if max_seq_length is None:
|
94 |
+
if (
|
95 |
+
hasattr(self.auto_model, "config")
|
96 |
+
and hasattr(self.auto_model.config, "max_position_embeddings")
|
97 |
+
and hasattr(self.tokenizer, "model_max_length")
|
98 |
+
):
|
99 |
+
max_seq_length = min(self.auto_model.config.max_position_embeddings, self.tokenizer.model_max_length)
|
100 |
+
|
101 |
+
self.max_seq_length = max_seq_length
|
102 |
+
|
103 |
+
if tokenizer_name_or_path is not None:
|
104 |
+
self.auto_model.config.tokenizer_class = self.tokenizer.__class__.__name__
|
105 |
+
|
106 |
+
|
107 |
+
@property
|
108 |
+
def default_task(self):
|
109 |
+
return self._default_task
|
110 |
+
|
111 |
+
@default_task.setter
|
112 |
+
def default_task(self, task: Union[None, str]):
|
113 |
+
self._validate_task(task)
|
114 |
+
self._default_task = task
|
115 |
+
|
116 |
+
|
117 |
+
def _validate_task(self, task: str):
|
118 |
+
if task and task not in self._lora_adaptations:
|
119 |
+
raise ValueError(
|
120 |
+
f"Unsupported task '{task}'. "
|
121 |
+
f"Supported tasks are: {', '.join(self.config.lora_adaptations)}. "
|
122 |
+
f"Alternatively, don't pass the `task` argument to disable LoRA."
|
123 |
+
)
|
124 |
+
|
125 |
+
def forward(
|
126 |
+
self, features: Dict[str, torch.Tensor], task: Optional[str] = None
|
127 |
+
) -> Dict[str, torch.Tensor]:
|
128 |
+
"""Returns token_embeddings, cls_token"""
|
129 |
+
self._validate_task(task)
|
130 |
+
task = task or self.default_task
|
131 |
+
adapter_mask = None
|
132 |
+
if task:
|
133 |
+
task_id = self._adaptation_map[task]
|
134 |
+
num_examples = features['input_ids'].size(0)
|
135 |
+
adapter_mask = torch.full(
|
136 |
+
(num_examples,), task_id, dtype=torch.int32, device=features['input_ids'].device
|
137 |
+
)
|
138 |
+
|
139 |
+
lora_arguments = (
|
140 |
+
{"adapter_mask": adapter_mask} if adapter_mask is not None else {}
|
141 |
+
)
|
142 |
+
features.pop('prompt_length', None)
|
143 |
+
output_states = self.auto_model.forward(**features, **lora_arguments, return_dict=False)
|
144 |
+
output_tokens = output_states[0]
|
145 |
+
features.update({"token_embeddings": output_tokens, "attention_mask": features["attention_mask"]})
|
146 |
+
return features
|
147 |
+
|
148 |
+
def get_word_embedding_dimension(self) -> int:
|
149 |
+
return self.auto_model.config.hidden_size
|
150 |
+
|
151 |
+
def tokenize(
|
152 |
+
self,
|
153 |
+
texts: Union[List[str], List[dict], List[Tuple[str, str]]],
|
154 |
+
padding: Union[str, bool] = True
|
155 |
+
) -> Dict[str, torch.Tensor]:
|
156 |
+
"""Tokenizes a text and maps tokens to token-ids"""
|
157 |
+
output = {}
|
158 |
+
if isinstance(texts[0], str):
|
159 |
+
to_tokenize = [texts]
|
160 |
+
elif isinstance(texts[0], dict):
|
161 |
+
to_tokenize = []
|
162 |
+
output["text_keys"] = []
|
163 |
+
for lookup in texts:
|
164 |
+
text_key, text = next(iter(lookup.items()))
|
165 |
+
to_tokenize.append(text)
|
166 |
+
output["text_keys"].append(text_key)
|
167 |
+
to_tokenize = [to_tokenize]
|
168 |
+
else:
|
169 |
+
batch1, batch2 = [], []
|
170 |
+
for text_tuple in texts:
|
171 |
+
batch1.append(text_tuple[0])
|
172 |
+
batch2.append(text_tuple[1])
|
173 |
+
to_tokenize = [batch1, batch2]
|
174 |
+
|
175 |
+
# strip
|
176 |
+
to_tokenize = [[str(s).strip() for s in col] for col in to_tokenize]
|
177 |
+
|
178 |
+
# Lowercase
|
179 |
+
if self.do_lower_case:
|
180 |
+
to_tokenize = [[s.lower() for s in col] for col in to_tokenize]
|
181 |
+
|
182 |
+
output.update(
|
183 |
+
self.tokenizer(
|
184 |
+
*to_tokenize,
|
185 |
+
padding=padding,
|
186 |
+
truncation="longest_first",
|
187 |
+
return_tensors="pt",
|
188 |
+
max_length=self.max_seq_length,
|
189 |
+
)
|
190 |
+
)
|
191 |
+
return output
|
192 |
+
|
193 |
+
def get_config_dict(self) -> Dict[str, Any]:
|
194 |
+
return {key: self.__dict__[key] for key in self.config_keys}
|
195 |
+
|
196 |
+
def save(self, output_path: str, safe_serialization: bool = True) -> None:
|
197 |
+
self.auto_model.save_pretrained(output_path, safe_serialization=safe_serialization)
|
198 |
+
self.tokenizer.save_pretrained(output_path)
|
199 |
+
|
200 |
+
with open(os.path.join(output_path, "sentence_bert_config.json"), "w") as fOut:
|
201 |
+
json.dump(self.get_config_dict(), fOut, indent=2)
|
202 |
+
|
203 |
+
|
204 |
+
@classmethod
|
205 |
+
def load(cls, input_path: str) -> "Transformer":
|
206 |
+
# Old classes used other config names than 'sentence_bert_config.json'
|
207 |
+
for config_name in [
|
208 |
+
"sentence_bert_config.json",
|
209 |
+
"sentence_roberta_config.json",
|
210 |
+
"sentence_distilbert_config.json",
|
211 |
+
"sentence_camembert_config.json",
|
212 |
+
"sentence_albert_config.json",
|
213 |
+
"sentence_xlm-roberta_config.json",
|
214 |
+
"sentence_xlnet_config.json",
|
215 |
+
]:
|
216 |
+
sbert_config_path = os.path.join(input_path, config_name)
|
217 |
+
if os.path.exists(sbert_config_path):
|
218 |
+
break
|
219 |
+
|
220 |
+
with open(sbert_config_path) as fIn:
|
221 |
+
config = json.load(fIn)
|
222 |
+
# Don't allow configs to set trust_remote_code
|
223 |
+
if "model_args" in config and "trust_remote_code" in config["model_args"]:
|
224 |
+
config["model_args"].pop("trust_remote_code")
|
225 |
+
if "tokenizer_args" in config and "trust_remote_code" in config["tokenizer_args"]:
|
226 |
+
config["tokenizer_args"].pop("trust_remote_code")
|
227 |
+
if "config_args" in config and "trust_remote_code" in config["config_args"]:
|
228 |
+
config["config_args"].pop("trust_remote_code")
|
229 |
+
return cls(model_name_or_path=input_path, **config)
|
model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a48c0e7f02f023e8ba9391589e44367b1f80ead3daa92c55a0300124b28779ba
|
3 |
+
size 1144685320
|
modules.json
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"idx": 0,
|
4 |
+
"name": "transformer",
|
5 |
+
"path": "",
|
6 |
+
"type": "custom_st.Transformer"
|
7 |
+
},
|
8 |
+
{
|
9 |
+
"idx": 1,
|
10 |
+
"name": "pooler",
|
11 |
+
"path": "1_Pooling",
|
12 |
+
"type": "sentence_transformers.models.Pooling"
|
13 |
+
},
|
14 |
+
{
|
15 |
+
"idx": 2,
|
16 |
+
"name": "normalizer",
|
17 |
+
"path": "2_Normalize",
|
18 |
+
"type": "sentence_transformers.models.Normalize"
|
19 |
+
}
|
20 |
+
]
|
sentence_bert_config.json
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"max_seq_length": 8194,
|
3 |
+
"do_lower_case": false
|
4 |
+
}
|
special_tokens_map.json
ADDED
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"bos_token": {
|
3 |
+
"content": "<s>",
|
4 |
+
"lstrip": false,
|
5 |
+
"normalized": false,
|
6 |
+
"rstrip": false,
|
7 |
+
"single_word": false
|
8 |
+
},
|
9 |
+
"cls_token": {
|
10 |
+
"content": "<s>",
|
11 |
+
"lstrip": false,
|
12 |
+
"normalized": false,
|
13 |
+
"rstrip": false,
|
14 |
+
"single_word": false
|
15 |
+
},
|
16 |
+
"eos_token": {
|
17 |
+
"content": "</s>",
|
18 |
+
"lstrip": false,
|
19 |
+
"normalized": false,
|
20 |
+
"rstrip": false,
|
21 |
+
"single_word": false
|
22 |
+
},
|
23 |
+
"mask_token": {
|
24 |
+
"content": "<mask>",
|
25 |
+
"lstrip": true,
|
26 |
+
"normalized": false,
|
27 |
+
"rstrip": false,
|
28 |
+
"single_word": false
|
29 |
+
},
|
30 |
+
"pad_token": {
|
31 |
+
"content": "<pad>",
|
32 |
+
"lstrip": false,
|
33 |
+
"normalized": false,
|
34 |
+
"rstrip": false,
|
35 |
+
"single_word": false
|
36 |
+
},
|
37 |
+
"sep_token": {
|
38 |
+
"content": "</s>",
|
39 |
+
"lstrip": false,
|
40 |
+
"normalized": false,
|
41 |
+
"rstrip": false,
|
42 |
+
"single_word": false
|
43 |
+
},
|
44 |
+
"unk_token": {
|
45 |
+
"content": "<unk>",
|
46 |
+
"lstrip": false,
|
47 |
+
"normalized": false,
|
48 |
+
"rstrip": false,
|
49 |
+
"single_word": false
|
50 |
+
}
|
51 |
+
}
|
tokenizer.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3e19cd8c08f528b481e909f73dbd1fd62b1e8b1117579ba205e477801237f9e0
|
3 |
+
size 17082988
|
tokenizer_config.json
ADDED
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"added_tokens_decoder": {
|
3 |
+
"0": {
|
4 |
+
"content": "<s>",
|
5 |
+
"lstrip": false,
|
6 |
+
"normalized": false,
|
7 |
+
"rstrip": false,
|
8 |
+
"single_word": false,
|
9 |
+
"special": true
|
10 |
+
},
|
11 |
+
"1": {
|
12 |
+
"content": "<pad>",
|
13 |
+
"lstrip": false,
|
14 |
+
"normalized": false,
|
15 |
+
"rstrip": false,
|
16 |
+
"single_word": false,
|
17 |
+
"special": true
|
18 |
+
},
|
19 |
+
"2": {
|
20 |
+
"content": "</s>",
|
21 |
+
"lstrip": false,
|
22 |
+
"normalized": false,
|
23 |
+
"rstrip": false,
|
24 |
+
"single_word": false,
|
25 |
+
"special": true
|
26 |
+
},
|
27 |
+
"3": {
|
28 |
+
"content": "<unk>",
|
29 |
+
"lstrip": false,
|
30 |
+
"normalized": false,
|
31 |
+
"rstrip": false,
|
32 |
+
"single_word": false,
|
33 |
+
"special": true
|
34 |
+
},
|
35 |
+
"250001": {
|
36 |
+
"content": "<mask>",
|
37 |
+
"lstrip": true,
|
38 |
+
"normalized": false,
|
39 |
+
"rstrip": false,
|
40 |
+
"single_word": false,
|
41 |
+
"special": true
|
42 |
+
}
|
43 |
+
},
|
44 |
+
"bos_token": "<s>",
|
45 |
+
"clean_up_tokenization_spaces": true,
|
46 |
+
"cls_token": "<s>",
|
47 |
+
"eos_token": "</s>",
|
48 |
+
"mask_token": "<mask>",
|
49 |
+
"model_max_length": 8194,
|
50 |
+
"pad_token": "<pad>",
|
51 |
+
"sep_token": "</s>",
|
52 |
+
"tokenizer_class": "XLMRobertaTokenizer",
|
53 |
+
"unk_token": "<unk>"
|
54 |
+
}
|