upskyy commited on
Commit
9d2ff40
1 Parent(s): 69e0a0c

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
1_Pooling/config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "word_embedding_dimension": 768,
3
+ "pooling_mode_cls_token": false,
4
+ "pooling_mode_mean_tokens": true,
5
+ "pooling_mode_max_tokens": false,
6
+ "pooling_mode_mean_sqrt_len_tokens": false,
7
+ "pooling_mode_weightedmean_tokens": false,
8
+ "pooling_mode_lasttoken": false,
9
+ "include_prompt": true
10
+ }
README.md CHANGED
@@ -1,3 +1,341 @@
1
- ---
2
- license: apache-2.0
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - korean
4
+ - sentence-transformers
5
+ - transformers
6
+ - multilingual
7
+ - sentence-transformers
8
+ - sentence-similarity
9
+ - feature-extraction
10
+ license: apache-2.0
11
+ language:
12
+ - af
13
+ - ar
14
+ - az
15
+ - be
16
+ - bg
17
+ - bn
18
+ - ca
19
+ - ceb
20
+ - cs
21
+ - cy
22
+ - da
23
+ - de
24
+ - el
25
+ - en
26
+ - es
27
+ - et
28
+ - eu
29
+ - fa
30
+ - fi
31
+ - fr
32
+ - gl
33
+ - gu
34
+ - he
35
+ - hi
36
+ - hr
37
+ - ht
38
+ - hu
39
+ - hy
40
+ - id
41
+ - is
42
+ - it
43
+ - ja
44
+ - jv
45
+ - ka
46
+ - kk
47
+ - km
48
+ - kn
49
+ - ko
50
+ - ky
51
+ - lo
52
+ - lt
53
+ - lv
54
+ - mk
55
+ - ml
56
+ - mn
57
+ - mr
58
+ - ms
59
+ - my
60
+ - ne
61
+ - nl
62
+ - 'no'
63
+ - pa
64
+ - pl
65
+ - pt
66
+ - qu
67
+ - ro
68
+ - ru
69
+ - si
70
+ - sk
71
+ - sl
72
+ - so
73
+ - sq
74
+ - sr
75
+ - sv
76
+ - sw
77
+ - ta
78
+ - te
79
+ - th
80
+ - tl
81
+ - tr
82
+ - uk
83
+ - ur
84
+ - vi
85
+ - yo
86
+ - zh
87
+ library_name: sentence-transformers
88
+ base_model: Alibaba-NLP/gte-multilingual-base
89
+ datasets: []
90
+ metrics:
91
+ - pearson_cosine
92
+ - spearman_cosine
93
+ - pearson_manhattan
94
+ - spearman_manhattan
95
+ - pearson_euclidean
96
+ - spearman_euclidean
97
+ - pearson_dot
98
+ - spearman_dot
99
+ - pearson_max
100
+ - spearman_max
101
+ widget:
102
+ - source_sentence: 이집트 군대가 형제애를 단속하다
103
+ sentences:
104
+ - 이집트의 군대가 무슬림 형제애를 단속하다
105
+ - 아르헨티나의 기예르모 코리아와 네덜란드의 마틴 버커크의 또 다른 준결승전도 매력적이다.
106
+ - 그것이 사실일 수도 있다고 생각하는 것은 재미있다.
107
+ - source_sentence: 오, 그리고 다시 결혼은 근본적인 인권이라고 주장한다.
108
+ sentences:
109
+ - 특히 결혼은 근본적인 인권이라고 말한 후에.
110
+ - 해변에 있는 흑인과 그의 개...
111
+ - 이란은 핵 프로그램이 평화적인 목적을 위한 것이라고 주장한다
112
+ - source_sentence: 두 남자가 난간에 상자를 올려놓고 있다.
113
+ sentences:
114
+ - 심장 박동이 빨라졌다.
115
+ - 두 남자가 집에 있고, 깊이 잠들어 있다.
116
+ - 두 남자가 난간에 상자를 놓고 있다.
117
+ - source_sentence: 조지 샤힌은 안데르센 컨설팅 사업부에서 일했다.
118
+ sentences:
119
+ - 안데르센 컨설팅은 여전히 번창하는 사업이다.
120
+ - 공개 전시 중에 총이 경례한다.
121
+ - 이것은 내가 영국의 아서 안데르센 사업부의 파트너인 짐 와디아를 아서 안데르센 경영진이 선택한 것보다 래리 웨인바흐를 안데르센 월드와이드의
122
+ 경영 파트너로 승계하기 위해 안데르센 컨설팅 사업부(현재의 엑센츄어라고 알려져 있음)의 전 관리 파트너인 조지 샤힌에 대한 지지를 표명했을
123
+ 때 가장 명백했다.
124
+ - source_sentence: 아이를 가진 엄마가 해변을 걷는다.
125
+ sentences:
126
+ - 국립공원에서 가장 큰 마을인 케스윅의 인구는 매년 여름 등산객, 뱃사람, 관광객이 도착함에 따라 증가한다.
127
+ - 한 남자가 해변에서 개를 산책시킨다.
128
+ - 두 사람이 해변을 걷는다.
129
+ pipeline_tag: sentence-similarity
130
+ model-index:
131
+ - name: upskyy/gte-korean-base
132
+ results:
133
+ - task:
134
+ type: semantic-similarity
135
+ name: Semantic Similarity
136
+ dataset:
137
+ name: sts dev
138
+ type: sts-dev
139
+ metrics:
140
+ - type: pearson_cosine
141
+ value: 0.8681402442523579
142
+ name: Pearson Cosine
143
+ - type: spearman_cosine
144
+ value: 0.8689161244129222
145
+ name: Spearman Cosine
146
+ - type: pearson_manhattan
147
+ value: 0.7793706671294577
148
+ name: Pearson Manhattan
149
+ - type: spearman_manhattan
150
+ value: 0.7816816816264681
151
+ name: Spearman Manhattan
152
+ - type: pearson_euclidean
153
+ value: 0.7810210343196274
154
+ name: Pearson Euclidean
155
+ - type: spearman_euclidean
156
+ value: 0.7835693502057339
157
+ name: Spearman Euclidean
158
+ - type: pearson_dot
159
+ value: 0.71802928588865
160
+ name: Pearson Dot
161
+ - type: spearman_dot
162
+ value: 0.7552957785734216
163
+ name: Spearman Dot
164
+ - type: pearson_max
165
+ value: 0.8681402442523579
166
+ name: Pearson Max
167
+ - type: spearman_max
168
+ value: 0.8689161244129222
169
+ name: Spearman Max
170
+ ---
171
+
172
+ # SentenceTransformer based on Alibaba-NLP/gte-multilingual-base
173
+
174
+ This is a [sentence-transformers](https://www.SBERT.net) model finetuned from [Alibaba-NLP/gte-multilingual-base](https://huggingface.co/Alibaba-NLP/gte-multilingual-base). It maps sentences & paragraphs to a 768-dimensional dense vector space and can be used for semantic textual similarity, semantic search, paraphrase mining, text classification, clustering, and more.
175
+
176
+ ## Model Details
177
+
178
+ ### Model Description
179
+ - **Model Type:** Sentence Transformer
180
+ - **Base model:** [Alibaba-NLP/gte-multilingual-base](https://huggingface.co/Alibaba-NLP/gte-multilingual-base) <!-- at revision 2098722cb5e9d7f96f46df0496f4d34b7338f79c -->
181
+ - **Maximum Sequence Length:** **8192 tokens**
182
+ - **Output Dimensionality:** 768 tokens
183
+ - **Similarity Function:** Cosine Similarity
184
+ <!-- - **Training Dataset:** Unknown -->
185
+ <!-- - **Language:** Unknown -->
186
+ <!-- - **License:** Unknown -->
187
+
188
+ ### Full Model Architecture
189
+
190
+ ```
191
+ SentenceTransformer(
192
+ (0): Transformer({'max_seq_length': 8192, 'do_lower_case': False}) with Transformer model
193
+ (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
194
+ )
195
+ ```
196
+
197
+ ## Usage
198
+
199
+ ### Direct Usage (Sentence Transformers)
200
+
201
+ First install the Sentence Transformers library:
202
+
203
+ ```bash
204
+ pip install -U sentence-transformers
205
+ ```
206
+
207
+ Then you can load this model and run inference.
208
+ ```python
209
+ from sentence_transformers import SentenceTransformer
210
+
211
+ # Download from the 🤗 Hub
212
+ model = SentenceTransformer("upskyy/gte-korean-base")
213
+
214
+ # Run inference
215
+ sentences = [
216
+ '아이를 가진 엄마가 해변을 걷는다.',
217
+ '두 사람이 해변을 걷는다.',
218
+ '한 남자가 해변에서 개를 산책시킨다.',
219
+ ]
220
+ embeddings = model.encode(sentences)
221
+ print(embeddings.shape)
222
+ # [3, 768]
223
+
224
+ # Get the similarity scores for the embeddings
225
+ similarities = model.similarity(embeddings, embeddings)
226
+ print(similarities.shape)
227
+ # [3, 3]
228
+ ```
229
+
230
+ <!--
231
+ ### Direct Usage (Transformers)
232
+
233
+ <details><summary>Click to see the direct usage in Transformers</summary>
234
+
235
+ </details>
236
+ -->
237
+
238
+ <!--
239
+ ### Downstream Usage (Sentence Transformers)
240
+
241
+ You can finetune this model on your own dataset.
242
+
243
+ <details><summary>Click to expand</summary>
244
+
245
+ </details>
246
+ -->
247
+
248
+ <!--
249
+ ### Out-of-Scope Use
250
+
251
+ *List how the model may foreseeably be misused and address what users ought not to do with the model.*
252
+ -->
253
+
254
+ ## Evaluation
255
+
256
+ ### Metrics
257
+
258
+ #### Semantic Similarity
259
+ * Dataset: `sts-dev`
260
+ * Evaluated with [<code>EmbeddingSimilarityEvaluator</code>](https://sbert.net/docs/package_reference/sentence_transformer/evaluation.html#sentence_transformers.evaluation.EmbeddingSimilarityEvaluator)
261
+
262
+ | Metric | Value |
263
+ | :----------------- | :--------- |
264
+ | pearson_cosine | 0.8681 |
265
+ | spearman_cosine | 0.8689 |
266
+ | pearson_manhattan | 0.7794 |
267
+ | spearman_manhattan | 0.7817 |
268
+ | pearson_euclidean | 0.781 |
269
+ | spearman_euclidean | 0.7836 |
270
+ | pearson_dot | 0.718 |
271
+ | spearman_dot | 0.7553 |
272
+ | pearson_max | 0.8681 |
273
+ | **spearman_max** | **0.8689** |
274
+
275
+ <!--
276
+ ## Bias, Risks and Limitations
277
+
278
+ *What are the known or foreseeable issues stemming from this model? You could also flag here known failure cases or weaknesses of the model.*
279
+ -->
280
+
281
+ <!--
282
+ ### Recommendations
283
+
284
+ *What are recommendations with respect to the foreseeable issues? For example, filtering explicit content.*
285
+ -->
286
+
287
+
288
+ ### Framework Versions
289
+ - Python: 3.10.13
290
+ - Sentence Transformers: 3.0.1
291
+ - Transformers: 4.42.4
292
+ - PyTorch: 2.3.0+cu121
293
+ - Accelerate: 0.30.1
294
+ - Datasets: 2.16.1
295
+ - Tokenizers: 0.19.1
296
+
297
+ ## Citation
298
+
299
+ ### BibTeX
300
+
301
+ ```bibtex
302
+ @misc{zhang2024mgte,
303
+ title={mGTE: Generalized Long-Context Text Representation and Reranking Models for Multilingual Text Retrieval},
304
+ author={Xin Zhang and Yanzhao Zhang and Dingkun Long and Wen Xie and Ziqi Dai and Jialong Tang and Huan Lin and Baosong Yang and Pengjun Xie and Fei Huang and Meishan Zhang and Wenjie Li and Min Zhang},
305
+ year={2024},
306
+ eprint={2407.19669},
307
+ archivePrefix={arXiv},
308
+ primaryClass={cs.CL},
309
+ url={https://arxiv.org/abs/2407.19669},
310
+ }
311
+ ```
312
+
313
+ ```bibtex
314
+ @inproceedings{reimers-2019-sentence-bert,
315
+ title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
316
+ author = "Reimers, Nils and Gurevych, Iryna",
317
+ booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
318
+ month = "11",
319
+ year = "2019",
320
+ publisher = "Association for Computational Linguistics",
321
+ url = "https://arxiv.org/abs/1908.10084",
322
+ }
323
+ ```
324
+
325
+ <!--
326
+ ## Glossary
327
+
328
+ *Clearly define terms in order to be accessible across audiences.*
329
+ -->
330
+
331
+ <!--
332
+ ## Model Card Authors
333
+
334
+ *Lists the people who create the model card, providing recognition and accountability for the detailed work that goes into its construction.*
335
+ -->
336
+
337
+ <!--
338
+ ## Model Card Contact
339
+
340
+ *Provides a way for people who have updates to the Model Card, suggestions, or questions, to contact the Model Card authors.*
341
+ -->
config.json ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "Alibaba-NLP/gte-multilingual-base",
3
+ "architectures": [
4
+ "NewModel"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.0,
7
+ "auto_map": {
8
+ "AutoConfig": "Alibaba-NLP/new-impl--configuration.NewConfig",
9
+ "AutoModel": "Alibaba-NLP/new-impl--modeling.NewModel",
10
+ "AutoModelForMaskedLM": "Alibaba-NLP/new-impl--modeling.NewForMaskedLM",
11
+ "AutoModelForMultipleChoice": "Alibaba-NLP/new-impl--modeling.NewForMultipleChoice",
12
+ "AutoModelForQuestionAnswering": "Alibaba-NLP/new-impl--modeling.NewForQuestionAnswering",
13
+ "AutoModelForSequenceClassification": "Alibaba-NLP/new-impl--modeling.NewForSequenceClassification",
14
+ "AutoModelForTokenClassification": "Alibaba-NLP/new-impl--modeling.NewForTokenClassification"
15
+ },
16
+ "classifier_dropout": 0.0,
17
+ "hidden_act": "gelu",
18
+ "hidden_dropout_prob": 0.1,
19
+ "hidden_size": 768,
20
+ "id2label": {
21
+ "0": "LABEL_0"
22
+ },
23
+ "initializer_range": 0.02,
24
+ "intermediate_size": 3072,
25
+ "label2id": {
26
+ "LABEL_0": 0
27
+ },
28
+ "layer_norm_eps": 1e-12,
29
+ "layer_norm_type": "layer_norm",
30
+ "logn_attention_clip1": false,
31
+ "logn_attention_scale": false,
32
+ "max_position_embeddings": 8192,
33
+ "model_type": "new",
34
+ "num_attention_heads": 12,
35
+ "num_hidden_layers": 12,
36
+ "pack_qkv": true,
37
+ "pad_token_id": 1,
38
+ "position_embedding_type": "rope",
39
+ "rope_scaling": {
40
+ "factor": 8.0,
41
+ "type": "ntk"
42
+ },
43
+ "rope_theta": 20000,
44
+ "torch_dtype": "float32",
45
+ "transformers_version": "4.42.4",
46
+ "type_vocab_size": 1,
47
+ "unpad_inputs": false,
48
+ "use_memory_efficient_attention": false,
49
+ "vocab_size": 250048
50
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0bf4dfd202a39a084bcf63f8d00d8f47ab5d0afd04367bbfd24a44c0e66ecd55
3
+ size 1221487872
modules.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "idx": 0,
4
+ "name": "0",
5
+ "path": "",
6
+ "type": "sentence_transformers.models.Transformer"
7
+ },
8
+ {
9
+ "idx": 1,
10
+ "name": "1",
11
+ "path": "1_Pooling",
12
+ "type": "sentence_transformers.models.Pooling"
13
+ }
14
+ ]
sentence_bert_config.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "max_seq_length": 8192,
3
+ "do_lower_case": false
4
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "cls_token": {
10
+ "content": "<s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "eos_token": {
17
+ "content": "</s>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "mask_token": {
24
+ "content": "<mask>",
25
+ "lstrip": true,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "pad_token": {
31
+ "content": "<pad>",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ },
37
+ "sep_token": {
38
+ "content": "</s>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false
43
+ },
44
+ "unk_token": {
45
+ "content": "<unk>",
46
+ "lstrip": false,
47
+ "normalized": false,
48
+ "rstrip": false,
49
+ "single_word": false
50
+ }
51
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aa7a6ad87a7ce8fe196787355f6af7d03aee94d19c54a5eb1392ed18c8ef451a
3
+ size 17082988
tokenizer_config.json ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "<s>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "<pad>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "</s>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "<unk>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "250001": {
36
+ "content": "<mask>",
37
+ "lstrip": true,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "bos_token": "<s>",
45
+ "clean_up_tokenization_spaces": true,
46
+ "cls_token": "<s>",
47
+ "eos_token": "</s>",
48
+ "mask_token": "<mask>",
49
+ "model_max_length": 8192,
50
+ "pad_token": "<pad>",
51
+ "sep_token": "</s>",
52
+ "tokenizer_class": "XLMRobertaTokenizer",
53
+ "unk_token": "<unk>"
54
+ }