Alibaba-NLP
/

gte-multilingual-base

@@ -4662,6 +4662,8 @@ refer to [enable-unpadding-and-xformers](https://huggingface.co/Alibaba-NLP/new-
 ```
 # Requires transformers>=4.36.0
 import torch.nn.functional as F
 from transformers import AutoModel, AutoTokenizer
@@ -4672,9 +4674,9 @@ input_texts = [
     "快排算法介绍"
 ]
-model_path = 'Alibaba-NLP/gte-multilingual-base'
-tokenizer = AutoTokenizer.from_pretrained(model_path)
-model = AutoModel.from_pretrained(model_path, trust_remote_code=True)
 # Tokenize the input texts
 batch_dict = tokenizer(input_texts, max_length=8192, padding=True, truncation=True, return_tensors='pt')
@@ -4687,12 +4689,17 @@ embeddings = outputs.last_hidden_state[:, 0][:dimension]
 embeddings = F.normalize(embeddings, p=2, dim=1)
 scores = (embeddings[:1] @ embeddings[1:].T) * 100
 print(scores.tolist())
 ```
 ### Use with sentence-transformers
 ```
 from sentence_transformers import SentenceTransformer
 from sentence_transformers.util import cos_sim
 input_texts = [
     "what is the capital of China?",
@@ -4701,17 +4708,31 @@ input_texts = [
     "快排算法介绍"
 ]
-model = SentenceTransformer('Alibaba-NLP/gte-multilingual-base', trust_remote_code=True)
-embeddings = model.encode(input_texts)
 ```
 ### Use with custom code to get dense embeddigns and sparse token weights
 ```
-# You can find the gte_embeddings.py in https://huggingface.co/Alibaba-NLP/gte-multilingual-base/blob/main/scripts/gte_embedding.py
-from gte_embeddings import GTEEmbeddidng
-model_path = 'Alibaba-NLP/gte-multilingual-base'
-model = GTEEmbeddidng(model_path)
 query = "中国的首都在哪儿"
 docs = [
@@ -4727,12 +4748,26 @@ print('token_weights', embs['token_weights'])
 pairs = [(query, doc) for doc in docs]
 dense_scores = model.compute_scores(pairs, dense_weight=1.0, sparse_weight=0.0)
 sparse_scores = model.compute_scores(pairs, dense_weight=0.0, sparse_weight=1.0)
-hybird_scores = model.compute_scores(pairs, dense_weight=1.0, sparse_weight=0.3)
 print('dense_scores', dense_scores)
 print('sparse_scores', sparse_scores)
-print('hybird_scores', hybird_scores)
 ```
 ## Citation
 ```
 @misc{zhang2024mgte,

 ```
 # Requires transformers>=4.36.0
+# Requires transformers>=4.36.0
 import torch.nn.functional as F
 from transformers import AutoModel, AutoTokenizer
     "快排算法介绍"
 ]
+model_name_or_path = 'Alibaba-NLP/gte-multilingual-base'
+tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
+model = AutoModel.from_pretrained(model_name_or_path, trust_remote_code=True)
 # Tokenize the input texts
 batch_dict = tokenizer(input_texts, max_length=8192, padding=True, truncation=True, return_tensors='pt')
 embeddings = F.normalize(embeddings, p=2, dim=1)
 scores = (embeddings[:1] @ embeddings[1:].T) * 100
 print(scores.tolist())
+# [[0.3016996383666992, 0.7503870129585266, 0.3203084468841553]]
 ```
 ### Use with sentence-transformers
 ```
+# Requires sentences-transformers>=3.0.0
 from sentence_transformers import SentenceTransformer
 from sentence_transformers.util import cos_sim
+import numpy as np
 input_texts = [
     "what is the capital of China?",
     "快排算法介绍"
 ]
+model_name_or_path="Alibaba-NLP/gte-multilingual-base"
+model = SentenceTransformer(', trust_remote_code=True)
+embeddings = model.encode(input_texts) # embeddings.shape (4, 768)
+# normalized embeddings
+norms = np.linalg.norm(embeddings, ord=2, axis=1, keepdims=True)
+norms[norms == 0] = 1
+embeddings = embeddings / norms
+# sim scores
+scores = (embeddings[:1] @ embeddings[1:].T)
+print(scores.tolist())
+# [[0.301699697971344, 0.7503870129585266, 0.32030850648880005]]
 ```
 ### Use with custom code to get dense embeddigns and sparse token weights
 ```
+# You can find the script gte_embedding.py in https://huggingface.co/Alibaba-NLP/gte-multilingual-base/blob/main/scripts/gte_embedding.py
+from gte_embedding import GTEEmbeddidng
+model_name_or_path = 'Alibaba-NLP/gte-multilingual-base'
+model = GTEEmbeddidng(model_name_or_path)
 query = "中国的首都在哪儿"
 docs = [
 pairs = [(query, doc) for doc in docs]
 dense_scores = model.compute_scores(pairs, dense_weight=1.0, sparse_weight=0.0)
 sparse_scores = model.compute_scores(pairs, dense_weight=0.0, sparse_weight=1.0)
+hybrid_scores = model.compute_scores(pairs, dense_weight=1.0, sparse_weight=0.3)
 print('dense_scores', dense_scores)
 print('sparse_scores', sparse_scores)
+print('hybrid_scores', hybrid_scores)
+# dense_scores [0.85302734375, 0.257568359375, 0.76953125, 0.325439453125]
+# sparse_scores [0.0, 0.0, 4.600879669189453, 1.570279598236084]
+# hybrid_scores [0.85302734375, 0.257568359375, 2.1497951507568356, 0.7965233325958252]
 ```
+## Evaluation
+We validated the performance of the **gte-multilingual-base** model on multiple downstream tasks, including multilingual retrieval, cross-lingual retrieval, long text retrieval, and general text representation evaluation on the [MTEB Leaderboard](https://huggingface.co/spaces/mteb/leaderboard), among others.
+### Retrieval Task
+![avatar](./images/mgte-retrieval.pdf)
 ## Citation
 ```
 @misc{zhang2024mgte,