thenlper commited on
Commit
167d84d
·
verified ·
1 Parent(s): 087a024

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +46 -11
README.md CHANGED
@@ -4662,6 +4662,8 @@ refer to [enable-unpadding-and-xformers](https://huggingface.co/Alibaba-NLP/new-
4662
  ```
4663
  # Requires transformers>=4.36.0
4664
 
 
 
4665
  import torch.nn.functional as F
4666
  from transformers import AutoModel, AutoTokenizer
4667
 
@@ -4672,9 +4674,9 @@ input_texts = [
4672
  "快排算法介绍"
4673
  ]
4674
 
4675
- model_path = 'Alibaba-NLP/gte-multilingual-base'
4676
- tokenizer = AutoTokenizer.from_pretrained(model_path)
4677
- model = AutoModel.from_pretrained(model_path, trust_remote_code=True)
4678
 
4679
  # Tokenize the input texts
4680
  batch_dict = tokenizer(input_texts, max_length=8192, padding=True, truncation=True, return_tensors='pt')
@@ -4687,12 +4689,17 @@ embeddings = outputs.last_hidden_state[:, 0][:dimension]
4687
  embeddings = F.normalize(embeddings, p=2, dim=1)
4688
  scores = (embeddings[:1] @ embeddings[1:].T) * 100
4689
  print(scores.tolist())
 
 
4690
  ```
4691
 
4692
  ### Use with sentence-transformers
4693
  ```
 
 
4694
  from sentence_transformers import SentenceTransformer
4695
  from sentence_transformers.util import cos_sim
 
4696
 
4697
  input_texts = [
4698
  "what is the capital of China?",
@@ -4701,17 +4708,31 @@ input_texts = [
4701
  "快排算法介绍"
4702
  ]
4703
 
4704
- model = SentenceTransformer('Alibaba-NLP/gte-multilingual-base', trust_remote_code=True)
4705
- embeddings = model.encode(input_texts)
 
 
 
 
 
 
 
 
 
 
 
 
 
4706
  ```
4707
 
4708
  ### Use with custom code to get dense embeddigns and sparse token weights
4709
  ```
4710
- # You can find the gte_embeddings.py in https://huggingface.co/Alibaba-NLP/gte-multilingual-base/blob/main/scripts/gte_embedding.py
4711
- from gte_embeddings import GTEEmbeddidng
4712
 
4713
- model_path = 'Alibaba-NLP/gte-multilingual-base'
4714
- model = GTEEmbeddidng(model_path)
 
 
4715
  query = "中国的首都在哪儿"
4716
 
4717
  docs = [
@@ -4727,12 +4748,26 @@ print('token_weights', embs['token_weights'])
4727
  pairs = [(query, doc) for doc in docs]
4728
  dense_scores = model.compute_scores(pairs, dense_weight=1.0, sparse_weight=0.0)
4729
  sparse_scores = model.compute_scores(pairs, dense_weight=0.0, sparse_weight=1.0)
4730
- hybird_scores = model.compute_scores(pairs, dense_weight=1.0, sparse_weight=0.3)
 
4731
  print('dense_scores', dense_scores)
4732
  print('sparse_scores', sparse_scores)
4733
- print('hybird_scores', hybird_scores)
 
 
 
 
 
4734
  ```
4735
 
 
 
 
 
 
 
 
 
4736
  ## Citation
4737
  ```
4738
  @misc{zhang2024mgte,
 
4662
  ```
4663
  # Requires transformers>=4.36.0
4664
 
4665
+ # Requires transformers>=4.36.0
4666
+
4667
  import torch.nn.functional as F
4668
  from transformers import AutoModel, AutoTokenizer
4669
 
 
4674
  "快排算法介绍"
4675
  ]
4676
 
4677
+ model_name_or_path = 'Alibaba-NLP/gte-multilingual-base'
4678
+ tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
4679
+ model = AutoModel.from_pretrained(model_name_or_path, trust_remote_code=True)
4680
 
4681
  # Tokenize the input texts
4682
  batch_dict = tokenizer(input_texts, max_length=8192, padding=True, truncation=True, return_tensors='pt')
 
4689
  embeddings = F.normalize(embeddings, p=2, dim=1)
4690
  scores = (embeddings[:1] @ embeddings[1:].T) * 100
4691
  print(scores.tolist())
4692
+
4693
+ # [[0.3016996383666992, 0.7503870129585266, 0.3203084468841553]]
4694
  ```
4695
 
4696
  ### Use with sentence-transformers
4697
  ```
4698
+ # Requires sentences-transformers>=3.0.0
4699
+
4700
  from sentence_transformers import SentenceTransformer
4701
  from sentence_transformers.util import cos_sim
4702
+ import numpy as np
4703
 
4704
  input_texts = [
4705
  "what is the capital of China?",
 
4708
  "快排算法介绍"
4709
  ]
4710
 
4711
+ model_name_or_path="Alibaba-NLP/gte-multilingual-base"
4712
+ model = SentenceTransformer(', trust_remote_code=True)
4713
+ embeddings = model.encode(input_texts) # embeddings.shape (4, 768)
4714
+
4715
+ # normalized embeddings
4716
+ norms = np.linalg.norm(embeddings, ord=2, axis=1, keepdims=True)
4717
+ norms[norms == 0] = 1
4718
+ embeddings = embeddings / norms
4719
+
4720
+ # sim scores
4721
+ scores = (embeddings[:1] @ embeddings[1:].T)
4722
+
4723
+ print(scores.tolist())
4724
+ # [[0.301699697971344, 0.7503870129585266, 0.32030850648880005]]
4725
+
4726
  ```
4727
 
4728
  ### Use with custom code to get dense embeddigns and sparse token weights
4729
  ```
4730
+ # You can find the script gte_embedding.py in https://huggingface.co/Alibaba-NLP/gte-multilingual-base/blob/main/scripts/gte_embedding.py
 
4731
 
4732
+ from gte_embedding import GTEEmbeddidng
4733
+
4734
+ model_name_or_path = 'Alibaba-NLP/gte-multilingual-base'
4735
+ model = GTEEmbeddidng(model_name_or_path)
4736
  query = "中国的首都在哪儿"
4737
 
4738
  docs = [
 
4748
  pairs = [(query, doc) for doc in docs]
4749
  dense_scores = model.compute_scores(pairs, dense_weight=1.0, sparse_weight=0.0)
4750
  sparse_scores = model.compute_scores(pairs, dense_weight=0.0, sparse_weight=1.0)
4751
+ hybrid_scores = model.compute_scores(pairs, dense_weight=1.0, sparse_weight=0.3)
4752
+
4753
  print('dense_scores', dense_scores)
4754
  print('sparse_scores', sparse_scores)
4755
+ print('hybrid_scores', hybrid_scores)
4756
+
4757
+ # dense_scores [0.85302734375, 0.257568359375, 0.76953125, 0.325439453125]
4758
+ # sparse_scores [0.0, 0.0, 4.600879669189453, 1.570279598236084]
4759
+ # hybrid_scores [0.85302734375, 0.257568359375, 2.1497951507568356, 0.7965233325958252]
4760
+
4761
  ```
4762
 
4763
+ ## Evaluation
4764
+
4765
+ We validated the performance of the **gte-multilingual-base** model on multiple downstream tasks, including multilingual retrieval, cross-lingual retrieval, long text retrieval, and general text representation evaluation on the [MTEB Leaderboard](https://huggingface.co/spaces/mteb/leaderboard), among others.
4766
+
4767
+ ### Retrieval Task
4768
+
4769
+ ![avatar](./images/mgte-retrieval.pdf)
4770
+
4771
  ## Citation
4772
  ```
4773
  @misc{zhang2024mgte,