update README.md
Browse files
README.md
CHANGED
|
@@ -2614,7 +2614,6 @@ import torch.nn.functional as F
|
|
| 2614 |
|
| 2615 |
from torch import Tensor
|
| 2616 |
from transformers import AutoTokenizer, AutoModel
|
| 2617 |
-
from transformers.modeling_outputs import BaseModelOutput
|
| 2618 |
|
| 2619 |
|
| 2620 |
def average_pool(last_hidden_states: Tensor,
|
|
@@ -2636,7 +2635,7 @@ model = AutoModel.from_pretrained('intfloat/e5-base')
|
|
| 2636 |
# Tokenize the input texts
|
| 2637 |
batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt')
|
| 2638 |
|
| 2639 |
-
outputs
|
| 2640 |
embeddings = average_pool(outputs.last_hidden_state, batch_dict['attention_mask'])
|
| 2641 |
|
| 2642 |
# (Optionally) normalize embeddings
|
|
@@ -2654,3 +2653,20 @@ Please refer to our paper at [https://arxiv.org/pdf/2212.03533.pdf](https://arxi
|
|
| 2654 |
Check out [unilm/e5](https://github.com/microsoft/unilm/tree/master/e5) to reproduce evaluation results
|
| 2655 |
on the [BEIR](https://arxiv.org/abs/2104.08663) and [MTEB benchmark](https://arxiv.org/abs/2210.07316).
|
| 2656 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2614 |
|
| 2615 |
from torch import Tensor
|
| 2616 |
from transformers import AutoTokenizer, AutoModel
|
|
|
|
| 2617 |
|
| 2618 |
|
| 2619 |
def average_pool(last_hidden_states: Tensor,
|
|
|
|
| 2635 |
# Tokenize the input texts
|
| 2636 |
batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt')
|
| 2637 |
|
| 2638 |
+
outputs = model(**batch_dict)
|
| 2639 |
embeddings = average_pool(outputs.last_hidden_state, batch_dict['attention_mask'])
|
| 2640 |
|
| 2641 |
# (Optionally) normalize embeddings
|
|
|
|
| 2653 |
Check out [unilm/e5](https://github.com/microsoft/unilm/tree/master/e5) to reproduce evaluation results
|
| 2654 |
on the [BEIR](https://arxiv.org/abs/2104.08663) and [MTEB benchmark](https://arxiv.org/abs/2210.07316).
|
| 2655 |
|
| 2656 |
+
## Citation
|
| 2657 |
+
|
| 2658 |
+
If you find our paper or models helpful, please consider cite as follows:
|
| 2659 |
+
|
| 2660 |
+
```
|
| 2661 |
+
@article{wang2022text,
|
| 2662 |
+
title={Text Embeddings by Weakly-Supervised Contrastive Pre-training},
|
| 2663 |
+
author={Wang, Liang and Yang, Nan and Huang, Xiaolong and Jiao, Binxing and Yang, Linjun and Jiang, Daxin and Majumder, Rangan and Wei, Furu},
|
| 2664 |
+
journal={arXiv preprint arXiv:2212.03533},
|
| 2665 |
+
year={2022}
|
| 2666 |
+
}
|
| 2667 |
+
```
|
| 2668 |
+
|
| 2669 |
+
## Limitations
|
| 2670 |
+
|
| 2671 |
+
This model only works for English texts. Long texts will be truncated to at most 512 tokens.
|
| 2672 |
+
|