Spaces:
Sleeping
Sleeping
File size: 1,750 Bytes
bb3407a e250f84 bb3407a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 |
from torch import Tensor
import tiktoken
from transformers import AutoTokenizer, AutoModel
tokenizer = AutoTokenizer.from_pretrained("intfloat/e5-large-v2")
model = AutoModel.from_pretrained("intfloat/e5-large-v2")
EMBEDDING_CHAR_LIMIT = 512
def average_pool(last_hidden_states: Tensor, attention_mask: Tensor) -> Tensor:
last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]
def strings_to_vectors(strings):
passage_batch = tokenizer(
strings,
max_length=EMBEDDING_CHAR_LIMIT,
padding=True,
truncation=True,
return_tensors="pt",
)
passage_outputs = model(**passage_batch)
return average_pool(
passage_outputs.last_hidden_state, passage_batch["attention_mask"]
)
def num_tokens_from_str(string, model="gpt-3.5-turbo"):
"""Returns the number of tokens used by a list of messages."""
try:
encoding = tiktoken.encoding_for_model(model)
except KeyError:
encoding = tiktoken.get_encoding("cl100k_base")
if model == "gpt-3.5-turbo": # note: future models may deviate from this
num_tokens = 0
num_tokens += (
4 # every message follows <im_start>{role/name}\n{content}<im_end>\n
)
num_tokens += len(encoding.encode(string))
num_tokens += 2 # every reply is primed with <im_start>assistant
return num_tokens
else:
raise NotImplementedError(
f"""num_tokens_from_messages() is not presently implemented for model {model}.
See https://github.com/openai/openai-python/blob/main/chatml.md for information on how messages are converted to tokens."""
)
|