rajammanabrolu
commited on
Commit
•
eeec945
1
Parent(s):
cd5239e
Update tiktoken.py
Browse files- tiktoken.py +1 -18
tiktoken.py
CHANGED
@@ -3,7 +3,6 @@
|
|
3 |
from functools import lru_cache
|
4 |
from typing import Any, Dict, List, Optional, Tuple
|
5 |
|
6 |
-
import torch
|
7 |
from transformers import PreTrainedTokenizer
|
8 |
|
9 |
DEFAULT_SYSTEM_PROMPT = """You are a helpful, respectful and honest assistant. Always answer as helpfully as possible."""
|
@@ -200,8 +199,6 @@ class TiktokenTokenizerWrapper(PreTrainedTokenizer):
|
|
200 |
'{% endif %}'
|
201 |
'{% if (add_generation_prompt == true and loop.last) %}'
|
202 |
"{{ '\n' + '<|im_start|>' + 'assistant' + '\n' }}"
|
203 |
-
"{% elif (message['role'] == 'assistant') %}"
|
204 |
-
'{{ eos_token }}'
|
205 |
'{% endif %}'
|
206 |
'{% endfor %}')
|
207 |
template = template.replace(
|
@@ -358,19 +355,5 @@ class TiktokenTokenizerWrapper(PreTrainedTokenizer):
|
|
358 |
|
359 |
return self.add_tokens(actual_new_tokens, special_tokens=True)
|
360 |
|
361 |
-
def construct_logit_tensor(self, logprobs: Dict[str,
|
362 |
-
float]) -> torch.Tensor:
|
363 |
-
"""Construct tensor of shape (vocab_size,) mapping words to logprobs.
|
364 |
|
365 |
-
|
366 |
-
logprobs (Dict[str, float]): Dictionary mapping tokens to log probabilities assigned to them by the model.
|
367 |
-
"""
|
368 |
-
tensor = torch.tensor([min(logprobs.values()) - 1] * (self.vocab_size))
|
369 |
-
for k in logprobs:
|
370 |
-
encoding = self(k)['input_ids']
|
371 |
-
idx = encoding[0]
|
372 |
-
tensor[idx] = logprobs[k]
|
373 |
-
return tensor
|
374 |
-
|
375 |
-
|
376 |
-
TiktokenTokenizerWrapper.register_for_auto_class()
|
|
|
3 |
from functools import lru_cache
|
4 |
from typing import Any, Dict, List, Optional, Tuple
|
5 |
|
|
|
6 |
from transformers import PreTrainedTokenizer
|
7 |
|
8 |
DEFAULT_SYSTEM_PROMPT = """You are a helpful, respectful and honest assistant. Always answer as helpfully as possible."""
|
|
|
199 |
'{% endif %}'
|
200 |
'{% if (add_generation_prompt == true and loop.last) %}'
|
201 |
"{{ '\n' + '<|im_start|>' + 'assistant' + '\n' }}"
|
|
|
|
|
202 |
'{% endif %}'
|
203 |
'{% endfor %}')
|
204 |
template = template.replace(
|
|
|
355 |
|
356 |
return self.add_tokens(actual_new_tokens, special_tokens=True)
|
357 |
|
|
|
|
|
|
|
358 |
|
359 |
+
TiktokenTokenizerWrapper.register_for_auto_class()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|