update
Browse files- images/README.md +0 -0
- images/info.svg +1 -0
- vocab/code_davinci_002/__init__.py +0 -0
- vocab/gpt_35_turbo/aaa.py +38 -0
- vocab/gpt_4/__init__.py +1 -0
images/README.md
ADDED
File without changes
|
images/info.svg
ADDED
vocab/code_davinci_002/__init__.py
ADDED
File without changes
|
vocab/gpt_35_turbo/aaa.py
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
gpt_35_turbo decode UnicodeDecodeError 99413 b' \xe6\xb5'
|
3 |
+
gpt_35_turbo decode UnicodeDecodeError 99742 b'\x8c\xa8'
|
4 |
+
gpt_35_turbo decode UnicodeDecodeError 99834 b'\xad\x90'
|
5 |
+
gpt_35_turbo decode UnicodeDecodeError 100112 b'\xe0\xae\xbf\xe0\xae'
|
6 |
+
gpt_35_turbo decode KeyError 100256
|
7 |
+
gpt_35_turbo decode KeyError 100261
|
8 |
+
gpt_35_turbo decode KeyError 100262
|
9 |
+
gpt_35_turbo decode KeyError 100263
|
10 |
+
"""
|
11 |
+
|
12 |
+
|
13 |
+
|
14 |
+
import json
|
15 |
+
import tiktoken
|
16 |
+
|
17 |
+
|
18 |
+
tokenizer = tiktoken.encoding_for_model('gpt-3.5-turbo')
|
19 |
+
|
20 |
+
|
21 |
+
for token_id in [100263, 99834]: # special_tokens: 200257-100260 100276
|
22 |
+
try:
|
23 |
+
tokenizer.decode_tokens_bytes([token_id])
|
24 |
+
except:
|
25 |
+
pass
|
26 |
+
|
27 |
+
try:
|
28 |
+
tokenizer.decode_single_token_bytes(token_id)
|
29 |
+
except:
|
30 |
+
pass
|
31 |
+
|
32 |
+
try:
|
33 |
+
tokenizer.decode_bytes([token_id])
|
34 |
+
except:
|
35 |
+
pass
|
36 |
+
|
37 |
+
|
38 |
+
|
vocab/gpt_4/__init__.py
CHANGED
@@ -2,6 +2,7 @@
|
|
2 |
|
3 |
import tiktoken
|
4 |
from tiktoken import Encoding
|
|
|
5 |
|
6 |
tokenizer = tiktoken.encoding_for_model('gpt-4')
|
7 |
tokenizer.vocab_size = tokenizer.n_vocab
|
|
|
2 |
|
3 |
import tiktoken
|
4 |
from tiktoken import Encoding
|
5 |
+
from utils.log_util import logger
|
6 |
|
7 |
tokenizer = tiktoken.encoding_for_model('gpt-4')
|
8 |
tokenizer.vocab_size = tokenizer.n_vocab
|