eson commited on
Commit
614012d
1 Parent(s): 8e0e4e9
images/README.md ADDED
File without changes
images/info.svg ADDED
vocab/code_davinci_002/__init__.py ADDED
File without changes
vocab/gpt_35_turbo/aaa.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ gpt_35_turbo decode UnicodeDecodeError 99413 b' \xe6\xb5'
3
+ gpt_35_turbo decode UnicodeDecodeError 99742 b'\x8c\xa8'
4
+ gpt_35_turbo decode UnicodeDecodeError 99834 b'\xad\x90'
5
+ gpt_35_turbo decode UnicodeDecodeError 100112 b'\xe0\xae\xbf\xe0\xae'
6
+ gpt_35_turbo decode KeyError 100256
7
+ gpt_35_turbo decode KeyError 100261
8
+ gpt_35_turbo decode KeyError 100262
9
+ gpt_35_turbo decode KeyError 100263
10
+ """
11
+
12
+
13
+
14
+ import json
15
+ import tiktoken
16
+
17
+
18
+ tokenizer = tiktoken.encoding_for_model('gpt-3.5-turbo')
19
+
20
+
21
+ for token_id in [100263, 99834]: # special_tokens: 200257-100260 100276
22
+ try:
23
+ tokenizer.decode_tokens_bytes([token_id])
24
+ except:
25
+ pass
26
+
27
+ try:
28
+ tokenizer.decode_single_token_bytes(token_id)
29
+ except:
30
+ pass
31
+
32
+ try:
33
+ tokenizer.decode_bytes([token_id])
34
+ except:
35
+ pass
36
+
37
+
38
+
vocab/gpt_4/__init__.py CHANGED
@@ -2,6 +2,7 @@
2
 
3
  import tiktoken
4
  from tiktoken import Encoding
 
5
 
6
  tokenizer = tiktoken.encoding_for_model('gpt-4')
7
  tokenizer.vocab_size = tokenizer.n_vocab
 
2
 
3
  import tiktoken
4
  from tiktoken import Encoding
5
+ from utils.log_util import logger
6
 
7
  tokenizer = tiktoken.encoding_for_model('gpt-4')
8
  tokenizer.vocab_size = tokenizer.n_vocab