eson commited on
Commit
e6543ac
1 Parent(s): c75633b

fix tokenize

Browse files
Files changed (2) hide show
  1. util.py +5 -3
  2. vocab/gpt_35_turbo/__init__.py +3 -2
util.py CHANGED
@@ -46,8 +46,10 @@ def tokenize(text, tokenizer_type, color_num=5):
46
  token_bytes = bytes(token_str, "utf-8")
47
  # json_dumps = json.dumps(token_str)
48
  else:
49
- logger.error(f"typeError for token {token_id} with {type(token)} " + json.dumps({"text": text, "tokenizer_type": tokenizer_type}, ensure_ascii=False))
50
- continue
 
 
51
 
52
  # ⭐
53
  table.append(
@@ -61,7 +63,7 @@ def tokenize(text, tokenizer_type, color_num=5):
61
  )
62
 
63
  table_df = pd.DataFrame(table)
64
- logger.info(f"Tokens={table[:2]}")
65
  # print(table_df)
66
 
67
  return gr.update(value=pos_tokens, label=f"Tokens: {len(encoding)}"), table_df
 
46
  token_bytes = bytes(token_str, "utf-8")
47
  # json_dumps = json.dumps(token_str)
48
  else:
49
+ logger.error(f"{idx}: wrong type for token {token_id} {type(token)} " + json.dumps({"text": text, "tokenizer_type": tokenizer_type}, ensure_ascii=False))
50
+ token_str = token
51
+ token_bytes = token
52
+ # continue
53
 
54
  # ⭐
55
  table.append(
 
63
  )
64
 
65
  table_df = pd.DataFrame(table)
66
+ logger.info(f"tokenizer_type={tokenizer_type}, Tokens={table[:4]}")
67
  # print(table_df)
68
 
69
  return gr.update(value=pos_tokens, label=f"Tokens: {len(encoding)}"), table_df
vocab/gpt_35_turbo/__init__.py CHANGED
@@ -31,10 +31,11 @@ def convert_ids_to_tokens(self, tokens, skip_special_tokens=False):
31
  """
32
  try:
33
  return self.decode_tokens_bytes(tokens)
34
- except:
35
  # 什么要返回None?见zh_util.py
36
  # 16个空闲id, 100256 100261-100275
37
- return [None for token in tokens]
 
38
 
39
 
40
  def get_vocab(self, token_type="str"):
 
31
  """
32
  try:
33
  return self.decode_tokens_bytes(tokens)
34
+ except Exception as e:
35
  # 什么要返回None?见zh_util.py
36
  # 16个空闲id, 100256 100261-100275
37
+ logger.error(e)
38
+ return [None for _ in tokens]
39
 
40
 
41
  def get_vocab(self, token_type="str"):