Annorita commited on
Commit
62a5026
1 Parent(s): 1d128e9

Update utils.py

Browse files

Use a wordaround method to solve the space missing issue for llama-based tokenizer

Files changed (1) hide show
  1. utils.py +16 -1
utils.py CHANGED
@@ -16,7 +16,22 @@ def get_res(model_name, input_sentence, single_print=True):
16
  out = tokenizer.encode(input_sentence, add_special_tokens=False)
17
  token_num = len(out)
18
 
19
- w = [ f'<span style="font-size:1.25em;background-color:{next(color_iterator)}">{tokenizer.decode(x)}</span>' for x in out ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  res = ''.join(w)
21
  if single_print:
22
  print(res + str(token_num))
 
16
  out = tokenizer.encode(input_sentence, add_special_tokens=False)
17
  token_num = len(out)
18
 
19
+ work_around = False
20
+ if work_around:
21
+ w = []
22
+ pre = ""
23
+ for i in range(len(out)):
24
+ res = tokenizer.decode(out[:i+1])
25
+ if w == []:
26
+ w.append(res)
27
+ else:
28
+ pre_len = len(pre) #0
29
+ w.append(res[pre_len:])
30
+ pre = res
31
+
32
+ w = [ f'<span style="font-size:1.25em;background-color:{next(color_iterator)}">{x}</span>' for x in out ]
33
+ else:
34
+ w = [ f'<span style="font-size:1.25em;background-color:{next(color_iterator)}">{tokenizer.decode(x)}</span>' for x in out ]
35
  res = ''.join(w)
36
  if single_print:
37
  print(res + str(token_num))