Spaces:

Annorita
/

tokenizer_comparison

Sleeping

Annorita commited on Jan 24

Commit

62a5026

•

1 Parent(s): 1d128e9

Update utils.py

Use a wordaround method to solve the space missing issue for llama-based tokenizer

Files changed (1) hide show

utils.py CHANGED Viewed

@@ -16,7 +16,22 @@ def get_res(model_name, input_sentence, single_print=True):
     out = tokenizer.encode(input_sentence, add_special_tokens=False)
     token_num = len(out)
-    w = [ f'<span style="font-size:1.25em;background-color:{next(color_iterator)}">{tokenizer.decode(x)}</span>' for x in out ]
     res = ''.join(w)
     if single_print:
         print(res + str(token_num))

     out = tokenizer.encode(input_sentence, add_special_tokens=False)
     token_num = len(out)
+    work_around = False
+    if work_around:
+        w = []
+        pre = ""
+        for i in range(len(out)):
+            res = tokenizer.decode(out[:i+1])
+            if w == []:
+                w.append(res)
+            else:
+                pre_len = len(pre) #0
+                w.append(res[pre_len:])
+            pre = res
+        w = [ f'<span style="font-size:1.25em;background-color:{next(color_iterator)}">{x}</span>' for x in out ]
+    else:
+        w = [ f'<span style="font-size:1.25em;background-color:{next(color_iterator)}">{tokenizer.decode(x)}</span>' for x in out ]
     res = ''.join(w)
     if single_print:
         print(res + str(token_num))