File size: 1,470 Bytes
35996ec
 
 
 
 
 
 
8c31c63
ab98424
35996ec
 
 
 
 
 
 
 
 
bfc00c8
62a5026
 
 
 
 
 
 
 
 
 
 
a56d29b
 
 
 
aa0404d
a56d29b
aa0404d
62a5026
a56d29b
 
 
aa0404d
a56d29b
aa0404d
a56d29b
 
35996ec
224f5e0
35996ec
224f5e0
35996ec
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52

from transformers import AutoTokenizer
import itertools



def get_color():
    colors = ['#df7b55', '#2c7482', '#2c8234', '#5581df', '#822c63','#b355df']

    return itertools.cycle(colors)

def get_res(model_name, input_sentence, single_print=True):
    
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    color_iterator = get_color()
    out = tokenizer.encode(input_sentence, add_special_tokens=False)
    token_num = len(out)

    work_around = True
    if work_around:
        w = [] 
        pre = ""
        for i in range(len(out)): 
            res = tokenizer.decode(out[:i+1])  
            if w == []:
                w.append(res)
            else:
                pre_len = len(pre) #0
                w.append(res[pre_len:])  
            pre = res

        res = []
        for x in w:
            if x == '\n':
                res.append('  \n')
            else:
                res.append(f'<span style="font-size:1.25em;background-color:{next(color_iterator)}">{x.replace(" ", "&nbsp;")}</span>')
    else:
        res = []
        for x in out:
            if x == '\n':
                res.append('  \n')
            else:
                res.append(f'<span style="font-size:1.25em;background-color:{next(color_iterator)}">{tokenizer.decode(x).replace(" ", "&nbsp;")}</span>')
        
    res = ''.join(res)
    if single_print:
        print(res + str(token_num))
    else:
        return res, token_num