File size: 4,085 Bytes
98985f3
 
2a7a3b6
 
 
98985f3
ce96772
24c122a
 
bbc0512
ce96772
f3369dd
24c122a
d35776c
 
 
ce96772
 
 
 
 
 
 
d35776c
ce96772
 
 
 
 
d35776c
a17b6c0
 
d35776c
e3dfd55
 
 
 
 
6475fdc
c89cc71
e3dfd55
 
6475fdc
 
ce96772
 
 
 
 
a17b6c0
 
d35776c
 
238547c
d35776c
98985f3
2789d18
 
ce96772
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6475fdc
 
ce96772
6475fdc
24c122a
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
from transformers import AutoTokenizer
import gradio as gr
import os
print("Check CPU count...")
print(os.cpu_count())


def formatarr(input):
   return "["+",".join(str(x) for x in input)+"]"


def tokenize(input_text):
    llama_tokens = llama_tokenizer(input_text, add_special_tokens=True)["input_ids"]
    llama3_tokens = llama3_tokenizer(input_text, add_special_tokens=True)["input_ids"]
    mistral_tokens = mistral_tokenizer(input_text, add_special_tokens=True)["input_ids"]
    gpt2_tokens = gpt2_tokenizer(input_text, add_special_tokens=True)["input_ids"]
    gpt_neox_tokens = gpt_neox_tokenizer(input_text, add_special_tokens=True)["input_ids"]
    falcon_tokens = falcon_tokenizer(input_text, add_special_tokens=True)["input_ids"]
    phi2_tokens = phi2_tokenizer(input_text, add_special_tokens=True)["input_ids"]
    phi3_tokens = phi3_tokenizer(input_text, add_special_tokens=True)["input_ids"]
    t5_tokens = t5_tokenizer(input_text, add_special_tokens=True)["input_ids"]
    gemma_tokens = gemma_tokenizer(input_text, add_special_tokens=True)["input_ids"]
    qwen_tokens = qwen_tokenizer(input_text, add_special_tokens=True)["input_ids"]
    codeqwen_tokens = codeqwen_tokenizer(input_text, add_special_tokens=True)["input_ids"]
    rwkv4_tokens = rwkv4_tokenizer(input_text, add_special_tokens=True)["input_ids"]
    rwkv5_tokens = rwkv5_tokenizer(input_text, add_special_tokens=True)["input_ids"]
    deepseek_tokens = deepseek_tokenizer(input_text, add_special_tokens=True)["input_ids"]
    internlm_tokens = internlm_tokenizer(input_text, add_special_tokens=True)["input_ids"]
    internlm2_tokens = internlm2_tokenizer(input_text, add_special_tokens=True)["input_ids"]
    

    results = {
        "LLaMa-1/LLaMa-2": llama_tokens,
        "LLaMa-3": llama3_tokens,
        "Mistral": mistral_tokens,
        "GPT-2/GPT-J": gpt2_tokens,
        "GPT-NeoX": gpt_neox_tokens,
        "Falcon": falcon_tokens,
        "Phi-1/Phi-2": phi2_tokens,
        "Phi-3": phi3_tokens,
        "T5": t5_tokens,
        "Gemma": gemma_tokens,
        "Qwen/Qwen1.5": qwen_tokens,
        "CodeQwen": codeqwen_tokens,
        "RWKV-v4": rwkv4_tokens,
        "RWKV-v5/RWKV-v6": rwkv5_tokens,
        "DeepSeek": deepseek_tokens,
        "InternLM": internlm_tokens,
        "InternLM2": internlm2_tokens
    }

    toks = ""    
    for model, tokens in results.items():
        toks += f"\n{model} gets {len(tokens)} tokens: {formatarr(tokens)}"  
    return toks


if __name__ == "__main__":
    llama_tokenizer = AutoTokenizer.from_pretrained("TheBloke/Llama-2-7B-fp16")
    llama3_tokenizer = AutoTokenizer.from_pretrained("unsloth/llama-3-8b")
    mistral_tokenizer = AutoTokenizer.from_pretrained("mistral-community/Mistral-7B-v0.2")
    gpt2_tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
    gpt_neox_tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neox-20b")
    falcon_tokenizer = AutoTokenizer.from_pretrained("tiiuae/falcon-7b")
    phi2_tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-2")
    phi3_tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct")
    t5_tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-xxl")
    gemma_tokenizer = AutoTokenizer.from_pretrained("alpindale/gemma-2b")
    qwen_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen1.5-7B")
    codeqwen_tokenizer = AutoTokenizer.from_pretrained("Qwen/CodeQwen1.5-7B")
    rwkv4_tokenizer = AutoTokenizer.from_pretrained("RWKV/rwkv-4-14b-pile", trust_remote_code=True)
    rwkv5_tokenizer = AutoTokenizer.from_pretrained("RWKV/v5-EagleX-v2-7B-HF", trust_remote_code=True)
    deepseek_tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-V2", trust_remote_code=True)
    internlm_tokenizer = AutoTokenizer.from_pretrained("internlm/internlm-20b", trust_remote_code=True)
    internlm2_tokenizer = AutoTokenizer.from_pretrained("internlm/internlm2-20b", trust_remote_code=True)

    iface = gr.Interface(
        fn=tokenize, inputs=gr.Textbox(label="Input Text", lines=19), outputs="text"
    )
    iface.launch()