bayartsogt's picture
adding falcon
473996d
import os
from typing import List
import gradio as gr
from transformers import AutoTokenizer
import tiktoken
HF_TOKEN = os.getenv("HF_TOKEN")
hf_tokenizer_list = [
("tugstugi/bert-large-mongolian-cased", False),
("tugstugi/bert-large-mongolian-uncased", False),
("bayartsogt/mongolian-roberta-large", True),
("meta-llama/Llama-2-13b-hf", True),
("tiiuae/falcon-7b", True),
("bigscience/bloom", True),
]
openai_tokenizer_list = [
"text-davinci-003",
"gpt-4"
]
# load tokenizers
hf_tokenizers = [
AutoTokenizer.from_pretrained(model_name_or_id, use_fast=use_fast, trust_remote_code=True, token=HF_TOKEN)
for model_name_or_id, use_fast in hf_tokenizer_list
]
openai_tokenizers = [
tiktoken.encoding_for_model(name)
for name in openai_tokenizer_list
]
def do_tokenize(tokenizer: AutoTokenizer, text: str) -> List[str]:
return [(tokenizer.decode([token_id]), str(i)) for i, token_id in enumerate(tokenizer.encode(text))]
def do_simple_split(text: str):
return [(x, str(i)) for i, x in enumerate(text.split())]
def do_function(text: str):
return (
text,
len(text),
do_simple_split(text),
*[do_tokenize(tokenizer, text) for tokenizer in hf_tokenizers],
*[do_tokenize(tokenizer, text) for tokenizer in openai_tokenizers],
)
demo = gr.Interface(
do_function,
[
gr.Text("", placeholder="Мөнгөө тушаачихсаныхаа дараа мэдэгдээрэй")
],
[
gr.Text("", label="input"),
gr.Number(0, label="Character Count"),
gr.HighlightedText("", label="Simple Split"),
*[gr.HighlightedText("", label=tokenizer_name) for tokenizer_name, _ in hf_tokenizer_list],
*[gr.HighlightedText("", label="openai/" + tokenizer_name) for tokenizer_name in openai_tokenizer_list],
],
live=True,
allow_flagging="never",
title="Real-Time Tokenizer",
description=(
"**Tokenizers:**\n" +
"\n".join(
[
f"🤗 [{x}](https://huggingface.co/{x})"
for x, _ in hf_tokenizer_list
] + [
f"⏳ [{x}](https://github.com/openai/tiktoken)"
for x in openai_tokenizer_list
])
),
)
if __name__ == "__main__":
demo.launch()