import os from typing import List import gradio as gr from transformers import AutoTokenizer import tiktoken HF_TOKEN = os.getenv("HF_TOKEN") hf_tokenizer_list = [ ("tugstugi/bert-large-mongolian-cased", False), ("tugstugi/bert-large-mongolian-uncased", False), ("bayartsogt/mongolian-roberta-large", True), ("meta-llama/Llama-2-13b-hf", True), ("tiiuae/falcon-7b", True), ("bigscience/bloom", True), ] openai_tokenizer_list = [ "text-davinci-003", "gpt-4" ] # load tokenizers hf_tokenizers = [ AutoTokenizer.from_pretrained(model_name_or_id, use_fast=use_fast, trust_remote_code=True, token=HF_TOKEN) for model_name_or_id, use_fast in hf_tokenizer_list ] openai_tokenizers = [ tiktoken.encoding_for_model(name) for name in openai_tokenizer_list ] def do_tokenize(tokenizer: AutoTokenizer, text: str) -> List[str]: return [(tokenizer.decode([token_id]), str(i)) for i, token_id in enumerate(tokenizer.encode(text))] def do_simple_split(text: str): return [(x, str(i)) for i, x in enumerate(text.split())] def do_function(text: str): return ( text, len(text), do_simple_split(text), *[do_tokenize(tokenizer, text) for tokenizer in hf_tokenizers], *[do_tokenize(tokenizer, text) for tokenizer in openai_tokenizers], ) demo = gr.Interface( do_function, [ gr.Text("", placeholder="Мөнгөө тушаачихсаныхаа дараа мэдэгдээрэй") ], [ gr.Text("", label="input"), gr.Number(0, label="Character Count"), gr.HighlightedText("", label="Simple Split"), *[gr.HighlightedText("", label=tokenizer_name) for tokenizer_name, _ in hf_tokenizer_list], *[gr.HighlightedText("", label="openai/" + tokenizer_name) for tokenizer_name in openai_tokenizer_list], ], live=True, allow_flagging="never", title="Real-Time Tokenizer", description=( "**Tokenizers:**\n" + "\n".join( [ f"🤗 [{x}](https://huggingface.co/{x})" for x, _ in hf_tokenizer_list ] + [ f"⏳ [{x}](https://github.com/openai/tiktoken)" for x in openai_tokenizer_list ]) ), ) if __name__ == "__main__": demo.launch()