bayartsogt commited on
Commit
ce44288
1 Parent(s): 61c1783

initial push, hf + openai

Browse files
Files changed (3) hide show
  1. .gitignore +1 -0
  2. app.py +76 -0
  3. requirements.txt +1 -0
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ venv/
app.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ from typing import List
3
+ import gradio as gr
4
+ from transformers import AutoTokenizer
5
+ import tiktoken
6
+
7
+
8
+ hf_tokenizer_list = [
9
+ ("tugstugi/bert-large-mongolian-cased", False),
10
+ ("tugstugi/bert-large-mongolian-uncased", False),
11
+ ("bayartsogt/mongolian-roberta-large", True),
12
+ ("bigscience/bloom", True),
13
+ ]
14
+
15
+ openai_tokenizer_list = [
16
+ "text-davinci-003",
17
+ "gpt-4"
18
+ ]
19
+
20
+ # load tokenizers
21
+ hf_tokenizers = [
22
+ AutoTokenizer.from_pretrained(model_name_or_id, use_fast=use_fast)
23
+ for model_name_or_id, use_fast in hf_tokenizer_list
24
+ ]
25
+
26
+ openai_tokenizers = [
27
+ tiktoken.encoding_for_model(name)
28
+ for name in openai_tokenizer_list
29
+ ]
30
+
31
+
32
+ def do_tokenize(tokenizer: AutoTokenizer, text: str) -> List[str]:
33
+ return [(tokenizer.decode([token_id]), str(i)) for i, token_id in enumerate(tokenizer.encode(text))]
34
+
35
+ def do_simple_split(text: str):
36
+ return [(x, str(i)) for i, x in enumerate(text.split())]
37
+
38
+ def do_function(text: str):
39
+ return (
40
+ text,
41
+ len(text),
42
+ do_simple_split(text),
43
+ *[do_tokenize(tokenizer, text) for tokenizer in hf_tokenizers],
44
+ *[do_tokenize(tokenizer, text) for tokenizer in openai_tokenizers],
45
+ )
46
+
47
+
48
+ demo = gr.Interface(
49
+ do_function,
50
+ [
51
+ gr.Text("", placeholder="Мөнгөө тушаачихсаныхаа дараа мэдэгдээрэй")
52
+ ],
53
+ [
54
+ gr.Text("", label="input"),
55
+ gr.Number(0, label="Character Count"),
56
+ gr.HighlightedText("", label="Simple Split"),
57
+ *[gr.HighlightedText("", label=tokenizer_name) for tokenizer_name, _ in hf_tokenizer_list],
58
+ *[gr.HighlightedText("", label="openai/" + tokenizer_name) for tokenizer_name in openai_tokenizer_list],
59
+ ],
60
+ live=True,
61
+ allow_flagging="never",
62
+ title="Real-Time Tokenizer",
63
+ description=(
64
+ "**Tokenizers:**\n" +
65
+ "\n".join(
66
+ [
67
+ f"🤗 [{x}](https://huggingface.co/{x})"
68
+ for x, _ in hf_tokenizer_list
69
+ ] + [
70
+ f"⏳ [{x}](https://github.com/openai/tiktoken)"
71
+ for x in openai_tokenizer_list
72
+ ])
73
+ ),
74
+ )
75
+ if __name__ == "__main__":
76
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ tiktoken