mikesapi commited on
Commit
c2e2189
·
1 Parent(s): 145530a

cleaning up

Browse files
Files changed (1) hide show
  1. app.py +44 -38
app.py CHANGED
@@ -1,29 +1,27 @@
1
  import logging
2
- import time
3
 
4
- import gradio as gr
5
- from transformers import AutoTokenizer
6
  import tiktoken
 
 
 
7
 
8
  logger = logging.getLogger(__name__) # noqa
9
 
 
10
  def load_test_phrases(filename):
11
  with open(f"./data/{filename}", "r", encoding="utf-8") as file:
12
- texts = file.read().splitlines()
13
- return texts
14
 
15
- # Initialize clients
16
- models = [
17
- "meta-llama/Llama-2-7b-chat-hf", # LLAMA-2
18
  "beomi/llama-2-ko-7b", # LLAMA-2-ko
19
  "openaccess-ai-collective/tiny-mistral", # Mistral
20
  "gpt-3.5-turbo", # GPT3.5
21
  "meta-llama/Meta-Llama-3-8B-Instruct", # LLAMA-3
22
  "CohereForAI/aya-23-8B", # AYA
23
- "google/gemma-1.1-2b-it", # GEMMA //# requires log in to HF huggingface-cli
24
  "gpt-4o", # GPT4o
25
- "TWO/sutra-alpha", # SUTRA
26
- ]
27
 
28
  test_phrase_set = [
29
  "நாங்கள் சந்திரனுக்கு ராக்கெட் பயணத்தில் இருக்கிறோம்",
@@ -50,6 +48,7 @@ test_phrase_set = [
50
  test_phrase_set_long_1 = load_test_phrases('multilingualphrases01.txt')
51
  test_phrase_set_long_2 = load_test_phrases('multilingualphrases02.txt')
52
 
 
53
  def generate_tokens_as_table(text):
54
  table = []
55
  for model in models:
@@ -63,7 +62,11 @@ def generate_tokens_as_table(text):
63
  table.append([model] + decoded)
64
  return table
65
 
66
- def generate_tokenizer_table(input_text):
 
 
 
 
67
  token_counts = {model: 0 for model in models}
68
  vocab_size = {model: 0 for model in models}
69
 
@@ -75,35 +78,38 @@ def generate_tokenizer_table(input_text):
75
  tokenizer = tiktoken.encoding_for_model(model)
76
  vocab_size[model] = tokenizer.n_vocab
77
 
78
- token_counts[model] += len(tokenizer.encode(input_text))
79
 
80
- word_count = len(input_text.split(' '))
81
 
82
  output = []
83
  for m in models:
84
- row = [m, vocab_size[m], word_count, token_counts[m], token_counts[m]/word_count]
85
  output.append(row)
86
 
87
  return output
88
 
 
89
  def generate_split_token_table(text):
 
 
 
90
  table = generate_tokenizer_table(text)
91
- records = gr.Dataframe(
92
- table,
93
- headers=['tokenizer', 'v size', '#word', '#token', '#tokens/word'],
94
- datatype=["str", "number", "str"],
95
- row_count=len(models),
96
- col_count=(5, "fixed"),
97
- )
98
 
99
- return records
100
 
101
  with gr.Blocks() as sutra_token_count:
102
  gr.Markdown(
103
- """
104
- # SUTRA Multilingual Tokenizer Specs & Stats.
105
- ## Tokenize paragraphs in multiple languages and inspect how many tokens it takes to represent the multilingual paragraph.
106
- """)
107
  textbox = gr.Textbox(label="Input Text")
108
  submit_button = gr.Button("Submit")
109
  output = gr.Dataframe()
@@ -114,24 +120,24 @@ with gr.Blocks() as sutra_token_count:
114
  gr.Examples(examples=examples, inputs=[textbox])
115
  submit_button.click(generate_split_token_table, inputs=[textbox], outputs=[output])
116
 
 
117
  def generate_tokens_table(text):
118
  table = generate_tokens_as_table(text)
119
  cols = len(table[0])
120
- records = gr.Dataframe(
121
- table,
122
- headers=['model'] + [str(i) for i in range(cols - 1)],
123
- row_count=2,
124
- col_count=(cols, "fixed"),
125
- )
126
 
127
- return records
128
 
129
  with gr.Blocks() as sutra_tokenize:
130
  gr.Markdown(
131
- """
132
- # SUTRA Multilingual Tokenizer Sentence Inspector.
133
- ## Tokenize a sentence with various tokenizers and inspect how it's broken down.
134
- """)
135
  textbox = gr.Textbox(label="Input Text")
136
  submit_button = gr.Button("Submit")
137
  output = gr.Dataframe()
 
1
  import logging
 
2
 
 
 
3
  import tiktoken
4
+ from transformers import AutoTokenizer
5
+
6
+ import gradio as gr
7
 
8
  logger = logging.getLogger(__name__) # noqa
9
 
10
+
11
  def load_test_phrases(filename):
12
  with open(f"./data/{filename}", "r", encoding="utf-8") as file:
13
+ return file.read().splitlines()
14
+
15
 
16
+ models = ["meta-llama/Llama-2-7b-chat-hf", # LLAMA-2
 
 
17
  "beomi/llama-2-ko-7b", # LLAMA-2-ko
18
  "openaccess-ai-collective/tiny-mistral", # Mistral
19
  "gpt-3.5-turbo", # GPT3.5
20
  "meta-llama/Meta-Llama-3-8B-Instruct", # LLAMA-3
21
  "CohereForAI/aya-23-8B", # AYA
22
+ "google/gemma-1.1-2b-it", # GEMMA
23
  "gpt-4o", # GPT4o
24
+ "TWO/sutra-alpha"] # SUTRA
 
25
 
26
  test_phrase_set = [
27
  "நாங்கள் சந்திரனுக்கு ராக்கெட் பயணத்தில் இருக்கிறோம்",
 
48
  test_phrase_set_long_1 = load_test_phrases('multilingualphrases01.txt')
49
  test_phrase_set_long_2 = load_test_phrases('multilingualphrases02.txt')
50
 
51
+
52
  def generate_tokens_as_table(text):
53
  table = []
54
  for model in models:
 
62
  table.append([model] + decoded)
63
  return table
64
 
65
+
66
+ def generate_tokenizer_table(text):
67
+ if not text:
68
+ return []
69
+
70
  token_counts = {model: 0 for model in models}
71
  vocab_size = {model: 0 for model in models}
72
 
 
78
  tokenizer = tiktoken.encoding_for_model(model)
79
  vocab_size[model] = tokenizer.n_vocab
80
 
81
+ token_counts[model] += len(tokenizer.encode(text))
82
 
83
+ word_count = len(text.split(' '))
84
 
85
  output = []
86
  for m in models:
87
+ row = [m, vocab_size[m], word_count, token_counts[m], token_counts[m] / word_count]
88
  output.append(row)
89
 
90
  return output
91
 
92
+
93
  def generate_split_token_table(text):
94
+ if not text:
95
+ return gr.Dataframe()
96
+
97
  table = generate_tokenizer_table(text)
98
+ return gr.Dataframe(
99
+ table,
100
+ headers=['tokenizer', 'v size', '#word', '#token', '#tokens/word'],
101
+ datatype=["str", "number", "str"],
102
+ row_count=len(models),
103
+ col_count=(5, "fixed"),
104
+ )
105
 
 
106
 
107
  with gr.Blocks() as sutra_token_count:
108
  gr.Markdown(
109
+ """
110
+ # SUTRA Multilingual Tokenizer Specs & Stats.
111
+ ## Tokenize paragraphs in multiple languages and compare token counts.
112
+ """)
113
  textbox = gr.Textbox(label="Input Text")
114
  submit_button = gr.Button("Submit")
115
  output = gr.Dataframe()
 
120
  gr.Examples(examples=examples, inputs=[textbox])
121
  submit_button.click(generate_split_token_table, inputs=[textbox], outputs=[output])
122
 
123
+
124
  def generate_tokens_table(text):
125
  table = generate_tokens_as_table(text)
126
  cols = len(table[0])
127
+ return gr.Dataframe(
128
+ table,
129
+ headers=['model'] + [str(i) for i in range(cols - 1)],
130
+ row_count=2,
131
+ col_count=(cols, "fixed"),
132
+ )
133
 
 
134
 
135
  with gr.Blocks() as sutra_tokenize:
136
  gr.Markdown(
137
+ """
138
+ # SUTRA Multilingual Tokenizer Sentence Inspector.
139
+ ## Tokenize a sentence with various tokenizers and inspect how it's broken down.
140
+ """)
141
  textbox = gr.Textbox(label="Input Text")
142
  submit_button = gr.Button("Submit")
143
  output = gr.Dataframe()