Update app.py
Browse files
app.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
import gradio as gr
|
2 |
-
import
|
3 |
import random
|
4 |
|
5 |
# License Information
|
@@ -10,15 +10,13 @@ import random
|
|
10 |
# - Copyright: 2020-2023, Gradio contributors
|
11 |
# - Full License: http://www.apache.org/licenses/LICENSE-2.0
|
12 |
#
|
13 |
-
# 2.
|
14 |
-
# - License:
|
15 |
-
# - Copyright:
|
16 |
-
# - Full License:
|
17 |
-
|
18 |
|
19 |
-
# Load the
|
20 |
-
|
21 |
-
enc_gpt3_5turbo = tiktoken.encoding_for_model("gpt-3.5-turbo")
|
22 |
|
23 |
def get_color_mapping(tokens):
|
24 |
unique_tokens = list(set(tokens))
|
@@ -26,9 +24,9 @@ def get_color_mapping(tokens):
|
|
26 |
color_mapping = dict(zip(unique_tokens, colors))
|
27 |
return color_mapping
|
28 |
|
29 |
-
def process_model(text,
|
30 |
-
token_ids =
|
31 |
-
tokens = [
|
32 |
num_tokens = len(tokens)
|
33 |
|
34 |
color_mapping = get_color_mapping(tokens)
|
@@ -45,24 +43,21 @@ def process_model(text, encoder, model_name):
|
|
45 |
return modelname_html + num_tokens_html + tokens_html + token_ids_html
|
46 |
|
47 |
def tokenize_input(text):
|
48 |
-
|
49 |
-
gpt35turbo_result = process_model(text, enc_gpt3_5turbo, "GPT-3.5-turbo")
|
50 |
num_chars = len(text)
|
51 |
num_chars_html = f'<h2>Number of Characters: <span style="font-size: 20px; font-weight: bold;">{num_chars}</span></h2>'
|
52 |
-
return num_chars_html,
|
53 |
-
|
54 |
|
55 |
with gr.Blocks() as demo:
|
56 |
-
gr.Markdown("##
|
57 |
with gr.Row():
|
58 |
-
input_text = gr.Textbox(lines=2, placeholder="Enter text here...", label="Enter text to tokenize
|
59 |
num_chars_output = gr.HTML()
|
60 |
with gr.Row():
|
61 |
-
|
62 |
-
gpt35turbo_output = gr.HTML(label="GPT-3.5-turbo")
|
63 |
|
64 |
-
input_text.change(tokenize_input, inputs=[input_text], outputs=[num_chars_output,
|
65 |
-
input_text.submit(tokenize_input, inputs=[input_text], outputs=[num_chars_output,
|
66 |
|
67 |
gr.Markdown("""
|
68 |
<hr>
|
@@ -75,16 +70,12 @@ with gr.Blocks() as demo:
|
|
75 |
- Copyright: 2020-2023, Gradio contributors
|
76 |
- Full License: [Apache License 2.0](http://www.apache.org/licenses/LICENSE-2.0)
|
77 |
- Repository: [Gradio GitHub](https://github.com/gradio-app/gradio/)
|
78 |
-
|
79 |
-
|
80 |
-
-
|
81 |
-
-
|
82 |
-
-
|
83 |
-
- Repository: [tiktoken GitHub](https://github.com/openai/tiktoken)
|
84 |
""")
|
85 |
|
86 |
-
|
87 |
# Launch the app
|
88 |
-
demo.launch()
|
89 |
-
|
90 |
-
|
|
|
1 |
import gradio as gr
|
2 |
+
from sentencepiece import SentencePieceProcessor
|
3 |
import random
|
4 |
|
5 |
# License Information
|
|
|
10 |
# - Copyright: 2020-2023, Gradio contributors
|
11 |
# - Full License: http://www.apache.org/licenses/LICENSE-2.0
|
12 |
#
|
13 |
+
# 2. SentencePiece:
|
14 |
+
# - License: Apache License 2.0
|
15 |
+
# - Copyright: 2018 Google Inc.
|
16 |
+
# - Full License: http://www.apache.org/licenses/LICENSE-2.0
|
|
|
17 |
|
18 |
+
# Load the tokenizer
|
19 |
+
sp = SentencePieceProcessor("models/ver3.0/llm-jp-tokenizer-100k.ver3.0b1.model")
|
|
|
20 |
|
21 |
def get_color_mapping(tokens):
|
22 |
unique_tokens = list(set(tokens))
|
|
|
24 |
color_mapping = dict(zip(unique_tokens, colors))
|
25 |
return color_mapping
|
26 |
|
27 |
+
def process_model(text, model_name):
|
28 |
+
token_ids = sp.encode(text)
|
29 |
+
tokens = [sp.id_to_piece(id) for id in token_ids]
|
30 |
num_tokens = len(tokens)
|
31 |
|
32 |
color_mapping = get_color_mapping(tokens)
|
|
|
43 |
return modelname_html + num_tokens_html + tokens_html + token_ids_html
|
44 |
|
45 |
def tokenize_input(text):
|
46 |
+
result = process_model(text, "SentencePiece Tokenizer")
|
|
|
47 |
num_chars = len(text)
|
48 |
num_chars_html = f'<h2>Number of Characters: <span style="font-size: 20px; font-weight: bold;">{num_chars}</span></h2>'
|
49 |
+
return num_chars_html, result
|
|
|
50 |
|
51 |
with gr.Blocks() as demo:
|
52 |
+
gr.Markdown("## SentencePiece Tokenizer App")
|
53 |
with gr.Row():
|
54 |
+
input_text = gr.Textbox(lines=2, placeholder="Enter text here...", label="Enter text to tokenize using SentencePiece tokenizer.")
|
55 |
num_chars_output = gr.HTML()
|
56 |
with gr.Row():
|
57 |
+
tokenizer_output = gr.HTML(label="SentencePiece Tokenizer")
|
|
|
58 |
|
59 |
+
input_text.change(tokenize_input, inputs=[input_text], outputs=[num_chars_output, tokenizer_output])
|
60 |
+
input_text.submit(tokenize_input, inputs=[input_text], outputs=[num_chars_output, tokenizer_output])
|
61 |
|
62 |
gr.Markdown("""
|
63 |
<hr>
|
|
|
70 |
- Copyright: 2020-2023, Gradio contributors
|
71 |
- Full License: [Apache License 2.0](http://www.apache.org/licenses/LICENSE-2.0)
|
72 |
- Repository: [Gradio GitHub](https://github.com/gradio-app/gradio/)
|
73 |
+
2. **SentencePiece**:
|
74 |
+
- License: Apache License 2.0
|
75 |
+
- Copyright: 2018 Google Inc.
|
76 |
+
- Full License: [Apache License 2.0](http://www.apache.org/licenses/LICENSE-2.0)
|
77 |
+
- Repository: [SentencePiece GitHub](https://github.com/google/sentencepiece)
|
|
|
78 |
""")
|
79 |
|
|
|
80 |
# Launch the app
|
81 |
+
demo.launch()
|
|
|
|