Spaces:
Sleeping
Sleeping
pedrogengo
commited on
Commit
•
5010b2c
1
Parent(s):
ef672b2
Update app.py
Browse files
app.py
CHANGED
@@ -245,17 +245,22 @@ def get_results(tokenizer_name, base_lang, comp_lang, HF_token=""):
|
|
245 |
adverb = "more"
|
246 |
token_ratio = (token_ratio - 1.) * 100
|
247 |
|
248 |
-
output = f"You need {round(token_ratio, 3)}% {adverb} tokens to represent your text in {comp_lang} than in {base_lang}
|
249 |
return output
|
250 |
|
251 |
|
252 |
with gr.Blocks() as demo:
|
253 |
-
with gr.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
254 |
with gr.Column():
|
255 |
-
|
256 |
-
tokenizer = gr.Textbox(label="Tokenizer name", value="bert-base-cased")
|
257 |
-
with gr.Row():
|
258 |
-
HF_token = gr.Textbox(label="your HF Token")
|
259 |
|
260 |
with gr.Row():
|
261 |
with gr.Column():
|
|
|
245 |
adverb = "more"
|
246 |
token_ratio = (token_ratio - 1.) * 100
|
247 |
|
248 |
+
output = f"**You need {round(token_ratio, 3)}% {adverb} tokens to represent your text in {comp_lang} than in {base_lang}.**"
|
249 |
return output
|
250 |
|
251 |
|
252 |
with gr.Blocks() as demo:
|
253 |
+
with gr.Row():
|
254 |
+
gr.Markdown("""<h1>Language tokenization comparison</h1>
|
255 |
+
This tool will help you calculate the how many more or less tokens you need to tokenize text in different languages.
|
256 |
+
To perform this comparison we are using [FLORES](https://github.com/facebookresearch/flores/tree/main) dataset, developed by meta, which presents translations between English and low-resource languages.
|
257 |
+
We first tokenize around 1000 texts to the base language and to the language we want to compare. After that, we get average of inputs_ids lenght.""")
|
258 |
+
|
259 |
+
with gr.Row():
|
260 |
+
with gr.Column():
|
261 |
+
tokenizer = gr.Textbox(label="Tokenizer name", value="bert-base-cased")
|
262 |
with gr.Column():
|
263 |
+
HF_token = gr.Textbox(label="your HF Token")
|
|
|
|
|
|
|
264 |
|
265 |
with gr.Row():
|
266 |
with gr.Column():
|