Spaces:

22h
/

tokenizer_language_battle

Sleeping

pedrogengo commited on Feb 25

Commit

5010b2c

•

1 Parent(s): ef672b2

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -245,17 +245,22 @@ def get_results(tokenizer_name, base_lang, comp_lang, HF_token=""):
         adverb = "more"
         token_ratio = (token_ratio - 1.) * 100
-    output = f"You need {round(token_ratio, 3)}% {adverb} tokens to represent your text in {comp_lang} than in {base_lang}."
     return output
 with gr.Blocks() as demo:
-    with gr.Column():
         with gr.Column():
-            with gr.Row():
-                tokenizer = gr.Textbox(label="Tokenizer name", value="bert-base-cased")
-            with gr.Row():
-                HF_token = gr.Textbox(label="your HF Token")
         with gr.Row():
             with gr.Column():

         adverb = "more"
         token_ratio = (token_ratio - 1.) * 100
+    output = f"**You need {round(token_ratio, 3)}% {adverb} tokens to represent your text in {comp_lang} than in {base_lang}.**"
     return output
 with gr.Blocks() as demo:
+    with gr.Row():
+        gr.Markdown("""<h1>Language tokenization comparison</h1>
+    This tool will help you calculate the how many more or less tokens you need to tokenize text in different languages.
+    To perform this comparison we are using [FLORES](https://github.com/facebookresearch/flores/tree/main) dataset, developed by meta, which presents translations between English and low-resource languages.
+    We first tokenize around 1000 texts to the base language and to the language we want to compare. After that, we get average of inputs_ids lenght.""")
+    with gr.Row():
+        with gr.Column():
+            tokenizer = gr.Textbox(label="Tokenizer name", value="bert-base-cased")
         with gr.Column():
+            HF_token = gr.Textbox(label="your HF Token")
         with gr.Row():
             with gr.Column():