pedrogengo commited on
Commit
5010b2c
1 Parent(s): ef672b2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +11 -6
app.py CHANGED
@@ -245,17 +245,22 @@ def get_results(tokenizer_name, base_lang, comp_lang, HF_token=""):
245
  adverb = "more"
246
  token_ratio = (token_ratio - 1.) * 100
247
 
248
- output = f"You need {round(token_ratio, 3)}% {adverb} tokens to represent your text in {comp_lang} than in {base_lang}."
249
  return output
250
 
251
 
252
  with gr.Blocks() as demo:
253
- with gr.Column():
 
 
 
 
 
 
 
 
254
  with gr.Column():
255
- with gr.Row():
256
- tokenizer = gr.Textbox(label="Tokenizer name", value="bert-base-cased")
257
- with gr.Row():
258
- HF_token = gr.Textbox(label="your HF Token")
259
 
260
  with gr.Row():
261
  with gr.Column():
 
245
  adverb = "more"
246
  token_ratio = (token_ratio - 1.) * 100
247
 
248
+ output = f"**You need {round(token_ratio, 3)}% {adverb} tokens to represent your text in {comp_lang} than in {base_lang}.**"
249
  return output
250
 
251
 
252
  with gr.Blocks() as demo:
253
+ with gr.Row():
254
+ gr.Markdown("""<h1>Language tokenization comparison</h1>
255
+ This tool will help you calculate the how many more or less tokens you need to tokenize text in different languages.
256
+ To perform this comparison we are using [FLORES](https://github.com/facebookresearch/flores/tree/main) dataset, developed by meta, which presents translations between English and low-resource languages.
257
+ We first tokenize around 1000 texts to the base language and to the language we want to compare. After that, we get average of inputs_ids lenght.""")
258
+
259
+ with gr.Row():
260
+ with gr.Column():
261
+ tokenizer = gr.Textbox(label="Tokenizer name", value="bert-base-cased")
262
  with gr.Column():
263
+ HF_token = gr.Textbox(label="your HF Token")
 
 
 
264
 
265
  with gr.Row():
266
  with gr.Column():