WebTokenizer

Running

App Files Files Community

concedo commited on Apr 21

Commit

0571571

•

1 Parent(s): 8377b08

Update app.py

Browse files

Files changed (1) hide show

app.py +6 -24

app.py CHANGED Viewed

@@ -1,12 +1,11 @@
 from transformers import AutoTokenizer
 import gradio as gr
-def formatarr(input):
-   return "["+",".join(str(x) for x in input)+"]"
 def tokenize(input_text):
-    llama_tokens = llama_tokenizer(input_text, add_special_tokens=True)["input_ids"]
     llama3_tokens = len(
         llama3_tokenizer(input_text, add_special_tokens=True)["input_ids"]
     )
@@ -42,22 +41,7 @@ def tokenize(input_text):
     )
     results = {
-        "LLaMa-1/LLaMa-2": len(llama_tokens),
-        "LLaMa-3": llama3_tokens,
-        "Mistral": mistral_tokens,
-        "GPT-2/GPT-J": gpt2_tokens,
-        "GPT-NeoX": gpt_neox_tokens,
-        "Falcon": falcon_tokens,
-        "Phi-1/Phi-2": phi2_tokens,
-        "T5": t5_tokens,
-        "Gemma": gemma_tokens,
-        "Command-R": command_r_tokens,
-        "Qwen/Qwen1.5": qwen_tokens,
-        "CodeQwen": codeqwen_tokens,
-    }
-    results2 = {
-        "LLaMa-1/LLaMa-2": formatarr(llama_tokens),
         "LLaMa-3": llama3_tokens,
         "Mistral": mistral_tokens,
         "GPT-2/GPT-J": gpt2_tokens,
@@ -74,9 +58,7 @@ def tokenize(input_text):
     # Sort the results in descending order based on token length
     sorted_results = sorted(results.items(), key=lambda x: x[1], reverse=True)
-    lens = "\n".join([f"{model}: {tokens}" for model, tokens in sorted_results])
-    toks = "\n".join([f"{model}: {tokens}" for model, tokens in results2])
-    return lens + "\n" + toks
 if __name__ == "__main__":
@@ -120,4 +102,4 @@ if __name__ == "__main__":
     iface = gr.Interface(
         fn=tokenize, inputs=gr.Textbox(label="Input Text", lines=12), outputs="text"
     )
-    iface.launch()

 from transformers import AutoTokenizer
 import gradio as gr
 def tokenize(input_text):
+    llama_tokens = len(
+        llama_tokenizer(input_text, add_special_tokens=True)["input_ids"]
+    )
     llama3_tokens = len(
         llama3_tokenizer(input_text, add_special_tokens=True)["input_ids"]
     )
     )
     results = {
+        "LLaMa-1/LLaMa-2": llama_tokens,
         "LLaMa-3": llama3_tokens,
         "Mistral": mistral_tokens,
         "GPT-2/GPT-J": gpt2_tokens,
     # Sort the results in descending order based on token length
     sorted_results = sorted(results.items(), key=lambda x: x[1], reverse=True)
+    return "\n".join([f"{model}: {tokens}" for model, tokens in sorted_results])
 if __name__ == "__main__":
     iface = gr.Interface(
         fn=tokenize, inputs=gr.Textbox(label="Input Text", lines=12), outputs="text"
     )
+    iface.launch()