Spaces:

MohamedRashad
/

arabic-tokenizers-leaderboard

Running

MohamedRashad commited on Dec 10, 2024

Commit

36b7eaa

verified ·

1 Parent(s): 810f57c

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -7,19 +7,24 @@ import random
 from pathlib import Path
 initial_list_of_models = [
     "asafaya/bert-base-arabic",
     "Xenova/gpt-4o",
     "FreedomIntelligence/AceGPT-v1.5-13B-Chat",
     "FreedomIntelligence/AceGPT-13B",
-    "Qwen/Qwen1.5-7B-Chat",
-    "Qwen/Qwen1.5-110B-Chat",
     "microsoft/Phi-3-mini-128k-instruct",
     "unsloth/gemma-2b-bnb-4bit",
-    "NousResearch/Meta-Llama-3-8B",
     "CohereForAI/c4ai-command-r-v01",
     "CohereForAI/c4ai-command-r-plus",
-    "core42/jais-13b",
-    "core42/jais-30b-chat-v3",
 ]
 dataframe_path = Path(__file__).parent / "arabic_tokenizers_leaderboard.jsonl"
@@ -38,7 +43,7 @@ else:
     )
 # Datasets used for calculating the number of tokens
-arabic_dataset1 = load_dataset("MohamedRashad/rasaif-translations", split="train")["arabic"]
 arabic_dataset2 = load_dataset("HeshamHaroon/arabic-quotes", split="train")["quote"]
 arabic_dataset3 = load_dataset("SaiedAlshahrani/Moroccan_Arabic_Wikipedia_20230101_nobots", split="train")["text"]
 all_data = arabic_dataset1 + arabic_dataset2 + arabic_dataset3

 from pathlib import Path
 initial_list_of_models = [
+    "riotu-lab/Aranizer-PBE-86k",
+    "riotu-lab/Aranizer-PBE-64k",
+    "riotu-lab/Aranizer-PBE-32k",
+    "riotu-lab/Aranizer-SP-86k",
+    "riotu-lab/Aranizer-SP-64k",
+    "riotu-lab/Aranizer-SP-32k",
     "asafaya/bert-base-arabic",
+    "inceptionai/jais-family-30b-16k",
     "Xenova/gpt-4o",
     "FreedomIntelligence/AceGPT-v1.5-13B-Chat",
     "FreedomIntelligence/AceGPT-13B",
+    "Qwen/Qwen2.5-72B-Instruct",
     "microsoft/Phi-3-mini-128k-instruct",
     "unsloth/gemma-2b-bnb-4bit",
+    "unsloth/Llama-3.3-70B-Instruct",
     "CohereForAI/c4ai-command-r-v01",
     "CohereForAI/c4ai-command-r-plus",
+    "CohereForAI/aya-101",
 ]
 dataframe_path = Path(__file__).parent / "arabic_tokenizers_leaderboard.jsonl"
     )
 # Datasets used for calculating the number of tokens
+arabic_dataset1 = load_dataset("ImruQays/Rasaif-Classical-Arabic-English-Parallel-texts", split="train")["ar"]
 arabic_dataset2 = load_dataset("HeshamHaroon/arabic-quotes", split="train")["quote"]
 arabic_dataset3 = load_dataset("SaiedAlshahrani/Moroccan_Arabic_Wikipedia_20230101_nobots", split="train")["text"]
 all_data = arabic_dataset1 + arabic_dataset2 + arabic_dataset3