MohamedRashad
commited on
Update app.py
Browse files
app.py
CHANGED
@@ -7,19 +7,24 @@ import random
|
|
7 |
from pathlib import Path
|
8 |
|
9 |
initial_list_of_models = [
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
"asafaya/bert-base-arabic",
|
|
|
11 |
"Xenova/gpt-4o",
|
12 |
"FreedomIntelligence/AceGPT-v1.5-13B-Chat",
|
13 |
"FreedomIntelligence/AceGPT-13B",
|
14 |
-
"Qwen/
|
15 |
-
"Qwen/Qwen1.5-110B-Chat",
|
16 |
"microsoft/Phi-3-mini-128k-instruct",
|
17 |
"unsloth/gemma-2b-bnb-4bit",
|
18 |
-
"
|
19 |
"CohereForAI/c4ai-command-r-v01",
|
20 |
"CohereForAI/c4ai-command-r-plus",
|
21 |
-
"
|
22 |
-
"core42/jais-30b-chat-v3",
|
23 |
]
|
24 |
|
25 |
dataframe_path = Path(__file__).parent / "arabic_tokenizers_leaderboard.jsonl"
|
@@ -38,7 +43,7 @@ else:
|
|
38 |
)
|
39 |
|
40 |
# Datasets used for calculating the number of tokens
|
41 |
-
arabic_dataset1 = load_dataset("
|
42 |
arabic_dataset2 = load_dataset("HeshamHaroon/arabic-quotes", split="train")["quote"]
|
43 |
arabic_dataset3 = load_dataset("SaiedAlshahrani/Moroccan_Arabic_Wikipedia_20230101_nobots", split="train")["text"]
|
44 |
all_data = arabic_dataset1 + arabic_dataset2 + arabic_dataset3
|
|
|
7 |
from pathlib import Path
|
8 |
|
9 |
initial_list_of_models = [
|
10 |
+
"riotu-lab/Aranizer-PBE-86k",
|
11 |
+
"riotu-lab/Aranizer-PBE-64k",
|
12 |
+
"riotu-lab/Aranizer-PBE-32k",
|
13 |
+
"riotu-lab/Aranizer-SP-86k",
|
14 |
+
"riotu-lab/Aranizer-SP-64k",
|
15 |
+
"riotu-lab/Aranizer-SP-32k",
|
16 |
"asafaya/bert-base-arabic",
|
17 |
+
"inceptionai/jais-family-30b-16k",
|
18 |
"Xenova/gpt-4o",
|
19 |
"FreedomIntelligence/AceGPT-v1.5-13B-Chat",
|
20 |
"FreedomIntelligence/AceGPT-13B",
|
21 |
+
"Qwen/Qwen2.5-72B-Instruct",
|
|
|
22 |
"microsoft/Phi-3-mini-128k-instruct",
|
23 |
"unsloth/gemma-2b-bnb-4bit",
|
24 |
+
"unsloth/Llama-3.3-70B-Instruct",
|
25 |
"CohereForAI/c4ai-command-r-v01",
|
26 |
"CohereForAI/c4ai-command-r-plus",
|
27 |
+
"CohereForAI/aya-101",
|
|
|
28 |
]
|
29 |
|
30 |
dataframe_path = Path(__file__).parent / "arabic_tokenizers_leaderboard.jsonl"
|
|
|
43 |
)
|
44 |
|
45 |
# Datasets used for calculating the number of tokens
|
46 |
+
arabic_dataset1 = load_dataset("ImruQays/Rasaif-Classical-Arabic-English-Parallel-texts", split="train")["ar"]
|
47 |
arabic_dataset2 = load_dataset("HeshamHaroon/arabic-quotes", split="train")["quote"]
|
48 |
arabic_dataset3 = load_dataset("SaiedAlshahrani/Moroccan_Arabic_Wikipedia_20230101_nobots", split="train")["text"]
|
49 |
all_data = arabic_dataset1 + arabic_dataset2 + arabic_dataset3
|