MohamedRashad commited on
Commit
36b7eaa
·
verified ·
1 Parent(s): 810f57c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +11 -6
app.py CHANGED
@@ -7,19 +7,24 @@ import random
7
  from pathlib import Path
8
 
9
  initial_list_of_models = [
 
 
 
 
 
 
10
  "asafaya/bert-base-arabic",
 
11
  "Xenova/gpt-4o",
12
  "FreedomIntelligence/AceGPT-v1.5-13B-Chat",
13
  "FreedomIntelligence/AceGPT-13B",
14
- "Qwen/Qwen1.5-7B-Chat",
15
- "Qwen/Qwen1.5-110B-Chat",
16
  "microsoft/Phi-3-mini-128k-instruct",
17
  "unsloth/gemma-2b-bnb-4bit",
18
- "NousResearch/Meta-Llama-3-8B",
19
  "CohereForAI/c4ai-command-r-v01",
20
  "CohereForAI/c4ai-command-r-plus",
21
- "core42/jais-13b",
22
- "core42/jais-30b-chat-v3",
23
  ]
24
 
25
  dataframe_path = Path(__file__).parent / "arabic_tokenizers_leaderboard.jsonl"
@@ -38,7 +43,7 @@ else:
38
  )
39
 
40
  # Datasets used for calculating the number of tokens
41
- arabic_dataset1 = load_dataset("MohamedRashad/rasaif-translations", split="train")["arabic"]
42
  arabic_dataset2 = load_dataset("HeshamHaroon/arabic-quotes", split="train")["quote"]
43
  arabic_dataset3 = load_dataset("SaiedAlshahrani/Moroccan_Arabic_Wikipedia_20230101_nobots", split="train")["text"]
44
  all_data = arabic_dataset1 + arabic_dataset2 + arabic_dataset3
 
7
  from pathlib import Path
8
 
9
  initial_list_of_models = [
10
+ "riotu-lab/Aranizer-PBE-86k",
11
+ "riotu-lab/Aranizer-PBE-64k",
12
+ "riotu-lab/Aranizer-PBE-32k",
13
+ "riotu-lab/Aranizer-SP-86k",
14
+ "riotu-lab/Aranizer-SP-64k",
15
+ "riotu-lab/Aranizer-SP-32k",
16
  "asafaya/bert-base-arabic",
17
+ "inceptionai/jais-family-30b-16k",
18
  "Xenova/gpt-4o",
19
  "FreedomIntelligence/AceGPT-v1.5-13B-Chat",
20
  "FreedomIntelligence/AceGPT-13B",
21
+ "Qwen/Qwen2.5-72B-Instruct",
 
22
  "microsoft/Phi-3-mini-128k-instruct",
23
  "unsloth/gemma-2b-bnb-4bit",
24
+ "unsloth/Llama-3.3-70B-Instruct",
25
  "CohereForAI/c4ai-command-r-v01",
26
  "CohereForAI/c4ai-command-r-plus",
27
+ "CohereForAI/aya-101",
 
28
  ]
29
 
30
  dataframe_path = Path(__file__).parent / "arabic_tokenizers_leaderboard.jsonl"
 
43
  )
44
 
45
  # Datasets used for calculating the number of tokens
46
+ arabic_dataset1 = load_dataset("ImruQays/Rasaif-Classical-Arabic-English-Parallel-texts", split="train")["ar"]
47
  arabic_dataset2 = load_dataset("HeshamHaroon/arabic-quotes", split="train")["quote"]
48
  arabic_dataset3 = load_dataset("SaiedAlshahrani/Moroccan_Arabic_Wikipedia_20230101_nobots", split="train")["text"]
49
  all_data = arabic_dataset1 + arabic_dataset2 + arabic_dataset3