TTimur commited on
Commit
6c91170
Β·
1 Parent(s): 602eb7d

code update

Browse files
Files changed (1) hide show
  1. src/display/about.py +105 -0
src/display/about.py CHANGED
@@ -1,6 +1,111 @@
1
  from dataclasses import dataclass
2
  from enum import Enum
3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  @dataclass
5
  class Task:
6
  benchmark: str
 
1
  from dataclasses import dataclass
2
  from enum import Enum
3
 
4
+
5
+ @dataclass
6
+ class Task:
7
+ benchmark: str
8
+ metric: str
9
+ col_name: str
10
+
11
+
12
+ class Tasks(Enum):
13
+ # task_key in the json file, metric_key in the json file, name to display in the leaderboard
14
+ task0 = Task("KyrgyzMMLU", "metric_name", "KyrgyzMMLU")
15
+ task1 = Task("KyrgyzRC", "metric_name", "KyrgyzRC")
16
+ task2 = Task("WinoGrande", "metric_name", "WinoGrande")
17
+ task3 = Task("BoolQ", "metric_name", "BoolQ")
18
+ task4 = Task("HellaSwag", "metric_name", "HellaSwag")
19
+ task5 = Task("GSM8K", "metric_name", "GSM8K")
20
+ task6 = Task("TruthfulQA", "metric_name", "TruthfulQA")
21
+
22
+
23
+ TITLE = """<h1 align="center" id="space-title"> OpenLLM Kyrgyz Leaderboard v0.1</h1>"""
24
+
25
+
26
+ INTRODUCTION_TEXT = """
27
+ Welcome to the Kyrgyz LLM Leaderboard β€” a dedicated platform for evaluating Large Language Models on Kyrgyz benchmarks. This space highlights models that perform well in Kyrgyz and helps advance research and tooling for low-resource languages.
28
+
29
+ Benchmarks include native Kyrgyz tasks (KyrgyzMMLU, KyrgyzRC) and carefully translated sets (WinoGrande, BoolQ, HellaSwag, GSM8K, TruthfulQA). Scores are comparable across models and settings.
30
+
31
+ πŸš€ Submit Your Model πŸš€
32
+
33
+ Have a Kyrgyz-capable model? Submit it for evaluation (currently manual) and help build a stronger Kyrgyz NLP ecosystem. See the About tab for details.
34
+ """
35
+
36
+
37
+ LLM_BENCHMARKS_TEXT = f"""
38
+ ## How it works
39
+
40
+ ## Reproducibility
41
+
42
+ This leaderboard aggregates results from Kyrgyz benchmarks. Datasets are hosted on the Hugging Face Hub under `TTimur`:
43
+ - KyrgyzMMLU: `TTimur/kyrgyzMMLU`
44
+ - KyrgyzRC: `TTimur/kyrgyzRC`
45
+ - WinoGrande (KY): `TTimur/winogrande_kg`
46
+ - BoolQ (KY): `TTimur/boolq_kg`
47
+ - HellaSwag (KY): `TTimur/hellaswag_kg`
48
+ - GSM8K (KY): `TTimur/gsm8k_kg`
49
+ - TruthfulQA (KY): `TTimur/truthfulqa_kg`
50
+
51
+ You can evaluate using your preferred evaluation harness (e.g., Lighteval or EleutherAI's lm-evaluation-harness) with Kyrgyz tasks enabled and then upload the resulting JSON to the results dataset for this Space.
52
+
53
+ Notes:
54
+ - Metrics reported are accuracy (or QEM for GSM8K), aligned with the dataset conventions.
55
+ - Ensure you use consistent few-shot settings when comparing models.
56
+ """
57
+
58
+
59
+ EVALUATION_QUEUE_TEXT = """
60
+ ## Some good practices before submitting a model
61
+
62
+ ### 1) Make sure you can load your model and tokenizer using AutoClasses:
63
+ ```python
64
+ from transformers import AutoConfig, AutoModel, AutoTokenizer
65
+ config = AutoConfig.from_pretrained("your model name", revision=revision)
66
+ model = AutoModel.from_pretrained("your model name", revision=revision)
67
+ tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
68
+ ```
69
+ If this step fails, follow the error messages to debug your model before submitting it. It's likely your model has been improperly uploaded.
70
+
71
+ Note: make sure your model is public!
72
+ Note: if your model needs `use_remote_code=True`, we do not support this option yet but we are working on adding it, stay posted!
73
+
74
+ ### 2) Convert your model weights to safetensors
75
+ It's a new format for storing weights which is safer and faster to load and use. It will also allow us to add the number of parameters of your model to the Extended Viewer!
76
+
77
+ ### 3) Make sure your model has an open license!
78
+ This is a leaderboard for Open LLMs, and we'd love for as many people as possible to know they can use your model πŸ€—
79
+
80
+ ### 4) Fill up your model card
81
+ When we add extra information about models to the leaderboard, it will be automatically taken from the model card
82
+
83
+ ## In case of model failure
84
+ If your model is displayed in the FAILED category, its execution stopped.
85
+ Make sure you have followed the above steps first.
86
+ If everything is done, check you can launch your evaluation locally (you can add `--limit` to limit the number of examples per task).
87
+ """
88
+
89
+
90
+ CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
91
+ CITATION_BUTTON_TEXT = r"""
92
+ @article{KyrgyzLLM-Bench,
93
+ title={Bridging the Gap in Less-Resourced Languages: Building a Benchmark for Kyrgyz Language Models},
94
+ author={Timur Turatali, Aida Turdubaeva, Islam Zhenishbekov, Zhoomart Suranbaev, Anton Alekseev, Rustem Izmailov},
95
+ year={2025},
96
+ url={https://huggingface.co/datasets/TTimur/kyrgyzMMLU,
97
+ https://huggingface.co/datasets/TTimur/kyrgyzRC,
98
+ https://huggingface.co/datasets/TTimur/winogrande_kg,
99
+ https://huggingface.co/datasets/TTimur/boolq_kg,
100
+ https://huggingface.co/datasets/TTimur/truthfulqa_kg,
101
+ https://huggingface.co/datasets/TTimur/gsm8k_kg,
102
+ https://huggingface.co/datasets/TTimur/hellaswag_kg}
103
+ }
104
+ """
105
+
106
+ from dataclasses import dataclass
107
+ from enum import Enum
108
+
109
  @dataclass
110
  class Task:
111
  benchmark: str