Spaces:

TTimur
/

OpenLLMKyrgyzLeaderboard_v0.1

Running

App Files Files Community

TTimur commited on 25 days ago

Commit

6c91170

1 Parent(s): 602eb7d

code update

Browse files

Files changed (1) hide show

src/display/about.py +105 -0

src/display/about.py CHANGED Viewed

@@ -1,6 +1,111 @@
 from dataclasses import dataclass
 from enum import Enum
 @dataclass
 class Task:
     benchmark: str

 from dataclasses import dataclass
 from enum import Enum
+@dataclass
+class Task:
+    benchmark: str
+    metric: str
+    col_name: str
+class Tasks(Enum):
+    # task_key in the json file, metric_key in the json file, name to display in the leaderboard
+    task0 = Task("KyrgyzMMLU", "metric_name", "KyrgyzMMLU")
+    task1 = Task("KyrgyzRC", "metric_name", "KyrgyzRC")
+    task2 = Task("WinoGrande", "metric_name", "WinoGrande")
+    task3 = Task("BoolQ", "metric_name", "BoolQ")
+    task4 = Task("HellaSwag", "metric_name", "HellaSwag")
+    task5 = Task("GSM8K", "metric_name", "GSM8K")
+    task6 = Task("TruthfulQA", "metric_name", "TruthfulQA")
+TITLE = """<h1 align="center" id="space-title"> OpenLLM Kyrgyz Leaderboard v0.1</h1>"""
+INTRODUCTION_TEXT = """
+Welcome to the Kyrgyz LLM Leaderboard — a dedicated platform for evaluating Large Language Models on Kyrgyz benchmarks. This space highlights models that perform well in Kyrgyz and helps advance research and tooling for low-resource languages.
+Benchmarks include native Kyrgyz tasks (KyrgyzMMLU, KyrgyzRC) and carefully translated sets (WinoGrande, BoolQ, HellaSwag, GSM8K, TruthfulQA). Scores are comparable across models and settings.
+🚀 Submit Your Model 🚀
+Have a Kyrgyz-capable model? Submit it for evaluation (currently manual) and help build a stronger Kyrgyz NLP ecosystem. See the About tab for details.
+"""
+LLM_BENCHMARKS_TEXT = f"""
+## How it works
+## Reproducibility
+This leaderboard aggregates results from Kyrgyz benchmarks. Datasets are hosted on the Hugging Face Hub under `TTimur`:
+- KyrgyzMMLU: `TTimur/kyrgyzMMLU`
+- KyrgyzRC: `TTimur/kyrgyzRC`
+- WinoGrande (KY): `TTimur/winogrande_kg`
+- BoolQ (KY): `TTimur/boolq_kg`
+- HellaSwag (KY): `TTimur/hellaswag_kg`
+- GSM8K (KY): `TTimur/gsm8k_kg`
+- TruthfulQA (KY): `TTimur/truthfulqa_kg`
+You can evaluate using your preferred evaluation harness (e.g., Lighteval or EleutherAI's lm-evaluation-harness) with Kyrgyz tasks enabled and then upload the resulting JSON to the results dataset for this Space.
+Notes:
+- Metrics reported are accuracy (or QEM for GSM8K), aligned with the dataset conventions.
+- Ensure you use consistent few-shot settings when comparing models.
+"""
+EVALUATION_QUEUE_TEXT = """
+## Some good practices before submitting a model
+### 1) Make sure you can load your model and tokenizer using AutoClasses:
+```python
+from transformers import AutoConfig, AutoModel, AutoTokenizer
+config = AutoConfig.from_pretrained("your model name", revision=revision)
+model = AutoModel.from_pretrained("your model name", revision=revision)
+tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
+```
+If this step fails, follow the error messages to debug your model before submitting it. It's likely your model has been improperly uploaded.
+Note: make sure your model is public!
+Note: if your model needs `use_remote_code=True`, we do not support this option yet but we are working on adding it, stay posted!
+### 2) Convert your model weights to safetensors
+It's a new format for storing weights which is safer and faster to load and use. It will also allow us to add the number of parameters of your model to the Extended Viewer!
+### 3) Make sure your model has an open license!
+This is a leaderboard for Open LLMs, and we'd love for as many people as possible to know they can use your model 🤗
+### 4) Fill up your model card
+When we add extra information about models to the leaderboard, it will be automatically taken from the model card
+## In case of model failure
+If your model is displayed in the FAILED category, its execution stopped.
+Make sure you have followed the above steps first.
+If everything is done, check you can launch your evaluation locally (you can add `--limit` to limit the number of examples per task).
+"""
+CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
+CITATION_BUTTON_TEXT = r"""
+@article{KyrgyzLLM-Bench,
+  title={Bridging the Gap in Less-Resourced Languages: Building a Benchmark for Kyrgyz Language Models},
+  author={Timur Turatali, Aida Turdubaeva, Islam Zhenishbekov, Zhoomart Suranbaev, Anton Alekseev, Rustem Izmailov},
+  year={2025},
+  url={https://huggingface.co/datasets/TTimur/kyrgyzMMLU,
+  https://huggingface.co/datasets/TTimur/kyrgyzRC,
+  https://huggingface.co/datasets/TTimur/winogrande_kg,
+  https://huggingface.co/datasets/TTimur/boolq_kg,
+  https://huggingface.co/datasets/TTimur/truthfulqa_kg,
+  https://huggingface.co/datasets/TTimur/gsm8k_kg,
+  https://huggingface.co/datasets/TTimur/hellaswag_kg}
+}
+"""
+from dataclasses import dataclass
+from enum import Enum
 @dataclass
 class Task:
     benchmark: str