AdithyaSK's picture
updated language order list - Adithya S K
8972e24
import os
import re
import streamlit as st
import requests
import pandas as pd
from io import StringIO
import plotly.graph_objs as go
from huggingface_hub import HfApi
from huggingface_hub.utils import RepositoryNotFoundError, RevisionNotFoundError
from dotenv import load_dotenv
from huggingface_hub import HfApi
from huggingface_hub.utils import RepositoryNotFoundError, RevisionNotFoundError
load_dotenv()
SERVER_URL = os.getenv("SERVER_URL")
# @st.cache_data
def get_data():
response = requests.get(SERVER_URL)
data = response.json()
return data
# @st.cache_data
def get_model_info(df):
api = HfApi()
# Initialize new columns for likes and tags
df['Likes'] = None
# Iterate through DataFrame rows
for index, row in df.iterrows():
model = row['Model'].strip()
try:
model_info = api.model_info(repo_id=str(model))
df.loc[index, 'Likes'] = f"{model_info.likes}🧑"
# df.loc[index, 'Tags'] = ', '.join(model_info.tags)
except (RepositoryNotFoundError, RevisionNotFoundError):
df.loc[index, 'Likes'] = None
# df.loc[index, 'Tags'] = ''
return df
# @st.cache_data
def main():
st.set_page_config(page_title="Indic LLM Leaderboard", layout="wide")
title_column, refresh_column = st.columns([.92, 0.08])
with title_column:
st.title("Indic LLM Leaderboard (Ξ±)")
st.markdown("The Indic LLM Leaderboard utilizes the [indic_eval](https://github.com/adithya-s-k/indic_eval) evaluation framework , incorporating SOTA translated benchmarks like ARC, Hellaswag, MMLU, among others. Supporting 7 Indic languages, it offers a comprehensive platform for assessing model performance and comparing results within the Indic language modeling landscape.")
with refresh_column:
if st.button("Refresh", type="primary"):
data = get_data()
Leaderboard_tab, Release_tab, About_tab ,FAQ_tab, Submit_tab = st.tabs(["πŸ… Leaderboard", "(Ξ±) Release" ,"πŸ“ About" , "❗FAQ","πŸš€ Submit"])
with Leaderboard_tab:
data = get_data()
table_data = []
all_models = []
try:
for item in data:
model_name = item.get("name")
language = item.get("language")
is_verified= item.get("is_verified")
try:
ALL = item["result"]["all"]["acc_norm"]
except KeyError:
ALL = None
try:
ARC_Easy = item["result"]["ARC-Easy"]["acc_norm"]
except KeyError:
ARC_Easy = None
try:
ARC_Challenge = item["result"]["ARC-Challenge"]["acc_norm"]
except KeyError:
ARC_Challenge = None
try:
Hellaswag = item["result"]["Hellaswag"]["acc_norm"]
except KeyError:
Hellaswag = None
try:
Boolq = item["result"]["Boolq"]["acc_norm"]
except KeyError:
Boolq = None
try:
MMLU = item["result"]["MMLU"]["acc_norm"]
except KeyError:
MMLU = None
try:
Translation = item["result"]["Translation"]["acc_norm"]
except KeyError:
Translation = None
# If you are going through the code and wondering what is happening this code is a mess
all_models.append(model_name)
table_data.append({
"Model": model_name,
"Language": language,
"Avergae": ALL,
"ARC-Easy": ARC_Easy,
"ARC-Challenge": ARC_Challenge,
"Hellaswag": Hellaswag,
"Boolq": Boolq,
"MMLU": MMLU,
"Translation": Translation,
"Verified": is_verified,
})
df = pd.DataFrame(table_data)
except:
columns = ["Model", "Language", "Avergae", "ARC-Easy", "ARC-Challenge", "Hellaswag", "Boolq", "MMLU", "Translation"]
# Create an empty list to hold the data
table_data = []
# Append an empty dictionary with column names as keys to the table_data list
table_data.append({col: None for col in columns})
# Create a DataFrame from the table_data list
df = pd.DataFrame(table_data)
title = st.text_input('Model', placeholder=" πŸ” Search for your model (separate multiple queries with `;`) and press ENTER...")
option_column1, option_column2 = st.columns(2)
with option_column1:
on = st.checkbox('Sort by Language')
with option_column2:
is_verified = st.checkbox('Verified', value=True)
col1, col2 = st.columns(2)
with col1:
benchmark_options = st.multiselect(
'Pick Benchmark',
['ARC-Easy', 'ARC-Challenge', 'Hellaswag', 'Boolq','MMLU','Translation'],['ARC-Easy', 'ARC-Challenge', 'Hellaswag'])
with col2:
language_options = st.multiselect(
'Pick Languages',
["english", 'hindi','kannada', 'tamil', 'telugu','gujarati','marathi','malayalam'],["english", 'hindi','kannada', 'tamil', 'telugu','gujarati','marathi','malayalam'])
if on:
for language in language_options:
filtered_df = df[(df['Language'] == language) & (df['Verified'] == is_verified)]
if not filtered_df.empty:
st.subheader(f"{language.capitalize()[0]}{language[1:]}")
filtered_df.reset_index(drop=True, inplace=True)
filtered_df = get_model_info(filtered_df)
if title:
if ';' in title:
model_names = [name.strip() for name in title.split(';')]
filtered_df = df[df['Model'].isin(model_names)]
else:
filtered_df = df[df['Model'].str.contains(title, case=False, na=False)]
filtered_df = filtered_df[filtered_df['Language'] == language]
filtered_df = filtered_df[filtered_df['Verified'] == is_verified]
filtered_df = filtered_df[df.columns.intersection(['Model', 'Language'] + benchmark_options)]
filtered_df['Average'] = filtered_df[benchmark_options].mean(axis=1)
filtered_df.index += 1
st.dataframe(filtered_df, use_container_width=True)
elif benchmark_options or language_options:
filtered_df = filtered_df[df.columns.intersection(['Model', 'Language'] + benchmark_options)]
filtered_df['Average'] = filtered_df[benchmark_options].mean(axis=1)
st.dataframe(filtered_df, use_container_width=True)
else:
if title:
if ';' in title:
model_names = [name.strip() for name in title.split(';')]
filtered_df = df[df['Model'].isin(model_names)]
else:
filtered_df = df[df['Model'].str.contains(title, case=False, na=False)]
filtered_df = filtered_df[filtered_df['Language'].isin(language_options)]
filtered_df = filtered_df[filtered_df['Verified'] == is_verified]
filtered_df = filtered_df[df.columns.intersection(['Model', 'Language'] + benchmark_options)]
filtered_df['Average'] = filtered_df[benchmark_options].mean(axis=1)
filtered_df.index += 1
st.dataframe(filtered_df, use_container_width=True)
elif benchmark_options or language_options:
filtered_df = df[df['Language'].isin(language_options)]
filtered_df = filtered_df[filtered_df['Verified'] == is_verified]
filtered_df = filtered_df[df.columns.intersection(['Model', 'Language'] + benchmark_options)]
filtered_df['Average'] = filtered_df[benchmark_options].mean(axis=1)
st.dataframe(filtered_df, use_container_width=True)
# Multiselect for comparing models
compare_models = st.multiselect(
'Pick Models to compare them',
df['Model'].unique()
)
# Display DataFrame for selected models and their scores
if compare_models:
compare_data = []
for model in compare_models:
model_data = df[df['Model'] == model]
compare_data.append(model_data)
if compare_data:
compare_df = pd.concat(compare_data)
compare_df['Average'] = compare_df[benchmark_options].mean(axis=1) # Calculate average
compare_df.index += 1
st.dataframe(compare_df, use_container_width=True)
with Release_tab:
st.markdown(
"""
**Date: April 5th, 2024**
the alpha release of the **Indic LLM Leaderboard** and **Indic Eval**.
The Indic LLM Leaderboard is an evolving platform, aiming to streamline evaluations for Language Model (LLM) models tailored to Indic languages. While this **alpha release is far from perfect**, it signifies a crucial initial step towards establishing evaluation standards within the community.
### Features:
As of this release, the following base models have been evaluated in using the different datasets and benchmarks integrated into the platform:
- `meta meta-llama/Llama-2-7b-hf`
- `google/gemma-7b`
Tasks incorporated into the platform:
- `ARC-Easy:{language}`
- `ARC-Challenge:{language}`
- `Hellaswag:{language}`
For evaluation purposes, each task includes 5-shot prompting. Further experimentation will determine the most optimal balance between evaluation time and accuracy.
### Datasets:
Datasets utilized for evaluation are accessible via the following link: [Indic LLM Leaderboard Eval Suite](https://huggingface.co/collections/Cognitive-Lab/indic-llm-leaderboard-eval-suite-660ac4818695a785edee4e6f)
### Rationale for Alpha Release:
The decision to label this release as alpha stems from the realization that extensive testing and experimentation are necessary. Key considerations include:
- Selection of appropriate metrics for evaluation
- Determination of the optimal few-shot learning parameters
- Establishment of the ideal number of evaluation samples within the dataset
### Collaborative Effort:
To foster collaboration and discussion surrounding evaluations, a [WhatsApp group](https://chat.whatsapp.com/CUb6eS50lX2JHX2D4j13d1) is being established.
and we can also connect on Hugging faces discord [indic_llm channel](https://discord.com/channels/879548962464493619/1189605147068858408)
### Roadmap for Next Release:
Anticipate the following enhancements in the upcoming release:
- Enhanced testing and accountability mechanisms
- A refined version of the leaderboard
- Defined benchmarks and standardized datasets
- Bilingual evaluation support
- Expansion of supported models
- Implementation of more secure interaction mechanisms
- Addition of support for additional languages
### Benchmarks to be added/tested
- [ ] Boolq
- [ ] MMLU
- [ ] Translation - [IN22-Gen](https://huggingface.co/datasets/ai4bharat/IN22-Gen), [Flores](https://huggingface.co/datasets/facebook/flores)
- [ ] Generation - [ai4bharat/IndicSentiment](https://huggingface.co/datasets/ai4bharat/IndicSentiment), etc..
Upcoming Implementations
- [ ] Support to add VLLM for faster evaluation and inference
- [ ] Add support for onboard evaluation just like OpenLLM Leaderboard
## Conclusion:
The alpha release of the Indic LLM Leaderboard and Indic Eval signifies a significant milestone in the pursuit of standardized evaluations for Indic language models. We invite contributions and feedback from the community to further enhance and refine these tools.
For more information and updates, visit [Indic LLM Leaderboard](https://huggingface.co/spaces/Cognitive-Lab/indic_llm_leaderboard) and [Indic Eval](https://github.com/adithya-s-k/indic_eval).
Thank you for your interest and support.
"""
)
# About tab
with About_tab:
st.markdown('''
## **Why a Indic LLM Leaderboard is Required ?**
In recent months, there has been considerable progress in the Indic large language model (LLM) space. Major startups like Sarvam and Krutrim are building LLMs in this area.
Simultaneously, the open-source community is also adapting pretrained models, such as Llama, Mistral, and Gemma, for Indic languages.
Despite the influx of new models, there is a lack of a unified method to evaluate and compare them. This makes it challenging to track progress and determine what is working and what is not.
> This is the alpha release of the Indic LLM Leaderboard, and modifications will be made to the leaderboard in the future.
>
## **Who We Are**
I'm [Adithya S K](https://linktr.ee/adithyaskolavi), the founder of [CognitiveLab](https://www.cognitivelab.in/). We provide AI solutions at scale and undertake research-based tasks.
One initiative we have taken is to create a unified platform where Indic LLMs can be compared using specially crafted datasets. Although initially developed for internal use, we are now open-sourcing this framework to further aid the Indic LLM ecosystem.
After releasing [Amabri, a 7b parameter English-Kannada bilingual LLM](https://www.cognitivelab.in/blog/introducing-ambari), we wanted to compare it with other open-source LLMs to identify areas for improvement. As there wasn't an existing solution, we built the Indic LLM suite, which consists of three projects:
- [Indic-llm](https://github.com/adithya-s-k/Indic-llm): An open-source framework designed to adapt pretrained LLMs, such as Llama, Mistral, and Mixtral, to a wide array of domains and languages.
- [Indic-Eval](https://github.com/adithya-s-k/indic_eval): A lightweight evaluation suite tailored specifically for assessing Indic LLMs across a diverse range of tasks, aiding in performance assessment and comparison within the Indian language context.
- [Indic LLM Leaderboard](https://huggingface.co/spaces/Cognitive-Lab/indic_llm_leaderboard): Utilizes the [indic_eval](https://github.com/adithya-s-k/indic_eval) evaluation framework, incorporating state-of-the-art translated benchmarks like ARC, Hellaswag, MMLU, among others. Supporting seven Indic languages, it offers a comprehensive platform for assessing model performance and comparing results within the Indic language modeling landscape.
**Contribute**
All the projects are completely open source with different licenses, so anyone can contribute.
The current leaderboard is in alpha release, and many more changes are forthcoming:
- More robust benchmarks tailored for Indic languages.
- Easier integration with [indic_eval](https://github.com/adithya-s-k/indic_eval).
''')
# FAQ tab
with FAQ_tab:
st.markdown('''
Boolq , MMLU , Translation is still being tested
**What is the minimum requirement for GPUs to run the evaluation?**
- The evaluation can easily run on a single A100 GPU, but the framework also supports multi-GPU based evaluation to speed up the process.
**What languages are supported by the evaluation framework?**
- The following languages are supported by default: English, Kannada, Hindi, Tamil, Telugu, Gujarati, Marathi, Malayalam.
**How can I put my model on the leaderboard?**
- Please follow the steps shown in the Submit tab or refer to the indic_eval for more details.
**How does the leaderboard work?**
- After running indic_eval on the model of your choice, the results are pushed to a server and stored in a database. The Frontend Leaderboard accesses the server and retrieves the latest models in the database along with their respective benchmarks and metadata. The entire system is deployed in India and is as secure as possible.
**How is it different from the Open LLM leaderboard?**
- This project was mainly inspired by the Open LLM leaderboard. However, due to limited computation resources, we standardized the evaluation library with standard benchmarks. You can run the evaluation on your GPUs and the leaderboard will serve as a unified platform to compare models. We used indictrans2 and other translation APIs to translate the benchmarking dataset into seven Indian languages to ensure reliability and consistency in the output.
**Why does it take so much time to load the results?**
- We are running the server on a serverless instance which has a cold start problem, so it might sometimes take a while.
**What benchmarks are offered?**
- The current Indic Benchmarks offered by the indic_eval library can be found in this collection: https://huggingface.co/collections/Cognitive-Lab/indic-llm-leaderboard-eval-suite-660ac4818695a785edee4e6f. They include ARC Easy, ARC Challenge, Hellaswag, Boolq, and MMLU.
**How much time does it take to run the evaluation using indic_eval?**
- Depending on which GPU you are running, the time for evaluation varies.
- From our testing, it takes 7 to 8 hours to run the whole evaluation on a single A100 GPU.
- It's much faster when using multiple GPUs.
**How does the verification step happen?**
- While running the evaluation, you are given an option to push results to the leaderboard with `-push_to_leaderboard <yourname@company.com>`. You will need to provide an email address through which we can contact you. If we find any anomaly in the evaluation score, we will contact you through this email for verification of results.
''')
# Submit tab
with Submit_tab:
st.markdown('''
Here are the steps you will have to follows to put your model on the Indic LLM leaderboard
Clone the repo:
```bash
git clone <https://github.com/adithya-s-k/indic_eval>
cd indic_eval
```
Create a virtual environment using virtualenv or conda depending on your preferences. We require Python 3.10 or above:
```bash
conda create -n indic-eval-venv python=3.10 && conda activate indic-eval-venv
```
Install the dependencies. For the default installation, you just need:
```bash
pip install .
```
If you want to evaluate models with frameworks like `accelerate` or `peft`, you will need to specify the optional dependencies group that fits your use case (`accelerate`,`tgi`,`optimum`,`quantization`,`adapters`,`nanotron`):
```bash
pip install '.[optional1,optional2]'
```
The setup tested most is:
```bash
pip install '.[accelerate,quantization,adapters]'
```
If you want to push your results to the Hugging Face Hub, don't forget to add your access token to the environment variable `HUGGING_FACE_HUB_TOKEN`. You can do this by running:
```
huggingface-cli login
```
## Command to Run Indic Eval and Push to Indic LLM Leaderboard
```bash
accelerate launch run_indic_evals_accelerate.py \\
--model_args="pretrained=<path to model on the hub>" \\
--tasks indic_llm_leaderboard \\
--output_dir output_dir \\
--push_to_leaderboard <yourname@company.com> \\
```
It's as simple as that.πŸ‘
For `--push_to_leaderboard`, provide an email id through which we can contact you in case of verification. This email won't be shared anywhere. It's only required for future verification of the model's scores and for authenticity.
After you have installed all the required packages, run the following command:
For multi-GPU configuration, please refer to the docs of [Indic_Eval](https://github.com/adithya-s-k/indic_eval).
''')
with st.expander(label="πŸ“™ Citation"):
code = '''
@misc{indic-llm-leaderboard,
author = {Adithya S Kolavi},
title = {Indic LLM Leaderboard},
year = {2024},
publisher = {Cognitivelab},
howpublished = "url{https://huggingface.co/spaces/Cognitive-Lab/indic_llm_leaderboard}",
}
'''
st.code(code, language='python')
if __name__ == "__main__":
main()