File size: 11,856 Bytes
fc68f79
 
8012c4e
fc68f79
 
bc527a4
30ec544
b049de2
a989e5c
708e0e9
e9dbd6f
708e0e9
 
744e7f0
f7eab11
e9dbd6f
8a0dd37
f7eab11
8012c4e
20b3456
8a0dd37
744e7f0
b005e3f
e9dbd6f
b005e3f
47a867b
 
 
 
 
 
 
 
 
b005e3f
 
fc68f79
e9dbd6f
fc68f79
35bf268
b005e3f
 
 
 
 
fc68f79
8a0dd37
 
e9dbd6f
8a0dd37
 
 
 
 
 
e9dbd6f
47a867b
 
 
8a0dd37
 
e9dbd6f
 
8a0dd37
 
 
a989e5c
e9dbd6f
434f3cf
e9dbd6f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a989e5c
8a0dd37
e9dbd6f
5e3730c
 
 
ec2bb77
8a0dd37
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
952614e
708e0e9
 
 
4b13fa7
708e0e9
 
 
7d1b966
 
 
 
 
 
 
15046f9
7d1b966
 
 
 
 
 
 
 
 
 
 
 
15046f9
 
 
 
 
df11296
a989e5c
df11296
 
 
 
 
 
 
a989e5c
f7eab11
 
5d80dad
7d1b966
8a0dd37
 
 
 
 
 
 
 
 
 
 
 
 
ebf8650
 
 
8a0dd37
 
7d1b966
8a0dd37
7d1b966
8a0dd37
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7d1b966
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
import streamlit as st
import pandas as pd
from huggingface_hub import HfApi, ModelCard
from huggingface_hub.utils import RepositoryNotFoundError, RevisionNotFoundError
import re
from io import StringIO
from yall import create_yall
import plotly.graph_objs as go

def calculate_pages(df, items_per_page):
    """Calculate the number of pages needed for pagination."""
    return -(-len(df) // items_per_page)  # Equivalent to math.ceil(len(df) / items_per_page)

@st.cache_data
def cached_model_info(_api, model):
    """Fetch model information from the Hugging Face API and cache the result."""
    try:
        return _api.model_info(repo_id=str(model))
    except (RepositoryNotFoundError, RevisionNotFoundError):
        return None

@st.cache_data
def get_model_info(df):
    """Get model information and update the DataFrame with likes and tags."""
    api = HfApi()
    with st.spinner("Fetching model information..."):
        for index, row in df.iterrows():
            model_info = cached_model_info(api, row['Model'].strip())
            if model_info:
                df.loc[index, 'Likes'] = model_info.likes
                df.loc[index, 'Tags'] = ', '.join(model_info.tags)
            else:
                df.loc[index, 'Likes'] = -1
                df.loc[index, 'Tags'] = ''
    return df

def convert_markdown_table_to_dataframe(md_content):
    """Convert a markdown table to a pandas DataFrame."""
    cleaned_content = re.sub(r'\|\s*$', '', re.sub(r'^\|\s*', '', md_content, flags=re.MULTILINE), flags=re.MULTILINE)
    df = pd.read_csv(StringIO(cleaned_content), sep="\|", engine='python')
    df = df.drop(0, axis=0)
    df.columns = df.columns.str.strip()
    model_link_pattern = r'\[(.*?)\]\((.*?)\)\s*\[.*?\]\(.*?\)'
    df['URL'] = df['Model'].apply(lambda x: re.search(model_link_pattern, x).group(2) if re.search(model_link_pattern, x) else None)
    df['Model'] = df['Model'].apply(lambda x: re.sub(model_link_pattern, r'\1', x))
    return df

def create_bar_chart(df, category):
    """Create a horizontal bar chart for the specified category."""
    st.write(f"### {category} Scores")
    sorted_df = df[['Model', category]].sort_values(by=category, ascending=True)
    fig = go.Figure(go.Bar(
        x=sorted_df[category],
        y=sorted_df['Model'],
        orientation='h',
        marker=dict(color=sorted_df[category], colorscale='Viridis'),
        hoverinfo='x+y',
        text=sorted_df[category],
        textposition='auto'
    ))
    fig.update_layout(
        margin=dict(l=20, r=20, t=20, b=20),
        title=f"Leaderboard for {category} Scores"
    )
    st.plotly_chart(fig, use_container_width=True, height=len(df) * 35)

def fetch_merge_configs(df):
    """Fetch and save merge configurations for the top models."""
    df_sorted = df.sort_values(by='Average', ascending=False)
    try:
        with open('/tmp/configurations.txt', 'a') as file:
            for index, row in df_sorted.head(20).iterrows():
                model_name = row['Model'].rstrip()
                try:
                    card = ModelCard.load(model_name)
                    file.write(f'Model Name: {model_name}\n')
                    file.write(f'Scores: {row["Average"]}\n')
                    file.write(f'AGIEval: {row["AGIEval"]}\n')
                    file.write(f'GPT4All: {row["GPT4All"]}\n')
                    file.write(f'TruthfulQA: {row["TruthfulQA"]}\n')
                    file.write(f'Bigbench: {row["Bigbench"]}\n')
                    file.write(f'Model Card: {card}\n')
                except Exception as e:
                    st.error(f"Error loading model card for {model_name}: {str(e)}")
        with open('/tmp/configurations.txt', 'r') as file:
            content = file.read()
            matches = re.findall(r'yaml(.*?)```', content, re.DOTALL)
        with open('/tmp/configurations2.txt', 'w') as file:
            for row, match in zip(df_sorted[['Model', 'Average', 'AGIEval', 'GPT4All', 'TruthfulQA', 'Bigbench']].head(20).values, matches):
                file.write(f'Model Name: {row[0]}\n')
                file.write(f'Scores: {row[1]}\n')
                file.write(f'AGIEval: {row[2]}\n')
                file.write(f'GPT4All: {row[3]}\n')
                file.write(f'TruthfulQA: {row[4]}\n')
                file.write(f'Bigbench: {row[5]}\n')
                file.write('yaml' + match + '```\n')
    except Exception as e:
        st.error(f"Error while fetching merge configs: {str(e)}")

def main():
    """Main function to set up the Streamlit app and display the leaderboard."""
    st.set_page_config(page_title="YALL - Yet Another LLM Leaderboard", layout="wide")
    st.title("๐Ÿ† YALL - Yet Another LLM Leaderboard")
    st.markdown("Leaderboard made with ๐Ÿง [LLM AutoEval](https://github.com/mlabonne/llm-autoeval) using [Nous](https://huggingface.co/NousResearch) benchmark suite.")
    content = create_yall()
    tab1, tab2 = st.tabs(["๐Ÿ† Leaderboard", "๐Ÿ“ About"])

    with tab1:
        if content:
            try:
                score_columns = ['Average', 'AGIEval', 'GPT4All', 'TruthfulQA', 'Bigbench']
                full_df = convert_markdown_table_to_dataframe(content)

                for col in score_columns:
                    full_df[col] = pd.to_numeric(full_df[col].str.strip(), errors='coerce')

                full_df = get_model_info(full_df)
                full_df['Tags'] = full_df['Tags'].fillna('')
                df = pd.DataFrame(columns=full_df.columns)

                show_phi = st.checkbox("Phi (2.8B)", value=True)
                show_mistral = st.checkbox("Mistral (7B)", value=True)
                show_other = st.checkbox("Other", value=True)

                dfs_to_concat = []
                if show_phi:
                    dfs_to_concat.append(full_df[full_df['Tags'].str.lower().str.contains('phi,|phi-msft,')])
                if show_mistral:
                    dfs_to_concat.append(full_df[full_df['Tags'].str.lower().str.contains('mistral,')])
                if show_other:
                    other_df = full_df[~full_df['Tags'].str.lower().str.contains('phi,|phi-msft,|mistral,')]
                    dfs_to_concat.append(other_df)

                if dfs_to_concat:
                    df = pd.concat(dfs_to_concat, ignore_index=True)

                search_query = st.text_input("Search models", "")
                if search_query:
                    df = df[df['Model'].str.contains(search_query, case=False)]

                items_per_page = 50
                pages = calculate_pages(df, items_per_page)
                page = st.selectbox("Page", list(range(1, pages + 1)))

                df = df.sort_values(by='Average', ascending=False)
                start = (page - 1) * items_per_page
                end = start + items_per_page
                df = df[start:end]

                selected_benchmarks = st.multiselect('Select benchmarks to include in the average', score_columns, default=score_columns)

                if selected_benchmarks:
                    df['Filtered Average'] = df[selected_benchmarks].mean(axis=1)
                    df = df.sort_values(by='Filtered Average', ascending=False)
                    st.dataframe(
                        df[['Model'] + selected_benchmarks + ['Filtered Average', 'Likes', 'URL']],
                        use_container_width=True,
                        column_config={
                            "Likes": st.column_config.NumberColumn(
                                "Likes",
                                help="Number of likes on Hugging Face",
                                format="%d โค๏ธ",
                            ),
                            "URL": st.column_config.LinkColumn("URL"),
                        },
                        hide_index=True,
                        height=len(df) * 37,
                    )

                selected_models = st.multiselect('Select models to compare', df['Model'].unique())
                comparison_df = df[df['Model'].isin(selected_models)]
                st.dataframe(comparison_df)

                if st.button("Export to CSV"):
                    csv_data = df.to_csv(index=False)
                    st.download_button(
                        label="Download CSV",
                        data=csv_data,
                        file_name="leaderboard.csv",
                        key="download-csv",
                        help="Click to download the CSV file",
                    )
                if st.button("Fetch Merge-Configs"):
                    fetch_merge_configs(full_df)
                    st.success("Merge configurations have been fetched and saved.")

                create_bar_chart(df, 'Filtered Average')

                col1, col2 = st.columns(2)
                with col1:
                    create_bar_chart(df, score_columns[1])
                with col2:
                    create_bar_chart(df, score_columns[2])

                col3, col4 = st.columns(2)
                with col3:
                    create_bar_chart(df, score_columns[3])
                with col4:
                    create_bar_chart(df, score_columns[4])

            except Exception as e:
                st.error("An error occurred while processing the markdown table.")
                st.error(str(e))
        else:
            st.error("Failed to download the content from the URL provided.")
            
    with tab2:
        st.markdown('''
            ### Nous benchmark suite
            Popularized by [Teknium](https://huggingface.co/teknium) and [NousResearch](https://huggingface.co/NousResearch), this benchmark suite aggregates four benchmarks:
            * [**AGIEval**](https://arxiv.org/abs/2304.06364) (0-shot): `agieval_aqua_rat,agieval_logiqa_en,agieval_lsat_ar,agieval_lsat_lr,agieval_lsat_rc,agieval_sat_en,agieval_sat_en_without_passage,agieval_sat_math`
            * **GPT4ALL** (0-shot): `hellaswag,openbookqa,winogrande,arc_easy,arc_challenge,boolq,piqa`
            * [**TruthfulQA**](https://arxiv.org/abs/2109.07958) (0-shot): `truthfulqa_mc`
            * [**Bigbench**](https://arxiv.org/abs/2206.04615) (0-shot): `bigbench_causal_judgement,bigbench_date_understanding,bigbench_disambiguation_qa,bigbench_geometric_shapes,bigbench_logical_deduction_five_objects,bigbench_logical_deduction_seven_objects,bigbench_logical_deduction_three_objects,bigbench_movie_recommendation,bigbench_navigate,bigbench_reasoning_about_colored_objects,bigbench_ruin_names,bigbench_salient_translation_error_detection,bigbench_snarks,bigbench_sports_understanding,bigbench_temporal_sequences,bigbench_tracking_shuffled_objects_five_objects,bigbench_tracking_shuffled_objects_seven_objects,bigbench_tracking_shuffled_objects_three_objects`
            ### Reproducibility
            You can easily reproduce these results using ๐Ÿง [LLM AutoEval](https://github.com/mlabonne/llm-autoeval/tree/master), a colab notebook that automates the evaluation process (benchmark: `nous`). This will upload the results to GitHub as gists. You can find the entire table with the links to the detailed results [here](https://gist.github.com/mlabonne/90294929a2dbcb8877f9696f28105fdf).
            ### Clone this space
            You can create your own leaderboard with your LLM AutoEval results on GitHub Gist. You just need to clone this space and specify two variables:
            * Change the `gist_id` in [yall.py](https://huggingface.co/spaces/mlabonne/Yet_Another_LLM_Leaderboard/blob/main/yall.py#L126).
            * Create "New Secret" in Settings > Variables and secrets (name: "github", value: [your GitHub token](https://github.com/settings/tokens))
            A special thanks to [gblazex](https://huggingface.co/gblazex) for providing many evaluations.
        ''')

if __name__ == "__main__":
    main()