CultriX commited on
Commit
8a0dd37
β€’
1 Parent(s): cf543f7

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +331 -0
app.py ADDED
@@ -0,0 +1,331 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Importing necessary libraries
2
+ import re
3
+ import streamlit as st
4
+ import requests
5
+ import pandas as pd
6
+ from io import StringIO
7
+ import plotly.graph_objs as go
8
+ from huggingface_hub import HfApi
9
+ from huggingface_hub.utils import RepositoryNotFoundError, RevisionNotFoundError
10
+ from yall import create_yall
11
+ from functools import cache
12
+
13
+
14
+
15
+ # Function to get model info from Hugging Face API using caching
16
+ @cache
17
+ def cached_model_info(api, model):
18
+ try:
19
+ return api.model_info(repo_id=str(model))
20
+ except (RepositoryNotFoundError, RevisionNotFoundError):
21
+ return None
22
+
23
+ # Function to get model info from DataFrame and update it with likes and tags
24
+ @st.cache
25
+ def get_model_info(df):
26
+ api = HfApi()
27
+
28
+ for index, row in df.iterrows():
29
+ model_info = cached_model_info(api, row['Model'].strip())
30
+ if model_info:
31
+ df.loc[index, 'Likes'] = model_info.likes
32
+ df.loc[index, 'Tags'] = ', '.join(model_info.tags)
33
+ else:
34
+ df.loc[index, 'Likes'] = -1
35
+ df.loc[index, 'Tags'] = ''
36
+ return df
37
+
38
+ # Function to convert markdown table to DataFrame and extract Hugging Face URLs
39
+ def convert_markdown_table_to_dataframe(md_content):
40
+ """
41
+ Converts markdown table to Pandas DataFrame, handling special characters and links,
42
+ extracts Hugging Face URLs, and adds them to a new column.
43
+ """
44
+ # Remove leading and trailing | characters
45
+ cleaned_content = re.sub(r'\|\s*$', '', re.sub(r'^\|\s*', '', md_content, flags=re.MULTILINE), flags=re.MULTILINE)
46
+
47
+ # Create DataFrame from cleaned content
48
+ df = pd.read_csv(StringIO(cleaned_content), sep="\|", engine='python')
49
+
50
+ # Remove the first row after the header
51
+ df = df.drop(0, axis=0)
52
+
53
+ # Strip whitespace from column names
54
+ df.columns = df.columns.str.strip()
55
+
56
+ # Extract Hugging Face URLs and add them to a new column
57
+ model_link_pattern = r'\[(.*?)\]\((.*?)\)\s*\[.*?\]\(.*?\)'
58
+ df['URL'] = df['Model'].apply(lambda x: re.search(model_link_pattern, x).group(2) if re.search(model_link_pattern, x) else None)
59
+
60
+ # Clean Model column to have only the model link text
61
+ df['Model'] = df['Model'].apply(lambda x: re.sub(model_link_pattern, r'\1', x))
62
+
63
+ return df
64
+ # Function to get model info from Hugging Face API using caching
65
+ @cache
66
+ def cached_model_info(api, model):
67
+ try:
68
+ return api.model_info(repo_id=str(model))
69
+ except (RepositoryNotFoundError, RevisionNotFoundError):
70
+ return None
71
+
72
+ # Function to convert markdown table to DataFrame and extract Hugging Face URLs
73
+ def convert_markdown_table_to_dataframe(md_content):
74
+ cleaned_content = re.sub(r'\|\s*$', '', re.sub(r'^\|\s*', '', md_content, flags=re.MULTILINE), flags=re.MULTILINE)
75
+ df = pd.read_csv(StringIO(cleaned_content), sep="\|", engine='python')
76
+ df = df.drop(0, axis=0)
77
+ df.columns = df.columns.str.strip()
78
+ model_link_pattern = r'\[(.*?)\]\((.*?)\)\s*\[.*?\]\(.*?\)'
79
+ df['URL'] = df['Model'].apply(lambda x: re.search(model_link_pattern, x).group(2) if re.search(model_link_pattern, x) else None)
80
+ df['Model'] = df['Model'].apply(lambda x: re.sub(model_link_pattern, r'\1', x))
81
+ return df
82
+
83
+ # Function to get model info from DataFrame and update it with likes and tags
84
+ @st.cache
85
+ def get_model_info(df):
86
+ api = HfApi()
87
+ df['Likes'] = None
88
+ df['Tags'] = None
89
+ for index, row in df.iterrows():
90
+ model_info = cached_model_info(api, row['Model'].strip())
91
+ if model_info:
92
+ df.loc[index, 'Likes'] = model_info.likes
93
+ df.loc[index, 'Tags'] = ', '.join(model_info.tags)
94
+ else:
95
+ df.loc[index, 'Likes'] = -1
96
+ df.loc[index, 'Tags'] = ''
97
+ return df
98
+
99
+ # Define the score columns (global for use in calculations)
100
+ score_columns = ['Average', 'AGIEval', 'GPT4All', 'TruthfulQA', 'Bigbench']
101
+
102
+ # Function to calculate the highest combined score for a given column
103
+ def calculate_highest_combined_score(data, column):
104
+ scores = data[column].tolist()
105
+ models = data['Model'].tolist()
106
+ top_combinations = {2: [], 3: [], 4: [], 5: [], 6: []}
107
+ for r in range(2, 7):
108
+ for combination in combinations(zip(scores, models), r):
109
+ combined_score = sum(score for score, _ in combination)
110
+ top_combinations[r].append((combined_score, tuple(model for _, model in combination)))
111
+ top_combinations[r] = sorted(top_combinations[r], key=lambda x: x[0], reverse=True)[:3]
112
+ return column, top_combinations
113
+
114
+ # Function to display the results of the highest combined scores
115
+ def display_highest_combined_scores(data):
116
+ with st.spinner('Calculating highest combined scores...'):
117
+ results = [calculate_highest_combined_score(data, col) for col in score_columns]
118
+ for column, top_combinations in results:
119
+ st.subheader(f"Top Combinations for {column}")
120
+ for r, combinations in top_combinations.items():
121
+ st.write(f"**Number of Models: {r}**")
122
+ for score, combination in combinations:
123
+ st.write(f"Score: {score}, Models: {', '.join(combination)}")
124
+
125
+ # Function to create and display charts (existing functions can be reused or modified as needed)
126
+
127
+ # Main function to run the Streamlit app
128
+ def main():
129
+ st.set_page_config(page_title="YALL - Yet Another LLM Leaderboard", layout="wide")
130
+ st.title("πŸ† YALL - Yet Another LLM Leaderboard")
131
+
132
+ # Example content placeholder - replace with actual markdown content or file upload
133
+ content = """Your markdown table content here"""
134
+
135
+ if content:
136
+ full_df = convert_markdown_table_to_dataframe(content)
137
+ full_df = get_model_info(full_df)
138
+ # Assuming the scores are already in the right format, otherwise adjust as needed
139
+ display_highest_combined_scores(full_df) # Call to display the calculated scores
140
+
141
+ # Rest of your Streamlit app logic here (tabs, visualizations, etc.)
142
+
143
+
144
+
145
+ @st.cache_data
146
+ def get_model_info(df):
147
+ api = HfApi()
148
+
149
+ # Initialize new columns for likes and tags
150
+ df['Likes'] = None
151
+ df['Tags'] = None
152
+
153
+ # Iterate through DataFrame rows
154
+ for index, row in df.iterrows():
155
+ model = row['Model'].strip()
156
+ try:
157
+ model_info = api.model_info(repo_id=str(model))
158
+ df.loc[index, 'Likes'] = model_info.likes
159
+ df.loc[index, 'Tags'] = ', '.join(model_info.tags)
160
+
161
+ except (RepositoryNotFoundError, RevisionNotFoundError):
162
+ df.loc[index, 'Likes'] = -1
163
+ df.loc[index, 'Tags'] = ''
164
+
165
+ return df
166
+
167
+ # Function to create bar chart for a given category
168
+ def create_bar_chart(df, category):
169
+ """Create and display a bar chart for a given category."""
170
+ st.write(f"### {category} Scores")
171
+
172
+ # Sort the DataFrame based on the category score
173
+ sorted_df = df[['Model', category]].sort_values(by=category, ascending=True)
174
+
175
+ # Create the bar chart with a color gradient (using 'Viridis' color scale as an example)
176
+ fig = go.Figure(go.Bar(
177
+ x=sorted_df[category],
178
+ y=sorted_df['Model'],
179
+ orientation='h',
180
+ marker=dict(color=sorted_df[category], colorscale='Spectral') # You can change 'Viridis' to another color scale
181
+ ))
182
+
183
+ # Update layout for better readability
184
+ fig.update_layout(
185
+ margin=dict(l=20, r=20, t=20, b=20)
186
+ )
187
+
188
+ # Adjust the height of the chart based on the number of rows in the DataFrame
189
+ st.plotly_chart(fig, use_container_width=True, height=len(df) * 35)
190
+
191
+ # Main function to run the Streamlit app
192
+ def main():
193
+ # Set page configuration and title
194
+ st.set_page_config(page_title="YALL - Yet Another LLM Leaderboard", layout="wide")
195
+
196
+ st.title("πŸ† YALL - Yet Another LLM Leaderboard")
197
+ st.markdown("Leaderboard made with 🧐 [LLM AutoEval](https://github.com/mlabonne/llm-autoeval) using [Nous](https://huggingface.co/NousResearch) benchmark suite.")
198
+
199
+ # Create tabs for leaderboard and about section
200
+ content = create_yall()
201
+ tab1, tab2 = st.tabs(["πŸ† Leaderboard", "πŸ“ About"])
202
+
203
+ # Leaderboard tab
204
+ with tab1:
205
+ if content:
206
+ try:
207
+ score_columns = ['Average', 'AGIEval', 'GPT4All', 'TruthfulQA', 'Bigbench']
208
+
209
+ # Display dataframe
210
+ full_df = convert_markdown_table_to_dataframe(content)
211
+
212
+ for col in score_columns:
213
+ # Corrected use of pd.to_numeric
214
+ full_df[col] = pd.to_numeric(full_df[col].str.strip(), errors='coerce')
215
+
216
+ full_df = get_model_info(full_df)
217
+ full_df['Tags'] = full_df['Tags'].fillna('')
218
+ df = pd.DataFrame(columns=full_df.columns)
219
+
220
+ # Toggles for filtering by tags
221
+ show_phi = st.checkbox("Phi (2.8B)", value=True)
222
+ show_mistral = st.checkbox("Mistral (7B)", value=True)
223
+ show_other = st.checkbox("Other", value=True)
224
+
225
+ # Create a DataFrame based on selected filters
226
+ dfs_to_concat = []
227
+
228
+ if show_phi:
229
+ dfs_to_concat.append(full_df[full_df['Tags'].str.lower().str.contains('phi,|phi-msft,')])
230
+ if show_mistral:
231
+ dfs_to_concat.append(full_df[full_df['Tags'].str.lower().str.contains('mistral,')])
232
+ if show_other:
233
+ other_df = full_df[~full_df['Tags'].str.lower().str.contains('phi,|phi-msft,|mistral,')]
234
+ dfs_to_concat.append(other_df)
235
+
236
+ # Concatenate the DataFrames
237
+ if dfs_to_concat:
238
+ df = pd.concat(dfs_to_concat, ignore_index=True)
239
+
240
+ # Add a search bar
241
+ search_query = st.text_input("Search models", "")
242
+
243
+ # Filter the DataFrame based on the search query
244
+ if search_query:
245
+ df = df[df['Model'].str.contains(search_query, case=False)]
246
+
247
+ # Display the filtered DataFrame or the entire leaderboard
248
+ st.dataframe(
249
+ df[['Model'] + score_columns + ['Likes', 'URL']],
250
+ use_container_width=True,
251
+ column_config={
252
+ "Likes": st.column_config.NumberColumn(
253
+ "Likes",
254
+ help="Number of likes on Hugging Face",
255
+ format="%d ❀️",
256
+ ),
257
+ "URL": st.column_config.LinkColumn("URL"),
258
+ },
259
+ hide_index=True,
260
+ height=len(df) * 37,
261
+ )
262
+ selected_models = st.multiselect('Select models to compare', df['Model'].unique())
263
+ comparison_df = df[df['Model'].isin(selected_models)]
264
+ st.dataframe(comparison_df)
265
+ # Add a button to export data to CSV
266
+ if st.button("Export to CSV"):
267
+ # Export the DataFrame to CSV
268
+ csv_data = df.to_csv(index=False)
269
+
270
+ # Create a link to download the CSV file
271
+ st.download_button(
272
+ label="Download CSV",
273
+ data=csv_data,
274
+ file_name="leaderboard.csv",
275
+ key="download-csv",
276
+ help="Click to download the CSV file",
277
+ )
278
+
279
+ # Full-width plot for the first category
280
+ create_bar_chart(df, score_columns[0])
281
+
282
+ # Next two plots in two columns
283
+ col1, col2 = st.columns(2)
284
+ with col1:
285
+ create_bar_chart(df, score_columns[1])
286
+ with col2:
287
+ create_bar_chart(df, score_columns[2])
288
+
289
+ # Last two plots in two columns
290
+ col3, col4 = st.columns(2)
291
+ with col3:
292
+ create_bar_chart(df, score_columns[3])
293
+ with col4:
294
+ create_bar_chart(df, score_columns[4])
295
+
296
+
297
+ except Exception as e:
298
+ st.error("An error occurred while processing the markdown table.")
299
+ st.error(str(e))
300
+ else:
301
+ st.error("Failed to download the content from the URL provided.")
302
+
303
+ # About tab
304
+ with tab2:
305
+ st.markdown('''
306
+ ### Nous benchmark suite
307
+
308
+ Popularized by [Teknium](https://huggingface.co/teknium) and [NousResearch](https://huggingface.co/NousResearch), this benchmark suite aggregates four benchmarks:
309
+
310
+ * [**AGIEval**](https://arxiv.org/abs/2304.06364) (0-shot): `agieval_aqua_rat,agieval_logiqa_en,agieval_lsat_ar,agieval_lsat_lr,agieval_lsat_rc,agieval_sat_en,agieval_sat_en_without_passage,agieval_sat_math`
311
+ * **GPT4ALL** (0-shot): `hellaswag,openbookqa,winogrande,arc_easy,arc_challenge,boolq,piqa`
312
+ * [**TruthfulQA**](https://arxiv.org/abs/2109.07958) (0-shot): `truthfulqa_mc`
313
+ * [**Bigbench**](https://arxiv.org/abs/2206.04615) (0-shot): `bigbench_causal_judgement,bigbench_date_understanding,bigbench_disambiguation_qa,bigbench_geometric_shapes,bigbench_logical_deduction_five_objects,bigbench_logical_deduction_seven_objects,bigbench_logical_deduction_three_objects,bigbench_movie_recommendation,bigbench_navigate,bigbench_reasoning_about_colored_objects,bigbench_ruin_names,bigbench_salient_translation_error_detection,bigbench_snarks,bigbench_sports_understanding,bigbench_temporal_sequences,bigbench_tracking_shuffled_objects_five_objects,bigbench_tracking_shuffled_objects_seven_objects,bigbench_tracking_shuffled_objects_three_objects`
314
+
315
+ ### Reproducibility
316
+
317
+ You can easily reproduce these results using 🧐 [LLM AutoEval](https://github.com/mlabonne/llm-autoeval/tree/master), a colab notebook that automates the evaluation process (benchmark: `nous`). This will upload the results to GitHub as gists. You can find the entire table with the links to the detailed results [here](https://gist.github.com/mlabonne/90294929a2dbcb8877f9696f28105fdf).
318
+
319
+ ### Clone this space
320
+
321
+ You can create your own leaderboard with your LLM AutoEval results on GitHub Gist. You just need to clone this space and specify two variables:
322
+
323
+ * Change the `gist_id` in [yall.py](https://huggingface.co/spaces/mlabonne/Yet_Another_LLM_Leaderboard/blob/main/yall.py#L126).
324
+ * Create "New Secret" in Settings > Variables and secrets (name: "github", value: [your GitHub token](https://github.com/settings/tokens))
325
+
326
+ A special thanks to [gblazex](https://huggingface.co/gblazex) for providing many evaluations.
327
+ ''')
328
+
329
+ # Run the main function if this script is run directly
330
+ if __name__ == "__main__":
331
+ main()