Ludwig Stumpp commited on
Commit
cfeff2f
2 Parent(s): 5a5e2af 1c71762

Merge branch 'main' into hf-launch

Browse files
Files changed (7) hide show
  1. README.md +1 -0
  2. poetry.lock +0 -0
  3. pyproject.toml +14 -0
  4. requirements-dev.txt +0 -4
  5. requirements.txt +0 -2
  6. setup.cfg +2 -0
  7. streamlit_app.py +35 -13
README.md CHANGED
@@ -56,6 +56,7 @@ https://huggingface.co/spaces/ludwigstumpp/llm-leaderboard
56
  | [llama-13b](https://arxiv.org/abs/2302.13971) | Meta AI | no | [932](https://lmsys.org/blog/2023-05-03-arena/) | | [0.792](https://arxiv.org/abs/2302.13971) | | [0.158](https://drive.google.com/file/d/1cN-b9GnWtHzQRoE7M7gAEyivY0kl4BYs/view) | | | | | | | [0.730](https://arxiv.org/abs/2302.13971v1) | | |
57
  | [llama-33b](https://arxiv.org/abs/2302.13971) | Meta AI | no | | | [0.828](https://arxiv.org/abs/2302.13971) | | [0.217](https://drive.google.com/file/d/1cN-b9GnWtHzQRoE7M7gAEyivY0kl4BYs/view) | | | | | | | [0.760](https://arxiv.org/abs/2302.13971v1) | | |
58
  | [llama-65b](https://arxiv.org/abs/2302.13971) | Meta AI | no | | | [0.842](https://arxiv.org/abs/2302.13971) | | [0.237](https://drive.google.com/file/d/1cN-b9GnWtHzQRoE7M7gAEyivY0kl4BYs/view) | | | | [0.634](https://arxiv.org/abs/2302.13971v1) | | | [0.770](https://arxiv.org/abs/2302.13971v1) | | |
 
59
  | [mpt-7b](https://huggingface.co/mosaicml/mpt-7b) | MosaicML | yes | | | [0.761](https://www.mosaicml.com/blog/mpt-7b) | | | [0.702](https://www.mosaicml.com/blog/mpt-7b) | | [0.296](https://www.mosaicml.com/blog/mpt-7b) | | [0.343](https://www.mosaicml.com/blog/mpt-7b) | | | | |
60
  | [oasst-pythia-12b](https://huggingface.co/OpenAssistant/pythia-12b-pre-v8-12.5k-steps) | Open Assistant | yes | [1065](https://lmsys.org/blog/2023-05-03-arena/) | | [0.681](https://gpt4all.io/reports/GPT4All_Technical_Report_3.pdf) | | | | | | | | | [0.650](https://gpt4all.io/reports/GPT4All_Technical_Report_3.pdf) | | |
61
  | [opt-7b](https://huggingface.co/facebook/opt-6.7b) | Meta AI | no | | | [0.677](https://www.mosaicml.com/blog/mpt-7b) | | | [0.677](https://www.mosaicml.com/blog/mpt-7b) | | [0.251](https://www.mosaicml.com/blog/mpt-7b) | | [0.227](https://www.mosaicml.com/blog/mpt-7b) | | | | |
 
56
  | [llama-13b](https://arxiv.org/abs/2302.13971) | Meta AI | no | [932](https://lmsys.org/blog/2023-05-03-arena/) | | [0.792](https://arxiv.org/abs/2302.13971) | | [0.158](https://drive.google.com/file/d/1cN-b9GnWtHzQRoE7M7gAEyivY0kl4BYs/view) | | | | | | | [0.730](https://arxiv.org/abs/2302.13971v1) | | |
57
  | [llama-33b](https://arxiv.org/abs/2302.13971) | Meta AI | no | | | [0.828](https://arxiv.org/abs/2302.13971) | | [0.217](https://drive.google.com/file/d/1cN-b9GnWtHzQRoE7M7gAEyivY0kl4BYs/view) | | | | | | | [0.760](https://arxiv.org/abs/2302.13971v1) | | |
58
  | [llama-65b](https://arxiv.org/abs/2302.13971) | Meta AI | no | | | [0.842](https://arxiv.org/abs/2302.13971) | | [0.237](https://drive.google.com/file/d/1cN-b9GnWtHzQRoE7M7gAEyivY0kl4BYs/view) | | | | [0.634](https://arxiv.org/abs/2302.13971v1) | | | [0.770](https://arxiv.org/abs/2302.13971v1) | | |
59
+ | [llama-2-70b](https://arxiv.org/abs/2307.09288) | Meta AI | yes | | [0.873](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard) | | | | | | | [0.698](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard) | | | | | |
60
  | [mpt-7b](https://huggingface.co/mosaicml/mpt-7b) | MosaicML | yes | | | [0.761](https://www.mosaicml.com/blog/mpt-7b) | | | [0.702](https://www.mosaicml.com/blog/mpt-7b) | | [0.296](https://www.mosaicml.com/blog/mpt-7b) | | [0.343](https://www.mosaicml.com/blog/mpt-7b) | | | | |
61
  | [oasst-pythia-12b](https://huggingface.co/OpenAssistant/pythia-12b-pre-v8-12.5k-steps) | Open Assistant | yes | [1065](https://lmsys.org/blog/2023-05-03-arena/) | | [0.681](https://gpt4all.io/reports/GPT4All_Technical_Report_3.pdf) | | | | | | | | | [0.650](https://gpt4all.io/reports/GPT4All_Technical_Report_3.pdf) | | |
62
  | [opt-7b](https://huggingface.co/facebook/opt-6.7b) | Meta AI | no | | | [0.677](https://www.mosaicml.com/blog/mpt-7b) | | | [0.677](https://www.mosaicml.com/blog/mpt-7b) | | [0.251](https://www.mosaicml.com/blog/mpt-7b) | | [0.227](https://www.mosaicml.com/blog/mpt-7b) | | | | |
poetry.lock ADDED
The diff for this file is too large to render. See raw diff
 
pyproject.toml ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [tool.poetry]
2
+ package-mode = false
3
+ description = ""
4
+ authors = ["Ludwig Stumpp <42147848+LudwigStumpp@users.noreply.github.com>"]
5
+ readme = "README.md"
6
+
7
+ [tool.poetry.dependencies]
8
+ python = "^3.10"
9
+ pandas = "^2.2.2"
10
+ streamlit = "^1.37.1"
11
+
12
+ [build-system]
13
+ requires = ["poetry-core"]
14
+ build-backend = "poetry.core.masonry.api"
requirements-dev.txt DELETED
@@ -1,4 +0,0 @@
1
- black
2
- flake
3
- isort
4
- mypy
 
 
 
 
 
requirements.txt DELETED
@@ -1,2 +0,0 @@
1
- pandas~=2.0.1
2
- streamlit~=1.22.0
 
 
 
setup.cfg ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ [flake8]
2
+ max-line-length = 88
streamlit_app.py CHANGED
@@ -4,7 +4,8 @@ from collections.abc import Iterable
4
 
5
  import pandas as pd
6
  import streamlit as st
7
- from pandas.api.types import is_bool_dtype, is_datetime64_any_dtype, is_numeric_dtype
 
8
 
9
  GITHUB_URL = "https://github.com/LudwigStumpp/llm-leaderboard"
10
  NON_BENCHMARK_COLS = ["Open?", "Publisher"]
@@ -22,11 +23,13 @@ def extract_table_and_format_from_markdown_text(markdown_table: str) -> pd.DataF
22
  df = (
23
  pd.read_table(io.StringIO(markdown_table), sep="|", header=0, index_col=1)
24
  .dropna(axis=1, how="all") # drop empty columns
25
- .iloc[1:] # drop first row which is the "----" separator of the original markdown table
 
 
26
  .sort_index(ascending=True)
27
  .apply(lambda x: x.str.strip() if x.dtype == "object" else x)
28
  .replace("", float("NaN"))
29
- .astype(float, errors="ignore")
30
  )
31
 
32
  # remove whitespace from column names and index
@@ -37,7 +40,9 @@ def extract_table_and_format_from_markdown_text(markdown_table: str) -> pd.DataF
37
  return df
38
 
39
 
40
- def extract_markdown_table_from_multiline(multiline: str, table_headline: str, next_headline_start: str = "#") -> str:
 
 
41
  """Extracts the markdown table from a multiline string.
42
 
43
  Args:
@@ -89,7 +94,9 @@ def remove_markdown_links(text: str) -> str:
89
  return text
90
 
91
 
92
- def filter_dataframe_by_row_and_columns(df: pd.DataFrame, ignore_columns: list[str] | None = None) -> pd.DataFrame:
 
 
93
  """
94
  Filter dataframe by the rows and columns to display.
95
 
@@ -116,7 +123,8 @@ def filter_dataframe_by_row_and_columns(df: pd.DataFrame, ignore_columns: list[s
116
  df = pd.DataFrame(df.loc[to_filter_index])
117
 
118
  to_filter_columns = st.multiselect(
119
- "Filter by benchmark:", sorted([c for c in df.columns if c not in ignore_columns])
 
120
  )
121
  if to_filter_columns:
122
  df = pd.DataFrame(df[ignore_columns + to_filter_columns])
@@ -173,7 +181,9 @@ def filter_dataframe_by_column_values(df: pd.DataFrame) -> pd.DataFrame:
173
  ),
174
  )
175
  if isinstance(user_date_input, Iterable) and len(user_date_input) == 2:
176
- user_date_input_datetime = tuple(map(pd.to_datetime, user_date_input))
 
 
177
  start_date, end_date = user_date_input_datetime
178
  df = df.loc[df[column].between(start_date, end_date)]
179
 
@@ -207,22 +217,30 @@ def setup_basic():
207
 
208
 
209
  def setup_leaderboard(readme: str):
210
- leaderboard_table = extract_markdown_table_from_multiline(readme, table_headline="## Leaderboard")
 
 
211
  leaderboard_table = remove_markdown_links(leaderboard_table)
212
  df_leaderboard = extract_table_and_format_from_markdown_text(leaderboard_table)
213
- df_leaderboard["Open?"] = df_leaderboard["Open?"].map({"yes": 1, "no": 0}).astype(bool)
 
 
214
 
215
  st.markdown("## Leaderboard")
216
  modify = st.checkbox("Add filters")
217
  clear_empty_entries = st.checkbox("Clear empty entries", value=True)
218
 
219
  if modify:
220
- df_leaderboard = filter_dataframe_by_row_and_columns(df_leaderboard, ignore_columns=NON_BENCHMARK_COLS)
 
 
221
  df_leaderboard = filter_dataframe_by_column_values(df_leaderboard)
222
 
223
  if clear_empty_entries:
224
  df_leaderboard = df_leaderboard.dropna(axis=1, how="all")
225
- benchmark_columns = [c for c in df_leaderboard.columns if df_leaderboard[c].dtype == float]
 
 
226
  rows_wo_any_benchmark = df_leaderboard[benchmark_columns].isna().all(axis=1)
227
  df_leaderboard = df_leaderboard[~rows_wo_any_benchmark]
228
 
@@ -246,12 +264,16 @@ def setup_leaderboard(readme: str):
246
 
247
 
248
  def setup_benchmarks(readme: str):
249
- benchmarks_table = extract_markdown_table_from_multiline(readme, table_headline="## Benchmarks")
 
 
250
  df_benchmarks = extract_table_and_format_from_markdown_text(benchmarks_table)
251
 
252
  st.markdown("## Covered Benchmarks")
253
 
254
- selected_benchmark = st.selectbox("Select a benchmark to learn more:", df_benchmarks.index.unique())
 
 
255
  df_selected = df_benchmarks.loc[selected_benchmark]
256
  text = [
257
  f"Name: {selected_benchmark}",
 
4
 
5
  import pandas as pd
6
  import streamlit as st
7
+ from pandas.api.types import (is_bool_dtype, is_datetime64_any_dtype,
8
+ is_numeric_dtype)
9
 
10
  GITHUB_URL = "https://github.com/LudwigStumpp/llm-leaderboard"
11
  NON_BENCHMARK_COLS = ["Open?", "Publisher"]
 
23
  df = (
24
  pd.read_table(io.StringIO(markdown_table), sep="|", header=0, index_col=1)
25
  .dropna(axis=1, how="all") # drop empty columns
26
+ .iloc[
27
+ 1:
28
+ ] # drop first row which is the "----" separator of the original markdown table
29
  .sort_index(ascending=True)
30
  .apply(lambda x: x.str.strip() if x.dtype == "object" else x)
31
  .replace("", float("NaN"))
32
+ .apply(pd.to_numeric, errors="ignore")
33
  )
34
 
35
  # remove whitespace from column names and index
 
40
  return df
41
 
42
 
43
+ def extract_markdown_table_from_multiline(
44
+ multiline: str, table_headline: str, next_headline_start: str = "#"
45
+ ) -> str:
46
  """Extracts the markdown table from a multiline string.
47
 
48
  Args:
 
94
  return text
95
 
96
 
97
+ def filter_dataframe_by_row_and_columns(
98
+ df: pd.DataFrame, ignore_columns: list[str] | None = None
99
+ ) -> pd.DataFrame:
100
  """
101
  Filter dataframe by the rows and columns to display.
102
 
 
123
  df = pd.DataFrame(df.loc[to_filter_index])
124
 
125
  to_filter_columns = st.multiselect(
126
+ "Filter by benchmark:",
127
+ sorted([c for c in df.columns if c not in ignore_columns]),
128
  )
129
  if to_filter_columns:
130
  df = pd.DataFrame(df[ignore_columns + to_filter_columns])
 
181
  ),
182
  )
183
  if isinstance(user_date_input, Iterable) and len(user_date_input) == 2:
184
+ user_date_input_datetime = tuple(
185
+ map(pd.to_datetime, user_date_input)
186
+ )
187
  start_date, end_date = user_date_input_datetime
188
  df = df.loc[df[column].between(start_date, end_date)]
189
 
 
217
 
218
 
219
  def setup_leaderboard(readme: str):
220
+ leaderboard_table = extract_markdown_table_from_multiline(
221
+ readme, table_headline="## Leaderboard"
222
+ )
223
  leaderboard_table = remove_markdown_links(leaderboard_table)
224
  df_leaderboard = extract_table_and_format_from_markdown_text(leaderboard_table)
225
+ df_leaderboard["Open?"] = (
226
+ df_leaderboard["Open?"].map({"yes": 1, "no": 0}).astype(bool)
227
+ )
228
 
229
  st.markdown("## Leaderboard")
230
  modify = st.checkbox("Add filters")
231
  clear_empty_entries = st.checkbox("Clear empty entries", value=True)
232
 
233
  if modify:
234
+ df_leaderboard = filter_dataframe_by_row_and_columns(
235
+ df_leaderboard, ignore_columns=NON_BENCHMARK_COLS
236
+ )
237
  df_leaderboard = filter_dataframe_by_column_values(df_leaderboard)
238
 
239
  if clear_empty_entries:
240
  df_leaderboard = df_leaderboard.dropna(axis=1, how="all")
241
+ benchmark_columns = [
242
+ c for c in df_leaderboard.columns if df_leaderboard[c].dtype == float
243
+ ]
244
  rows_wo_any_benchmark = df_leaderboard[benchmark_columns].isna().all(axis=1)
245
  df_leaderboard = df_leaderboard[~rows_wo_any_benchmark]
246
 
 
264
 
265
 
266
  def setup_benchmarks(readme: str):
267
+ benchmarks_table = extract_markdown_table_from_multiline(
268
+ readme, table_headline="## Benchmarks"
269
+ )
270
  df_benchmarks = extract_table_and_format_from_markdown_text(benchmarks_table)
271
 
272
  st.markdown("## Covered Benchmarks")
273
 
274
+ selected_benchmark = st.selectbox(
275
+ "Select a benchmark to learn more:", df_benchmarks.index.unique()
276
+ )
277
  df_selected = df_benchmarks.loc[selected_benchmark]
278
  text = [
279
  f"Name: {selected_benchmark}",