pavlichenko commited on
Commit
2940af9
1 Parent(s): 4c379fb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +17 -4
app.py CHANGED
@@ -6,8 +6,19 @@ import pandas as pd
6
  description = """The Toloka LLM leaderboard provides a human evaluation framework. Here, we invite annotators from the [Toloka](https://toloka.ai/) crowdsourcing platform to assess the model's responses. For this purpose, responses are generated by open-source LLMs based on a dataset of real-world user prompts. These prompts are categorized as per the [InstructGPT paper](https://arxiv.org/abs/2203.02155). Subsequently, annotators evaluate these responses in the manner of [AlpacaEval](https://tatsu-lab.github.io/alpaca_eval/). It's worth noting that we employ [Guanaco 13B](https://huggingface.co/timdettmers/guanaco-13b) instead of text-davinci-003. This is because Guanaco 13B is the closest counterpart to the now-deprecated text-davinci-003 in AlpacaEval.
7
  The metrics on the leaderboard represent the win rate of the respective model in comparison to Guanaco 13B across various prompt categories. The "all" category denotes the aggregation of all prompts and is not a mere average of metrics from individual categories."""
8
 
 
 
 
 
 
 
 
 
 
 
9
  leaderboard_results = requests.get("https://llmleaderboard.blob.core.windows.net/llmleaderboard/evaluation_resuls.json").json()
10
  categories = list(leaderboard_results.keys())
 
11
  categories.sort()
12
  models = set()
13
 
@@ -25,18 +36,20 @@ table = []
25
  for model in models:
26
  row = [model]
27
  for category in categories:
 
 
28
  if category not in model_ratings[model]:
29
- ignore = True
30
  else:
31
  row.append(model_ratings[model][category] * 100)
32
  table.append(row)
33
 
34
- table = pd.DataFrame(table, columns=['Model'] + categories)
35
- table = table.sort_values(by=['all'], ascending=False)
36
  table = table.head(5)
37
  table.index = range(1, len(table) + 1)
38
 
39
- for category in categories:
40
  table[category] = table[category].map('{:,.2f}%'.format)
41
 
42
  st.set_page_config(layout="wide")
 
6
  description = """The Toloka LLM leaderboard provides a human evaluation framework. Here, we invite annotators from the [Toloka](https://toloka.ai/) crowdsourcing platform to assess the model's responses. For this purpose, responses are generated by open-source LLMs based on a dataset of real-world user prompts. These prompts are categorized as per the [InstructGPT paper](https://arxiv.org/abs/2203.02155). Subsequently, annotators evaluate these responses in the manner of [AlpacaEval](https://tatsu-lab.github.io/alpaca_eval/). It's worth noting that we employ [Guanaco 13B](https://huggingface.co/timdettmers/guanaco-13b) instead of text-davinci-003. This is because Guanaco 13B is the closest counterpart to the now-deprecated text-davinci-003 in AlpacaEval.
7
  The metrics on the leaderboard represent the win rate of the respective model in comparison to Guanaco 13B across various prompt categories. The "all" category denotes the aggregation of all prompts and is not a mere average of metrics from individual categories."""
8
 
9
+ pretty_category_names = {
10
+ "all": "Total",
11
+ "brainstorming": "Brainstorming",
12
+ "closed_qa": "Closed QA",
13
+ "generation": "Generation",
14
+ "open_qa": "Open QA",
15
+ "rewrite": "Rewrite",
16
+ }
17
+
18
+
19
  leaderboard_results = requests.get("https://llmleaderboard.blob.core.windows.net/llmleaderboard/evaluation_resuls.json").json()
20
  categories = list(leaderboard_results.keys())
21
+ pretty_categories = [pretty_category_names[category] for category in categories if category in pretty_category_names]
22
  categories.sort()
23
  models = set()
24
 
 
36
  for model in models:
37
  row = [model]
38
  for category in categories:
39
+ if category not in pretty_category_names:
40
+ continue
41
  if category not in model_ratings[model]:
42
+ row.append(0.0)
43
  else:
44
  row.append(model_ratings[model][category] * 100)
45
  table.append(row)
46
 
47
+ table = pd.DataFrame(table, columns=['Model'] + pretty_categories)
48
+ table = table.sort_values(by=['Total'], ascending=False)
49
  table = table.head(5)
50
  table.index = range(1, len(table) + 1)
51
 
52
+ for category in pretty_category_names.values():
53
  table[category] = table[category].map('{:,.2f}%'.format)
54
 
55
  st.set_page_config(layout="wide")