sherzod-hakimov Koshti10 commited on
Commit
e16fd64
·
1 Parent(s): c69d931

Initial commit (#1)

Browse files

- Initial commit (d4c52034a11be912014a2ecc30b9c8a9d4ca6ba3)


Co-authored-by: Koshti <Koshti10@users.noreply.huggingface.co>

app.py ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+
3
+ from src.assets.text_content import TITLE, INTRODUCTION_TEXT
4
+ from src.utils import get_data, compare_plots, filter_search
5
+
6
+ ############################ For Leaderboards #############################
7
+ DATA_PATH = 'versions'
8
+ latest_flag = True #Set flag to iclude latest data in Details and Versions Tab
9
+ latest_df, latest_vname, previous_df, previous_vname = get_data(DATA_PATH, latest_flag)
10
+
11
+ global prev_df
12
+ prev_df = previous_df[0]
13
+ def select_prev_df(name):
14
+ ind = previous_vname.index(name)
15
+ prev_df = previous_df[ind]
16
+ return prev_df
17
+
18
+ ############################ For Plots ####################################
19
+ global plot_df, MODEL_COLS
20
+ plot_df = latest_df[0]
21
+ MODEL_COLS = list(plot_df['Model'].unique())
22
+
23
+
24
+ ############# MAIN APPLICATION ######################
25
+ demo = gr.Blocks()
26
+ with demo:
27
+ gr.HTML(TITLE)
28
+ gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
29
+
30
+ with gr.Tabs(elem_classes="tab-buttons") as tabs:
31
+ with gr.TabItem("🥇 Clem Leaderboard", elem_id="llm-benchmark-tab-table", id=0):
32
+ with gr.Row():
33
+ search_bar = gr.Textbox(
34
+ placeholder=" 🔍 Search for models - separate multiple queries with `;` and press ENTER...",
35
+ show_label=False,
36
+ elem_id="search-bar",
37
+ )
38
+
39
+ leaderboard_table = gr.components.Dataframe(
40
+ value=latest_df[0],
41
+ elem_id="leaderboard-table",
42
+ interactive=False,
43
+ visible=True,
44
+ )
45
+
46
+ # Add a dummy leaderboard to handle search queries from the latest_df and not update latest_df
47
+ dummy_leaderboard_table = gr.components.Dataframe(
48
+ value=latest_df[0],
49
+ elem_id="leaderboard-table",
50
+ interactive=False,
51
+ visible=False,
52
+ )
53
+
54
+ search_bar.submit(
55
+ filter_search,
56
+ [dummy_leaderboard_table, search_bar],
57
+ leaderboard_table,
58
+ queue=True
59
+ )
60
+ with gr.TabItem("📈 Plot", id=3):
61
+ with gr.Row():
62
+ model_cols = gr.CheckboxGroup(
63
+ MODEL_COLS,
64
+ label="Select Models 🤖",
65
+ value=[],
66
+ elem_id="column-select",
67
+ interactive=True,
68
+ )
69
+
70
+ with gr.Row():
71
+ plot_grdf = gr.DataFrame(
72
+ value=plot_df,
73
+ visible=False
74
+ )
75
+ with gr.Row():
76
+ # Output block for the plot
77
+ plot_output = gr.Plot()
78
+
79
+ model_cols.change(
80
+ compare_plots,
81
+ [plot_grdf, model_cols],
82
+ plot_output,
83
+ queue=True
84
+ )
85
+
86
+ with gr.TabItem("🔄 Versions and Details", elem_id="details", id=2):
87
+ with gr.Row():
88
+ ver_selection = gr.Dropdown(
89
+ previous_vname, label="Select Version 🕹️", value=previous_vname[0]
90
+ )
91
+ with gr.Row():
92
+ search_bar_prev = gr.Textbox(
93
+ placeholder=" 🔍 Search for models - separate multiple queries with `;` and press ENTER...",
94
+ show_label=False,
95
+ elem_id="search-bar-2",
96
+ )
97
+
98
+ prev_table = gr.components.Dataframe(
99
+ value=prev_df,
100
+ elem_id="leaderboard-table",
101
+ interactive=False,
102
+ visible=True,
103
+ )
104
+
105
+ dummy_prev_table = gr.components.Dataframe(
106
+ value=prev_df,
107
+ elem_id="leaderboard-table",
108
+ interactive=False,
109
+ visible=False,
110
+ )
111
+
112
+ search_bar_prev.submit(
113
+ filter_search,
114
+ [dummy_prev_table, search_bar_prev],
115
+ prev_table,
116
+ queue=True
117
+ )
118
+
119
+ ver_selection.change(
120
+ select_prev_df,
121
+ [ver_selection],
122
+ prev_table,
123
+ queue=True
124
+ )
125
+
126
+ demo.load()
127
+ demo.queue()
128
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ accelerate==0.23.0
2
+ aiofiles==23.1.0
3
+ aiohttp==3.8.4
4
+ aiosignal==1.3.1
5
+ altair==4.2.2
6
+ anyio==3.6.2
7
+ APScheduler==3.10.1
8
+ async-timeout==4.0.2
9
+ attrs==23.1.0
10
+ certifi==2022.12.7
11
+ charset-normalizer==3.1.0
12
+ click==8.1.3
13
+ contourpy==1.0.7
14
+ cycler==0.11.0
15
+ datasets==2.12.0
16
+ entrypoints==0.4
17
+ fastapi==0.95.1
18
+ ffmpy==0.3.0
19
+ filelock==3.11.0
20
+ fonttools==4.39.3
21
+ frozenlist==1.3.3
22
+ fsspec==2023.5.0
23
+ gradio==3.43.2
24
+ gradio-client==0.5.0
25
+ h11==0.14.0
26
+ httpcore==0.17.0
27
+ httpx==0.24.0
28
+ huggingface-hub==0.16.4
29
+ idna==3.4
30
+ Jinja2==3.1.2
31
+ jsonschema==4.17.3
32
+ kiwisolver==1.4.4
33
+ linkify-it-py==2.0.0
34
+ markdown-it-py==2.2.0
35
+ MarkupSafe==2.1.2
36
+ matplotlib==3.7.1
37
+ mdit-py-plugins==0.3.3
38
+ mdurl==0.1.2
39
+ multidict==6.0.4
40
+ numpy==1.24.2
41
+ orjson==3.8.10
42
+ packaging==23.1
43
+ pandas==2.0.0
44
+ Pillow==9.5.0
45
+ plotly==5.14.1
46
+ pyarrow==11.0.0
47
+ pydantic==1.10.7
48
+ pydub==0.25.1
49
+ pyparsing==3.0.9
50
+ pyrsistent==0.19.3
51
+ python-dateutil==2.8.2
52
+ python-multipart==0.0.6
53
+ pytz==2023.3
54
+ pytz-deprecation-shim==0.1.0.post0
55
+ PyYAML==6.0
56
+ requests==2.28.2
57
+ semantic-version==2.10.0
58
+ six==1.16.0
59
+ sniffio==1.3.0
60
+ starlette==0.26.1
61
+ toolz==0.12.0
62
+ tqdm==4.65.0
63
+ transformers@git+https://github.com/clefourrier/transformers.git
64
+ tokenizers==0.14
65
+ #tokenizers==0.14.1 wait for tokenizers patch in dependencies with hf_hub
66
+ #transformers==4.34
67
+ typing_extensions==4.5.0
68
+ tzdata==2023.3
69
+ tzlocal==4.3
70
+ uc-micro-py==1.0.1
71
+ urllib3==1.26.15
72
+ uvicorn==0.21.1
73
+ websockets==11.0.1
74
+ yarl==1.8.2
75
+ hf_transfer==0.1.3
src/assets/text_content.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ TITLE = """<h1 align="center" id="space-title"> 🏆 CLEM Leaderboard</h1>"""
2
+
3
+ INTRODUCTION_TEXT = """
4
+ 🔝 The CLEM Leaderboard aims to track, rank and evaluate current cLLMs (chat-optimized Large Language Models, “clems”) as described in [Clembench: Using Game Play to Evaluate Chat-Optimized Language Models as Conversational Agents](https://arxiv.org/abs/2305.13455).
5
+ """
6
+
7
+ SHORT_NAMES = {
8
+ "t0.0": "",
9
+ "claude-v1.3-": "cl",
10
+ "gpt-3.5-turbo-": "3.5",
11
+ "gpt-4-": "4",
12
+ "text-davinci-003-": "3",
13
+ "luminous-supreme-": "lm",
14
+ "koala-13b-": "ko",
15
+ "falcon-40b-": "flc",
16
+ "oasst-12b-": "ost",
17
+ "vicuna-13b-": "vcn"
18
+ }
src/utils.py ADDED
@@ -0,0 +1,217 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pandas as pd
3
+ import matplotlib.pyplot as plt
4
+ import numpy as np
5
+
6
+ from src.assets.text_content import SHORT_NAMES
7
+
8
+ def update_cols(df: pd.DataFrame) -> pd.DataFrame:
9
+ '''
10
+ Change three header rows to a single header row
11
+ Args:
12
+ df: Raw dataframe containing 3 separate header rows
13
+ Remove this function if the dataframe has only one header row
14
+
15
+ Returns:
16
+ df: Updated dataframe which has only 1 header row instead of 3
17
+ '''
18
+ default_cols = list(df.columns)
19
+
20
+ # First 4 columns are initalised in 'update', Append additional columns for games Model, Clemscore, ALL(PLayed) and ALL(Main Score)
21
+ update = ['Model', 'Clemscore', 'All(Played)', 'All(Quality Score)']
22
+ game_metrics = default_cols[4:]
23
+
24
+ # Change columns Names for each Game
25
+ for i in range(len(game_metrics)):
26
+ if i%3 == 0:
27
+ game = game_metrics[i]
28
+ update.append(str(game).capitalize() + "(Played)")
29
+ update.append(str(game).capitalize() + "(Quality Score)")
30
+ update.append(str(game).capitalize() + "(Quality Score[std])")
31
+
32
+ # Create a dict to change names of the columns
33
+ map_cols = {}
34
+ for i in range(len(default_cols)):
35
+ map_cols[default_cols[i]] = str(update[i])
36
+
37
+ df = df.rename(columns=map_cols)
38
+ df = df.iloc[2:]
39
+
40
+ return df
41
+
42
+ def process_df(df: pd.DataFrame) -> pd.DataFrame:
43
+ '''
44
+ Process dataframe - Remove repition in model names, convert datatypes to sort by "float" instead of "str"
45
+ Args:
46
+ df: Unprocessed Dataframe (after using update_cols)
47
+ Returns:
48
+ df: Processed Dataframe
49
+ '''
50
+
51
+ # Change column type to float from str
52
+ list_column_names = list(df.columns)
53
+ model_col_name = list_column_names[0]
54
+ for col in list_column_names:
55
+ if col != model_col_name:
56
+ df[col] = df[col].astype(float)
57
+
58
+ # Remove repetition in model names, if any
59
+ models_list = []
60
+ for i in range(len(df)):
61
+ model_name = df.iloc[i][model_col_name]
62
+ splits = model_name.split('--')
63
+ splits = [split.replace('-t0.0', '') for split in splits] # Comment to not remove -t0.0
64
+ if splits[0] == splits[1]:
65
+ models_list.append(splits[0])
66
+ else:
67
+ models_list.append(splits[0] + "--" + splits[1])
68
+ df[model_col_name] = models_list
69
+
70
+ return df
71
+
72
+ def get_data(path: str, flag: bool):
73
+ '''
74
+ Get a list of all version names and respective Dataframes
75
+ Args:
76
+ path: Path to the directory containing CSVs of different versions -> v0.9.csv, v1.0.csv, ....
77
+ flag: Set this flag to include the latest version in Details and Versions tab
78
+ Returns:
79
+ latest_df: singular list containing dataframe of the latest version of the leaderboard with only 4 columns
80
+ latest_vname: list of the name of latest version
81
+ previous_df: list of dataframes for previous versions (can skip latest version if required)
82
+ previous_vname: list of the names for the previous versions (INCLUDED IN Details and Versions Tab)
83
+
84
+ '''
85
+ # Check if Directory is empty
86
+ list_versions = os.listdir(path)
87
+ if not list_versions:
88
+ print("Directory is empty")
89
+
90
+ else:
91
+ files = [file for file in list_versions if file.endswith('.csv')]
92
+ files.sort(reverse=True)
93
+ file_names = [os.path.splitext(file)[0] for file in files]
94
+
95
+ DFS = []
96
+ for file in files:
97
+ df = pd.read_csv(os.path.join(path, file))
98
+ df = update_cols(df) # Remove if by default there is only one header row
99
+ df = process_df(df) # Process Dataframe
100
+ df = df.sort_values(by=list(df.columns)[1], ascending=False) # Sort by clemscore
101
+ DFS.append(df)
102
+
103
+ # Only keep relavant columns for the main leaderboard
104
+ latest_df_dummy = DFS[0]
105
+ all_columns = list(latest_df_dummy.columns)
106
+ keep_columns = all_columns[0:4]
107
+ latest_df_dummy = latest_df_dummy.drop(columns=[c for c in all_columns if c not in keep_columns])
108
+
109
+ latest_df = [latest_df_dummy]
110
+ latest_vname = [file_names[0]]
111
+ previous_df = []
112
+ previous_vname = []
113
+ for df, name in zip(DFS, file_names):
114
+ previous_df.append(df)
115
+ previous_vname.append(name)
116
+
117
+ if not flag:
118
+ previous_df.pop(0)
119
+ previous_vname.pop(0)
120
+
121
+ return latest_df, latest_vname, previous_df, previous_vname
122
+
123
+ return None
124
+
125
+
126
+ # ['Model', 'Clemscore', 'All(Played)', 'All(Quality Score)']
127
+ def compare_plots(df: pd.DataFrame, LIST: list):
128
+ '''
129
+ Quality Score v/s % Played plot by selecting models
130
+ Args:
131
+ LIST: The list of models to show in the plot, updated from frontend
132
+ Returns:
133
+ fig: The plot
134
+ '''
135
+ short_names = label_map(LIST)
136
+
137
+ list_columns = list(df.columns)
138
+ df = df[df[list_columns[0]].isin(LIST)]
139
+
140
+ X = df[list_columns[2]]
141
+ fig, ax = plt.subplots()
142
+ for model in LIST:
143
+ short = short_names[model][0]
144
+ same_flag = short_names[model][1]
145
+ model_df = df[df[list_columns[0]] == model]
146
+ x = model_df[list_columns[2]]
147
+ y = model_df[list_columns[3]]
148
+ color = plt.cm.rainbow(x / max(X)) # Use a colormap for different colors
149
+ plt.scatter(x, y, color=color)
150
+ if same_flag:
151
+ plt.annotate(f'{short}', (x, y), textcoords="offset points", xytext=(0, -15), ha='center', rotation=0)
152
+ else:
153
+ plt.annotate(f'{short}', (x, y), textcoords="offset points", xytext=(20, -3), ha='center', rotation=0)
154
+ ax.grid(which='both', color='grey', linewidth=1, linestyle='-', alpha=0.2)
155
+ ax.set_xticks(np.arange(0,110,10))
156
+ plt.xlim(-10, 110)
157
+ plt.ylim(-10, 110)
158
+ plt.xlabel('% Played')
159
+ plt.ylabel('Quality Score')
160
+ plt.title('Overview of benchmark results')
161
+ plt.show()
162
+
163
+ return fig
164
+
165
+
166
+ def label_map(model_list: list) -> dict:
167
+ '''
168
+ Generate a map from long names to short names, to plot them in frontend graph
169
+ Define the short names in src/assets/text_content.py
170
+ Args:
171
+ model_list: A list of long model names
172
+ Returns:
173
+ short_name: A map from long to list of short name + indication if models are same or different
174
+ '''
175
+ short_name = {}
176
+ for model_name in model_list:
177
+ splits = model_name.split('--')
178
+ if len(splits) != 1:
179
+ splits[0] = SHORT_NAMES[splits[0] + '-']
180
+ splits[1] = SHORT_NAMES[splits[1] + '-']
181
+ # Define the short name and indicate there are two different models
182
+ short_name[model_name] = [splits[0] + '--' + splits[1], 0]
183
+ else:
184
+ splits[0] = SHORT_NAMES[splits[0] + '-']
185
+ # Define the short name and indicate both models are same
186
+ short_name[model_name] = [splits[0], 1]
187
+
188
+ return short_name
189
+
190
+ def filter_search(df: pd.DataFrame, query: str) -> pd.DataFrame:
191
+ '''
192
+ Filter the dataframe based on the search query
193
+ Args:
194
+ df: Unfiltered dataframe
195
+ query: a string of queries separated by ";"
196
+ Return:
197
+ filtered_df: Dataframe containing searched queries in the 'Model' column
198
+ '''
199
+ queries = query.split(';')
200
+ list_cols = list(df.columns)
201
+ df_len = len(df)
202
+ filtered_models = []
203
+ models_list = list(df[list_cols[0]])
204
+ for q in queries:
205
+ q = q.lower()
206
+ for i in range(df_len):
207
+ model_name = models_list[i]
208
+ if q in model_name.lower():
209
+ filtered_models.append(model_name) # Append model names containing query q
210
+
211
+ filtered_df = df[df[list_cols[0]].isin(filtered_models)]
212
+
213
+ if query == "":
214
+ return df
215
+
216
+ return filtered_df
217
+
versions/v0.7.csv ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ,-,all,all,imagegame,imagegame,imagegame,privateshared,privateshared,privateshared,referencegame,referencegame,referencegame,taboo,taboo,taboo,wordle,wordle,wordle,wordle_withclue,wordle_withclue,wordle_withclue,wordle_withcritic,wordle_withcritic,wordle_withcritic
2
+ ,clemscore,Average % Played,Average Quality Score,% Played,Quality Score,Quality Score (std),% Played,Quality Score,Quality Score (std),% Played,Quality Score,Quality Score (std),% Played,Quality Score,Quality Score (std),% Played,Quality Score,Quality Score (std),% Played,Quality Score,Quality Score (std),% Played,Quality Score,Quality Score (std)
3
+ model,,,,,,,,,,,,,,,,,,,,,,,,
4
+ claude-v1.3-t0.0--claude-v1.3-t0.0,37.07,74.76,49.58,0.0,,,100.0,84.87,18.87,100.0,82.5,38.48,76.92,68.75,38.71,100.0,0.0,0.0,100.0,30.56,40.13,46.43,30.77,48.04
5
+ falcon-40b-t0.0--falcon-40b-t0.0,0.71,0.95,75.0,0.0,,,0.0,,,0.0,,,0.0,,,0.0,,,3.33,50.0,,3.33,100.0,
6
+ gpt-3.5-turbo-t0.0--gpt-3.5-turbo-t0.0,37.02,85.86,43.12,97.5,60.28,25.95,64.0,72.83,13.07,100.0,55.0,50.38,69.49,71.95,44.79,100.0,0.0,0.0,93.33,28.57,46.0,76.67,13.19,30.16
7
+ gpt-3.5-turbo-t0.0--gpt-4-t0.0,42.39,86.75,48.87,97.5,64.95,25.45,,,,100.0,57.5,50.06,69.49,62.6,45.15,,,,,,,80.0,10.42,17.42
8
+ gpt-4-t0.0--gpt-3.5-turbo-t0.0,55.62,82.78,67.19,65.0,81.0,21.54,,,,100.0,47.5,50.57,66.1,93.59,23.45,,,,,,,100.0,46.67,42.92
9
+ gpt-4-t0.0--gpt-4-t0.0,7.77,96.06,61.93,77.5,89.06,22.28,100.0,90.79,8.2,100.0,75.0,43.85,94.92,76.19,37.45,100.0,3.67,8.4,100.0,49.67,42.09,100.0,49.11,38.46
10
+ koala-13b-t0.0--koala-13b-t0.0,1.48,14.76,10.0,0.0,,,0.0,,,0.0,,,0.0,,,86.67,0.0,0.0,16.67,20.0,44.72,0.0,,
11
+ luminous-supreme-t0.0--luminous-supreme-t0.0,0.0,16.24,0.0,0.0,,,0.0,,,0.0,,,0.0,,,100.0,0.0,0.0,3.33,0.0,,10.34,0.0,0.0
12
+ oasst-12b-t0.0--oasst-12b-t0.0,1.74,20.85,8.33,0.0,,,0.0,,,15.0,33.33,51.64,0.0,,,100.0,0.0,0.0,16.67,0.0,0.0,14.29,0.0,0.0
13
+ text-davinci-003-t0.0--text-davinci-003-t0.0,15.78,44.5,35.46,57.5,38.7,27.78,16.0,14.1,25.21,82.5,36.36,48.85,28.81,76.47,43.72,66.67,1.25,5.59,36.67,31.36,38.99,23.33,50.0,50.0
14
+ vicuna-13b-t0.0--vicuna-13b-t0.0,4.24,13.58,31.25,0.0,,,0.0,,,0.0,,,5.08,100.0,0.0,56.67,0.0,0.0,13.33,25.0,50.0,20.0,0.0,0.0
versions/v0.8.csv ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ,-,all,all,imagegame,imagegame,imagegame,privateshared,privateshared,privateshared,referencegame,referencegame,referencegame,taboo,taboo,taboo,wordle,wordle,wordle,wordle_withclue,wordle_withclue,wordle_withclue,wordle_withcritic,wordle_withcritic,wordle_withcritic
2
+ ,clemscore,Average % Played,Average Quality Score,% Played,Quality Score,Quality Score (std),% Played,Quality Score,Quality Score (std),% Played,Quality Score,Quality Score (std),% Played,Quality Score,Quality Score (std),% Played,Quality Score,Quality Score (std),% Played,Quality Score,Quality Score (std),% Played,Quality Score,Quality Score (std)
3
+ model,,,,,,,,,,,,,,,,,,,,,,,,
4
+ claude-v1.3-t0.0--claude-v1.3-t0.0,37.07,74.76,49.58,0.0,,,100.0,84.87,18.87,100.0,82.5,38.48,76.92,68.75,38.71,100.0,0.0,0.0,100.0,30.56,40.13,46.43,30.77,48.04
5
+ falcon-40b-t0.0--falcon-40b-t0.0,0.71,0.95,75.0,0.0,,,0.0,,,0.0,,,0.0,,,0.0,,,3.33,50.0,,3.33,100.0,
6
+ gpt-3.5-turbo-t0.0--gpt-3.5-turbo-t0.0,37.02,85.86,43.12,97.5,60.28,25.95,64.0,72.83,13.07,100.0,55.0,50.38,69.49,71.95,44.79,100.0,0.0,0.0,93.33,28.57,46.0,76.67,13.19,30.16
7
+ gpt-3.5-turbo-t0.0--gpt-4-t0.0,42.39,86.75,48.87,97.5,64.95,25.45,,,,100.0,57.5,50.06,69.49,62.6,45.15,,,,,,,80.0,10.42,17.42
8
+ gpt-4-t0.0--gpt-3.5-turbo-t0.0,55.62,82.78,67.19,65.0,81.0,21.54,,,,100.0,47.5,50.57,66.1,93.59,23.45,,,,,,,100.0,46.67,42.92
9
+ gpt-4-t0.0--gpt-4-t0.0,8.88,96.06,61.93,77.5,89.06,22.28,100.0,90.79,8.2,100.0,75.0,43.85,94.92,76.19,37.45,100.0,3.67,8.4,100.0,49.67,42.09,100.0,49.11,38.46
10
+ koala-13b-t0.0--koala-13b-t0.0,1.48,14.76,10.0,0.0,,,0.0,,,0.0,,,0.0,,,86.67,0.0,0.0,16.67,20.0,44.72,0.0,,
11
+ luminous-supreme-t0.0--luminous-supreme-t0.0,0.0,16.24,0.0,0.0,,,0.0,,,0.0,,,0.0,,,100.0,0.0,0.0,3.33,0.0,,10.34,0.0,0.0
12
+ oasst-12b-t0.0--oasst-12b-t0.0,1.74,20.85,8.33,0.0,,,0.0,,,15.0,33.33,51.64,0.0,,,100.0,0.0,0.0,16.67,0.0,0.0,14.29,0.0,0.0
13
+ text-davinci-003-t0.0--text-davinci-003-t0.0,15.78,44.5,35.46,57.5,38.7,27.78,16.0,14.1,25.21,82.5,36.36,48.85,28.81,76.47,43.72,66.67,1.25,5.59,36.67,31.36,38.99,23.33,50.0,50.0
14
+ vicuna-13b-t0.0--vicuna-13b-t0.0,4.24,13.58,31.25,0.0,,,0.0,,,0.0,,,5.08,100.0,0.0,56.67,0.0,0.0,13.33,25.0,50.0,20.0,0.0,0.0
versions/v0.9.csv ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ,-,all,all,imagegame,imagegame,imagegame,privateshared,privateshared,privateshared,referencegame,referencegame,referencegame,taboo,taboo,taboo,wordle,wordle,wordle,wordle_withclue,wordle_withclue,wordle_withclue,wordle_withcritic,wordle_withcritic,wordle_withcritic
2
+ ,clemscore,Average % Played,Average Quality Score,% Played,Quality Score,Quality Score (std),% Played,Quality Score,Quality Score (std),% Played,Quality Score,Quality Score (std),% Played,Quality Score,Quality Score (std),% Played,Quality Score,Quality Score (std),% Played,Quality Score,Quality Score (std),% Played,Quality Score,Quality Score (std)
3
+ model,,,,,,,,,,,,,,,,,,,,,,,,
4
+ claude-v1.3-t0.0--claude-v1.3-t0.0,37.07,74.76,49.58,0.0,,,100.0,84.87,18.87,100.0,82.5,38.48,76.92,68.75,38.71,100.0,0.0,0.0,100.0,30.56,40.13,46.43,30.77,48.04
5
+ falcon-40b-t0.0--falcon-40b-t0.0,0.71,0.95,75.0,0.0,,,0.0,,,0.0,,,0.0,,,0.0,,,3.33,50.0,,3.33,100.0,
6
+ gpt-3.5-turbo-t0.0--gpt-3.5-turbo-t0.0,37.02,85.86,43.12,97.5,60.28,25.95,64.0,72.83,13.07,100.0,55.0,50.38,69.49,71.95,44.79,100.0,0.0,0.0,93.33,28.57,46.0,76.67,13.19,30.16
7
+ gpt-3.5-turbo-t0.0--gpt-4-t0.0,42.39,86.75,48.87,97.5,64.95,25.45,,,,100.0,57.5,50.06,69.49,62.6,45.15,,,,,,,80.0,10.42,17.42
8
+ gpt-4-t0.0--gpt-3.5-turbo-t0.0,55.62,82.78,67.19,65.0,81.0,21.54,,,,100.0,47.5,50.57,66.1,93.59,23.45,,,,,,,100.0,46.67,42.92
9
+ gpt-4-t0.0--gpt-4-t0.0,9.99,96.06,61.93,77.5,89.06,22.28,100.0,90.79,8.2,100.0,75.0,43.85,94.92,76.19,37.45,100.0,3.67,8.4,100.0,49.67,42.09,100.0,49.11,38.46
10
+ koala-13b-t0.0--koala-13b-t0.0,1.48,14.76,10.0,0.0,,,0.0,,,0.0,,,0.0,,,86.67,0.0,0.0,16.67,20.0,44.72,0.0,,
11
+ luminous-supreme-t0.0--luminous-supreme-t0.0,0.0,16.24,0.0,0.0,,,0.0,,,0.0,,,0.0,,,100.0,0.0,0.0,3.33,0.0,,10.34,0.0,0.0
12
+ oasst-12b-t0.0--oasst-12b-t0.0,1.74,20.85,8.33,0.0,,,0.0,,,15.0,33.33,51.64,0.0,,,100.0,0.0,0.0,16.67,0.0,0.0,14.29,0.0,0.0
13
+ text-davinci-003-t0.0--text-davinci-003-t0.0,15.78,44.5,35.46,57.5,38.7,27.78,16.0,14.1,25.21,82.5,36.36,48.85,28.81,76.47,43.72,66.67,1.25,5.59,36.67,31.36,38.99,23.33,50.0,50.0
14
+ vicuna-13b-t0.0--vicuna-13b-t0.0,4.24,13.58,31.25,0.0,,,0.0,,,0.0,,,5.08,100.0,0.0,56.67,0.0,0.0,13.33,25.0,50.0,20.0,0.0,0.0
versions/v1.0.csv ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ,-,all,all,imagegame,imagegame,imagegame,privateshared,privateshared,privateshared,referencegame,referencegame,referencegame,taboo,taboo,taboo,wordle,wordle,wordle,wordle_withclue,wordle_withclue,wordle_withclue,wordle_withcritic,wordle_withcritic,wordle_withcritic
2
+ ,clemscore,Average % Played,Average Quality Score,% Played,Quality Score,Quality Score (std),% Played,Quality Score,Quality Score (std),% Played,Quality Score,Quality Score (std),% Played,Quality Score,Quality Score (std),% Played,Quality Score,Quality Score (std),% Played,Quality Score,Quality Score (std),% Played,Quality Score,Quality Score (std)
3
+ model,,,,,,,,,,,,,,,,,,,,,,,,
4
+ claude-v1.3-t0.0--claude-v1.3-t0.0,37.07,74.76,49.58,0.0,,,100.0,84.87,18.87,100.0,82.5,38.48,76.92,68.75,38.71,100.0,0.0,0.0,100.0,30.56,40.13,46.43,30.77,48.04
5
+ falcon-40b-t0.0--falcon-40b-t0.0,0.71,0.95,75.0,0.0,,,0.0,,,0.0,,,0.0,,,0.0,,,3.33,50.0,,3.33,100.0,
6
+ gpt-3.5-turbo-t0.0--gpt-3.5-turbo-t0.0,37.02,85.86,43.12,97.5,60.28,25.95,64.0,72.83,13.07,100.0,55.0,50.38,69.49,71.95,44.79,100.0,0.0,0.0,93.33,28.57,46.0,76.67,13.19,30.16
7
+ gpt-3.5-turbo-t0.0--gpt-4-t0.0,42.39,86.75,48.87,97.5,64.95,25.45,,,,100.0,57.5,50.06,69.49,62.6,45.15,,,,,,,80.0,10.42,17.42
8
+ gpt-4-t0.0--gpt-3.5-turbo-t0.0,55.62,82.78,67.19,65.0,81.0,21.54,,,,100.0,47.5,50.57,66.1,93.59,23.45,,,,,,,100.0,46.67,42.92
9
+ gpt-4-t0.0--gpt-4-t0.0,59.49,96.06,61.93,77.5,89.06,22.28,100.0,90.79,8.2,100.0,75.0,43.85,94.92,76.19,37.45,100.0,3.67,8.4,100.0,49.67,42.09,100.0,49.11,38.46
10
+ koala-13b-t0.0--koala-13b-t0.0,1.48,14.76,10.0,0.0,,,0.0,,,0.0,,,0.0,,,86.67,0.0,0.0,16.67,20.0,44.72,0.0,,
11
+ luminous-supreme-t0.0--luminous-supreme-t0.0,0.0,16.24,0.0,0.0,,,0.0,,,0.0,,,0.0,,,100.0,0.0,0.0,3.33,0.0,,10.34,0.0,0.0
12
+ oasst-12b-t0.0--oasst-12b-t0.0,1.74,20.85,8.33,0.0,,,0.0,,,15.0,33.33,51.64,0.0,,,100.0,0.0,0.0,16.67,0.0,0.0,14.29,0.0,0.0
13
+ text-davinci-003-t0.0--text-davinci-003-t0.0,15.78,44.5,35.46,57.5,38.7,27.78,16.0,14.1,25.21,82.5,36.36,48.85,28.81,76.47,43.72,66.67,1.25,5.59,36.67,31.36,38.99,23.33,50.0,50.0
14
+ vicuna-13b-t0.0--vicuna-13b-t0.0,4.24,13.58,31.25,0.0,,,0.0,,,0.0,,,5.08,100.0,0.0,56.67,0.0,0.0,13.33,25.0,50.0,20.0,0.0,0.0