CPunisher commited on
Commit
23f22ff
1 Parent(s): 948d4dc
app.py CHANGED
@@ -1,102 +1,70 @@
1
  import gradio as gr
2
- from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
3
  import pandas as pd
 
 
4
  from apscheduler.schedulers.background import BackgroundScheduler
5
- from huggingface_hub import snapshot_download
6
 
7
  from src.about import (
8
  CITATION_BUTTON_LABEL,
9
  CITATION_BUTTON_TEXT,
10
  EVALUATION_QUEUE_TEXT,
11
  INTRODUCTION_TEXT,
12
- LLM_BENCHMARKS_TEXT,
13
  TITLE,
14
  )
15
  from src.display.css_html_js import custom_css
16
  from src.display.utils import (
17
- BENCHMARK_COLS,
18
- COLS,
19
- EVAL_COLS,
20
- EVAL_TYPES,
21
  AutoEvalColumn,
22
- ModelType,
23
- fields,
24
- WeightType,
25
- Precision
26
  )
27
- from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
28
- from src.populate import get_evaluation_queue_df, get_leaderboard_df
29
- from src.submission.submit import add_new_eval
30
 
31
 
32
  def restart_space():
33
  API.restart_space(repo_id=REPO_ID)
34
 
35
- ### Space initialisation
36
- try:
37
- print(EVAL_REQUESTS_PATH)
38
- snapshot_download(
39
- repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
40
- )
41
- except Exception:
42
- restart_space()
43
- try:
44
- print(EVAL_RESULTS_PATH)
45
- snapshot_download(
46
- repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
47
- )
48
- except Exception:
49
- restart_space()
50
 
 
 
 
51
 
52
- LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
 
 
 
 
53
 
54
- (
55
- finished_eval_queue_df,
56
- running_eval_queue_df,
57
- pending_eval_queue_df,
58
- ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
59
 
60
- def init_leaderboard(dataframe):
61
- if dataframe is None or dataframe.empty:
62
- raise ValueError("Leaderboard DataFrame is empty or None.")
63
- return Leaderboard(
64
  value=dataframe,
 
65
  datatype=[c.type for c in fields(AutoEvalColumn)],
66
- select_columns=SelectColumns(
67
- default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
68
- cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
69
- label="Select Columns to Display:",
70
- ),
71
- search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
72
- hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
73
- filter_columns=[
74
- ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
75
- ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
76
- ColumnFilter(
77
- AutoEvalColumn.params.name,
78
- type="slider",
79
- min=0.01,
80
- max=150,
81
- label="Select the number of parameters (B)",
82
- ),
83
- ColumnFilter(
84
- AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
85
- ),
86
- ],
87
- bool_checkboxgroup_label="Hide models",
88
  interactive=False,
89
  )
90
 
91
-
92
  demo = gr.Blocks(css=custom_css)
93
  with demo:
94
  gr.HTML(TITLE)
95
  gr.HTML(INTRODUCTION_TEXT, elem_classes="markdown-text")
96
 
97
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
98
- with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
99
- leaderboard = init_leaderboard(LEADERBOARD_DF)
 
 
 
 
 
 
100
 
101
  # with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
102
  # gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
 
1
  import gradio as gr
 
2
  import pandas as pd
3
+ import json
4
+ from gradio_leaderboard import Leaderboard, SelectColumns
5
  from apscheduler.schedulers.background import BackgroundScheduler
 
6
 
7
  from src.about import (
8
  CITATION_BUTTON_LABEL,
9
  CITATION_BUTTON_TEXT,
10
  EVALUATION_QUEUE_TEXT,
11
  INTRODUCTION_TEXT,
 
12
  TITLE,
13
  )
14
  from src.display.css_html_js import custom_css
15
  from src.display.utils import (
 
 
 
 
16
  AutoEvalColumn,
17
+ fields
 
 
 
18
  )
19
+ from src.envs import API, REPO_ID
 
 
20
 
21
 
22
  def restart_space():
23
  API.restart_space(repo_id=REPO_ID)
24
 
25
+ def init_leaderboard(data_file):
26
+ with open(data_file, "r") as fp:
27
+ data = json.load(fp)
28
+
29
+ dataframe = pd.DataFrame()
30
+ for key, value in data.items():
31
+ col_df = pd.DataFrame(value)
32
+ col_df.rename(columns={"Pass_at_1": key}, inplace=True)
33
+ dataframe = col_df if dataframe.empty else dataframe.merge(col_df, on=['Context', 'Method', 'Model'], how='outer')
 
 
 
 
 
 
34
 
35
+ dataframe['Score'] = dataframe.drop(columns=['Context', 'Method', 'Model']).sum(axis=1) / 5
36
+ numeric_cols = dataframe.select_dtypes(include='number').columns
37
+ dataframe[numeric_cols] = dataframe[numeric_cols].apply(lambda x: x * 100).round(1)
38
 
39
+ cols = list(dataframe.columns)
40
+ cols.remove('Score')
41
+ cols.insert(3, 'Score')
42
+ dataframe = dataframe[cols]
43
+ cols.insert(3, cols.pop(cols.index('Score')))
44
 
45
+ dataframe = dataframe.sort_values(by='Score', ascending=False)
 
 
 
 
46
 
47
+ return gr.components.DataFrame(
 
 
 
48
  value=dataframe,
49
+ headers=[c.name for c in fields(AutoEvalColumn) if not c.hidden],
50
  datatype=[c.type for c in fields(AutoEvalColumn)],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
  interactive=False,
52
  )
53
 
 
54
  demo = gr.Blocks(css=custom_css)
55
  with demo:
56
  gr.HTML(TITLE)
57
  gr.HTML(INTRODUCTION_TEXT, elem_classes="markdown-text")
58
 
59
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
60
+ with gr.TabItem("[Method] Evaluation", elem_id="llm-benchmark-tab-table", id=0):
61
+ leaderboard = init_leaderboard("./data/data_method.json")
62
+
63
+ with gr.TabItem("[Context] Evaluation", elem_id="llm-benchmark-tab-table", id=1):
64
+ leaderboard = init_leaderboard("./data/data_context.json")
65
+
66
+ with gr.TabItem("[Incremental] Evaluation", elem_id="llm-benchmark-tab-table", id=2):
67
+ leaderboard = init_leaderboard("./data/data_incr-order.json")
68
 
69
  # with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
70
  # gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
data/data_context.json ADDED
@@ -0,0 +1,492 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "completion": [
3
+ {
4
+ "Context": "selective",
5
+ "Method": "holistic",
6
+ "Model": "gpt-4o-2024-05-13",
7
+ "Pass_at_1": 1.0
8
+ },
9
+ {
10
+ "Context": "maximum",
11
+ "Method": "holistic",
12
+ "Model": "deepseek-coder-33b-instruct",
13
+ "Pass_at_1": 0.9714285714
14
+ },
15
+ {
16
+ "Context": "maximum",
17
+ "Method": "holistic",
18
+ "Model": "deepseek-coder-6.7b-instruct",
19
+ "Pass_at_1": 0.945
20
+ },
21
+ {
22
+ "Context": "selective",
23
+ "Method": "holistic",
24
+ "Model": "deepseek-coder-6.7b-instruct",
25
+ "Pass_at_1": 0.9378571429
26
+ },
27
+ {
28
+ "Context": "selective",
29
+ "Method": "holistic",
30
+ "Model": "deepseek-coder-33b-instruct",
31
+ "Pass_at_1": 0.9357142857
32
+ },
33
+ {
34
+ "Context": "maximum",
35
+ "Method": "holistic",
36
+ "Model": "gpt-3.5-turbo-1106",
37
+ "Pass_at_1": 0.9328571429
38
+ },
39
+ {
40
+ "Context": "selective",
41
+ "Method": "holistic",
42
+ "Model": "gpt-3.5-turbo-1106",
43
+ "Pass_at_1": 0.9214285714
44
+ },
45
+ {
46
+ "Context": "minimum",
47
+ "Method": "holistic",
48
+ "Model": "deepseek-coder-6.7b-instruct",
49
+ "Pass_at_1": 0.9007142857
50
+ },
51
+ {
52
+ "Context": "maximum",
53
+ "Method": "holistic",
54
+ "Model": "Phind-CodeLlama-34B-v2",
55
+ "Pass_at_1": 0.8907142857
56
+ },
57
+ {
58
+ "Context": "minimum",
59
+ "Method": "holistic",
60
+ "Model": "deepseek-coder-33b-instruct",
61
+ "Pass_at_1": 0.8828571429
62
+ },
63
+ {
64
+ "Context": "minimum",
65
+ "Method": "holistic",
66
+ "Model": "Phind-CodeLlama-34B-v2",
67
+ "Pass_at_1": 0.8728571429
68
+ },
69
+ {
70
+ "Context": "minimum",
71
+ "Method": "holistic",
72
+ "Model": "gpt-3.5-turbo-1106",
73
+ "Pass_at_1": 0.8578571429
74
+ },
75
+ {
76
+ "Context": "selective",
77
+ "Method": "holistic",
78
+ "Model": "Phind-CodeLlama-34B-v2",
79
+ "Pass_at_1": 0.845
80
+ },
81
+ {
82
+ "Context": "selective",
83
+ "Method": "holistic",
84
+ "Model": "WizardCoder-15B-V1.0",
85
+ "Pass_at_1": 0.7964285714
86
+ },
87
+ {
88
+ "Context": "minimum",
89
+ "Method": "holistic",
90
+ "Model": "WizardCoder-15B-V1.0",
91
+ "Pass_at_1": 0.7935714286
92
+ },
93
+ {
94
+ "Context": "maximum",
95
+ "Method": "holistic",
96
+ "Model": "WizardCoder-15B-V1.0",
97
+ "Pass_at_1": 0.7192857143
98
+ }
99
+ ],
100
+ "compilation_class_wise": [
101
+ {
102
+ "Context": "selective",
103
+ "Method": "holistic",
104
+ "Model": "gpt-3.5-turbo-1106",
105
+ "Pass_at_1": 0.7942857143
106
+ },
107
+ {
108
+ "Context": "selective",
109
+ "Method": "holistic",
110
+ "Model": "deepseek-coder-33b-instruct",
111
+ "Pass_at_1": 0.7414285714
112
+ },
113
+ {
114
+ "Context": "maximum",
115
+ "Method": "holistic",
116
+ "Model": "deepseek-coder-6.7b-instruct",
117
+ "Pass_at_1": 0.7385714286
118
+ },
119
+ {
120
+ "Context": "maximum",
121
+ "Method": "holistic",
122
+ "Model": "deepseek-coder-33b-instruct",
123
+ "Pass_at_1": 0.7314285714
124
+ },
125
+ {
126
+ "Context": "selective",
127
+ "Method": "holistic",
128
+ "Model": "deepseek-coder-6.7b-instruct",
129
+ "Pass_at_1": 0.7171428571
130
+ },
131
+ {
132
+ "Context": "selective",
133
+ "Method": "holistic",
134
+ "Model": "Phind-CodeLlama-34B-v2",
135
+ "Pass_at_1": 0.6978571429
136
+ },
137
+ {
138
+ "Context": "selective",
139
+ "Method": "holistic",
140
+ "Model": "gpt-4o-2024-05-13",
141
+ "Pass_at_1": 0.6607142857
142
+ },
143
+ {
144
+ "Context": "maximum",
145
+ "Method": "holistic",
146
+ "Model": "gpt-3.5-turbo-1106",
147
+ "Pass_at_1": 0.6592857143
148
+ },
149
+ {
150
+ "Context": "maximum",
151
+ "Method": "holistic",
152
+ "Model": "Phind-CodeLlama-34B-v2",
153
+ "Pass_at_1": 0.6414285714
154
+ },
155
+ {
156
+ "Context": "selective",
157
+ "Method": "holistic",
158
+ "Model": "WizardCoder-15B-V1.0",
159
+ "Pass_at_1": 0.58
160
+ },
161
+ {
162
+ "Context": "maximum",
163
+ "Method": "holistic",
164
+ "Model": "WizardCoder-15B-V1.0",
165
+ "Pass_at_1": 0.4814285714
166
+ },
167
+ {
168
+ "Context": "minimum",
169
+ "Method": "holistic",
170
+ "Model": "gpt-3.5-turbo-1106",
171
+ "Pass_at_1": 0.45
172
+ },
173
+ {
174
+ "Context": "minimum",
175
+ "Method": "holistic",
176
+ "Model": "Phind-CodeLlama-34B-v2",
177
+ "Pass_at_1": 0.39
178
+ },
179
+ {
180
+ "Context": "minimum",
181
+ "Method": "holistic",
182
+ "Model": "deepseek-coder-6.7b-instruct",
183
+ "Pass_at_1": 0.3692857143
184
+ },
185
+ {
186
+ "Context": "minimum",
187
+ "Method": "holistic",
188
+ "Model": "deepseek-coder-33b-instruct",
189
+ "Pass_at_1": 0.3457142857
190
+ },
191
+ {
192
+ "Context": "minimum",
193
+ "Method": "holistic",
194
+ "Model": "WizardCoder-15B-V1.0",
195
+ "Pass_at_1": 0.215
196
+ }
197
+ ],
198
+ "compilation_test_wise": [
199
+ {
200
+ "Context": "selective",
201
+ "Method": "holistic",
202
+ "Model": "gpt-4o-2024-05-13",
203
+ "Pass_at_1": 0.5035714286
204
+ },
205
+ {
206
+ "Context": "selective",
207
+ "Method": "holistic",
208
+ "Model": "deepseek-coder-33b-instruct",
209
+ "Pass_at_1": 0.4202826585
210
+ },
211
+ {
212
+ "Context": "selective",
213
+ "Method": "holistic",
214
+ "Model": "gpt-3.5-turbo-1106",
215
+ "Pass_at_1": 0.3443277311
216
+ },
217
+ {
218
+ "Context": "maximum",
219
+ "Method": "holistic",
220
+ "Model": "deepseek-coder-33b-instruct",
221
+ "Pass_at_1": 0.3405987395
222
+ },
223
+ {
224
+ "Context": "maximum",
225
+ "Method": "holistic",
226
+ "Model": "deepseek-coder-6.7b-instruct",
227
+ "Pass_at_1": 0.3387079832
228
+ },
229
+ {
230
+ "Context": "selective",
231
+ "Method": "holistic",
232
+ "Model": "Phind-CodeLlama-34B-v2",
233
+ "Pass_at_1": 0.3183823529
234
+ },
235
+ {
236
+ "Context": "selective",
237
+ "Method": "holistic",
238
+ "Model": "deepseek-coder-6.7b-instruct",
239
+ "Pass_at_1": 0.3121848739
240
+ },
241
+ {
242
+ "Context": "maximum",
243
+ "Method": "holistic",
244
+ "Model": "gpt-3.5-turbo-1106",
245
+ "Pass_at_1": 0.2858193277
246
+ },
247
+ {
248
+ "Context": "maximum",
249
+ "Method": "holistic",
250
+ "Model": "Phind-CodeLlama-34B-v2",
251
+ "Pass_at_1": 0.2283088235
252
+ },
253
+ {
254
+ "Context": "selective",
255
+ "Method": "holistic",
256
+ "Model": "WizardCoder-15B-V1.0",
257
+ "Pass_at_1": 0.2240546218
258
+ },
259
+ {
260
+ "Context": "maximum",
261
+ "Method": "holistic",
262
+ "Model": "WizardCoder-15B-V1.0",
263
+ "Pass_at_1": 0.1466911765
264
+ },
265
+ {
266
+ "Context": "minimum",
267
+ "Method": "holistic",
268
+ "Model": "deepseek-coder-33b-instruct",
269
+ "Pass_at_1": 0.1128676471
270
+ },
271
+ {
272
+ "Context": "minimum",
273
+ "Method": "holistic",
274
+ "Model": "gpt-3.5-turbo-1106",
275
+ "Pass_at_1": 0.03125
276
+ },
277
+ {
278
+ "Context": "minimum",
279
+ "Method": "holistic",
280
+ "Model": "WizardCoder-15B-V1.0",
281
+ "Pass_at_1": 0.0147058824
282
+ },
283
+ {
284
+ "Context": "minimum",
285
+ "Method": "holistic",
286
+ "Model": "deepseek-coder-6.7b-instruct",
287
+ "Pass_at_1": 0.0125
288
+ },
289
+ {
290
+ "Context": "minimum",
291
+ "Method": "holistic",
292
+ "Model": "Phind-CodeLlama-34B-v2",
293
+ "Pass_at_1": 0.0
294
+ }
295
+ ],
296
+ "pass_class_wise": [
297
+ {
298
+ "Context": "selective",
299
+ "Method": "holistic",
300
+ "Model": "gpt-3.5-turbo-1106",
301
+ "Pass_at_1": 0.7832360347
302
+ },
303
+ {
304
+ "Context": "selective",
305
+ "Method": "holistic",
306
+ "Model": "deepseek-coder-33b-instruct",
307
+ "Pass_at_1": 0.723699056
308
+ },
309
+ {
310
+ "Context": "maximum",
311
+ "Method": "holistic",
312
+ "Model": "deepseek-coder-33b-instruct",
313
+ "Pass_at_1": 0.715291943
314
+ },
315
+ {
316
+ "Context": "maximum",
317
+ "Method": "holistic",
318
+ "Model": "deepseek-coder-6.7b-instruct",
319
+ "Pass_at_1": 0.7033228696
320
+ },
321
+ {
322
+ "Context": "selective",
323
+ "Method": "holistic",
324
+ "Model": "deepseek-coder-6.7b-instruct",
325
+ "Pass_at_1": 0.6855203826
326
+ },
327
+ {
328
+ "Context": "selective",
329
+ "Method": "holistic",
330
+ "Model": "Phind-CodeLlama-34B-v2",
331
+ "Pass_at_1": 0.6808480861
332
+ },
333
+ {
334
+ "Context": "selective",
335
+ "Method": "holistic",
336
+ "Model": "gpt-4o-2024-05-13",
337
+ "Pass_at_1": 0.6545897285
338
+ },
339
+ {
340
+ "Context": "maximum",
341
+ "Method": "holistic",
342
+ "Model": "gpt-3.5-turbo-1106",
343
+ "Pass_at_1": 0.6417690022
344
+ },
345
+ {
346
+ "Context": "maximum",
347
+ "Method": "holistic",
348
+ "Model": "Phind-CodeLlama-34B-v2",
349
+ "Pass_at_1": 0.6293667264
350
+ },
351
+ {
352
+ "Context": "selective",
353
+ "Method": "holistic",
354
+ "Model": "WizardCoder-15B-V1.0",
355
+ "Pass_at_1": 0.5674101922
356
+ },
357
+ {
358
+ "Context": "maximum",
359
+ "Method": "holistic",
360
+ "Model": "WizardCoder-15B-V1.0",
361
+ "Pass_at_1": 0.4741970721
362
+ },
363
+ {
364
+ "Context": "minimum",
365
+ "Method": "holistic",
366
+ "Model": "gpt-3.5-turbo-1106",
367
+ "Pass_at_1": 0.4489821662
368
+ },
369
+ {
370
+ "Context": "minimum",
371
+ "Method": "holistic",
372
+ "Model": "Phind-CodeLlama-34B-v2",
373
+ "Pass_at_1": 0.3864123669
374
+ },
375
+ {
376
+ "Context": "minimum",
377
+ "Method": "holistic",
378
+ "Model": "deepseek-coder-6.7b-instruct",
379
+ "Pass_at_1": 0.3599216366
380
+ },
381
+ {
382
+ "Context": "minimum",
383
+ "Method": "holistic",
384
+ "Model": "deepseek-coder-33b-instruct",
385
+ "Pass_at_1": 0.3272885064
386
+ },
387
+ {
388
+ "Context": "minimum",
389
+ "Method": "holistic",
390
+ "Model": "WizardCoder-15B-V1.0",
391
+ "Pass_at_1": 0.2139054163
392
+ }
393
+ ],
394
+ "pass_test_wise": [
395
+ {
396
+ "Context": "selective",
397
+ "Method": "holistic",
398
+ "Model": "gpt-4o-2024-05-13",
399
+ "Pass_at_1": 0.3438179726
400
+ },
401
+ {
402
+ "Context": "selective",
403
+ "Method": "holistic",
404
+ "Model": "deepseek-coder-33b-instruct",
405
+ "Pass_at_1": 0.3047552867
406
+ },
407
+ {
408
+ "Context": "maximum",
409
+ "Method": "holistic",
410
+ "Model": "deepseek-coder-33b-instruct",
411
+ "Pass_at_1": 0.3032497787
412
+ },
413
+ {
414
+ "Context": "selective",
415
+ "Method": "holistic",
416
+ "Model": "gpt-3.5-turbo-1106",
417
+ "Pass_at_1": 0.2941156144
418
+ },
419
+ {
420
+ "Context": "selective",
421
+ "Method": "holistic",
422
+ "Model": "Phind-CodeLlama-34B-v2",
423
+ "Pass_at_1": 0.2544265255
424
+ },
425
+ {
426
+ "Context": "maximum",
427
+ "Method": "holistic",
428
+ "Model": "gpt-3.5-turbo-1106",
429
+ "Pass_at_1": 0.2393000344
430
+ },
431
+ {
432
+ "Context": "selective",
433
+ "Method": "holistic",
434
+ "Model": "deepseek-coder-6.7b-instruct",
435
+ "Pass_at_1": 0.2028454735
436
+ },
437
+ {
438
+ "Context": "maximum",
439
+ "Method": "holistic",
440
+ "Model": "Phind-CodeLlama-34B-v2",
441
+ "Pass_at_1": 0.1966660863
442
+ },
443
+ {
444
+ "Context": "maximum",
445
+ "Method": "holistic",
446
+ "Model": "deepseek-coder-6.7b-instruct",
447
+ "Pass_at_1": 0.1877858469
448
+ },
449
+ {
450
+ "Context": "selective",
451
+ "Method": "holistic",
452
+ "Model": "WizardCoder-15B-V1.0",
453
+ "Pass_at_1": 0.1669267449
454
+ },
455
+ {
456
+ "Context": "maximum",
457
+ "Method": "holistic",
458
+ "Model": "WizardCoder-15B-V1.0",
459
+ "Pass_at_1": 0.1370849195
460
+ },
461
+ {
462
+ "Context": "minimum",
463
+ "Method": "holistic",
464
+ "Model": "deepseek-coder-33b-instruct",
465
+ "Pass_at_1": 0.1123372221
466
+ },
467
+ {
468
+ "Context": "minimum",
469
+ "Method": "holistic",
470
+ "Model": "gpt-3.5-turbo-1106",
471
+ "Pass_at_1": 0.03125
472
+ },
473
+ {
474
+ "Context": "minimum",
475
+ "Method": "holistic",
476
+ "Model": "WizardCoder-15B-V1.0",
477
+ "Pass_at_1": 0.0147058824
478
+ },
479
+ {
480
+ "Context": "minimum",
481
+ "Method": "holistic",
482
+ "Model": "deepseek-coder-6.7b-instruct",
483
+ "Pass_at_1": 0.0125
484
+ },
485
+ {
486
+ "Context": "minimum",
487
+ "Method": "holistic",
488
+ "Model": "Phind-CodeLlama-34B-v2",
489
+ "Pass_at_1": 0.0
490
+ }
491
+ ]
492
+ }
data/data_incr-order.json ADDED
@@ -0,0 +1,282 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "completion": [
3
+ {
4
+ "Context": "selective",
5
+ "Method": "incremental",
6
+ "Model": "deepseek-coder-6.7b-instruct",
7
+ "Pass_at_1": 0.8642857143
8
+ },
9
+ {
10
+ "Context": "selective",
11
+ "Method": "incremental_random",
12
+ "Model": "deepseek-coder-6.7b-instruct",
13
+ "Pass_at_1": 0.8514285714
14
+ },
15
+ {
16
+ "Context": "selective",
17
+ "Method": "incremental",
18
+ "Model": "gpt-3.5-turbo-1106",
19
+ "Pass_at_1": 0.8392857143
20
+ },
21
+ {
22
+ "Context": "selective",
23
+ "Method": "incremental_rev",
24
+ "Model": "deepseek-coder-6.7b-instruct",
25
+ "Pass_at_1": 0.8257142857
26
+ },
27
+ {
28
+ "Context": "selective",
29
+ "Method": "incremental_random",
30
+ "Model": "gpt-3.5-turbo-1106",
31
+ "Pass_at_1": 0.8135714286
32
+ },
33
+ {
34
+ "Context": "selective",
35
+ "Method": "incremental_rev",
36
+ "Model": "gpt-3.5-turbo-1106",
37
+ "Pass_at_1": 0.8121428571
38
+ },
39
+ {
40
+ "Context": "selective",
41
+ "Method": "incremental",
42
+ "Model": "deepseek-coder-33b-instruct",
43
+ "Pass_at_1": 0.7485714286
44
+ },
45
+ {
46
+ "Context": "selective",
47
+ "Method": "incremental",
48
+ "Model": "Phind-CodeLlama-34B-v2",
49
+ "Pass_at_1": 0.5992857143
50
+ },
51
+ {
52
+ "Context": "selective",
53
+ "Method": "incremental",
54
+ "Model": "WizardCoder-15B-V1.0",
55
+ "Pass_at_1": 0.5057142857
56
+ }
57
+ ],
58
+ "compilation_class_wise": [
59
+ {
60
+ "Context": "selective",
61
+ "Method": "incremental_random",
62
+ "Model": "gpt-3.5-turbo-1106",
63
+ "Pass_at_1": 0.7285714286
64
+ },
65
+ {
66
+ "Context": "selective",
67
+ "Method": "incremental",
68
+ "Model": "deepseek-coder-6.7b-instruct",
69
+ "Pass_at_1": 0.725
70
+ },
71
+ {
72
+ "Context": "selective",
73
+ "Method": "incremental",
74
+ "Model": "gpt-3.5-turbo-1106",
75
+ "Pass_at_1": 0.7
76
+ },
77
+ {
78
+ "Context": "selective",
79
+ "Method": "incremental_random",
80
+ "Model": "deepseek-coder-6.7b-instruct",
81
+ "Pass_at_1": 0.695
82
+ },
83
+ {
84
+ "Context": "selective",
85
+ "Method": "incremental_rev",
86
+ "Model": "deepseek-coder-6.7b-instruct",
87
+ "Pass_at_1": 0.6864285714
88
+ },
89
+ {
90
+ "Context": "selective",
91
+ "Method": "incremental_rev",
92
+ "Model": "gpt-3.5-turbo-1106",
93
+ "Pass_at_1": 0.6642857143
94
+ },
95
+ {
96
+ "Context": "selective",
97
+ "Method": "incremental",
98
+ "Model": "deepseek-coder-33b-instruct",
99
+ "Pass_at_1": 0.5778571429
100
+ },
101
+ {
102
+ "Context": "selective",
103
+ "Method": "incremental",
104
+ "Model": "Phind-CodeLlama-34B-v2",
105
+ "Pass_at_1": 0.5021428571
106
+ },
107
+ {
108
+ "Context": "selective",
109
+ "Method": "incremental",
110
+ "Model": "WizardCoder-15B-V1.0",
111
+ "Pass_at_1": 0.3507142857
112
+ }
113
+ ],
114
+ "compilation_test_wise": [
115
+ {
116
+ "Context": "selective",
117
+ "Method": "incremental",
118
+ "Model": "deepseek-coder-6.7b-instruct",
119
+ "Pass_at_1": 0.2621848739
120
+ },
121
+ {
122
+ "Context": "selective",
123
+ "Method": "incremental_rev",
124
+ "Model": "deepseek-coder-6.7b-instruct",
125
+ "Pass_at_1": 0.2589381207
126
+ },
127
+ {
128
+ "Context": "selective",
129
+ "Method": "incremental_random",
130
+ "Model": "deepseek-coder-6.7b-instruct",
131
+ "Pass_at_1": 0.2505252101
132
+ },
133
+ {
134
+ "Context": "selective",
135
+ "Method": "incremental_rev",
136
+ "Model": "gpt-3.5-turbo-1106",
137
+ "Pass_at_1": 0.2421457219
138
+ },
139
+ {
140
+ "Context": "selective",
141
+ "Method": "incremental_random",
142
+ "Model": "gpt-3.5-turbo-1106",
143
+ "Pass_at_1": 0.1824818564
144
+ },
145
+ {
146
+ "Context": "selective",
147
+ "Method": "incremental",
148
+ "Model": "gpt-3.5-turbo-1106",
149
+ "Pass_at_1": 0.0955357143
150
+ },
151
+ {
152
+ "Context": "selective",
153
+ "Method": "incremental",
154
+ "Model": "Phind-CodeLlama-34B-v2",
155
+ "Pass_at_1": 0.0701680672
156
+ },
157
+ {
158
+ "Context": "selective",
159
+ "Method": "incremental",
160
+ "Model": "deepseek-coder-33b-instruct",
161
+ "Pass_at_1": 0.0352941176
162
+ },
163
+ {
164
+ "Context": "selective",
165
+ "Method": "incremental",
166
+ "Model": "WizardCoder-15B-V1.0",
167
+ "Pass_at_1": 0.0
168
+ }
169
+ ],
170
+ "pass_class_wise": [
171
+ {
172
+ "Context": "selective",
173
+ "Method": "incremental_random",
174
+ "Model": "gpt-3.5-turbo-1106",
175
+ "Pass_at_1": 0.7209670092
176
+ },
177
+ {
178
+ "Context": "selective",
179
+ "Method": "incremental",
180
+ "Model": "deepseek-coder-6.7b-instruct",
181
+ "Pass_at_1": 0.7036481325
182
+ },
183
+ {
184
+ "Context": "selective",
185
+ "Method": "incremental",
186
+ "Model": "gpt-3.5-turbo-1106",
187
+ "Pass_at_1": 0.6846699639
188
+ },
189
+ {
190
+ "Context": "selective",
191
+ "Method": "incremental_random",
192
+ "Model": "deepseek-coder-6.7b-instruct",
193
+ "Pass_at_1": 0.6816620162
194
+ },
195
+ {
196
+ "Context": "selective",
197
+ "Method": "incremental_rev",
198
+ "Model": "deepseek-coder-6.7b-instruct",
199
+ "Pass_at_1": 0.6650646981
200
+ },
201
+ {
202
+ "Context": "selective",
203
+ "Method": "incremental_rev",
204
+ "Model": "gpt-3.5-turbo-1106",
205
+ "Pass_at_1": 0.6537268945
206
+ },
207
+ {
208
+ "Context": "selective",
209
+ "Method": "incremental",
210
+ "Model": "deepseek-coder-33b-instruct",
211
+ "Pass_at_1": 0.5603969625
212
+ },
213
+ {
214
+ "Context": "selective",
215
+ "Method": "incremental",
216
+ "Model": "Phind-CodeLlama-34B-v2",
217
+ "Pass_at_1": 0.4878321662
218
+ },
219
+ {
220
+ "Context": "selective",
221
+ "Method": "incremental",
222
+ "Model": "WizardCoder-15B-V1.0",
223
+ "Pass_at_1": 0.3468474087
224
+ }
225
+ ],
226
+ "pass_test_wise": [
227
+ {
228
+ "Context": "selective",
229
+ "Method": "incremental_random",
230
+ "Model": "deepseek-coder-6.7b-instruct",
231
+ "Pass_at_1": 0.2107590067
232
+ },
233
+ {
234
+ "Context": "selective",
235
+ "Method": "incremental_rev",
236
+ "Model": "deepseek-coder-6.7b-instruct",
237
+ "Pass_at_1": 0.2085373337
238
+ },
239
+ {
240
+ "Context": "selective",
241
+ "Method": "incremental",
242
+ "Model": "deepseek-coder-6.7b-instruct",
243
+ "Pass_at_1": 0.1967821219
244
+ },
245
+ {
246
+ "Context": "selective",
247
+ "Method": "incremental_rev",
248
+ "Model": "gpt-3.5-turbo-1106",
249
+ "Pass_at_1": 0.1865232907
250
+ },
251
+ {
252
+ "Context": "selective",
253
+ "Method": "incremental_random",
254
+ "Model": "gpt-3.5-turbo-1106",
255
+ "Pass_at_1": 0.1179619378
256
+ },
257
+ {
258
+ "Context": "selective",
259
+ "Method": "incremental",
260
+ "Model": "Phind-CodeLlama-34B-v2",
261
+ "Pass_at_1": 0.061012122
262
+ },
263
+ {
264
+ "Context": "selective",
265
+ "Method": "incremental",
266
+ "Model": "gpt-3.5-turbo-1106",
267
+ "Pass_at_1": 0.0514928193
268
+ },
269
+ {
270
+ "Context": "selective",
271
+ "Method": "incremental",
272
+ "Model": "deepseek-coder-33b-instruct",
273
+ "Pass_at_1": 0.0350620781
274
+ },
275
+ {
276
+ "Context": "selective",
277
+ "Method": "incremental",
278
+ "Model": "WizardCoder-15B-V1.0",
279
+ "Pass_at_1": 0.0
280
+ }
281
+ ]
282
+ }
data/data_method.json ADDED
@@ -0,0 +1,492 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "completion": [
3
+ {
4
+ "Context": "selective",
5
+ "Method": "holistic",
6
+ "Model": "gpt-4o-2024-05-13",
7
+ "Pass_at_1": 1.0
8
+ },
9
+ {
10
+ "Context": "selective",
11
+ "Method": "holistic",
12
+ "Model": "deepseek-coder-6.7b-instruct",
13
+ "Pass_at_1": 0.9378571429
14
+ },
15
+ {
16
+ "Context": "selective",
17
+ "Method": "holistic",
18
+ "Model": "deepseek-coder-33b-instruct",
19
+ "Pass_at_1": 0.9357142857
20
+ },
21
+ {
22
+ "Context": "selective",
23
+ "Method": "holistic",
24
+ "Model": "gpt-3.5-turbo-1106",
25
+ "Pass_at_1": 0.9214285714
26
+ },
27
+ {
28
+ "Context": "selective",
29
+ "Method": "incremental",
30
+ "Model": "deepseek-coder-6.7b-instruct",
31
+ "Pass_at_1": 0.8642857143
32
+ },
33
+ {
34
+ "Context": "selective",
35
+ "Method": "independent",
36
+ "Model": "gpt-3.5-turbo-1106",
37
+ "Pass_at_1": 0.8535714286
38
+ },
39
+ {
40
+ "Context": "selective",
41
+ "Method": "independent",
42
+ "Model": "deepseek-coder-6.7b-instruct",
43
+ "Pass_at_1": 0.8471428571
44
+ },
45
+ {
46
+ "Context": "selective",
47
+ "Method": "holistic",
48
+ "Model": "Phind-CodeLlama-34B-v2",
49
+ "Pass_at_1": 0.845
50
+ },
51
+ {
52
+ "Context": "selective",
53
+ "Method": "incremental",
54
+ "Model": "gpt-3.5-turbo-1106",
55
+ "Pass_at_1": 0.8392857143
56
+ },
57
+ {
58
+ "Context": "selective",
59
+ "Method": "independent",
60
+ "Model": "deepseek-coder-33b-instruct",
61
+ "Pass_at_1": 0.8192857143
62
+ },
63
+ {
64
+ "Context": "selective",
65
+ "Method": "holistic",
66
+ "Model": "WizardCoder-15B-V1.0",
67
+ "Pass_at_1": 0.7964285714
68
+ },
69
+ {
70
+ "Context": "selective",
71
+ "Method": "incremental",
72
+ "Model": "deepseek-coder-33b-instruct",
73
+ "Pass_at_1": 0.7485714286
74
+ },
75
+ {
76
+ "Context": "selective",
77
+ "Method": "independent",
78
+ "Model": "Phind-CodeLlama-34B-v2",
79
+ "Pass_at_1": 0.6378571429
80
+ },
81
+ {
82
+ "Context": "selective",
83
+ "Method": "incremental",
84
+ "Model": "Phind-CodeLlama-34B-v2",
85
+ "Pass_at_1": 0.5992857143
86
+ },
87
+ {
88
+ "Context": "selective",
89
+ "Method": "independent",
90
+ "Model": "WizardCoder-15B-V1.0",
91
+ "Pass_at_1": 0.5971428571
92
+ },
93
+ {
94
+ "Context": "selective",
95
+ "Method": "incremental",
96
+ "Model": "WizardCoder-15B-V1.0",
97
+ "Pass_at_1": 0.5057142857
98
+ }
99
+ ],
100
+ "compilation_class_wise": [
101
+ {
102
+ "Context": "selective",
103
+ "Method": "holistic",
104
+ "Model": "gpt-3.5-turbo-1106",
105
+ "Pass_at_1": 0.7942857143
106
+ },
107
+ {
108
+ "Context": "selective",
109
+ "Method": "holistic",
110
+ "Model": "deepseek-coder-33b-instruct",
111
+ "Pass_at_1": 0.7414285714
112
+ },
113
+ {
114
+ "Context": "selective",
115
+ "Method": "incremental",
116
+ "Model": "deepseek-coder-6.7b-instruct",
117
+ "Pass_at_1": 0.725
118
+ },
119
+ {
120
+ "Context": "selective",
121
+ "Method": "holistic",
122
+ "Model": "deepseek-coder-6.7b-instruct",
123
+ "Pass_at_1": 0.7171428571
124
+ },
125
+ {
126
+ "Context": "selective",
127
+ "Method": "incremental",
128
+ "Model": "gpt-3.5-turbo-1106",
129
+ "Pass_at_1": 0.7
130
+ },
131
+ {
132
+ "Context": "selective",
133
+ "Method": "holistic",
134
+ "Model": "Phind-CodeLlama-34B-v2",
135
+ "Pass_at_1": 0.6978571429
136
+ },
137
+ {
138
+ "Context": "selective",
139
+ "Method": "independent",
140
+ "Model": "gpt-3.5-turbo-1106",
141
+ "Pass_at_1": 0.6828571429
142
+ },
143
+ {
144
+ "Context": "selective",
145
+ "Method": "independent",
146
+ "Model": "deepseek-coder-6.7b-instruct",
147
+ "Pass_at_1": 0.6814285714
148
+ },
149
+ {
150
+ "Context": "selective",
151
+ "Method": "independent",
152
+ "Model": "deepseek-coder-33b-instruct",
153
+ "Pass_at_1": 0.6764285714
154
+ },
155
+ {
156
+ "Context": "selective",
157
+ "Method": "holistic",
158
+ "Model": "gpt-4o-2024-05-13",
159
+ "Pass_at_1": 0.6607142857
160
+ },
161
+ {
162
+ "Context": "selective",
163
+ "Method": "holistic",
164
+ "Model": "WizardCoder-15B-V1.0",
165
+ "Pass_at_1": 0.58
166
+ },
167
+ {
168
+ "Context": "selective",
169
+ "Method": "incremental",
170
+ "Model": "deepseek-coder-33b-instruct",
171
+ "Pass_at_1": 0.5778571429
172
+ },
173
+ {
174
+ "Context": "selective",
175
+ "Method": "incremental",
176
+ "Model": "Phind-CodeLlama-34B-v2",
177
+ "Pass_at_1": 0.5021428571
178
+ },
179
+ {
180
+ "Context": "selective",
181
+ "Method": "independent",
182
+ "Model": "Phind-CodeLlama-34B-v2",
183
+ "Pass_at_1": 0.4985714286
184
+ },
185
+ {
186
+ "Context": "selective",
187
+ "Method": "independent",
188
+ "Model": "WizardCoder-15B-V1.0",
189
+ "Pass_at_1": 0.4335714286
190
+ },
191
+ {
192
+ "Context": "selective",
193
+ "Method": "incremental",
194
+ "Model": "WizardCoder-15B-V1.0",
195
+ "Pass_at_1": 0.3507142857
196
+ }
197
+ ],
198
+ "compilation_test_wise": [
199
+ {
200
+ "Context": "selective",
201
+ "Method": "holistic",
202
+ "Model": "gpt-4o-2024-05-13",
203
+ "Pass_at_1": 0.5035714286
204
+ },
205
+ {
206
+ "Context": "selective",
207
+ "Method": "holistic",
208
+ "Model": "deepseek-coder-33b-instruct",
209
+ "Pass_at_1": 0.4202826585
210
+ },
211
+ {
212
+ "Context": "selective",
213
+ "Method": "holistic",
214
+ "Model": "gpt-3.5-turbo-1106",
215
+ "Pass_at_1": 0.3443277311
216
+ },
217
+ {
218
+ "Context": "selective",
219
+ "Method": "independent",
220
+ "Model": "deepseek-coder-33b-instruct",
221
+ "Pass_at_1": 0.3378676471
222
+ },
223
+ {
224
+ "Context": "selective",
225
+ "Method": "holistic",
226
+ "Model": "Phind-CodeLlama-34B-v2",
227
+ "Pass_at_1": 0.3183823529
228
+ },
229
+ {
230
+ "Context": "selective",
231
+ "Method": "holistic",
232
+ "Model": "deepseek-coder-6.7b-instruct",
233
+ "Pass_at_1": 0.3121848739
234
+ },
235
+ {
236
+ "Context": "selective",
237
+ "Method": "independent",
238
+ "Model": "deepseek-coder-6.7b-instruct",
239
+ "Pass_at_1": 0.2836134454
240
+ },
241
+ {
242
+ "Context": "selective",
243
+ "Method": "incremental",
244
+ "Model": "deepseek-coder-6.7b-instruct",
245
+ "Pass_at_1": 0.2621848739
246
+ },
247
+ {
248
+ "Context": "selective",
249
+ "Method": "holistic",
250
+ "Model": "WizardCoder-15B-V1.0",
251
+ "Pass_at_1": 0.2240546218
252
+ },
253
+ {
254
+ "Context": "selective",
255
+ "Method": "independent",
256
+ "Model": "gpt-3.5-turbo-1106",
257
+ "Pass_at_1": 0.1930147059
258
+ },
259
+ {
260
+ "Context": "selective",
261
+ "Method": "incremental",
262
+ "Model": "gpt-3.5-turbo-1106",
263
+ "Pass_at_1": 0.0955357143
264
+ },
265
+ {
266
+ "Context": "selective",
267
+ "Method": "independent",
268
+ "Model": "Phind-CodeLlama-34B-v2",
269
+ "Pass_at_1": 0.0932773109
270
+ },
271
+ {
272
+ "Context": "selective",
273
+ "Method": "incremental",
274
+ "Model": "Phind-CodeLlama-34B-v2",
275
+ "Pass_at_1": 0.0701680672
276
+ },
277
+ {
278
+ "Context": "selective",
279
+ "Method": "incremental",
280
+ "Model": "deepseek-coder-33b-instruct",
281
+ "Pass_at_1": 0.0352941176
282
+ },
283
+ {
284
+ "Context": "selective",
285
+ "Method": "independent",
286
+ "Model": "WizardCoder-15B-V1.0",
287
+ "Pass_at_1": 0.0
288
+ },
289
+ {
290
+ "Context": "selective",
291
+ "Method": "incremental",
292
+ "Model": "WizardCoder-15B-V1.0",
293
+ "Pass_at_1": 0.0
294
+ }
295
+ ],
296
+ "pass_class_wise": [
297
+ {
298
+ "Context": "selective",
299
+ "Method": "holistic",
300
+ "Model": "gpt-3.5-turbo-1106",
301
+ "Pass_at_1": 0.7832360347
302
+ },
303
+ {
304
+ "Context": "selective",
305
+ "Method": "holistic",
306
+ "Model": "deepseek-coder-33b-instruct",
307
+ "Pass_at_1": 0.723699056
308
+ },
309
+ {
310
+ "Context": "selective",
311
+ "Method": "incremental",
312
+ "Model": "deepseek-coder-6.7b-instruct",
313
+ "Pass_at_1": 0.7036481325
314
+ },
315
+ {
316
+ "Context": "selective",
317
+ "Method": "holistic",
318
+ "Model": "deepseek-coder-6.7b-instruct",
319
+ "Pass_at_1": 0.6855203826
320
+ },
321
+ {
322
+ "Context": "selective",
323
+ "Method": "incremental",
324
+ "Model": "gpt-3.5-turbo-1106",
325
+ "Pass_at_1": 0.6846699639
326
+ },
327
+ {
328
+ "Context": "selective",
329
+ "Method": "holistic",
330
+ "Model": "Phind-CodeLlama-34B-v2",
331
+ "Pass_at_1": 0.6808480861
332
+ },
333
+ {
334
+ "Context": "selective",
335
+ "Method": "independent",
336
+ "Model": "gpt-3.5-turbo-1106",
337
+ "Pass_at_1": 0.6772858762
338
+ },
339
+ {
340
+ "Context": "selective",
341
+ "Method": "independent",
342
+ "Model": "deepseek-coder-6.7b-instruct",
343
+ "Pass_at_1": 0.6547244155
344
+ },
345
+ {
346
+ "Context": "selective",
347
+ "Method": "independent",
348
+ "Model": "deepseek-coder-33b-instruct",
349
+ "Pass_at_1": 0.6547232007
350
+ },
351
+ {
352
+ "Context": "selective",
353
+ "Method": "holistic",
354
+ "Model": "gpt-4o-2024-05-13",
355
+ "Pass_at_1": 0.6545897285
356
+ },
357
+ {
358
+ "Context": "selective",
359
+ "Method": "holistic",
360
+ "Model": "WizardCoder-15B-V1.0",
361
+ "Pass_at_1": 0.5674101922
362
+ },
363
+ {
364
+ "Context": "selective",
365
+ "Method": "incremental",
366
+ "Model": "deepseek-coder-33b-instruct",
367
+ "Pass_at_1": 0.5603969625
368
+ },
369
+ {
370
+ "Context": "selective",
371
+ "Method": "incremental",
372
+ "Model": "Phind-CodeLlama-34B-v2",
373
+ "Pass_at_1": 0.4878321662
374
+ },
375
+ {
376
+ "Context": "selective",
377
+ "Method": "independent",
378
+ "Model": "Phind-CodeLlama-34B-v2",
379
+ "Pass_at_1": 0.4863639752
380
+ },
381
+ {
382
+ "Context": "selective",
383
+ "Method": "independent",
384
+ "Model": "WizardCoder-15B-V1.0",
385
+ "Pass_at_1": 0.4261740357
386
+ },
387
+ {
388
+ "Context": "selective",
389
+ "Method": "incremental",
390
+ "Model": "WizardCoder-15B-V1.0",
391
+ "Pass_at_1": 0.3468474087
392
+ }
393
+ ],
394
+ "pass_test_wise": [
395
+ {
396
+ "Context": "selective",
397
+ "Method": "holistic",
398
+ "Model": "gpt-4o-2024-05-13",
399
+ "Pass_at_1": 0.3438179726
400
+ },
401
+ {
402
+ "Context": "selective",
403
+ "Method": "holistic",
404
+ "Model": "deepseek-coder-33b-instruct",
405
+ "Pass_at_1": 0.3047552867
406
+ },
407
+ {
408
+ "Context": "selective",
409
+ "Method": "holistic",
410
+ "Model": "gpt-3.5-turbo-1106",
411
+ "Pass_at_1": 0.2941156144
412
+ },
413
+ {
414
+ "Context": "selective",
415
+ "Method": "holistic",
416
+ "Model": "Phind-CodeLlama-34B-v2",
417
+ "Pass_at_1": 0.2544265255
418
+ },
419
+ {
420
+ "Context": "selective",
421
+ "Method": "independent",
422
+ "Model": "deepseek-coder-33b-instruct",
423
+ "Pass_at_1": 0.2224382166
424
+ },
425
+ {
426
+ "Context": "selective",
427
+ "Method": "independent",
428
+ "Model": "deepseek-coder-6.7b-instruct",
429
+ "Pass_at_1": 0.2083516025
430
+ },
431
+ {
432
+ "Context": "selective",
433
+ "Method": "holistic",
434
+ "Model": "deepseek-coder-6.7b-instruct",
435
+ "Pass_at_1": 0.2028454735
436
+ },
437
+ {
438
+ "Context": "selective",
439
+ "Method": "incremental",
440
+ "Model": "deepseek-coder-6.7b-instruct",
441
+ "Pass_at_1": 0.1967821219
442
+ },
443
+ {
444
+ "Context": "selective",
445
+ "Method": "independent",
446
+ "Model": "gpt-3.5-turbo-1106",
447
+ "Pass_at_1": 0.1930147059
448
+ },
449
+ {
450
+ "Context": "selective",
451
+ "Method": "holistic",
452
+ "Model": "WizardCoder-15B-V1.0",
453
+ "Pass_at_1": 0.1669267449
454
+ },
455
+ {
456
+ "Context": "selective",
457
+ "Method": "independent",
458
+ "Model": "Phind-CodeLlama-34B-v2",
459
+ "Pass_at_1": 0.0714792814
460
+ },
461
+ {
462
+ "Context": "selective",
463
+ "Method": "incremental",
464
+ "Model": "Phind-CodeLlama-34B-v2",
465
+ "Pass_at_1": 0.061012122
466
+ },
467
+ {
468
+ "Context": "selective",
469
+ "Method": "incremental",
470
+ "Model": "gpt-3.5-turbo-1106",
471
+ "Pass_at_1": 0.0514928193
472
+ },
473
+ {
474
+ "Context": "selective",
475
+ "Method": "incremental",
476
+ "Model": "deepseek-coder-33b-instruct",
477
+ "Pass_at_1": 0.0350620781
478
+ },
479
+ {
480
+ "Context": "selective",
481
+ "Method": "independent",
482
+ "Model": "WizardCoder-15B-V1.0",
483
+ "Pass_at_1": 0.0
484
+ },
485
+ {
486
+ "Context": "selective",
487
+ "Method": "incremental",
488
+ "Model": "WizardCoder-15B-V1.0",
489
+ "Pass_at_1": 0.0
490
+ }
491
+ ]
492
+ }
data/models.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "model": "gpt-3.5-turbo-1106",
4
+ "link": "https://openai.com/"
5
+ },
6
+ {
7
+ "model": "gpt-4o-2024-05-13",
8
+ "link": "https://openai.com/"
9
+ },
10
+ {
11
+ "model": "deepseek-coder-33b-instruct",
12
+ "link": "https://huggingface.co/deepseek-ai/deepseek-coder-33b-instruct",
13
+ "size": 33
14
+ },
15
+ {
16
+ "model": "deepseek-coder-6.7b-instruct",
17
+ "link": "https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-instruct",
18
+ "size": 6.7
19
+ },
20
+ {
21
+ "model": "Phind-CodeLlama-34B-v2",
22
+ "link": "https://huggingface.co/Phind/Phind-CodeLlama-34B-v2",
23
+ "size": 34
24
+ },
25
+ {
26
+ "model": "WizardCoder-15B-V1.0",
27
+ "link": "https://huggingface.co/WizardLMTeam/WizardCoder-15B-V1.0",
28
+ "size": 15
29
+ }
30
+ ]
src/display/utils.py CHANGED
@@ -23,22 +23,28 @@ class ColumnContent:
23
  ## Leaderboard columns
24
  auto_eval_column_dict = []
25
  # Init
26
- auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
27
- auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
28
  #Scores
29
- auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
30
- for task in Tasks:
31
- auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
32
- # Model information
33
- auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
34
- auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
35
- auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
36
- auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
37
- auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
38
- auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
39
- auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
40
- auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
41
- auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
 
 
 
 
 
 
42
 
43
  # We use make dataclass to dynamically fill the scores from Tasks
44
  AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
 
23
  ## Leaderboard columns
24
  auto_eval_column_dict = []
25
  # Init
26
+ # auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
27
+ # auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
28
  #Scores
29
+ # auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
30
+ # auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
31
+ # auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
32
+ # auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
33
+ # auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
34
+ # auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
35
+ # auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
36
+ # auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
37
+ # auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
38
+ # auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
39
+
40
+ auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
41
+ auto_eval_column_dict.append(["context", ColumnContent, ColumnContent("Context", "str", True, never_hidden=True)])
42
+ auto_eval_column_dict.append(["method", ColumnContent, ColumnContent("Method", "str", True, never_hidden=True)])
43
+ auto_eval_column_dict.append(["completion", ColumnContent, ColumnContent("Completion", "number", True, never_hidden=True)])
44
+ auto_eval_column_dict.append(["compilation_class_wise", ColumnContent, ColumnContent("Compilation(class)", "number", True, never_hidden=True)])
45
+ auto_eval_column_dict.append(["compilation_test_wise", ColumnContent, ColumnContent("Compilation(test)", "number", True, never_hidden=True)])
46
+ auto_eval_column_dict.append(["pass_class_wise", ColumnContent, ColumnContent("Pass(class)", "number", True, never_hidden=True)])
47
+ auto_eval_column_dict.append(["pass_test_wise", ColumnContent, ColumnContent("Pass(test)", "number", True, never_hidden=True)])
48
 
49
  # We use make dataclass to dynamically fill the scores from Tasks
50
  AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)