jszheng commited on
Commit
790dc55
β€’
1 Parent(s): 310a5d6

add results of 9 LLMs

Browse files
Files changed (2) hide show
  1. RESULTS.json +817 -401
  2. text_content.py +6 -2
RESULTS.json CHANGED
@@ -1,18 +1,67 @@
1
  {
2
- "gpt-4o-2024-05-13": {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  "readability": {
4
  "R*": 80.5,
5
- "RN_p": 81.1,
6
- "RN_if": 91.8,
7
- "RN": 75.3,
8
  "RL_p": 78.9,
9
  "RL_if": 78.9,
10
  "RL": 63.2,
11
  "RC_p": 79.8,
12
- "RC_if": 78.7,
13
- "RC": 64.3,
14
- "MBPP*": 64.6,
15
- "Readability": 67.6
16
  },
17
  "maintainability": {
18
  "MI*": 38.0,
@@ -26,31 +75,77 @@
26
  "efficiency": {
27
  "E*": 59.4,
28
  "E_p": 58.4,
29
- "E_NI_T": 44.8,
30
  "E_NI_S": 42.0,
31
- "Efficiency": 43.4
32
  },
 
 
 
 
 
33
  "correctness": {
34
- "Correctness": 59.9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  },
36
  "overall": {
37
- "RACE Score": 56.5
38
  }
39
  },
40
- "gpt-3.5-turbo-0125": {
 
 
 
 
 
 
 
 
41
  "readability": {
42
  "R*": 62.8,
43
  "RN_p": 63.2,
44
- "RN_if": 74.4,
45
- "RN": 48.3,
46
  "RL_p": 60.4,
47
  "RL_if": 76.8,
48
  "RL": 46.1,
49
  "RC_p": 65.8,
50
- "RC_if": 60.0,
51
- "RC": 41.5,
52
- "MBPP*": 62.2,
53
- "Readability": 45.3
54
  },
55
  "maintainability": {
56
  "MI*": 28.0,
@@ -68,65 +163,115 @@
68
  "E_NI_S": 36.5,
69
  "Efficiency": 32.0
70
  },
 
 
 
 
 
71
  "correctness": {
72
- "Correctness": 44.7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
  },
74
  "overall": {
75
- "RACE Score": 42.8
76
  }
77
  },
78
- "claude-3.5-sonnet": {
79
  "correctness": {
80
- "Correctness": 64.6
 
 
 
 
 
81
  },
82
  "readability": {
83
- "R*": 77.4,
84
- "RN_p": 76.3,
85
- "RN_if": 92.3,
86
- "RN": 71.9,
87
- "RL_p": 62.2,
88
- "RL_if": 70.3,
89
- "RL": 52.0,
90
- "RC_p": 74.1,
91
- "RC_if": 72.2,
92
- "RC": 58.0,
93
- "MBPP*": 63.5,
94
- "Readability": 60.6
95
  },
96
  "maintainability": {
97
- "MI*": 42.0,
98
- "MI_p": 32.0,
99
- "MI": 75.3,
100
- "MC*": 71.7,
101
- "MC_p": 68.5,
102
- "MC": 59.8,
103
- "Maintainability": 67.5
104
  },
105
  "efficiency": {
106
- "E*": 68.3,
107
- "E_p": 66.3,
108
- "E_NI_T": 56.8,
109
- "E_NI_S": 49.7,
110
- "Efficiency": 53.2
111
  },
112
  "overall": {
113
- "RACE Score": 61.5
114
  }
115
  },
116
- "CodeLlama-7b-Instruct": {
 
 
 
 
 
 
 
 
117
  "readability": {
118
  "R*": 32.3,
119
  "RN_p": 31.5,
120
- "RN_if": 55.5,
121
- "RN": 17.0,
122
  "RL_p": 31.7,
123
  "RL_if": 59.7,
124
  "RL": 23.4,
125
  "RC_p": 30.2,
126
- "RC_if": 67.4,
127
- "RC": 18.3,
128
- "MBPP*": 43.1,
129
- "Readability": 19.6
130
  },
131
  "maintainability": {
132
  "MI*": 16.0,
@@ -144,65 +289,73 @@
144
  "E_NI_S": 8.8,
145
  "Efficiency": 8.5
146
  },
147
- "correctness": {
148
- "Correctness": 23.9
149
- },
150
  "overall": {
151
- "RACE Score": 22.9
152
  }
153
  },
154
- "CodeLlama-7b-Python": {
 
 
 
 
 
 
 
 
155
  "readability": {
156
- "R*": 29.3,
157
- "RN_p": 29.5,
158
- "RN_if": 66.4,
159
- "RN": 20.4,
160
- "RL_p": 30.1,
161
- "RL_if": 76.6,
162
- "RL": 25.8,
163
- "RC_p": 24.7,
164
- "RC_if": 42.1,
165
- "RC": 11.6,
166
- "MBPP*": 41.3,
167
- "Readability": 19.3
168
  },
169
  "maintainability": {
170
- "MI*": 11.0,
171
- "MI_p": 10.0,
172
- "MI": 79.4,
173
- "MC*": 5.6,
174
- "MC_p": 6.5,
175
- "MC": 3.7,
176
- "Maintainability": 41.6
177
  },
178
  "efficiency": {
179
- "E*": 14.9,
180
- "E_p": 15.8,
181
- "E_NI_T": 14.3,
182
- "E_NI_S": 14.4,
183
- "Efficiency": 14.4
184
- },
185
- "correctness": {
186
- "Correctness": 20.4
187
  },
188
  "overall": {
189
- "RACE Score": 23.9
190
  }
191
  },
192
- "CodeLlama-13b-Instruct": {
 
 
 
 
 
 
 
 
193
  "readability": {
194
  "R*": 36.0,
195
  "RN_p": 37.7,
196
- "RN_if": 57.8,
197
- "RN": 22.0,
198
  "RL_p": 35.0,
199
  "RL_if": 59.9,
200
  "RL": 23.6,
201
  "RC_p": 35.7,
202
- "RC_if": 64.3,
203
- "RC": 23.2,
204
- "MBPP*": 40.7,
205
- "Readability": 22.9
206
  },
207
  "maintainability": {
208
  "MI*": 17.0,
@@ -220,65 +373,73 @@
220
  "E_NI_S": 16.1,
221
  "Efficiency": 13.2
222
  },
223
- "correctness": {
224
- "Correctness": 24.4
225
- },
226
  "overall": {
227
- "RACE Score": 26.4
228
  }
229
  },
230
- "CodeLlama-13b-Python": {
 
 
 
 
 
 
 
 
231
  "readability": {
232
- "R*": 40.2,
233
- "RN_p": 35.0,
234
- "RN_if": 61.3,
235
- "RN": 22.4,
236
- "RL_p": 34.8,
237
- "RL_if": 83.5,
238
- "RL": 30.9,
239
- "RC_p": 30.2,
240
- "RC_if": 60.7,
241
- "RC": 20.4,
242
- "MBPP*": 29.4,
243
- "Readability": 24.6
244
  },
245
  "maintainability": {
246
- "MI*": 16.0,
247
- "MI_p": 15.0,
248
- "MI": 78.6,
249
- "MC*": 6.1,
250
- "MC_p": 4.8,
251
- "MC": 2.4,
252
- "Maintainability": 40.5
253
  },
254
  "efficiency": {
255
- "E*": 16.8,
256
- "E_p": 17.8,
257
- "E_NI_T": 13.8,
258
- "E_NI_S": 14.7,
259
- "Efficiency": 14.2
260
- },
261
- "correctness": {
262
- "Correctness": 21.7
263
  },
264
  "overall": {
265
- "RACE Score": 25.3
266
  }
267
  },
268
- "CodeLlama-34b-Instruct": {
 
 
 
 
 
 
 
 
269
  "readability": {
270
  "R*": 36.0,
271
  "RN_p": 36.5,
272
- "RN_if": 54.3,
273
- "RN": 21.1,
274
  "RL_p": 35.8,
275
  "RL_if": 41.7,
276
  "RL": 17.5,
277
  "RC_p": 36.3,
278
- "RC_if": 32.0,
279
- "RC": 9.4,
280
- "MBPP*": 45.8,
281
- "Readability": 16.0
282
  },
283
  "maintainability": {
284
  "MI*": 12.0,
@@ -296,65 +457,199 @@
296
  "E_NI_S": 13.8,
297
  "Efficiency": 14.1
298
  },
 
 
 
 
 
299
  "correctness": {
300
- "Correctness": 26.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
301
  },
302
  "overall": {
303
- "RACE Score": 24.2
304
  }
305
  },
306
- "CodeLlama-34b-Python": {
 
 
 
 
 
 
 
 
307
  "readability": {
308
- "R*": 31.7,
309
- "RN_p": 27.2,
310
- "RN_if": 66.9,
311
- "RN": 18.6,
312
- "RL_p": 32.5,
313
- "RL_if": 73.2,
314
- "RL": 26.7,
315
- "RC_p": 27.8,
316
- "RC_if": 39.4,
317
- "RC": 6.7,
318
- "MBPP*": 36.2,
319
- "Readability": 17.3
320
  },
321
  "maintainability": {
322
- "MI*": 3.0,
323
- "MI_p": 2.0,
324
- "MI": 85.3,
325
- "MC*": 7.2,
326
- "MC_p": 5.4,
327
- "MC": 2.2,
328
- "Maintainability": 43.8
329
  },
330
  "efficiency": {
331
- "E*": 17.8,
332
- "E_p": 11.9,
333
- "E_NI_T": 12.0,
334
- "E_NI_S": 14.4,
335
- "Efficiency": 13.2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
336
  },
337
- "correctness": {
338
- "Correctness": 19.2
 
 
 
 
339
  },
340
  "overall": {
341
- "RACE Score": 23.4
342
  }
343
  },
344
- "DeepSeek-Coder-Instruct-6.7B": {
 
 
 
 
 
 
 
 
345
  "readability": {
346
  "R*": 65.2,
347
  "RN_p": 65.5,
348
- "RN_if": 67.2,
349
- "RN": 44.4,
350
  "RL_p": 61.2,
351
  "RL_if": 73.6,
352
  "RL": 46.6,
353
  "RC_p": 61.2,
354
- "RC_if": 65.5,
355
- "RC": 42.0,
356
- "MBPP*": 57.1,
357
- "Readability": 44.3
358
  },
359
  "maintainability": {
360
  "MI*": 26.0,
@@ -372,27 +667,31 @@
372
  "E_NI_S": 30.0,
373
  "Efficiency": 28.6
374
  },
375
- "correctness": {
376
- "Correctness": 39.2
377
- },
378
  "overall": {
379
- "RACE Score": 39.0
380
  }
381
  },
382
- "DeepSeek-Coder-Instruct-7B": {
 
 
 
 
 
 
 
 
383
  "readability": {
384
  "R*": 61.0,
385
  "RN_p": 61.5,
386
- "RN_if": 57.8,
387
- "RN": 35.2,
388
  "RL_p": 62.6,
389
  "RL_if": 70.9,
390
  "RL": 46.0,
391
  "RC_p": 62.8,
392
- "RC_if": 70.2,
393
- "RC": 46.0,
394
- "MBPP*": 59.3,
395
- "Readability": 42.4
396
  },
397
  "maintainability": {
398
  "MI*": 23.0,
@@ -410,27 +709,31 @@
410
  "E_NI_S": 26.8,
411
  "Efficiency": 26.0
412
  },
413
- "correctness": {
414
- "Correctness": 39.9
415
- },
416
  "overall": {
417
- "RACE Score": 38.1
418
  }
419
  },
420
- "DeepSeek-Coder-Instruct-33B": {
 
 
 
 
 
 
 
 
421
  "readability": {
422
  "R*": 65.9,
423
  "RN_p": 64.6,
424
- "RN_if": 86.8,
425
- "RN": 57.7,
426
  "RL_p": 65.0,
427
  "RL_if": 82.7,
428
  "RL": 53.5,
429
  "RC_p": 66.5,
430
- "RC_if": 70.8,
431
- "RC": 46.4,
432
- "MBPP*": 61.9,
433
- "Readability": 52.5
434
  },
435
  "maintainability": {
436
  "MI*": 28.0,
@@ -448,27 +751,31 @@
448
  "E_NI_S": 36.1,
449
  "Efficiency": 35.7
450
  },
451
- "correctness": {
452
- "Correctness": 44.7
453
- },
454
  "overall": {
455
- "RACE Score": 44.1
456
  }
457
  },
458
  "DeepSeek-Coder-V2-Lite-Instruct-16B": {
 
 
 
 
 
 
 
 
459
  "readability": {
460
  "R*": 72.0,
461
  "RN_p": 71.2,
462
- "RN_if": 55.3,
463
- "RN": 40.2,
464
  "RL_p": 66.5,
465
  "RL_if": 83.7,
466
  "RL": 57.7,
467
  "RC_p": 67.1,
468
- "RC_if": 63.5,
469
- "RC": 42.7,
470
- "MBPP*": 62.7,
471
- "Readability": 46.9
472
  },
473
  "maintainability": {
474
  "MI*": 26.0,
@@ -486,217 +793,73 @@
486
  "E_NI_S": 47.7,
487
  "Efficiency": 44.0
488
  },
489
- "correctness": {
490
- "Correctness": 50.9
491
- },
492
- "overall": {
493
- "RACE Score": 47.7
494
- }
495
- },
496
- "DeepSeek-Coder-V2-Instruct-236B": {
497
- "readability": {
498
- "R*": 73.8,
499
- "RN_p": 75.3,
500
- "RN_if": 91.8,
501
- "RN": 70.0,
502
- "RL_p": 75.2,
503
- "RL_if": 88.4,
504
- "RL": 67.1,
505
- "RC_p": 76.5,
506
- "RC_if": 74.1,
507
- "RC": 58.5,
508
- "MBPP*": 68.5,
509
- "Readability": 65.2
510
- },
511
- "maintainability": {
512
- "MI*": 35.0,
513
- "MI_p": 38.0,
514
- "MI": 77.3,
515
- "MC*": 58.9,
516
- "MC_p": 58.9,
517
- "MC": 35.0,
518
- "Maintainability": 56.1
519
- },
520
- "efficiency": {
521
- "E*": 57.3,
522
- "E_p": 53.5,
523
- "E_NI_T": 41.1,
524
- "E_NI_S": 49.4,
525
- "Efficiency": 45.2
526
- },
527
- "correctness": {
528
- "Correctness": 58.7
529
- },
530
- "overall": {
531
- "RACE Score": 56.3
532
- }
533
- },
534
- "WizardCoder-Python-7B-V1.0": {
535
- "readability": {
536
- "R*": 34.8,
537
- "RN_p": 35.8,
538
- "RN_if": 58.3,
539
- "RN": 22.4,
540
- "RL_p": 34.3,
541
- "RL_if": 79.7,
542
- "RL": 28.0,
543
- "RC_p": 35.4,
544
- "RC_if": 25.0,
545
- "RC": 8.6,
546
- "MBPP*": 41.8,
547
- "Readability": 19.7
548
- },
549
- "maintainability": {
550
- "MI*": 19.0,
551
- "MI_p": 23.0,
552
- "MI": 79.3,
553
- "MC*": 10.6,
554
- "MC_p": 9.8,
555
- "MC": 7.2,
556
- "Maintainability": 43.2
557
- },
558
- "efficiency": {
559
- "E*": 19.8,
560
- "E_p": 19.8,
561
- "E_NI_T": 15.3,
562
- "E_NI_S": 16.7,
563
- "Efficiency": 16.0
564
- },
565
- "correctness": {
566
- "Correctness": 25.2
567
- },
568
  "overall": {
569
- "RACE Score": 26.0
570
  }
571
  },
572
- "WizardCoder-Python-13B-V1.0": {
573
- "readability": {
574
- "R*": 36.0,
575
- "RN_p": 38.2,
576
- "RN_if": 58.4,
577
- "RN": 23.1,
578
- "RL_p": 38.4,
579
- "RL_if": 83.1,
580
- "RL": 33.1,
581
- "RC_p": 43.6,
582
- "RC_if": 59.8,
583
- "RC": 27.4,
584
- "MBPP*": 42.1,
585
- "Readability": 27.9
586
- },
587
- "maintainability": {
588
- "MI*": 20.0,
589
- "MI_p": 21.0,
590
- "MI": 78.8,
591
- "MC*": 12.8,
592
- "MC_p": 12.8,
593
- "MC": 8.5,
594
- "Maintainability": 43.6
595
- },
596
- "efficiency": {
597
- "E*": 20.8,
598
- "E_p": 18.8,
599
- "E_NI_T": 16.2,
600
- "E_NI_S": 19.8,
601
- "Efficiency": 18.0
602
- },
603
  "correctness": {
604
- "Correctness": 26.3
 
 
 
 
 
605
  },
606
- "overall": {
607
- "RACE Score": 29.0
608
- }
609
- },
610
- "WizardCoder-15B-V1.0": {
611
  "readability": {
612
- "R*": 38.4,
613
- "RN_p": 38.7,
614
- "RN_if": 59.0,
615
- "RN": 23.2,
616
- "RL_p": 41.9,
617
- "RL_if": 64.8,
618
- "RL": 27.8,
619
- "RC_p": 40.0,
620
- "RC_if": 57.3,
621
- "RC": 24.4,
622
- "MBPP*": 46.3,
623
- "Readability": 25.1
624
  },
625
  "maintainability": {
626
- "MI*": 22.0,
627
- "MI_p": 21.0,
628
- "MI": 80.0,
629
- "MC*": 11.7,
630
- "MC_p": 11.5,
631
- "MC": 7.8,
632
- "Maintainability": 43.9
633
  },
634
  "efficiency": {
635
- "E*": 21.8,
636
- "E_p": 22.8,
637
- "E_NI_T": 21.8,
638
- "E_NI_S": 24.2,
639
- "Efficiency": 23.0
640
- },
641
- "correctness": {
642
- "Correctness": 28.0
643
  },
644
  "overall": {
645
- "RACE Score": 30.0
646
  }
647
  },
648
- "WizardCoder-33B-V1.1": {
649
- "readability": {
650
- "R*": 58.5,
651
- "RN_p": 58.8,
652
- "RN_if": 65.4,
653
- "RN": 39.9,
654
- "RL_p": 62.2,
655
- "RL_if": 76.0,
656
- "RL": 47.6,
657
- "RC_p": 58.8,
658
- "RC_if": 61.0,
659
- "RC": 37.2,
660
- "MBPP*": 64.6,
661
- "Readability": 41.6
662
- },
663
- "maintainability": {
664
- "MI*": 34.0,
665
- "MI_p": 34.0,
666
- "MI": 71.2,
667
- "MC*": 26.1,
668
- "MC_p": 25.0,
669
- "MC": 9.3,
670
- "Maintainability": 40.2
671
- },
672
- "efficiency": {
673
- "E*": 38.6,
674
- "E_p": 35.6,
675
- "E_NI_T": 33.9,
676
- "E_NI_S": 34.9,
677
- "Efficiency": 34.4
678
- },
679
  "correctness": {
680
- "Correctness": 44.4
 
 
 
 
 
681
  },
682
- "overall": {
683
- "RACE Score": 40.1
684
- }
685
- },
686
- "CodeQwen1.5-7B-Chat": {
687
  "readability": {
688
  "R*": 76.2,
689
  "RN_p": 76.8,
690
- "RN_if": 60.8,
691
- "RN": 47.0,
692
  "RL_p": 73.4,
693
  "RL_if": 60.8,
694
  "RL": 47.0,
695
  "RC_p": 74.7,
696
- "RC_if": 71.3,
697
- "RC": 54.2,
698
- "MBPP*": 60.3,
699
- "Readability": 49.4
700
  },
701
  "maintainability": {
702
  "MI*": 22.0,
@@ -714,30 +877,73 @@
714
  "E_NI_S": 37.7,
715
  "Efficiency": 34.2
716
  },
 
 
 
 
 
717
  "correctness": {
718
- "Correctness": 46.3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
719
  },
720
  "overall": {
721
- "RACE Score": 44.4
722
  }
723
  },
724
  "Qwen2-72B-Instruct": {
725
  "correctness": {
 
 
 
 
 
726
  "Correctness": 53.1
727
  },
728
  "readability": {
729
  "R*": 73.2,
730
  "RN_p": 76.8,
731
- "RN_if": 93.8,
732
- "RN": 72.0,
733
  "RL_p": 74.8,
734
  "RL_if": 64.4,
735
  "RL": 47.6,
736
  "RC_p": 71.1,
737
- "RC_if": 74.4,
738
- "RC": 54.0,
739
- "MBPP*": 64.0,
740
- "Readability": 57.9
741
  },
742
  "maintainability": {
743
  "MI*": 40.0,
@@ -756,7 +962,217 @@
756
  "Efficiency": 35.8
757
  },
758
  "overall": {
759
- "RACE Score": 49.5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
760
  }
761
  }
762
  }
 
1
  {
2
+ "Claude-3.5-Sonnet": {
3
+ "correctness": {
4
+ "HumanEval+": 77.4,
5
+ "MBPP+": 63.5,
6
+ "ClassEval": 42.0,
7
+ "LeetCode": 71.7,
8
+ "LeetCode_Efficiency": 68.3,
9
+ "Correctness": 64.6
10
+ },
11
+ "readability": {
12
+ "R*": 77.4,
13
+ "RN_p": 76.3,
14
+ "RN_if": 95.5,
15
+ "RN": 74.4,
16
+ "RL_p": 62.2,
17
+ "RL_if": 70.3,
18
+ "RL": 52.0,
19
+ "RC_p": 74.1,
20
+ "RC_if": 85.1,
21
+ "RC": 65.5,
22
+ "Readability": 64.0
23
+ },
24
+ "maintainability": {
25
+ "MI*": 42.0,
26
+ "MI_p": 32.0,
27
+ "MI": 75.3,
28
+ "MC*": 71.7,
29
+ "MC_p": 68.5,
30
+ "MC": 59.8,
31
+ "Maintainability": 67.5
32
+ },
33
+ "efficiency": {
34
+ "E*": 68.3,
35
+ "E_p": 66.3,
36
+ "E_NI_T": 56.8,
37
+ "E_NI_S": 49.7,
38
+ "Efficiency": 53.2
39
+ },
40
+ "overall": {
41
+ "RACE Score": 62.3
42
+ }
43
+ },
44
+ "GPT-4o-2024-05-13": {
45
+ "correctness": {
46
+ "HumanEval+": 80.5,
47
+ "MBPP+": 64.6,
48
+ "ClassEval": 38.0,
49
+ "LeetCode": 57.2,
50
+ "LeetCode_Efficiency": 59.4,
51
+ "Correctness": 59.9
52
+ },
53
  "readability": {
54
  "R*": 80.5,
55
+ "RN_p": 81.2,
56
+ "RN_if": 95.6,
57
+ "RN": 78.6,
58
  "RL_p": 78.9,
59
  "RL_if": 78.9,
60
  "RL": 63.2,
61
  "RC_p": 79.8,
62
+ "RC_if": 87.5,
63
+ "RC": 70.4,
64
+ "Readability": 70.7
 
65
  },
66
  "maintainability": {
67
  "MI*": 38.0,
 
75
  "efficiency": {
76
  "E*": 59.4,
77
  "E_p": 58.4,
78
+ "E_NI_T": 44.0,
79
  "E_NI_S": 42.0,
80
+ "Efficiency": 43.0
81
  },
82
+ "overall": {
83
+ "RACE Score": 57.2
84
+ }
85
+ },
86
+ "GPT-4o-mini": {
87
  "correctness": {
88
+ "HumanEval+": 78.0,
89
+ "MBPP+": 63.0,
90
+ "ClassEval": 37.0,
91
+ "LeetCode": 51.7,
92
+ "LeetCode_Efficiency": 52.5,
93
+ "Correctness": 56.4
94
+ },
95
+ "readability": {
96
+ "R*": 78.0,
97
+ "RN_p": 76.4,
98
+ "RN_if": 87.0,
99
+ "RN": 67.6,
100
+ "RL_p": 70.3,
101
+ "RL_if": 74.8,
102
+ "RL": 55.7,
103
+ "RC_p": 74.1,
104
+ "RC_if": 96.9,
105
+ "RC": 72.9,
106
+ "Readability": 65.4
107
+ },
108
+ "maintainability": {
109
+ "MI*": 37.0,
110
+ "MI_p": 27.0,
111
+ "MI": 73.5,
112
+ "MC*": 51.7,
113
+ "MC_p": 49.1,
114
+ "MC": 23.3,
115
+ "Maintainability": 48.4
116
+ },
117
+ "efficiency": {
118
+ "E*": 52.5,
119
+ "E_p": 46.5,
120
+ "E_NI_T": 40.3,
121
+ "E_NI_S": 39.5,
122
+ "Efficiency": 39.9
123
  },
124
  "overall": {
125
+ "RACE Score": 52.5
126
  }
127
  },
128
+ "GPT-3.5-Turbo-0125": {
129
+ "correctness": {
130
+ "HumanEval+": 62.8,
131
+ "MBPP+": 62.2,
132
+ "ClassEval": 28.0,
133
+ "LeetCode": 31.1,
134
+ "LeetCode_Efficiency": 39.6,
135
+ "Correctness": 44.7
136
+ },
137
  "readability": {
138
  "R*": 62.8,
139
  "RN_p": 63.2,
140
+ "RN_if": 79.2,
141
+ "RN": 51.4,
142
  "RL_p": 60.4,
143
  "RL_if": 76.8,
144
  "RL": 46.1,
145
  "RC_p": 65.8,
146
+ "RC_if": 70.1,
147
+ "RC": 47.5,
148
+ "Readability": 48.3
 
149
  },
150
  "maintainability": {
151
  "MI*": 28.0,
 
163
  "E_NI_S": 36.5,
164
  "Efficiency": 32.0
165
  },
166
+ "overall": {
167
+ "RACE Score": 43.6
168
+ }
169
+ },
170
+ "o1-mini-2024-09-12": {
171
  "correctness": {
172
+ "HumanEval+": 82.9,
173
+ "MBPP+": 64.8,
174
+ "ClassEval": 36.0,
175
+ "LeetCode": 79.6,
176
+ "LeetCode_Efficiency": 87.1,
177
+ "Correctness": 70.1
178
+ },
179
+ "readability": {
180
+ "R*": 82.9,
181
+ "RN_p": 83.2,
182
+ "RN_if": 95.0,
183
+ "RN": 80.7,
184
+ "RL_p": 76.4,
185
+ "RL_if": 56.7,
186
+ "RL": 47.5,
187
+ "RC_p": 80.2,
188
+ "RC_if": 94.2,
189
+ "RC": 77.7,
190
+ "Readability": 68.6
191
+ },
192
+ "maintainability": {
193
+ "MI*": 36.0,
194
+ "MI_p": 25.0,
195
+ "MI": 64.4,
196
+ "MC*": 79.6,
197
+ "MC_p": 83.3,
198
+ "MC": 66.1,
199
+ "Maintainability": 65.2
200
+ },
201
+ "efficiency": {
202
+ "E*": 87.1,
203
+ "E_p": 77.4,
204
+ "E_NI_T": 60.3,
205
+ "E_NI_S": 40.0,
206
+ "Efficiency": 50.1
207
  },
208
  "overall": {
209
+ "RACE Score": 63.5
210
  }
211
  },
212
+ "CodeLlama-7B-Python": {
213
  "correctness": {
214
+ "HumanEval+": 29.3,
215
+ "MBPP+": 41.3,
216
+ "ClassEval": 11.0,
217
+ "LeetCode": 5.6,
218
+ "LeetCode_Efficiency": 14.9,
219
+ "Correctness": 20.4
220
  },
221
  "readability": {
222
+ "R*": 29.3,
223
+ "RN_p": 29.5,
224
+ "RN_if": 69.0,
225
+ "RN": 20.9,
226
+ "RL_p": 30.1,
227
+ "RL_if": 76.6,
228
+ "RL": 25.8,
229
+ "RC_p": 24.7,
230
+ "RC_if": 57.9,
231
+ "RC": 12.5,
232
+ "Readability": 19.7
 
233
  },
234
  "maintainability": {
235
+ "MI*": 11.0,
236
+ "MI_p": 10.0,
237
+ "MI": 79.4,
238
+ "MC*": 5.6,
239
+ "MC_p": 6.5,
240
+ "MC": 3.7,
241
+ "Maintainability": 41.6
242
  },
243
  "efficiency": {
244
+ "E*": 14.9,
245
+ "E_p": 15.8,
246
+ "E_NI_T": 14.3,
247
+ "E_NI_S": 14.4,
248
+ "Efficiency": 14.4
249
  },
250
  "overall": {
251
+ "RACE Score": 24.0
252
  }
253
  },
254
+ "CodeLlama-7B-Instruct": {
255
+ "correctness": {
256
+ "HumanEval+": 32.3,
257
+ "MBPP+": 43.1,
258
+ "ClassEval": 16.0,
259
+ "LeetCode": 12.2,
260
+ "LeetCode_Efficiency": 15.8,
261
+ "Correctness": 23.9
262
+ },
263
  "readability": {
264
  "R*": 32.3,
265
  "RN_p": 31.5,
266
+ "RN_if": 58.2,
267
+ "RN": 17.8,
268
  "RL_p": 31.7,
269
  "RL_if": 59.7,
270
  "RL": 23.4,
271
  "RC_p": 30.2,
272
+ "RC_if": 76.2,
273
+ "RC": 22.2,
274
+ "Readability": 21.1
 
275
  },
276
  "maintainability": {
277
  "MI*": 16.0,
 
289
  "E_NI_S": 8.8,
290
  "Efficiency": 8.5
291
  },
 
 
 
292
  "overall": {
293
+ "RACE Score": 23.2
294
  }
295
  },
296
+ "CodeLlama-13B-Python": {
297
+ "correctness": {
298
+ "HumanEval+": 40.2,
299
+ "MBPP+": 29.4,
300
+ "ClassEval": 16.0,
301
+ "LeetCode": 6.1,
302
+ "LeetCode_Efficiency": 16.8,
303
+ "Correctness": 21.7
304
+ },
305
  "readability": {
306
+ "R*": 40.2,
307
+ "RN_p": 35.0,
308
+ "RN_if": 63.6,
309
+ "RN": 23.1,
310
+ "RL_p": 34.8,
311
+ "RL_if": 83.5,
312
+ "RL": 30.9,
313
+ "RC_p": 30.2,
314
+ "RC_if": 77.4,
315
+ "RC": 24.4,
316
+ "Readability": 26.1
 
317
  },
318
  "maintainability": {
319
+ "MI*": 16.0,
320
+ "MI_p": 15.0,
321
+ "MI": 78.6,
322
+ "MC*": 6.1,
323
+ "MC_p": 4.8,
324
+ "MC": 2.4,
325
+ "Maintainability": 40.5
326
  },
327
  "efficiency": {
328
+ "E*": 16.8,
329
+ "E_p": 17.8,
330
+ "E_NI_T": 13.8,
331
+ "E_NI_S": 14.7,
332
+ "Efficiency": 14.2
 
 
 
333
  },
334
  "overall": {
335
+ "RACE Score": 25.6
336
  }
337
  },
338
+ "CodeLlama-13B-Instruct": {
339
+ "correctness": {
340
+ "HumanEval+": 36.0,
341
+ "MBPP+": 40.7,
342
+ "ClassEval": 17.0,
343
+ "LeetCode": 10.6,
344
+ "LeetCode_Efficiency": 17.8,
345
+ "Correctness": 24.4
346
+ },
347
  "readability": {
348
  "R*": 36.0,
349
  "RN_p": 37.7,
350
+ "RN_if": 60.2,
351
+ "RN": 22.9,
352
  "RL_p": 35.0,
353
  "RL_if": 59.9,
354
  "RL": 23.6,
355
  "RC_p": 35.7,
356
+ "RC_if": 75.0,
357
+ "RC": 29.0,
358
+ "Readability": 25.2
 
359
  },
360
  "maintainability": {
361
  "MI*": 17.0,
 
373
  "E_NI_S": 16.1,
374
  "Efficiency": 13.2
375
  },
 
 
 
376
  "overall": {
377
+ "RACE Score": 26.9
378
  }
379
  },
380
+ "CodeLlama-34B-Python": {
381
+ "correctness": {
382
+ "HumanEval+": 31.7,
383
+ "MBPP+": 36.2,
384
+ "ClassEval": 3.0,
385
+ "LeetCode": 7.2,
386
+ "LeetCode_Efficiency": 17.8,
387
+ "Correctness": 19.2
388
+ },
389
  "readability": {
390
+ "R*": 31.7,
391
+ "RN_p": 27.2,
392
+ "RN_if": 68.6,
393
+ "RN": 18.8,
394
+ "RL_p": 32.5,
395
+ "RL_if": 73.2,
396
+ "RL": 26.7,
397
+ "RC_p": 27.8,
398
+ "RC_if": 48.8,
399
+ "RC": 8.6,
400
+ "Readability": 18.0
 
401
  },
402
  "maintainability": {
403
+ "MI*": 3.0,
404
+ "MI_p": 2.0,
405
+ "MI": 85.3,
406
+ "MC*": 7.2,
407
+ "MC_p": 5.4,
408
+ "MC": 2.2,
409
+ "Maintainability": 43.8
410
  },
411
  "efficiency": {
412
+ "E*": 17.8,
413
+ "E_p": 11.9,
414
+ "E_NI_T": 12.0,
415
+ "E_NI_S": 14.4,
416
+ "Efficiency": 13.2
 
 
 
417
  },
418
  "overall": {
419
+ "RACE Score": 23.6
420
  }
421
  },
422
+ "CodeLlama-34B-Instruct": {
423
+ "correctness": {
424
+ "HumanEval+": 36.0,
425
+ "MBPP+": 45.8,
426
+ "ClassEval": 12.0,
427
+ "LeetCode": 15.6,
428
+ "LeetCode_Efficiency": 20.8,
429
+ "Correctness": 26.0
430
+ },
431
  "readability": {
432
  "R*": 36.0,
433
  "RN_p": 36.5,
434
+ "RN_if": 56.8,
435
+ "RN": 21.9,
436
  "RL_p": 35.8,
437
  "RL_if": 41.7,
438
  "RL": 17.5,
439
  "RC_p": 36.3,
440
+ "RC_if": 36.2,
441
+ "RC": 10.7,
442
+ "Readability": 16.7
 
443
  },
444
  "maintainability": {
445
  "MI*": 12.0,
 
457
  "E_NI_S": 13.8,
458
  "Efficiency": 14.1
459
  },
460
+ "overall": {
461
+ "RACE Score": 24.4
462
+ }
463
+ },
464
+ "WizardCoder-15B-V1.0": {
465
  "correctness": {
466
+ "HumanEval+": 38.4,
467
+ "MBPP+": 46.3,
468
+ "ClassEval": 22.0,
469
+ "LeetCode": 11.7,
470
+ "LeetCode_Efficiency": 21.8,
471
+ "Correctness": 28.0
472
+ },
473
+ "readability": {
474
+ "R*": 38.4,
475
+ "RN_p": 38.7,
476
+ "RN_if": 61.0,
477
+ "RN": 24.0,
478
+ "RL_p": 41.9,
479
+ "RL_if": 64.8,
480
+ "RL": 27.8,
481
+ "RC_p": 40.0,
482
+ "RC_if": 65.0,
483
+ "RC": 28.1,
484
+ "Readability": 26.6
485
+ },
486
+ "maintainability": {
487
+ "MI*": 22.0,
488
+ "MI_p": 21.0,
489
+ "MI": 80.0,
490
+ "MC*": 11.7,
491
+ "MC_p": 11.5,
492
+ "MC": 7.8,
493
+ "Maintainability": 43.9
494
+ },
495
+ "efficiency": {
496
+ "E*": 21.8,
497
+ "E_p": 22.8,
498
+ "E_NI_T": 21.8,
499
+ "E_NI_S": 24.2,
500
+ "Efficiency": 23.0
501
  },
502
  "overall": {
503
+ "RACE Score": 30.4
504
  }
505
  },
506
+ "WizardCoder-33B-V1.1": {
507
+ "correctness": {
508
+ "HumanEval+": 58.5,
509
+ "MBPP+": 64.6,
510
+ "ClassEval": 34.0,
511
+ "LeetCode": 26.1,
512
+ "LeetCode_Efficiency": 38.6,
513
+ "Correctness": 44.4
514
+ },
515
  "readability": {
516
+ "R*": 58.5,
517
+ "RN_p": 58.8,
518
+ "RN_if": 68.0,
519
+ "RN": 40.9,
520
+ "RL_p": 62.2,
521
+ "RL_if": 76.0,
522
+ "RL": 47.6,
523
+ "RC_p": 58.8,
524
+ "RC_if": 73.8,
525
+ "RC": 44.8,
526
+ "Readability": 44.4
 
527
  },
528
  "maintainability": {
529
+ "MI*": 34.0,
530
+ "MI_p": 34.0,
531
+ "MI": 71.2,
532
+ "MC*": 26.1,
533
+ "MC_p": 25.0,
534
+ "MC": 9.3,
535
+ "Maintainability": 40.2
536
  },
537
  "efficiency": {
538
+ "E*": 38.6,
539
+ "E_p": 35.6,
540
+ "E_NI_T": 33.9,
541
+ "E_NI_S": 34.9,
542
+ "Efficiency": 34.4
543
+ },
544
+ "overall": {
545
+ "RACE Score": 40.8
546
+ }
547
+ },
548
+ "WizardCoder-Python-7B-V1.0": {
549
+ "correctness": {
550
+ "HumanEval+": 34.8,
551
+ "MBPP+": 41.8,
552
+ "ClassEval": 19.0,
553
+ "LeetCode": 10.6,
554
+ "LeetCode_Efficiency": 19.8,
555
+ "Correctness": 25.2
556
+ },
557
+ "readability": {
558
+ "R*": 34.8,
559
+ "RN_p": 35.8,
560
+ "RN_if": 60.2,
561
+ "RN": 22.8,
562
+ "RL_p": 34.3,
563
+ "RL_if": 79.7,
564
+ "RL": 28.0,
565
+ "RC_p": 35.4,
566
+ "RC_if": 31.8,
567
+ "RC": 10.1,
568
+ "Readability": 20.3
569
+ },
570
+ "maintainability": {
571
+ "MI*": 19.0,
572
+ "MI_p": 23.0,
573
+ "MI": 79.3,
574
+ "MC*": 10.6,
575
+ "MC_p": 9.8,
576
+ "MC": 7.2,
577
+ "Maintainability": 43.2
578
+ },
579
+ "efficiency": {
580
+ "E*": 19.8,
581
+ "E_p": 19.8,
582
+ "E_NI_T": 15.3,
583
+ "E_NI_S": 16.7,
584
+ "Efficiency": 16.0
585
+ },
586
+ "overall": {
587
+ "RACE Score": 26.2
588
+ }
589
+ },
590
+ "WizardCoder-Python-13B-V1.0": {
591
+ "correctness": {
592
+ "HumanEval+": 36.0,
593
+ "MBPP+": 42.1,
594
+ "ClassEval": 20.0,
595
+ "LeetCode": 12.8,
596
+ "LeetCode_Efficiency": 20.8,
597
+ "Correctness": 26.3
598
+ },
599
+ "readability": {
600
+ "R*": 36.0,
601
+ "RN_p": 38.2,
602
+ "RN_if": 60.2,
603
+ "RN": 23.9,
604
+ "RL_p": 38.4,
605
+ "RL_if": 83.1,
606
+ "RL": 33.1,
607
+ "RC_p": 43.6,
608
+ "RC_if": 67.7,
609
+ "RC": 30.5,
610
+ "Readability": 29.2
611
+ },
612
+ "maintainability": {
613
+ "MI*": 20.0,
614
+ "MI_p": 21.0,
615
+ "MI": 78.8,
616
+ "MC*": 12.8,
617
+ "MC_p": 12.8,
618
+ "MC": 8.5,
619
+ "Maintainability": 43.6
620
  },
621
+ "efficiency": {
622
+ "E*": 20.8,
623
+ "E_p": 18.8,
624
+ "E_NI_T": 16.2,
625
+ "E_NI_S": 19.8,
626
+ "Efficiency": 18.0
627
  },
628
  "overall": {
629
+ "RACE Score": 29.3
630
  }
631
  },
632
+ "DeepSeek-Coder-6.7B-Instruct": {
633
+ "correctness": {
634
+ "HumanEval+": 65.2,
635
+ "MBPP+": 57.1,
636
+ "ClassEval": 26.0,
637
+ "LeetCode": 18.9,
638
+ "LeetCode_Efficiency": 28.7,
639
+ "Correctness": 39.2
640
+ },
641
  "readability": {
642
  "R*": 65.2,
643
  "RN_p": 65.5,
644
+ "RN_if": 69.5,
645
+ "RN": 45.8,
646
  "RL_p": 61.2,
647
  "RL_if": 73.6,
648
  "RL": 46.6,
649
  "RC_p": 61.2,
650
+ "RC_if": 78.3,
651
+ "RC": 50.0,
652
+ "Readability": 47.5
 
653
  },
654
  "maintainability": {
655
  "MI*": 26.0,
 
667
  "E_NI_S": 30.0,
668
  "Efficiency": 28.6
669
  },
 
 
 
670
  "overall": {
671
+ "RACE Score": 39.8
672
  }
673
  },
674
+ "DeepSeek-Coder-7B-Instruct-V1.5": {
675
+ "correctness": {
676
+ "HumanEval+": 61.0,
677
+ "MBPP+": 59.3,
678
+ "ClassEval": 23.0,
679
+ "LeetCode": 23.3,
680
+ "LeetCode_Efficiency": 32.7,
681
+ "Correctness": 39.9
682
+ },
683
  "readability": {
684
  "R*": 61.0,
685
  "RN_p": 61.5,
686
+ "RN_if": 60.5,
687
+ "RN": 36.8,
688
  "RL_p": 62.6,
689
  "RL_if": 70.9,
690
  "RL": 46.0,
691
  "RC_p": 62.8,
692
+ "RC_if": 83.0,
693
+ "RC": 53.7,
694
+ "Readability": 45.5
 
695
  },
696
  "maintainability": {
697
  "MI*": 23.0,
 
709
  "E_NI_S": 26.8,
710
  "Efficiency": 26.0
711
  },
 
 
 
712
  "overall": {
713
+ "RACE Score": 38.9
714
  }
715
  },
716
+ "DeepSeek-Coder-33B-Instruct": {
717
+ "correctness": {
718
+ "HumanEval+": 65.9,
719
+ "MBPP+": 61.9,
720
+ "ClassEval": 28.0,
721
+ "LeetCode": 22.2,
722
+ "LeetCode_Efficiency": 45.5,
723
+ "Correctness": 44.7
724
+ },
725
  "readability": {
726
  "R*": 65.9,
727
  "RN_p": 64.6,
728
+ "RN_if": 90.1,
729
+ "RN": 59.0,
730
  "RL_p": 65.0,
731
  "RL_if": 82.7,
732
  "RL": 53.5,
733
  "RC_p": 66.5,
734
+ "RC_if": 80.8,
735
+ "RC": 54.0,
736
+ "Readability": 55.5
 
737
  },
738
  "maintainability": {
739
  "MI*": 28.0,
 
751
  "E_NI_S": 36.1,
752
  "Efficiency": 35.7
753
  },
 
 
 
754
  "overall": {
755
+ "RACE Score": 44.8
756
  }
757
  },
758
  "DeepSeek-Coder-V2-Lite-Instruct-16B": {
759
+ "correctness": {
760
+ "HumanEval+": 72.0,
761
+ "MBPP+": 62.7,
762
+ "ClassEval": 26.0,
763
+ "LeetCode": 44.4,
764
+ "LeetCode_Efficiency": 49.5,
765
+ "Correctness": 50.9
766
+ },
767
  "readability": {
768
  "R*": 72.0,
769
  "RN_p": 71.2,
770
+ "RN_if": 57.8,
771
+ "RN": 41.8,
772
  "RL_p": 66.5,
773
  "RL_if": 83.7,
774
  "RL": 57.7,
775
  "RC_p": 67.1,
776
+ "RC_if": 71.0,
777
+ "RC": 47.5,
778
+ "Readability": 49.0
 
779
  },
780
  "maintainability": {
781
  "MI*": 26.0,
 
793
  "E_NI_S": 47.7,
794
  "Efficiency": 44.0
795
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
796
  "overall": {
797
+ "RACE Score": 48.2
798
  }
799
  },
800
+ "DeepSeek-V2.5-236B": {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
801
  "correctness": {
802
+ "HumanEval+": 72.0,
803
+ "MBPP+": 63.0,
804
+ "ClassEval": 41.0,
805
+ "LeetCode": 61.7,
806
+ "LeetCode_Efficiency": 57.4,
807
+ "Correctness": 59.0
808
  },
 
 
 
 
 
809
  "readability": {
810
+ "R*": 72.0,
811
+ "RN_p": 74.5,
812
+ "RN_if": 95.8,
813
+ "RN": 72.2,
814
+ "RL_p": 72.8,
815
+ "RL_if": 89.8,
816
+ "RL": 66.1,
817
+ "RC_p": 74.1,
818
+ "RC_if": 87.5,
819
+ "RC": 65.8,
820
+ "Readability": 68.0
 
821
  },
822
  "maintainability": {
823
+ "MI*": 41.0,
824
+ "MI_p": 36.0,
825
+ "MI": 72.9,
826
+ "MC*": 61.7,
827
+ "MC_p": 59.1,
828
+ "MC": 33.9,
829
+ "Maintainability": 53.4
830
  },
831
  "efficiency": {
832
+ "E*": 57.4,
833
+ "E_p": 54.5,
834
+ "E_NI_T": 46.4,
835
+ "E_NI_S": 49.5,
836
+ "Efficiency": 48.0
 
 
 
837
  },
838
  "overall": {
839
+ "RACE Score": 57.1
840
  }
841
  },
842
+ "CodeQwen1.5-7B-Chat": {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
843
  "correctness": {
844
+ "HumanEval+": 76.2,
845
+ "MBPP+": 60.3,
846
+ "ClassEval": 22.0,
847
+ "LeetCode": 33.3,
848
+ "LeetCode_Efficiency": 39.6,
849
+ "Correctness": 46.3
850
  },
 
 
 
 
 
851
  "readability": {
852
  "R*": 76.2,
853
  "RN_p": 76.8,
854
+ "RN_if": 63.2,
855
+ "RN": 48.8,
856
  "RL_p": 73.4,
857
  "RL_if": 60.8,
858
  "RL": 47.0,
859
  "RC_p": 74.7,
860
+ "RC_if": 80.8,
861
+ "RC": 62.2,
862
+ "Readability": 52.7
 
863
  },
864
  "maintainability": {
865
  "MI*": 22.0,
 
877
  "E_NI_S": 37.7,
878
  "Efficiency": 34.2
879
  },
880
+ "overall": {
881
+ "RACE Score": 45.2
882
+ }
883
+ },
884
+ "Qwen2.5-Coder-7B-Instruct": {
885
  "correctness": {
886
+ "HumanEval+": 78.0,
887
+ "MBPP+": 64.8,
888
+ "ClassEval": 29.0,
889
+ "LeetCode": 54.4,
890
+ "LeetCode_Efficiency": 59.4,
891
+ "Correctness": 57.1
892
+ },
893
+ "readability": {
894
+ "R*": 78.0,
895
+ "RN_p": 81.4,
896
+ "RN_if": 64.9,
897
+ "RN": 53.0,
898
+ "RL_p": 77.4,
899
+ "RL_if": 65.4,
900
+ "RL": 51.8,
901
+ "RC_p": 75.3,
902
+ "RC_if": 80.2,
903
+ "RC": 61.3,
904
+ "Readability": 55.4
905
+ },
906
+ "maintainability": {
907
+ "MI*": 29.0,
908
+ "MI_p": 27.0,
909
+ "MI": 78.6,
910
+ "MC*": 54.4,
911
+ "MC_p": 50.4,
912
+ "MC": 17.6,
913
+ "Maintainability": 48.1
914
+ },
915
+ "efficiency": {
916
+ "E*": 59.4,
917
+ "E_p": 48.5,
918
+ "E_NI_T": 37.0,
919
+ "E_NI_S": 33.7,
920
+ "Efficiency": 35.4
921
  },
922
  "overall": {
923
+ "RACE Score": 49.0
924
  }
925
  },
926
  "Qwen2-72B-Instruct": {
927
  "correctness": {
928
+ "HumanEval+": 73.2,
929
+ "MBPP+": 64.0,
930
+ "ClassEval": 40.0,
931
+ "LeetCode": 42.8,
932
+ "LeetCode_Efficiency": 45.5,
933
  "Correctness": 53.1
934
  },
935
  "readability": {
936
  "R*": 73.2,
937
  "RN_p": 76.8,
938
+ "RN_if": 95.9,
939
+ "RN": 73.6,
940
  "RL_p": 74.8,
941
  "RL_if": 64.4,
942
  "RL": 47.6,
943
  "RC_p": 71.1,
944
+ "RC_if": 82.9,
945
+ "RC": 60.1,
946
+ "Readability": 60.4
 
947
  },
948
  "maintainability": {
949
  "MI*": 40.0,
 
962
  "Efficiency": 35.8
963
  },
964
  "overall": {
965
+ "RACE Score": 50.1
966
+ }
967
+ },
968
+ "Qwen2.5-72B-Instruct": {
969
+ "correctness": {
970
+ "HumanEval+": 79.3,
971
+ "MBPP+": 65.9,
972
+ "ClassEval": 34.0,
973
+ "LeetCode": 72.8,
974
+ "LeetCode_Efficiency": 68.3,
975
+ "Correctness": 64.1
976
+ },
977
+ "readability": {
978
+ "R*": 79.3,
979
+ "RN_p": 79.6,
980
+ "RN_if": 97.0,
981
+ "RN": 77.2,
982
+ "RL_p": 77.4,
983
+ "RL_if": 92.1,
984
+ "RL": 72.1,
985
+ "RC_p": 80.5,
986
+ "RC_if": 89.3,
987
+ "RC": 72.8,
988
+ "Readability": 74.0
989
+ },
990
+ "maintainability": {
991
+ "MI*": 34.0,
992
+ "MI_p": 32.0,
993
+ "MI": 76.7,
994
+ "MC*": 72.8,
995
+ "MC_p": 71.8,
996
+ "MC": 40.4,
997
+ "Maintainability": 58.5
998
+ },
999
+ "efficiency": {
1000
+ "E*": 68.3,
1001
+ "E_p": 69.3,
1002
+ "E_NI_T": 47.9,
1003
+ "E_NI_S": 49.4,
1004
+ "Efficiency": 48.6
1005
+ },
1006
+ "overall": {
1007
+ "RACE Score": 61.3
1008
+ }
1009
+ },
1010
+ "Mixtral-8x22B": {
1011
+ "correctness": {
1012
+ "HumanEval+": 61.0,
1013
+ "MBPP+": 60.6,
1014
+ "ClassEval": 33.0,
1015
+ "LeetCode": 20.0,
1016
+ "LeetCode_Efficiency": 35.6,
1017
+ "Correctness": 42.0
1018
+ },
1019
+ "readability": {
1020
+ "R*": 61.0,
1021
+ "RN_p": 64.4,
1022
+ "RN_if": 87.0,
1023
+ "RN": 56.2,
1024
+ "RL_p": 62.4,
1025
+ "RL_if": 73.2,
1026
+ "RL": 47.8,
1027
+ "RC_p": 64.9,
1028
+ "RC_if": 84.8,
1029
+ "RC": 56.1,
1030
+ "Readability": 53.4
1031
+ },
1032
+ "maintainability": {
1033
+ "MI*": 33.0,
1034
+ "MI_p": 30.0,
1035
+ "MI": 79.6,
1036
+ "MC*": 20.0,
1037
+ "MC_p": 22.6,
1038
+ "MC": 9.1,
1039
+ "Maintainability": 44.3
1040
+ },
1041
+ "efficiency": {
1042
+ "E*": 35.6,
1043
+ "E_p": 31.7,
1044
+ "E_NI_T": 24.7,
1045
+ "E_NI_S": 33.2,
1046
+ "Efficiency": 29.0
1047
+ },
1048
+ "overall": {
1049
+ "RACE Score": 42.2
1050
+ }
1051
+ },
1052
+ "Llama3-8B-Instruct": {
1053
+ "correctness": {
1054
+ "HumanEval+": 49.4,
1055
+ "MBPP+": 50.5,
1056
+ "ClassEval": 24.0,
1057
+ "LeetCode": 20.6,
1058
+ "LeetCode_Efficiency": 33.7,
1059
+ "Correctness": 35.6
1060
+ },
1061
+ "readability": {
1062
+ "R*": 49.4,
1063
+ "RN_p": 45.5,
1064
+ "RN_if": 85.5,
1065
+ "RN": 44.3,
1066
+ "RL_p": 28.7,
1067
+ "RL_if": 45.9,
1068
+ "RL": 23.6,
1069
+ "RC_p": 48.1,
1070
+ "RC_if": 79.9,
1071
+ "RC": 40.0,
1072
+ "Readability": 36.0
1073
+ },
1074
+ "maintainability": {
1075
+ "MI*": 24.0,
1076
+ "MI_p": 19.0,
1077
+ "MI": 79.8,
1078
+ "MC*": 20.6,
1079
+ "MC_p": 19.1,
1080
+ "MC": 8.1,
1081
+ "Maintainability": 43.9
1082
+ },
1083
+ "efficiency": {
1084
+ "E*": 33.7,
1085
+ "E_p": 31.7,
1086
+ "E_NI_T": 23.5,
1087
+ "E_NI_S": 26.9,
1088
+ "Efficiency": 25.2
1089
+ },
1090
+ "overall": {
1091
+ "RACE Score": 35.2
1092
+ }
1093
+ },
1094
+ "Llama3-70B-Instruct": {
1095
+ "correctness": {
1096
+ "HumanEval+": 65.2,
1097
+ "MBPP+": 58.5,
1098
+ "ClassEval": 28.0,
1099
+ "LeetCode": 31.7,
1100
+ "LeetCode_Efficiency": 38.6,
1101
+ "Correctness": 44.4
1102
+ },
1103
+ "readability": {
1104
+ "R*": 65.2,
1105
+ "RN_p": 67.8,
1106
+ "RN_if": 96.7,
1107
+ "RN": 66.0,
1108
+ "RL_p": 56.1,
1109
+ "RL_if": 75.8,
1110
+ "RL": 47.8,
1111
+ "RC_p": 64.6,
1112
+ "RC_if": 84.8,
1113
+ "RC": 54.2,
1114
+ "Readability": 56.0
1115
+ },
1116
+ "maintainability": {
1117
+ "MI*": 28.0,
1118
+ "MI_p": 29.0,
1119
+ "MI": 79.8,
1120
+ "MC*": 31.7,
1121
+ "MC_p": 31.7,
1122
+ "MC": 25.2,
1123
+ "Maintainability": 52.5
1124
+ },
1125
+ "efficiency": {
1126
+ "E*": 38.6,
1127
+ "E_p": 38.6,
1128
+ "E_NI_T": 29.2,
1129
+ "E_NI_S": 42.8,
1130
+ "Efficiency": 36.0
1131
+ },
1132
+ "overall": {
1133
+ "RACE Score": 47.2
1134
+ }
1135
+ },
1136
+ "StarCoder2-15B": {
1137
+ "correctness": {
1138
+ "HumanEval+": 36.0,
1139
+ "MBPP+": 39.9,
1140
+ "ClassEval": 24.0,
1141
+ "LeetCode": 16.1,
1142
+ "LeetCode_Efficiency": 26.7,
1143
+ "Correctness": 28.5
1144
+ },
1145
+ "readability": {
1146
+ "R*": 36.0,
1147
+ "RN_p": 39.5,
1148
+ "RN_if": 64.3,
1149
+ "RN": 25.8,
1150
+ "RL_p": 40.2,
1151
+ "RL_if": 66.1,
1152
+ "RL": 27.9,
1153
+ "RC_p": 35.4,
1154
+ "RC_if": 59.4,
1155
+ "RC": 22.0,
1156
+ "Readability": 25.2
1157
+ },
1158
+ "maintainability": {
1159
+ "MI*": 24.0,
1160
+ "MI_p": 25.0,
1161
+ "MI": 74.2,
1162
+ "MC*": 16.1,
1163
+ "MC_p": 13.7,
1164
+ "MC": 6.1,
1165
+ "Maintainability": 40.1
1166
+ },
1167
+ "efficiency": {
1168
+ "E*": 26.7,
1169
+ "E_p": 25.7,
1170
+ "E_NI_T": 20.6,
1171
+ "E_NI_S": 25.1,
1172
+ "Efficiency": 22.9
1173
+ },
1174
+ "overall": {
1175
+ "RACE Score": 29.2
1176
  }
1177
  }
1178
  }
text_content.py CHANGED
@@ -1,9 +1,13 @@
1
  HEAD_TEXT = """
2
  Based on the 🏎️RACE benchmark, we demonstrated the ability of different LLMs to generate code that is **_correct_** and **_meets the requirements of real-world development scenarios_**.
3
 
4
- More details about how to evalute the LLM are available in the [🏎️RACE GitHub repository](https://github.com/jszheng21/RACE). For a complete description of RACE benchmark and related experimental analysis, please refer to the paper: [Beyond Correctness: Benchmarking Multi-dimensional Code Generation for Large Language Models](https://arxiv.org/abs/2407.11470). [![](https://img.shields.io/badge/arXiv-2407.11470-b31b1b.svg)](https://arxiv.org/abs/2407.11470)
5
 
6
  **_Latest News_** πŸ”₯
 
 
 
 
7
  - [24/07/24] We add the evaluation results of `claude-3.5-sonnet` and `Qwen2-72B-Instruct` in [RACE leaderboard](https://huggingface.co/spaces/jszheng/RACE_leaderboard).
8
  - [24/07/16] We release our RACE benchmark, leaderboard and paper.
9
  """
@@ -58,7 +62,7 @@ Inspired from the [πŸ€— Open LLM Leaderboard](https://huggingface.co/spaces/Hugg
58
  NOTES_TEXT = """
59
  **Notes:**
60
  - `πŸ’― RACE Score` denotes the final evaluation result based on 🏎️RACE benchmark, which is the average of the scores in the four dimensions: `βœ… Correctness`, `πŸ“– Readability`, `πŸ”¨ Maintainability`, and `πŸš€ Efficiency`.
61
- - All fine-grained evaluation results are provided in `⏬ Hidden Columns`. `πŸ“– R` denotes code **R**eadability, `πŸ”¨ M` denotes code **M**aintainability, and `πŸš€ E` denotes code **E**fficiency. `*` denotes the correctness of the code in the corresponding dimension. More details about the abbreviations are as follows:
62
  - `πŸ“– R*`: The code accuracy (baesline).
63
  - `πŸ“– RN`: The proportion of code that is both functionally correct and follows customized instructions related to `Naming Convention`.
64
  - `πŸ“– RL`: The proportion of code that is both functionally correct and follows customized instructions related to `Code Length`.
 
1
  HEAD_TEXT = """
2
  Based on the 🏎️RACE benchmark, we demonstrated the ability of different LLMs to generate code that is **_correct_** and **_meets the requirements of real-world development scenarios_**.
3
 
4
+ More details about how to evalute the LLM are available in the [🏎️RACE GitHub repository](https://github.com/jszheng21/RACE). For a complete description of RACE benchmark and related experimental analysis, please refer to the paper: [Beyond Correctness: Benchmarking Multi-dimensional Code Generation for Large Language Models](https://arxiv.org/abs/2407.11470).
5
 
6
  **_Latest News_** πŸ”₯
7
+ - [24/10/09] We release the second version of [RACE paper](https://arxiv.org/abs/2407.11470).
8
+ - [24/10/09] We add the evaluation results of 9 LLMs (including `o1-mini-2024-09-12`) in [RACE leaderboard](https://huggingface.co/spaces/jszheng/RACE_leaderboard).
9
+ - [24/10/01] We have improved the calculation methods for readability-related metrics and enhanced the robustness of the code post-processing techniques.
10
+ - [24/10/01] We have revised the test code in the LeetCode evaluation data to support the cases with multiple correct answers.
11
  - [24/07/24] We add the evaluation results of `claude-3.5-sonnet` and `Qwen2-72B-Instruct` in [RACE leaderboard](https://huggingface.co/spaces/jszheng/RACE_leaderboard).
12
  - [24/07/16] We release our RACE benchmark, leaderboard and paper.
13
  """
 
62
  NOTES_TEXT = """
63
  **Notes:**
64
  - `πŸ’― RACE Score` denotes the final evaluation result based on 🏎️RACE benchmark, which is the average of the scores in the four dimensions: `βœ… Correctness`, `πŸ“– Readability`, `πŸ”¨ Maintainability`, and `πŸš€ Efficiency`.
65
+ - All fine-grained evaluation results are provided in `⏬ Hidden Columns`. `πŸ“– R` denotes code **R**eadability, `πŸ”¨ M` denotes code **M**aintainability, and `πŸš€ E` denotes code **E**fficiency. `*` denotes the code accuracy in the absence of customized instructions. More details about the abbreviations are as follows:
66
  - `πŸ“– R*`: The code accuracy (baesline).
67
  - `πŸ“– RN`: The proportion of code that is both functionally correct and follows customized instructions related to `Naming Convention`.
68
  - `πŸ“– RL`: The proportion of code that is both functionally correct and follows customized instructions related to `Code Length`.