Spaces:

jszheng
/

RACE_leaderboard

Running

App Files Files Community

jszheng commited on Oct 12

Commit

790dc55

•

1 Parent(s): 310a5d6

add results of 9 LLMs

Browse files

Files changed (2) hide show

RESULTS.json +817 -401
text_content.py +6 -2

RESULTS.json CHANGED Viewed

@@ -1,18 +1,67 @@
 {
-    "gpt-4o-2024-05-13": {
         "readability": {
             "R*": 80.5,
-            "RN_p": 81.1,
-            "RN_if": 91.8,
-            "RN": 75.3,
             "RL_p": 78.9,
             "RL_if": 78.9,
             "RL": 63.2,
             "RC_p": 79.8,
-            "RC_if": 78.7,
-            "RC": 64.3,
-            "MBPP*": 64.6,
-            "Readability": 67.6
         },
         "maintainability": {
             "MI*": 38.0,
@@ -26,31 +75,77 @@
         "efficiency": {
             "E*": 59.4,
             "E_p": 58.4,
-            "E_NI_T": 44.8,
             "E_NI_S": 42.0,
-            "Efficiency": 43.4
         },
         "correctness": {
-            "Correctness": 59.9
         },
         "overall": {
-            "RACE Score": 56.5
         }
     },
-    "gpt-3.5-turbo-0125": {
         "readability": {
             "R*": 62.8,
             "RN_p": 63.2,
-            "RN_if": 74.4,
-            "RN": 48.3,
             "RL_p": 60.4,
             "RL_if": 76.8,
             "RL": 46.1,
             "RC_p": 65.8,
-            "RC_if": 60.0,
-            "RC": 41.5,
-            "MBPP*": 62.2,
-            "Readability": 45.3
         },
         "maintainability": {
             "MI*": 28.0,
@@ -68,65 +163,115 @@
             "E_NI_S": 36.5,
             "Efficiency": 32.0
         },
         "correctness": {
-            "Correctness": 44.7
         },
         "overall": {
-            "RACE Score": 42.8
         }
     },
-    "claude-3.5-sonnet": {
         "correctness": {
-            "Correctness": 64.6
         },
         "readability": {
-            "R*": 77.4,
-            "RN_p": 76.3,
-            "RN_if": 92.3,
-            "RN": 71.9,
-            "RL_p": 62.2,
-            "RL_if": 70.3,
-            "RL": 52.0,
-            "RC_p": 74.1,
-            "RC_if": 72.2,
-            "RC": 58.0,
-            "MBPP*": 63.5,
-            "Readability": 60.6
         },
         "maintainability": {
-            "MI*": 42.0,
-            "MI_p": 32.0,
-            "MI": 75.3,
-            "MC*": 71.7,
-            "MC_p": 68.5,
-            "MC": 59.8,
-            "Maintainability": 67.5
         },
         "efficiency": {
-            "E*": 68.3,
-            "E_p": 66.3,
-            "E_NI_T": 56.8,
-            "E_NI_S": 49.7,
-            "Efficiency": 53.2
         },
         "overall": {
-            "RACE Score": 61.5
         }
     },
-    "CodeLlama-7b-Instruct": {
         "readability": {
             "R*": 32.3,
             "RN_p": 31.5,
-            "RN_if": 55.5,
-            "RN": 17.0,
             "RL_p": 31.7,
             "RL_if": 59.7,
             "RL": 23.4,
             "RC_p": 30.2,
-            "RC_if": 67.4,
-            "RC": 18.3,
-            "MBPP*": 43.1,
-            "Readability": 19.6
         },
         "maintainability": {
             "MI*": 16.0,
@@ -144,65 +289,73 @@
             "E_NI_S": 8.8,
             "Efficiency": 8.5
         },
-        "correctness": {
-            "Correctness": 23.9
-        },
         "overall": {
-            "RACE Score": 22.9
         }
     },
-    "CodeLlama-7b-Python": {
         "readability": {
-            "R*": 29.3,
-            "RN_p": 29.5,
-            "RN_if": 66.4,
-            "RN": 20.4,
-            "RL_p": 30.1,
-            "RL_if": 76.6,
-            "RL": 25.8,
-            "RC_p": 24.7,
-            "RC_if": 42.1,
-            "RC": 11.6,
-            "MBPP*": 41.3,
-            "Readability": 19.3
         },
         "maintainability": {
-            "MI*": 11.0,
-            "MI_p": 10.0,
-            "MI": 79.4,
-            "MC*": 5.6,
-            "MC_p": 6.5,
-            "MC": 3.7,
-            "Maintainability": 41.6
         },
         "efficiency": {
-            "E*": 14.9,
-            "E_p": 15.8,
-            "E_NI_T": 14.3,
-            "E_NI_S": 14.4,
-            "Efficiency": 14.4
-        },
-        "correctness": {
-            "Correctness": 20.4
         },
         "overall": {
-            "RACE Score": 23.9
         }
     },
-    "CodeLlama-13b-Instruct": {
         "readability": {
             "R*": 36.0,
             "RN_p": 37.7,
-            "RN_if": 57.8,
-            "RN": 22.0,
             "RL_p": 35.0,
             "RL_if": 59.9,
             "RL": 23.6,
             "RC_p": 35.7,
-            "RC_if": 64.3,
-            "RC": 23.2,
-            "MBPP*": 40.7,
-            "Readability": 22.9
         },
         "maintainability": {
             "MI*": 17.0,
@@ -220,65 +373,73 @@
             "E_NI_S": 16.1,
             "Efficiency": 13.2
         },
-        "correctness": {
-            "Correctness": 24.4
-        },
         "overall": {
-            "RACE Score": 26.4
         }
     },
-    "CodeLlama-13b-Python": {
         "readability": {
-            "R*": 40.2,
-            "RN_p": 35.0,
-            "RN_if": 61.3,
-            "RN": 22.4,
-            "RL_p": 34.8,
-            "RL_if": 83.5,
-            "RL": 30.9,
-            "RC_p": 30.2,
-            "RC_if": 60.7,
-            "RC": 20.4,
-            "MBPP*": 29.4,
-            "Readability": 24.6
         },
         "maintainability": {
-            "MI*": 16.0,
-            "MI_p": 15.0,
-            "MI": 78.6,
-            "MC*": 6.1,
-            "MC_p": 4.8,
-            "MC": 2.4,
-            "Maintainability": 40.5
         },
         "efficiency": {
-            "E*": 16.8,
-            "E_p": 17.8,
-            "E_NI_T": 13.8,
-            "E_NI_S": 14.7,
-            "Efficiency": 14.2
-        },
-        "correctness": {
-            "Correctness": 21.7
         },
         "overall": {
-            "RACE Score": 25.3
         }
     },
-    "CodeLlama-34b-Instruct": {
         "readability": {
             "R*": 36.0,
             "RN_p": 36.5,
-            "RN_if": 54.3,
-            "RN": 21.1,
             "RL_p": 35.8,
             "RL_if": 41.7,
             "RL": 17.5,
             "RC_p": 36.3,
-            "RC_if": 32.0,
-            "RC": 9.4,
-            "MBPP*": 45.8,
-            "Readability": 16.0
         },
         "maintainability": {
             "MI*": 12.0,
@@ -296,65 +457,199 @@
             "E_NI_S": 13.8,
             "Efficiency": 14.1
         },
         "correctness": {
-            "Correctness": 26.0
         },
         "overall": {
-            "RACE Score": 24.2
         }
     },
-    "CodeLlama-34b-Python": {
         "readability": {
-            "R*": 31.7,
-            "RN_p": 27.2,
-            "RN_if": 66.9,
-            "RN": 18.6,
-            "RL_p": 32.5,
-            "RL_if": 73.2,
-            "RL": 26.7,
-            "RC_p": 27.8,
-            "RC_if": 39.4,
-            "RC": 6.7,
-            "MBPP*": 36.2,
-            "Readability": 17.3
         },
         "maintainability": {
-            "MI*": 3.0,
-            "MI_p": 2.0,
-            "MI": 85.3,
-            "MC*": 7.2,
-            "MC_p": 5.4,
-            "MC": 2.2,
-            "Maintainability": 43.8
         },
         "efficiency": {
-            "E*": 17.8,
-            "E_p": 11.9,
-            "E_NI_T": 12.0,
-            "E_NI_S": 14.4,
-            "Efficiency": 13.2
         },
-        "correctness": {
-            "Correctness": 19.2
         },
         "overall": {
-            "RACE Score": 23.4
         }
     },
-    "DeepSeek-Coder-Instruct-6.7B": {
         "readability": {
             "R*": 65.2,
             "RN_p": 65.5,
-            "RN_if": 67.2,
-            "RN": 44.4,
             "RL_p": 61.2,
             "RL_if": 73.6,
             "RL": 46.6,
             "RC_p": 61.2,
-            "RC_if": 65.5,
-            "RC": 42.0,
-            "MBPP*": 57.1,
-            "Readability": 44.3
         },
         "maintainability": {
             "MI*": 26.0,
@@ -372,27 +667,31 @@
             "E_NI_S": 30.0,
             "Efficiency": 28.6
         },
-        "correctness": {
-            "Correctness": 39.2
-        },
         "overall": {
-            "RACE Score": 39.0
         }
     },
-    "DeepSeek-Coder-Instruct-7B": {
         "readability": {
             "R*": 61.0,
             "RN_p": 61.5,
-            "RN_if": 57.8,
-            "RN": 35.2,
             "RL_p": 62.6,
             "RL_if": 70.9,
             "RL": 46.0,
             "RC_p": 62.8,
-            "RC_if": 70.2,
-            "RC": 46.0,
-            "MBPP*": 59.3,
-            "Readability": 42.4
         },
         "maintainability": {
             "MI*": 23.0,
@@ -410,27 +709,31 @@
             "E_NI_S": 26.8,
             "Efficiency": 26.0
         },
-        "correctness": {
-            "Correctness": 39.9
-        },
         "overall": {
-            "RACE Score": 38.1
         }
     },
-    "DeepSeek-Coder-Instruct-33B": {
         "readability": {
             "R*": 65.9,
             "RN_p": 64.6,
-            "RN_if": 86.8,
-            "RN": 57.7,
             "RL_p": 65.0,
             "RL_if": 82.7,
             "RL": 53.5,
             "RC_p": 66.5,
-            "RC_if": 70.8,
-            "RC": 46.4,
-            "MBPP*": 61.9,
-            "Readability": 52.5
         },
         "maintainability": {
             "MI*": 28.0,
@@ -448,27 +751,31 @@
             "E_NI_S": 36.1,
             "Efficiency": 35.7
         },
-        "correctness": {
-            "Correctness": 44.7
-        },
         "overall": {
-            "RACE Score": 44.1
         }
     },
     "DeepSeek-Coder-V2-Lite-Instruct-16B": {
         "readability": {
             "R*": 72.0,
             "RN_p": 71.2,
-            "RN_if": 55.3,
-            "RN": 40.2,
             "RL_p": 66.5,
             "RL_if": 83.7,
             "RL": 57.7,
             "RC_p": 67.1,
-            "RC_if": 63.5,
-            "RC": 42.7,
-            "MBPP*": 62.7,
-            "Readability": 46.9
         },
         "maintainability": {
             "MI*": 26.0,
@@ -486,217 +793,73 @@
             "E_NI_S": 47.7,
             "Efficiency": 44.0
         },
-        "correctness": {
-            "Correctness": 50.9
-        },
-        "overall": {
-            "RACE Score": 47.7
-        }
-    },
-    "DeepSeek-Coder-V2-Instruct-236B": {
-        "readability": {
-            "R*": 73.8,
-            "RN_p": 75.3,
-            "RN_if": 91.8,
-            "RN": 70.0,
-            "RL_p": 75.2,
-            "RL_if": 88.4,
-            "RL": 67.1,
-            "RC_p": 76.5,
-            "RC_if": 74.1,
-            "RC": 58.5,
-            "MBPP*": 68.5,
-            "Readability": 65.2
-        },
-        "maintainability": {
-            "MI*": 35.0,
-            "MI_p": 38.0,
-            "MI": 77.3,
-            "MC*": 58.9,
-            "MC_p": 58.9,
-            "MC": 35.0,
-            "Maintainability": 56.1
-        },
-        "efficiency": {
-            "E*": 57.3,
-            "E_p": 53.5,
-            "E_NI_T": 41.1,
-            "E_NI_S": 49.4,
-            "Efficiency": 45.2
-        },
-        "correctness": {
-            "Correctness": 58.7
-        },
-        "overall": {
-            "RACE Score": 56.3
-        }
-    },
-    "WizardCoder-Python-7B-V1.0": {
-        "readability": {
-            "R*": 34.8,
-            "RN_p": 35.8,
-            "RN_if": 58.3,
-            "RN": 22.4,
-            "RL_p": 34.3,
-            "RL_if": 79.7,
-            "RL": 28.0,
-            "RC_p": 35.4,
-            "RC_if": 25.0,
-            "RC": 8.6,
-            "MBPP*": 41.8,
-            "Readability": 19.7
-        },
-        "maintainability": {
-            "MI*": 19.0,
-            "MI_p": 23.0,
-            "MI": 79.3,
-            "MC*": 10.6,
-            "MC_p": 9.8,
-            "MC": 7.2,
-            "Maintainability": 43.2
-        },
-        "efficiency": {
-            "E*": 19.8,
-            "E_p": 19.8,
-            "E_NI_T": 15.3,
-            "E_NI_S": 16.7,
-            "Efficiency": 16.0
-        },
-        "correctness": {
-            "Correctness": 25.2
-        },
         "overall": {
-            "RACE Score": 26.0
         }
     },
-    "WizardCoder-Python-13B-V1.0": {
-        "readability": {
-            "R*": 36.0,
-            "RN_p": 38.2,
-            "RN_if": 58.4,
-            "RN": 23.1,
-            "RL_p": 38.4,
-            "RL_if": 83.1,
-            "RL": 33.1,
-            "RC_p": 43.6,
-            "RC_if": 59.8,
-            "RC": 27.4,
-            "MBPP*": 42.1,
-            "Readability": 27.9
-        },
-        "maintainability": {
-            "MI*": 20.0,
-            "MI_p": 21.0,
-            "MI": 78.8,
-            "MC*": 12.8,
-            "MC_p": 12.8,
-            "MC": 8.5,
-            "Maintainability": 43.6
-        },
-        "efficiency": {
-            "E*": 20.8,
-            "E_p": 18.8,
-            "E_NI_T": 16.2,
-            "E_NI_S": 19.8,
-            "Efficiency": 18.0
-        },
         "correctness": {
-            "Correctness": 26.3
         },
-        "overall": {
-            "RACE Score": 29.0
-        }
-    },
-    "WizardCoder-15B-V1.0": {
         "readability": {
-            "R*": 38.4,
-            "RN_p": 38.7,
-            "RN_if": 59.0,
-            "RN": 23.2,
-            "RL_p": 41.9,
-            "RL_if": 64.8,
-            "RL": 27.8,
-            "RC_p": 40.0,
-            "RC_if": 57.3,
-            "RC": 24.4,
-            "MBPP*": 46.3,
-            "Readability": 25.1
         },
         "maintainability": {
-            "MI*": 22.0,
-            "MI_p": 21.0,
-            "MI": 80.0,
-            "MC*": 11.7,
-            "MC_p": 11.5,
-            "MC": 7.8,
-            "Maintainability": 43.9
         },
         "efficiency": {
-            "E*": 21.8,
-            "E_p": 22.8,
-            "E_NI_T": 21.8,
-            "E_NI_S": 24.2,
-            "Efficiency": 23.0
-        },
-        "correctness": {
-            "Correctness": 28.0
         },
         "overall": {
-            "RACE Score": 30.0
         }
     },
-    "WizardCoder-33B-V1.1": {
-        "readability": {
-            "R*": 58.5,
-            "RN_p": 58.8,
-            "RN_if": 65.4,
-            "RN": 39.9,
-            "RL_p": 62.2,
-            "RL_if": 76.0,
-            "RL": 47.6,
-            "RC_p": 58.8,
-            "RC_if": 61.0,
-            "RC": 37.2,
-            "MBPP*": 64.6,
-            "Readability": 41.6
-        },
-        "maintainability": {
-            "MI*": 34.0,
-            "MI_p": 34.0,
-            "MI": 71.2,
-            "MC*": 26.1,
-            "MC_p": 25.0,
-            "MC": 9.3,
-            "Maintainability": 40.2
-        },
-        "efficiency": {
-            "E*": 38.6,
-            "E_p": 35.6,
-            "E_NI_T": 33.9,
-            "E_NI_S": 34.9,
-            "Efficiency": 34.4
-        },
         "correctness": {
-            "Correctness": 44.4
         },
-        "overall": {
-            "RACE Score": 40.1
-        }
-    },
-    "CodeQwen1.5-7B-Chat": {
         "readability": {
             "R*": 76.2,
             "RN_p": 76.8,
-            "RN_if": 60.8,
-            "RN": 47.0,
             "RL_p": 73.4,
             "RL_if": 60.8,
             "RL": 47.0,
             "RC_p": 74.7,
-            "RC_if": 71.3,
-            "RC": 54.2,
-            "MBPP*": 60.3,
-            "Readability": 49.4
         },
         "maintainability": {
             "MI*": 22.0,
@@ -714,30 +877,73 @@
             "E_NI_S": 37.7,
             "Efficiency": 34.2
         },
         "correctness": {
-            "Correctness": 46.3
         },
         "overall": {
-            "RACE Score": 44.4
         }
     },
     "Qwen2-72B-Instruct": {
         "correctness": {
             "Correctness": 53.1
         },
         "readability": {
             "R*": 73.2,
             "RN_p": 76.8,
-            "RN_if": 93.8,
-            "RN": 72.0,
             "RL_p": 74.8,
             "RL_if": 64.4,
             "RL": 47.6,
             "RC_p": 71.1,
-            "RC_if": 74.4,
-            "RC": 54.0,
-            "MBPP*": 64.0,
-            "Readability": 57.9
         },
         "maintainability": {
             "MI*": 40.0,
@@ -756,7 +962,217 @@
             "Efficiency": 35.8
         },
         "overall": {
-            "RACE Score": 49.5
         }
     }
 }

 {
+    "Claude-3.5-Sonnet": {
+        "correctness": {
+            "HumanEval+": 77.4,
+            "MBPP+": 63.5,
+            "ClassEval": 42.0,
+            "LeetCode": 71.7,
+            "LeetCode_Efficiency": 68.3,
+            "Correctness": 64.6
+        },
+        "readability": {
+            "R*": 77.4,
+            "RN_p": 76.3,
+            "RN_if": 95.5,
+            "RN": 74.4,
+            "RL_p": 62.2,
+            "RL_if": 70.3,
+            "RL": 52.0,
+            "RC_p": 74.1,
+            "RC_if": 85.1,
+            "RC": 65.5,
+            "Readability": 64.0
+        },
+        "maintainability": {
+            "MI*": 42.0,
+            "MI_p": 32.0,
+            "MI": 75.3,
+            "MC*": 71.7,
+            "MC_p": 68.5,
+            "MC": 59.8,
+            "Maintainability": 67.5
+        },
+        "efficiency": {
+            "E*": 68.3,
+            "E_p": 66.3,
+            "E_NI_T": 56.8,
+            "E_NI_S": 49.7,
+            "Efficiency": 53.2
+        },
+        "overall": {
+            "RACE Score": 62.3
+        }
+    },
+    "GPT-4o-2024-05-13": {
+        "correctness": {
+            "HumanEval+": 80.5,
+            "MBPP+": 64.6,
+            "ClassEval": 38.0,
+            "LeetCode": 57.2,
+            "LeetCode_Efficiency": 59.4,
+            "Correctness": 59.9
+        },
         "readability": {
             "R*": 80.5,
+            "RN_p": 81.2,
+            "RN_if": 95.6,
+            "RN": 78.6,
             "RL_p": 78.9,
             "RL_if": 78.9,
             "RL": 63.2,
             "RC_p": 79.8,
+            "RC_if": 87.5,
+            "RC": 70.4,
+            "Readability": 70.7
         },
         "maintainability": {
             "MI*": 38.0,
         "efficiency": {
             "E*": 59.4,
             "E_p": 58.4,
+            "E_NI_T": 44.0,
             "E_NI_S": 42.0,
+            "Efficiency": 43.0
         },
+        "overall": {
+            "RACE Score": 57.2
+        }
+    },
+    "GPT-4o-mini": {
         "correctness": {
+            "HumanEval+": 78.0,
+            "MBPP+": 63.0,
+            "ClassEval": 37.0,
+            "LeetCode": 51.7,
+            "LeetCode_Efficiency": 52.5,
+            "Correctness": 56.4
+        },
+        "readability": {
+            "R*": 78.0,
+            "RN_p": 76.4,
+            "RN_if": 87.0,
+            "RN": 67.6,
+            "RL_p": 70.3,
+            "RL_if": 74.8,
+            "RL": 55.7,
+            "RC_p": 74.1,
+            "RC_if": 96.9,
+            "RC": 72.9,
+            "Readability": 65.4
+        },
+        "maintainability": {
+            "MI*": 37.0,
+            "MI_p": 27.0,
+            "MI": 73.5,
+            "MC*": 51.7,
+            "MC_p": 49.1,
+            "MC": 23.3,
+            "Maintainability": 48.4
+        },
+        "efficiency": {
+            "E*": 52.5,
+            "E_p": 46.5,
+            "E_NI_T": 40.3,
+            "E_NI_S": 39.5,
+            "Efficiency": 39.9
         },
         "overall": {
+            "RACE Score": 52.5
         }
     },
+    "GPT-3.5-Turbo-0125": {
+        "correctness": {
+            "HumanEval+": 62.8,
+            "MBPP+": 62.2,
+            "ClassEval": 28.0,
+            "LeetCode": 31.1,
+            "LeetCode_Efficiency": 39.6,
+            "Correctness": 44.7
+        },
         "readability": {
             "R*": 62.8,
             "RN_p": 63.2,
+            "RN_if": 79.2,
+            "RN": 51.4,
             "RL_p": 60.4,
             "RL_if": 76.8,
             "RL": 46.1,
             "RC_p": 65.8,
+            "RC_if": 70.1,
+            "RC": 47.5,
+            "Readability": 48.3
         },
         "maintainability": {
             "MI*": 28.0,
             "E_NI_S": 36.5,
             "Efficiency": 32.0
         },
+        "overall": {
+            "RACE Score": 43.6
+        }
+    },
+    "o1-mini-2024-09-12": {
         "correctness": {
+            "HumanEval+": 82.9,
+            "MBPP+": 64.8,
+            "ClassEval": 36.0,
+            "LeetCode": 79.6,
+            "LeetCode_Efficiency": 87.1,
+            "Correctness": 70.1
+        },
+        "readability": {
+            "R*": 82.9,
+            "RN_p": 83.2,
+            "RN_if": 95.0,
+            "RN": 80.7,
+            "RL_p": 76.4,
+            "RL_if": 56.7,
+            "RL": 47.5,
+            "RC_p": 80.2,
+            "RC_if": 94.2,
+            "RC": 77.7,
+            "Readability": 68.6
+        },
+        "maintainability": {
+            "MI*": 36.0,
+            "MI_p": 25.0,
+            "MI": 64.4,
+            "MC*": 79.6,
+            "MC_p": 83.3,
+            "MC": 66.1,
+            "Maintainability": 65.2
+        },
+        "efficiency": {
+            "E*": 87.1,
+            "E_p": 77.4,
+            "E_NI_T": 60.3,
+            "E_NI_S": 40.0,
+            "Efficiency": 50.1
         },
         "overall": {
+            "RACE Score": 63.5
         }
     },
+    "CodeLlama-7B-Python": {
         "correctness": {
+            "HumanEval+": 29.3,
+            "MBPP+": 41.3,
+            "ClassEval": 11.0,
+            "LeetCode": 5.6,
+            "LeetCode_Efficiency": 14.9,
+            "Correctness": 20.4
         },
         "readability": {
+            "R*": 29.3,
+            "RN_p": 29.5,
+            "RN_if": 69.0,
+            "RN": 20.9,
+            "RL_p": 30.1,
+            "RL_if": 76.6,
+            "RL": 25.8,
+            "RC_p": 24.7,
+            "RC_if": 57.9,
+            "RC": 12.5,
+            "Readability": 19.7
         },
         "maintainability": {
+            "MI*": 11.0,
+            "MI_p": 10.0,
+            "MI": 79.4,
+            "MC*": 5.6,
+            "MC_p": 6.5,
+            "MC": 3.7,
+            "Maintainability": 41.6
         },
         "efficiency": {
+            "E*": 14.9,
+            "E_p": 15.8,
+            "E_NI_T": 14.3,
+            "E_NI_S": 14.4,
+            "Efficiency": 14.4
         },
         "overall": {
+            "RACE Score": 24.0
         }
     },
+    "CodeLlama-7B-Instruct": {
+        "correctness": {
+            "HumanEval+": 32.3,
+            "MBPP+": 43.1,
+            "ClassEval": 16.0,
+            "LeetCode": 12.2,
+            "LeetCode_Efficiency": 15.8,
+            "Correctness": 23.9
+        },
         "readability": {
             "R*": 32.3,
             "RN_p": 31.5,
+            "RN_if": 58.2,
+            "RN": 17.8,
             "RL_p": 31.7,
             "RL_if": 59.7,
             "RL": 23.4,
             "RC_p": 30.2,
+            "RC_if": 76.2,
+            "RC": 22.2,
+            "Readability": 21.1
         },
         "maintainability": {
             "MI*": 16.0,
             "E_NI_S": 8.8,
             "Efficiency": 8.5
         },
         "overall": {
+            "RACE Score": 23.2
         }
     },
+    "CodeLlama-13B-Python": {
+        "correctness": {
+            "HumanEval+": 40.2,
+            "MBPP+": 29.4,
+            "ClassEval": 16.0,
+            "LeetCode": 6.1,
+            "LeetCode_Efficiency": 16.8,
+            "Correctness": 21.7
+        },
         "readability": {
+            "R*": 40.2,
+            "RN_p": 35.0,
+            "RN_if": 63.6,
+            "RN": 23.1,
+            "RL_p": 34.8,
+            "RL_if": 83.5,
+            "RL": 30.9,
+            "RC_p": 30.2,
+            "RC_if": 77.4,
+            "RC": 24.4,
+            "Readability": 26.1
         },
         "maintainability": {
+            "MI*": 16.0,
+            "MI_p": 15.0,
+            "MI": 78.6,
+            "MC*": 6.1,
+            "MC_p": 4.8,
+            "MC": 2.4,
+            "Maintainability": 40.5
         },
         "efficiency": {
+            "E*": 16.8,
+            "E_p": 17.8,
+            "E_NI_T": 13.8,
+            "E_NI_S": 14.7,
+            "Efficiency": 14.2
         },
         "overall": {
+            "RACE Score": 25.6
         }
     },
+    "CodeLlama-13B-Instruct": {
+        "correctness": {
+            "HumanEval+": 36.0,
+            "MBPP+": 40.7,
+            "ClassEval": 17.0,
+            "LeetCode": 10.6,
+            "LeetCode_Efficiency": 17.8,
+            "Correctness": 24.4
+        },
         "readability": {
             "R*": 36.0,
             "RN_p": 37.7,
+            "RN_if": 60.2,
+            "RN": 22.9,
             "RL_p": 35.0,
             "RL_if": 59.9,
             "RL": 23.6,
             "RC_p": 35.7,
+            "RC_if": 75.0,
+            "RC": 29.0,
+            "Readability": 25.2
         },
         "maintainability": {
             "MI*": 17.0,
             "E_NI_S": 16.1,
             "Efficiency": 13.2
         },
         "overall": {
+            "RACE Score": 26.9
         }
     },
+    "CodeLlama-34B-Python": {
+        "correctness": {
+            "HumanEval+": 31.7,
+            "MBPP+": 36.2,
+            "ClassEval": 3.0,
+            "LeetCode": 7.2,
+            "LeetCode_Efficiency": 17.8,
+            "Correctness": 19.2
+        },
         "readability": {
+            "R*": 31.7,
+            "RN_p": 27.2,
+            "RN_if": 68.6,
+            "RN": 18.8,
+            "RL_p": 32.5,
+            "RL_if": 73.2,
+            "RL": 26.7,
+            "RC_p": 27.8,
+            "RC_if": 48.8,
+            "RC": 8.6,
+            "Readability": 18.0
         },
         "maintainability": {
+            "MI*": 3.0,
+            "MI_p": 2.0,
+            "MI": 85.3,
+            "MC*": 7.2,
+            "MC_p": 5.4,
+            "MC": 2.2,
+            "Maintainability": 43.8
         },
         "efficiency": {
+            "E*": 17.8,
+            "E_p": 11.9,
+            "E_NI_T": 12.0,
+            "E_NI_S": 14.4,
+            "Efficiency": 13.2
         },
         "overall": {
+            "RACE Score": 23.6
         }
     },
+    "CodeLlama-34B-Instruct": {
+        "correctness": {
+            "HumanEval+": 36.0,
+            "MBPP+": 45.8,
+            "ClassEval": 12.0,
+            "LeetCode": 15.6,
+            "LeetCode_Efficiency": 20.8,
+            "Correctness": 26.0
+        },
         "readability": {
             "R*": 36.0,
             "RN_p": 36.5,
+            "RN_if": 56.8,
+            "RN": 21.9,
             "RL_p": 35.8,
             "RL_if": 41.7,
             "RL": 17.5,
             "RC_p": 36.3,
+            "RC_if": 36.2,
+            "RC": 10.7,
+            "Readability": 16.7
         },
         "maintainability": {
             "MI*": 12.0,
             "E_NI_S": 13.8,
             "Efficiency": 14.1
         },
+        "overall": {
+            "RACE Score": 24.4
+        }
+    },
+    "WizardCoder-15B-V1.0": {
         "correctness": {
+            "HumanEval+": 38.4,
+            "MBPP+": 46.3,
+            "ClassEval": 22.0,
+            "LeetCode": 11.7,
+            "LeetCode_Efficiency": 21.8,
+            "Correctness": 28.0
+        },
+        "readability": {
+            "R*": 38.4,
+            "RN_p": 38.7,
+            "RN_if": 61.0,
+            "RN": 24.0,
+            "RL_p": 41.9,
+            "RL_if": 64.8,
+            "RL": 27.8,
+            "RC_p": 40.0,
+            "RC_if": 65.0,
+            "RC": 28.1,
+            "Readability": 26.6
+        },
+        "maintainability": {
+            "MI*": 22.0,
+            "MI_p": 21.0,
+            "MI": 80.0,
+            "MC*": 11.7,
+            "MC_p": 11.5,
+            "MC": 7.8,
+            "Maintainability": 43.9
+        },
+        "efficiency": {
+            "E*": 21.8,
+            "E_p": 22.8,
+            "E_NI_T": 21.8,
+            "E_NI_S": 24.2,
+            "Efficiency": 23.0
         },
         "overall": {
+            "RACE Score": 30.4
         }
     },
+    "WizardCoder-33B-V1.1": {
+        "correctness": {
+            "HumanEval+": 58.5,
+            "MBPP+": 64.6,
+            "ClassEval": 34.0,
+            "LeetCode": 26.1,
+            "LeetCode_Efficiency": 38.6,
+            "Correctness": 44.4
+        },
         "readability": {
+            "R*": 58.5,
+            "RN_p": 58.8,
+            "RN_if": 68.0,
+            "RN": 40.9,
+            "RL_p": 62.2,
+            "RL_if": 76.0,
+            "RL": 47.6,
+            "RC_p": 58.8,
+            "RC_if": 73.8,
+            "RC": 44.8,
+            "Readability": 44.4
         },
         "maintainability": {
+            "MI*": 34.0,
+            "MI_p": 34.0,
+            "MI": 71.2,
+            "MC*": 26.1,
+            "MC_p": 25.0,
+            "MC": 9.3,
+            "Maintainability": 40.2
         },
         "efficiency": {
+            "E*": 38.6,
+            "E_p": 35.6,
+            "E_NI_T": 33.9,
+            "E_NI_S": 34.9,
+            "Efficiency": 34.4
+        },
+        "overall": {
+            "RACE Score": 40.8
+        }
+    },
+    "WizardCoder-Python-7B-V1.0": {
+        "correctness": {
+            "HumanEval+": 34.8,
+            "MBPP+": 41.8,
+            "ClassEval": 19.0,
+            "LeetCode": 10.6,
+            "LeetCode_Efficiency": 19.8,
+            "Correctness": 25.2
+        },
+        "readability": {
+            "R*": 34.8,
+            "RN_p": 35.8,
+            "RN_if": 60.2,
+            "RN": 22.8,
+            "RL_p": 34.3,
+            "RL_if": 79.7,
+            "RL": 28.0,
+            "RC_p": 35.4,
+            "RC_if": 31.8,
+            "RC": 10.1,
+            "Readability": 20.3
+        },
+        "maintainability": {
+            "MI*": 19.0,
+            "MI_p": 23.0,
+            "MI": 79.3,
+            "MC*": 10.6,
+            "MC_p": 9.8,
+            "MC": 7.2,
+            "Maintainability": 43.2
+        },
+        "efficiency": {
+            "E*": 19.8,
+            "E_p": 19.8,
+            "E_NI_T": 15.3,
+            "E_NI_S": 16.7,
+            "Efficiency": 16.0
+        },
+        "overall": {
+            "RACE Score": 26.2
+        }
+    },
+    "WizardCoder-Python-13B-V1.0": {
+        "correctness": {
+            "HumanEval+": 36.0,
+            "MBPP+": 42.1,
+            "ClassEval": 20.0,
+            "LeetCode": 12.8,
+            "LeetCode_Efficiency": 20.8,
+            "Correctness": 26.3
+        },
+        "readability": {
+            "R*": 36.0,
+            "RN_p": 38.2,
+            "RN_if": 60.2,
+            "RN": 23.9,
+            "RL_p": 38.4,
+            "RL_if": 83.1,
+            "RL": 33.1,
+            "RC_p": 43.6,
+            "RC_if": 67.7,
+            "RC": 30.5,
+            "Readability": 29.2
+        },
+        "maintainability": {
+            "MI*": 20.0,
+            "MI_p": 21.0,
+            "MI": 78.8,
+            "MC*": 12.8,
+            "MC_p": 12.8,
+            "MC": 8.5,
+            "Maintainability": 43.6
         },
+        "efficiency": {
+            "E*": 20.8,
+            "E_p": 18.8,
+            "E_NI_T": 16.2,
+            "E_NI_S": 19.8,
+            "Efficiency": 18.0
         },
         "overall": {
+            "RACE Score": 29.3
         }
     },
+    "DeepSeek-Coder-6.7B-Instruct": {
+        "correctness": {
+            "HumanEval+": 65.2,
+            "MBPP+": 57.1,
+            "ClassEval": 26.0,
+            "LeetCode": 18.9,
+            "LeetCode_Efficiency": 28.7,
+            "Correctness": 39.2
+        },
         "readability": {
             "R*": 65.2,
             "RN_p": 65.5,
+            "RN_if": 69.5,
+            "RN": 45.8,
             "RL_p": 61.2,
             "RL_if": 73.6,
             "RL": 46.6,
             "RC_p": 61.2,
+            "RC_if": 78.3,
+            "RC": 50.0,
+            "Readability": 47.5
         },
         "maintainability": {
             "MI*": 26.0,
             "E_NI_S": 30.0,
             "Efficiency": 28.6
         },
         "overall": {
+            "RACE Score": 39.8
         }
     },
+    "DeepSeek-Coder-7B-Instruct-V1.5": {
+        "correctness": {
+            "HumanEval+": 61.0,
+            "MBPP+": 59.3,
+            "ClassEval": 23.0,
+            "LeetCode": 23.3,
+            "LeetCode_Efficiency": 32.7,
+            "Correctness": 39.9
+        },
         "readability": {
             "R*": 61.0,
             "RN_p": 61.5,
+            "RN_if": 60.5,
+            "RN": 36.8,
             "RL_p": 62.6,
             "RL_if": 70.9,
             "RL": 46.0,
             "RC_p": 62.8,
+            "RC_if": 83.0,
+            "RC": 53.7,
+            "Readability": 45.5
         },
         "maintainability": {
             "MI*": 23.0,
             "E_NI_S": 26.8,
             "Efficiency": 26.0
         },
         "overall": {
+            "RACE Score": 38.9
         }
     },
+    "DeepSeek-Coder-33B-Instruct": {
+        "correctness": {
+            "HumanEval+": 65.9,
+            "MBPP+": 61.9,
+            "ClassEval": 28.0,
+            "LeetCode": 22.2,
+            "LeetCode_Efficiency": 45.5,
+            "Correctness": 44.7
+        },
         "readability": {
             "R*": 65.9,
             "RN_p": 64.6,
+            "RN_if": 90.1,
+            "RN": 59.0,
             "RL_p": 65.0,
             "RL_if": 82.7,
             "RL": 53.5,
             "RC_p": 66.5,
+            "RC_if": 80.8,
+            "RC": 54.0,
+            "Readability": 55.5
         },
         "maintainability": {
             "MI*": 28.0,
             "E_NI_S": 36.1,
             "Efficiency": 35.7
         },
         "overall": {
+            "RACE Score": 44.8
         }
     },
     "DeepSeek-Coder-V2-Lite-Instruct-16B": {
+        "correctness": {
+            "HumanEval+": 72.0,
+            "MBPP+": 62.7,
+            "ClassEval": 26.0,
+            "LeetCode": 44.4,
+            "LeetCode_Efficiency": 49.5,
+            "Correctness": 50.9
+        },
         "readability": {
             "R*": 72.0,
             "RN_p": 71.2,
+            "RN_if": 57.8,
+            "RN": 41.8,
             "RL_p": 66.5,
             "RL_if": 83.7,
             "RL": 57.7,
             "RC_p": 67.1,
+            "RC_if": 71.0,
+            "RC": 47.5,
+            "Readability": 49.0
         },
         "maintainability": {
             "MI*": 26.0,
             "E_NI_S": 47.7,
             "Efficiency": 44.0
         },
         "overall": {
+            "RACE Score": 48.2
         }
     },
+    "DeepSeek-V2.5-236B": {
         "correctness": {
+            "HumanEval+": 72.0,
+            "MBPP+": 63.0,
+            "ClassEval": 41.0,
+            "LeetCode": 61.7,
+            "LeetCode_Efficiency": 57.4,
+            "Correctness": 59.0
         },
         "readability": {
+            "R*": 72.0,
+            "RN_p": 74.5,
+            "RN_if": 95.8,
+            "RN": 72.2,
+            "RL_p": 72.8,
+            "RL_if": 89.8,
+            "RL": 66.1,
+            "RC_p": 74.1,
+            "RC_if": 87.5,
+            "RC": 65.8,
+            "Readability": 68.0
         },
         "maintainability": {
+            "MI*": 41.0,
+            "MI_p": 36.0,
+            "MI": 72.9,
+            "MC*": 61.7,
+            "MC_p": 59.1,
+            "MC": 33.9,
+            "Maintainability": 53.4
         },
         "efficiency": {
+            "E*": 57.4,
+            "E_p": 54.5,
+            "E_NI_T": 46.4,
+            "E_NI_S": 49.5,
+            "Efficiency": 48.0
         },
         "overall": {
+            "RACE Score": 57.1
         }
     },
+    "CodeQwen1.5-7B-Chat": {
         "correctness": {
+            "HumanEval+": 76.2,
+            "MBPP+": 60.3,
+            "ClassEval": 22.0,
+            "LeetCode": 33.3,
+            "LeetCode_Efficiency": 39.6,
+            "Correctness": 46.3
         },
         "readability": {
             "R*": 76.2,
             "RN_p": 76.8,
+            "RN_if": 63.2,
+            "RN": 48.8,
             "RL_p": 73.4,
             "RL_if": 60.8,
             "RL": 47.0,
             "RC_p": 74.7,
+            "RC_if": 80.8,
+            "RC": 62.2,
+            "Readability": 52.7
         },
         "maintainability": {
             "MI*": 22.0,
             "E_NI_S": 37.7,
             "Efficiency": 34.2
         },
+        "overall": {
+            "RACE Score": 45.2
+        }
+    },
+    "Qwen2.5-Coder-7B-Instruct": {
         "correctness": {
+            "HumanEval+": 78.0,
+            "MBPP+": 64.8,
+            "ClassEval": 29.0,
+            "LeetCode": 54.4,
+            "LeetCode_Efficiency": 59.4,
+            "Correctness": 57.1
+        },
+        "readability": {
+            "R*": 78.0,
+            "RN_p": 81.4,
+            "RN_if": 64.9,
+            "RN": 53.0,
+            "RL_p": 77.4,
+            "RL_if": 65.4,
+            "RL": 51.8,
+            "RC_p": 75.3,
+            "RC_if": 80.2,
+            "RC": 61.3,
+            "Readability": 55.4
+        },
+        "maintainability": {
+            "MI*": 29.0,
+            "MI_p": 27.0,
+            "MI": 78.6,
+            "MC*": 54.4,
+            "MC_p": 50.4,
+            "MC": 17.6,
+            "Maintainability": 48.1
+        },
+        "efficiency": {
+            "E*": 59.4,
+            "E_p": 48.5,
+            "E_NI_T": 37.0,
+            "E_NI_S": 33.7,
+            "Efficiency": 35.4
         },
         "overall": {
+            "RACE Score": 49.0
         }
     },
     "Qwen2-72B-Instruct": {
         "correctness": {
+            "HumanEval+": 73.2,
+            "MBPP+": 64.0,
+            "ClassEval": 40.0,
+            "LeetCode": 42.8,
+            "LeetCode_Efficiency": 45.5,
             "Correctness": 53.1
         },
         "readability": {
             "R*": 73.2,
             "RN_p": 76.8,
+            "RN_if": 95.9,
+            "RN": 73.6,
             "RL_p": 74.8,
             "RL_if": 64.4,
             "RL": 47.6,
             "RC_p": 71.1,
+            "RC_if": 82.9,
+            "RC": 60.1,
+            "Readability": 60.4
         },
         "maintainability": {
             "MI*": 40.0,
             "Efficiency": 35.8
         },
         "overall": {
+            "RACE Score": 50.1
+        }
+    },
+    "Qwen2.5-72B-Instruct": {
+        "correctness": {
+            "HumanEval+": 79.3,
+            "MBPP+": 65.9,
+            "ClassEval": 34.0,
+            "LeetCode": 72.8,
+            "LeetCode_Efficiency": 68.3,
+            "Correctness": 64.1
+        },
+        "readability": {
+            "R*": 79.3,
+            "RN_p": 79.6,
+            "RN_if": 97.0,
+            "RN": 77.2,
+            "RL_p": 77.4,
+            "RL_if": 92.1,
+            "RL": 72.1,
+            "RC_p": 80.5,
+            "RC_if": 89.3,
+            "RC": 72.8,
+            "Readability": 74.0
+        },
+        "maintainability": {
+            "MI*": 34.0,
+            "MI_p": 32.0,
+            "MI": 76.7,
+            "MC*": 72.8,
+            "MC_p": 71.8,
+            "MC": 40.4,
+            "Maintainability": 58.5
+        },
+        "efficiency": {
+            "E*": 68.3,
+            "E_p": 69.3,
+            "E_NI_T": 47.9,
+            "E_NI_S": 49.4,
+            "Efficiency": 48.6
+        },
+        "overall": {
+            "RACE Score": 61.3
+        }
+    },
+    "Mixtral-8x22B": {
+        "correctness": {
+            "HumanEval+": 61.0,
+            "MBPP+": 60.6,
+            "ClassEval": 33.0,
+            "LeetCode": 20.0,
+            "LeetCode_Efficiency": 35.6,
+            "Correctness": 42.0
+        },
+        "readability": {
+            "R*": 61.0,
+            "RN_p": 64.4,
+            "RN_if": 87.0,
+            "RN": 56.2,
+            "RL_p": 62.4,
+            "RL_if": 73.2,
+            "RL": 47.8,
+            "RC_p": 64.9,
+            "RC_if": 84.8,
+            "RC": 56.1,
+            "Readability": 53.4
+        },
+        "maintainability": {
+            "MI*": 33.0,
+            "MI_p": 30.0,
+            "MI": 79.6,
+            "MC*": 20.0,
+            "MC_p": 22.6,
+            "MC": 9.1,
+            "Maintainability": 44.3
+        },
+        "efficiency": {
+            "E*": 35.6,
+            "E_p": 31.7,
+            "E_NI_T": 24.7,
+            "E_NI_S": 33.2,
+            "Efficiency": 29.0
+        },
+        "overall": {
+            "RACE Score": 42.2
+        }
+    },
+    "Llama3-8B-Instruct": {
+        "correctness": {
+            "HumanEval+": 49.4,
+            "MBPP+": 50.5,
+            "ClassEval": 24.0,
+            "LeetCode": 20.6,
+            "LeetCode_Efficiency": 33.7,
+            "Correctness": 35.6
+        },
+        "readability": {
+            "R*": 49.4,
+            "RN_p": 45.5,
+            "RN_if": 85.5,
+            "RN": 44.3,
+            "RL_p": 28.7,
+            "RL_if": 45.9,
+            "RL": 23.6,
+            "RC_p": 48.1,
+            "RC_if": 79.9,
+            "RC": 40.0,
+            "Readability": 36.0
+        },
+        "maintainability": {
+            "MI*": 24.0,
+            "MI_p": 19.0,
+            "MI": 79.8,
+            "MC*": 20.6,
+            "MC_p": 19.1,
+            "MC": 8.1,
+            "Maintainability": 43.9
+        },
+        "efficiency": {
+            "E*": 33.7,
+            "E_p": 31.7,
+            "E_NI_T": 23.5,
+            "E_NI_S": 26.9,
+            "Efficiency": 25.2
+        },
+        "overall": {
+            "RACE Score": 35.2
+        }
+    },
+    "Llama3-70B-Instruct": {
+        "correctness": {
+            "HumanEval+": 65.2,
+            "MBPP+": 58.5,
+            "ClassEval": 28.0,
+            "LeetCode": 31.7,
+            "LeetCode_Efficiency": 38.6,
+            "Correctness": 44.4
+        },
+        "readability": {
+            "R*": 65.2,
+            "RN_p": 67.8,
+            "RN_if": 96.7,
+            "RN": 66.0,
+            "RL_p": 56.1,
+            "RL_if": 75.8,
+            "RL": 47.8,
+            "RC_p": 64.6,
+            "RC_if": 84.8,
+            "RC": 54.2,
+            "Readability": 56.0
+        },
+        "maintainability": {
+            "MI*": 28.0,
+            "MI_p": 29.0,
+            "MI": 79.8,
+            "MC*": 31.7,
+            "MC_p": 31.7,
+            "MC": 25.2,
+            "Maintainability": 52.5
+        },
+        "efficiency": {
+            "E*": 38.6,
+            "E_p": 38.6,
+            "E_NI_T": 29.2,
+            "E_NI_S": 42.8,
+            "Efficiency": 36.0
+        },
+        "overall": {
+            "RACE Score": 47.2
+        }
+    },
+    "StarCoder2-15B": {
+        "correctness": {
+            "HumanEval+": 36.0,
+            "MBPP+": 39.9,
+            "ClassEval": 24.0,
+            "LeetCode": 16.1,
+            "LeetCode_Efficiency": 26.7,
+            "Correctness": 28.5
+        },
+        "readability": {
+            "R*": 36.0,
+            "RN_p": 39.5,
+            "RN_if": 64.3,
+            "RN": 25.8,
+            "RL_p": 40.2,
+            "RL_if": 66.1,
+            "RL": 27.9,
+            "RC_p": 35.4,
+            "RC_if": 59.4,
+            "RC": 22.0,
+            "Readability": 25.2
+        },
+        "maintainability": {
+            "MI*": 24.0,
+            "MI_p": 25.0,
+            "MI": 74.2,
+            "MC*": 16.1,
+            "MC_p": 13.7,
+            "MC": 6.1,
+            "Maintainability": 40.1
+        },
+        "efficiency": {
+            "E*": 26.7,
+            "E_p": 25.7,
+            "E_NI_T": 20.6,
+            "E_NI_S": 25.1,
+            "Efficiency": 22.9
+        },
+        "overall": {
+            "RACE Score": 29.2
         }
     }
 }

text_content.py CHANGED Viewed

@@ -1,9 +1,13 @@
 HEAD_TEXT = """
 Based on the 🏎️RACE benchmark, we demonstrated the ability of different LLMs to generate code that is **_correct_** and **_meets the requirements of real-world development scenarios_**.
-More details about how to evalute the LLM are available in the [🏎️RACE GitHub repository](https://github.com/jszheng21/RACE). For a complete description of RACE benchmark and related experimental analysis, please refer to the paper: [Beyond Correctness: Benchmarking Multi-dimensional Code Generation for Large Language Models](https://arxiv.org/abs/2407.11470). [![](https://img.shields.io/badge/arXiv-2407.11470-b31b1b.svg)](https://arxiv.org/abs/2407.11470)
 **_Latest News_** 🔥
 - [24/07/24] We add the evaluation results of `claude-3.5-sonnet` and `Qwen2-72B-Instruct` in [RACE leaderboard](https://huggingface.co/spaces/jszheng/RACE_leaderboard).
 - [24/07/16] We release our RACE benchmark, leaderboard and paper.
 """
@@ -58,7 +62,7 @@ Inspired from the [🤗 Open LLM Leaderboard](https://huggingface.co/spaces/Hugg
 NOTES_TEXT = """
 **Notes:**
 - `💯 RACE Score` denotes the final evaluation result based on 🏎️RACE benchmark, which is the average of the scores in the four dimensions: `✅ Correctness`, `📖 Readability`, `🔨 Maintainability`, and `🚀 Efficiency`.
-- All fine-grained evaluation results are provided in `⏬ Hidden Columns`. `📖 R` denotes code **R**eadability, `🔨 M` denotes code **M**aintainability, and `🚀 E` denotes code **E**fficiency. `*` denotes the correctness of the code in the corresponding dimension. More details about the abbreviations are as follows:
     - `📖 R*`: The code accuracy (baesline).
     - `📖 RN`: The proportion of code that is both functionally correct and follows customized instructions related to `Naming Convention`.
     - `📖 RL`: The proportion of code that is both functionally correct and follows customized instructions related to `Code Length`.

 HEAD_TEXT = """
 Based on the 🏎️RACE benchmark, we demonstrated the ability of different LLMs to generate code that is **_correct_** and **_meets the requirements of real-world development scenarios_**.
+More details about how to evalute the LLM are available in the [🏎️RACE GitHub repository](https://github.com/jszheng21/RACE). For a complete description of RACE benchmark and related experimental analysis, please refer to the paper: [Beyond Correctness: Benchmarking Multi-dimensional Code Generation for Large Language Models](https://arxiv.org/abs/2407.11470).
 **_Latest News_** 🔥
+- [24/10/09] We release the second version of [RACE paper](https://arxiv.org/abs/2407.11470).
+- [24/10/09] We add the evaluation results of 9 LLMs (including `o1-mini-2024-09-12`) in [RACE leaderboard](https://huggingface.co/spaces/jszheng/RACE_leaderboard).
+- [24/10/01] We have improved the calculation methods for readability-related metrics and enhanced the robustness of the code post-processing techniques.
+- [24/10/01] We have revised the test code in the LeetCode evaluation data to support the cases with multiple correct answers.
 - [24/07/24] We add the evaluation results of `claude-3.5-sonnet` and `Qwen2-72B-Instruct` in [RACE leaderboard](https://huggingface.co/spaces/jszheng/RACE_leaderboard).
 - [24/07/16] We release our RACE benchmark, leaderboard and paper.
 """
 NOTES_TEXT = """
 **Notes:**
 - `💯 RACE Score` denotes the final evaluation result based on 🏎️RACE benchmark, which is the average of the scores in the four dimensions: `✅ Correctness`, `📖 Readability`, `🔨 Maintainability`, and `🚀 Efficiency`.
+- All fine-grained evaluation results are provided in `⏬ Hidden Columns`. `📖 R` denotes code **R**eadability, `🔨 M` denotes code **M**aintainability, and `🚀 E` denotes code **E**fficiency. `*` denotes the code accuracy in the absence of customized instructions. More details about the abbreviations are as follows:
     - `📖 R*`: The code accuracy (baesline).
     - `📖 RN`: The proportion of code that is both functionally correct and follows customized instructions related to `Naming Convention`.
     - `📖 RL`: The proportion of code that is both functionally correct and follows customized instructions related to `Code Length`.