Spaces:
Sleeping
Sleeping
Commit
·
f53870f
1
Parent(s):
a9249e6
results from the paper
Browse files- app.py +1 -1
- slr_bench_results.csv +15 -0
app.py
CHANGED
|
@@ -9,7 +9,7 @@ This is a benchmark grid displaying model performance metrics.
|
|
| 9 |
""")
|
| 10 |
|
| 11 |
# Load the CSV file
|
| 12 |
-
csv_file_path = "
|
| 13 |
try:
|
| 14 |
data = pd.read_csv(csv_file_path)
|
| 15 |
|
|
|
|
| 9 |
""")
|
| 10 |
|
| 11 |
# Load the CSV file
|
| 12 |
+
csv_file_path = "slr_bench_results.csv" # Update this path to your CSV file
|
| 13 |
try:
|
| 14 |
data = pd.read_csv(csv_file_path)
|
| 15 |
|
slr_bench_results.csv
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Model,LRL,Syntax Score,Logical-Reasoning Acc. Basic,Logical-Reasoning Acc. Easy,Logical-Reasoning Acc. Medium,Logical-Reasoning Acc. Hard,Total Compute Tokens (M),Total Compute Costs ($)
|
| 2 |
+
o3,15.5,80,99,93,74,45,4.30,207.24
|
| 3 |
+
o4-mini,12.3,86,93,88,52,13,3.98,21.43
|
| 4 |
+
o1,11.9,68,92,89,41,15,5.19,364.72
|
| 5 |
+
o3-mini,11.6,75,97,90,37,7,4.73,24.71
|
| 6 |
+
o1-mini,10.1,95,97,82,20,3,3.65,19.98
|
| 7 |
+
R1-Llama-70B,8.8,75,98,67,8,4,11.61,5.33
|
| 8 |
+
Gemini-thinking,8.6,83,93,65,13,1,---,---
|
| 9 |
+
gpt-4.5-prev,7.3,100,94,47,5,1,0.37,576.40
|
| 10 |
+
gpt-4o,6.2,100,93,29,2,0,0.26,20.03
|
| 11 |
+
Llama 3.3-70B,5.6,100,90,22,1,0,0.49,0.82
|
| 12 |
+
gpt-4-turbo,5.4,100,89,18,2,0,0.41,81.30
|
| 13 |
+
Llama 3.1-8B,3.8,80,70,7,0,0,0.20,0.14
|
| 14 |
+
Llama 3.2-3B,1.0,25,19,1,0,0,0.10,0.11
|
| 15 |
+
Llama 3.2-1B,0.0,93,0,0,0,0,0.47,0.12
|