new metrics
Browse files- src/leaderboard/processor.py +6 -25
src/leaderboard/processor.py
CHANGED
|
@@ -32,13 +32,11 @@ def calculate_integral_score(row: pd.Series) -> float:
|
|
| 32 |
integral_score *= row[metric_col]
|
| 33 |
metric_count += 1
|
| 34 |
|
| 35 |
-
#
|
|
|
|
|
|
|
| 36 |
if metric_count == 0:
|
| 37 |
-
|
| 38 |
-
integral_score *= row["macro_accuracy"]
|
| 39 |
-
metric_count += 1
|
| 40 |
-
else:
|
| 41 |
-
return 0.0 # Cannot calculate score without primary metrics
|
| 42 |
|
| 43 |
# Error Penalty
|
| 44 |
micro_error_col = "micro_avg_error_ratio"
|
|
@@ -46,32 +44,15 @@ def calculate_integral_score(row: pd.Series) -> float:
|
|
| 46 |
# Micro error is stored as %, convert back to ratio
|
| 47 |
micro_error_ratio = row[micro_error_col] / 100.0
|
| 48 |
integral_score *= (1.0 - micro_error_ratio)
|
| 49 |
-
else:
|
| 50 |
-
# Fallback: Calculate average error from per-test-type
|
| 51 |
-
error_ratios = []
|
| 52 |
-
for test_type in TEST_TYPES:
|
| 53 |
-
error_col = f"{test_type}_error_ratio"
|
| 54 |
-
if error_col in row and pd.notna(row[error_col]):
|
| 55 |
-
error_ratios.append(row[error_col])
|
| 56 |
-
if error_ratios:
|
| 57 |
-
avg_error_ratio = np.mean(error_ratios)
|
| 58 |
-
integral_score *= (1.0 - avg_error_ratio)
|
| 59 |
|
| 60 |
# Runtime Penalty
|
|
|
|
| 61 |
micro_runtime_col = "micro_avg_runtime_ms"
|
| 62 |
if micro_runtime_col in row and pd.notna(row[micro_runtime_col]):
|
| 63 |
avg_runtime_ms = row[micro_runtime_col]
|
| 64 |
-
else:
|
| 65 |
-
# Fallback: Calculate average runtime from per-test-type
|
| 66 |
-
runtimes = []
|
| 67 |
-
for test_type in TEST_TYPES:
|
| 68 |
-
runtime_col = f"{test_type}_avg_runtime_ms"
|
| 69 |
-
if runtime_col in row and pd.notna(row[runtime_col]):
|
| 70 |
-
runtimes.append(row[runtime_col])
|
| 71 |
-
avg_runtime_ms = np.mean(runtimes) if runtimes else None
|
| 72 |
|
| 73 |
if avg_runtime_ms is not None:
|
| 74 |
-
# Apply penalty based on runtime (
|
| 75 |
runtime = max(
|
| 76 |
min(avg_runtime_ms, MAX_PUNISHABLE_RUNTIME_MS),
|
| 77 |
MIN_PUNISHABLE_RUNTIME_MS,
|
|
|
|
| 32 |
integral_score *= row[metric_col]
|
| 33 |
metric_count += 1
|
| 34 |
|
| 35 |
+
# If no accuracy metrics were found at all, the score remains 1.0 before penalties.
|
| 36 |
+
# The library returns 0.0 in this case (`return integral_score if count > 0 else 0.0`)
|
| 37 |
+
# Let's add that check back before applying penalties.
|
| 38 |
if metric_count == 0:
|
| 39 |
+
return 0.0
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
|
| 41 |
# Error Penalty
|
| 42 |
micro_error_col = "micro_avg_error_ratio"
|
|
|
|
| 44 |
# Micro error is stored as %, convert back to ratio
|
| 45 |
micro_error_ratio = row[micro_error_col] / 100.0
|
| 46 |
integral_score *= (1.0 - micro_error_ratio)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
|
| 48 |
# Runtime Penalty
|
| 49 |
+
avg_runtime_ms = None # Initialize
|
| 50 |
micro_runtime_col = "micro_avg_runtime_ms"
|
| 51 |
if micro_runtime_col in row and pd.notna(row[micro_runtime_col]):
|
| 52 |
avg_runtime_ms = row[micro_runtime_col]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
|
| 54 |
if avg_runtime_ms is not None:
|
| 55 |
+
# Apply penalty based on runtime (only if micro avg runtime was found)
|
| 56 |
runtime = max(
|
| 57 |
min(avg_runtime_ms, MAX_PUNISHABLE_RUNTIME_MS),
|
| 58 |
MIN_PUNISHABLE_RUNTIME_MS,
|