Spaces:

whitecircle-ai
/

circle-guard-bench

Running

App Files Files Community

apsys commited on Apr 24

Commit

75d59af

1 Parent(s): a3c3e83

new metrics

Browse files

Files changed (1) hide show

src/leaderboard/processor.py +6 -25

src/leaderboard/processor.py CHANGED Viewed

@@ -32,13 +32,11 @@ def calculate_integral_score(row: pd.Series) -> float:
             integral_score *= row[metric_col]
             metric_count += 1
-    # Fallback if no primary metrics found
     if metric_count == 0:
-        if "macro_accuracy" in row and pd.notna(row["macro_accuracy"]):
-            integral_score *= row["macro_accuracy"]
-            metric_count += 1
-        else:
-            return 0.0 # Cannot calculate score without primary metrics
     # Error Penalty
     micro_error_col = "micro_avg_error_ratio"
@@ -46,32 +44,15 @@ def calculate_integral_score(row: pd.Series) -> float:
         # Micro error is stored as %, convert back to ratio
         micro_error_ratio = row[micro_error_col] / 100.0
         integral_score *= (1.0 - micro_error_ratio)
-    else:
-        # Fallback: Calculate average error from per-test-type
-        error_ratios = []
-        for test_type in TEST_TYPES:
-            error_col = f"{test_type}_error_ratio"
-            if error_col in row and pd.notna(row[error_col]):
-                error_ratios.append(row[error_col])
-        if error_ratios:
-            avg_error_ratio = np.mean(error_ratios)
-            integral_score *= (1.0 - avg_error_ratio)
     # Runtime Penalty
     micro_runtime_col = "micro_avg_runtime_ms"
     if micro_runtime_col in row and pd.notna(row[micro_runtime_col]):
         avg_runtime_ms = row[micro_runtime_col]
-    else:
-        # Fallback: Calculate average runtime from per-test-type
-        runtimes = []
-        for test_type in TEST_TYPES:
-            runtime_col = f"{test_type}_avg_runtime_ms"
-            if runtime_col in row and pd.notna(row[runtime_col]):
-                runtimes.append(row[runtime_col])
-        avg_runtime_ms = np.mean(runtimes) if runtimes else None
     if avg_runtime_ms is not None:
-        # Apply penalty based on runtime (using micro or calculated average)
         runtime = max(
             min(avg_runtime_ms, MAX_PUNISHABLE_RUNTIME_MS),
             MIN_PUNISHABLE_RUNTIME_MS,

             integral_score *= row[metric_col]
             metric_count += 1
+    # If no accuracy metrics were found at all, the score remains 1.0 before penalties.
+    # The library returns 0.0 in this case (`return integral_score if count > 0 else 0.0`)
+    # Let's add that check back before applying penalties.
     if metric_count == 0:
+        return 0.0
     # Error Penalty
     micro_error_col = "micro_avg_error_ratio"
         # Micro error is stored as %, convert back to ratio
         micro_error_ratio = row[micro_error_col] / 100.0
         integral_score *= (1.0 - micro_error_ratio)
     # Runtime Penalty
+    avg_runtime_ms = None # Initialize
     micro_runtime_col = "micro_avg_runtime_ms"
     if micro_runtime_col in row and pd.notna(row[micro_runtime_col]):
         avg_runtime_ms = row[micro_runtime_col]
     if avg_runtime_ms is not None:
+        # Apply penalty based on runtime (only if micro avg runtime was found)
         runtime = max(
             min(avg_runtime_ms, MAX_PUNISHABLE_RUNTIME_MS),
             MIN_PUNISHABLE_RUNTIME_MS,