Spaces:
Runtime error
Runtime error
File size: 5,697 Bytes
fe64236 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 | """
utils.py
====================================
This module contains helper utilities for:
1. β±οΈ Performance measurement (deterministic timing)
2. π§Ή Parsing agent responses (clean JSON extraction)
3. β‘ Speed comparison (optimization scoring)
4. π Logging performance reports (feedback to agent)
π§ WHY THIS MATTERS:
--------------------
In our environment, we are not just fixing code β
we are MEASURING how much better it becomes.
This is critical for:
- Reward shaping
- Fair evaluation
- Reproducibility (required by OpenEnv)
"""
import time
import json
import re
from typing import Any, Dict, Callable
# ============================================================
# β±οΈ SPEED MEASUREMENT
# ============================================================
def measure_execution_time(code_str: str, inputs: Dict[str, Any], iterations: int = 5) -> float:
"""
Measure execution time of code deterministically.
Why multiple iterations?
-------------------------
CPU performance fluctuates due to:
- Background processes
- OS scheduling
- Cache effects
To reduce noise, we:
- Run the code multiple times
- Take the MINIMUM time (best-case performance)
Why time.perf_counter() instead of time.time()?
------------------------------------------------
time.time():
β Lower precision
β Affected by system clock changes
time.perf_counter():
β
High precision (nanosecond-level)
β
Monotonic (never goes backward)
β
Best for benchmarking
Parameters:
-----------
code_str : str
Python code to execute
inputs : dict
Variables passed into execution environment
iterations : int
Number of times to run the code
Returns:
--------
float β minimum execution time (seconds)
"""
exec_times = []
for _ in range(iterations):
# Create isolated execution environment
local_env = dict(inputs)
start = time.perf_counter()
try:
exec(code_str, {}, local_env)
except Exception:
# If code fails, treat as very slow
return float("inf")
end = time.perf_counter()
exec_times.append(end - start)
return min(exec_times) if exec_times else float("inf")
# ============================================================
# π§Ή JSON CLEANER
# ============================================================
def parse_agent_response(response_str: str) -> Dict[str, Any]:
"""
Parse agent response into a clean JSON dictionary.
Problem:
--------
LLMs often return JSON wrapped in markdown like:
```json
{
"key": "value"
}
```
This function:
- Removes markdown wrappers
- Extracts JSON safely
- Returns Python dictionary
Parameters:
-----------
response_str : str
Returns:
--------
dict
"""
if not response_str:
return {}
# Remove markdown code blocks
cleaned = re.sub(r"```json|```", "", response_str, flags=re.IGNORECASE).strip()
# Try direct JSON parsing
try:
return json.loads(cleaned)
except json.JSONDecodeError:
pass
# Try to extract JSON substring
match = re.search(r"\{.*\}", cleaned, re.DOTALL)
if match:
try:
return json.loads(match.group())
except json.JSONDecodeError:
pass
# Fallback
return {}
# ============================================================
# β‘ SPEEDUP CALCULATOR
# ============================================================
def compute_speedup(baseline_time: float, new_time: float) -> float:
"""
Compute speedup ratio between two execution times.
Formula:
speedup = baseline_time / new_time
Examples:
---------
baseline = 1.0s
new = 0.1s
β speedup = 10.0 (10x faster)
Edge Cases:
-----------
- If new_time == 0 β return large number
- If baseline_time == inf β return 0
Returns:
--------
float β speedup ratio
"""
if new_time == 0:
return float("inf")
if baseline_time == float("inf"):
return 0.0
return baseline_time / new_time
# ============================================================
# π PERFORMANCE REPORT FORMATTER
# ============================================================
def format_performance_report(
baseline_time: float,
new_time: float,
speedup: float
) -> str:
"""
Create a human-readable performance report.
This gets fed back to the agent as part of Observation.
Why?
----
Agents learn better with structured feedback.
Example Output:
---------------
Baseline Time: 0.5231 sec
Optimized Time: 0.0512 sec
Speedup: 10.21x faster
Returns:
--------
str
"""
if baseline_time == float("inf"):
baseline_str = "Failed"
else:
baseline_str = f"{baseline_time:.6f} sec"
if new_time == float("inf"):
new_str = "Failed"
else:
new_str = f"{new_time:.6f} sec"
if speedup == float("inf"):
speedup_str = "β (instant)"
else:
speedup_str = f"{speedup:.2f}x faster"
report = (
f"Performance Report:\n"
f"- Baseline Time: {baseline_str}\n"
f"- Optimized Time: {new_str}\n"
f"- Speedup: {speedup_str}"
)
return report |