Spaces:
Runtime error
Runtime error
| """ | |
| utils.py | |
| ==================================== | |
| This module contains helper utilities for: | |
| 1. β±οΈ Performance measurement (deterministic timing) | |
| 2. π§Ή Parsing agent responses (clean JSON extraction) | |
| 3. β‘ Speed comparison (optimization scoring) | |
| 4. π Logging performance reports (feedback to agent) | |
| π§ WHY THIS MATTERS: | |
| -------------------- | |
| In our environment, we are not just fixing code β | |
| we are MEASURING how much better it becomes. | |
| This is critical for: | |
| - Reward shaping | |
| - Fair evaluation | |
| - Reproducibility (required by OpenEnv) | |
| """ | |
| import time | |
| import json | |
| import re | |
| from typing import Any, Dict, Callable | |
| # ============================================================ | |
| # β±οΈ SPEED MEASUREMENT | |
| # ============================================================ | |
| def measure_execution_time(code_str: str, inputs: Dict[str, Any], iterations: int = 5) -> float: | |
| """ | |
| Measure execution time of code deterministically. | |
| Why multiple iterations? | |
| ------------------------- | |
| CPU performance fluctuates due to: | |
| - Background processes | |
| - OS scheduling | |
| - Cache effects | |
| To reduce noise, we: | |
| - Run the code multiple times | |
| - Take the MINIMUM time (best-case performance) | |
| Why time.perf_counter() instead of time.time()? | |
| ------------------------------------------------ | |
| time.time(): | |
| β Lower precision | |
| β Affected by system clock changes | |
| time.perf_counter(): | |
| β High precision (nanosecond-level) | |
| β Monotonic (never goes backward) | |
| β Best for benchmarking | |
| Parameters: | |
| ----------- | |
| code_str : str | |
| Python code to execute | |
| inputs : dict | |
| Variables passed into execution environment | |
| iterations : int | |
| Number of times to run the code | |
| Returns: | |
| -------- | |
| float β minimum execution time (seconds) | |
| """ | |
| exec_times = [] | |
| for _ in range(iterations): | |
| # Create isolated execution environment | |
| local_env = dict(inputs) | |
| start = time.perf_counter() | |
| try: | |
| exec(code_str, {}, local_env) | |
| except Exception: | |
| # If code fails, treat as very slow | |
| return float("inf") | |
| end = time.perf_counter() | |
| exec_times.append(end - start) | |
| return min(exec_times) if exec_times else float("inf") | |
| # ============================================================ | |
| # π§Ή JSON CLEANER | |
| # ============================================================ | |
| def parse_agent_response(response_str: str) -> Dict[str, Any]: | |
| """ | |
| Parse agent response into a clean JSON dictionary. | |
| Problem: | |
| -------- | |
| LLMs often return JSON wrapped in markdown like: | |
| ```json | |
| { | |
| "key": "value" | |
| } | |
| ``` | |
| This function: | |
| - Removes markdown wrappers | |
| - Extracts JSON safely | |
| - Returns Python dictionary | |
| Parameters: | |
| ----------- | |
| response_str : str | |
| Returns: | |
| -------- | |
| dict | |
| """ | |
| if not response_str: | |
| return {} | |
| # Remove markdown code blocks | |
| cleaned = re.sub(r"```json|```", "", response_str, flags=re.IGNORECASE).strip() | |
| # Try direct JSON parsing | |
| try: | |
| return json.loads(cleaned) | |
| except json.JSONDecodeError: | |
| pass | |
| # Try to extract JSON substring | |
| match = re.search(r"\{.*\}", cleaned, re.DOTALL) | |
| if match: | |
| try: | |
| return json.loads(match.group()) | |
| except json.JSONDecodeError: | |
| pass | |
| # Fallback | |
| return {} | |
| # ============================================================ | |
| # β‘ SPEEDUP CALCULATOR | |
| # ============================================================ | |
| def compute_speedup(baseline_time: float, new_time: float) -> float: | |
| """ | |
| Compute speedup ratio between two execution times. | |
| Formula: | |
| speedup = baseline_time / new_time | |
| Examples: | |
| --------- | |
| baseline = 1.0s | |
| new = 0.1s | |
| β speedup = 10.0 (10x faster) | |
| Edge Cases: | |
| ----------- | |
| - If new_time == 0 β return large number | |
| - If baseline_time == inf β return 0 | |
| Returns: | |
| -------- | |
| float β speedup ratio | |
| """ | |
| if new_time == 0: | |
| return float("inf") | |
| if baseline_time == float("inf"): | |
| return 0.0 | |
| return baseline_time / new_time | |
| # ============================================================ | |
| # π PERFORMANCE REPORT FORMATTER | |
| # ============================================================ | |
| def format_performance_report( | |
| baseline_time: float, | |
| new_time: float, | |
| speedup: float | |
| ) -> str: | |
| """ | |
| Create a human-readable performance report. | |
| This gets fed back to the agent as part of Observation. | |
| Why? | |
| ---- | |
| Agents learn better with structured feedback. | |
| Example Output: | |
| --------------- | |
| Baseline Time: 0.5231 sec | |
| Optimized Time: 0.0512 sec | |
| Speedup: 10.21x faster | |
| Returns: | |
| -------- | |
| str | |
| """ | |
| if baseline_time == float("inf"): | |
| baseline_str = "Failed" | |
| else: | |
| baseline_str = f"{baseline_time:.6f} sec" | |
| if new_time == float("inf"): | |
| new_str = "Failed" | |
| else: | |
| new_str = f"{new_time:.6f} sec" | |
| if speedup == float("inf"): | |
| speedup_str = "β (instant)" | |
| else: | |
| speedup_str = f"{speedup:.2f}x faster" | |
| report = ( | |
| f"Performance Report:\n" | |
| f"- Baseline Time: {baseline_str}\n" | |
| f"- Optimized Time: {new_str}\n" | |
| f"- Speedup: {speedup_str}" | |
| ) | |
| return report |