SidhaGarg commited on
Commit
ec24749
·
1 Parent(s): f713f41

Avoid rounded endpoint scores in inference END logs

Browse files
Files changed (1) hide show
  1. inference.py +36 -20
inference.py CHANGED
@@ -1,6 +1,7 @@
1
  import asyncio
2
  import json
3
  import os
 
4
  from typing import Any, Dict, List, Tuple
5
 
6
  from openai import OpenAI
@@ -11,35 +12,43 @@ from models import CloudAction
11
 
12
  API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
13
  MODEL_NAME = os.getenv("MODEL_NAME", "google/gemma-4-31B-it")
14
- HF_TOKEN = os.getenv("HF_TOKEN")
15
 
16
  BENCHMARK = "CloudDevOpsEnv"
17
  MAX_STEPS = 15
18
  MAX_TOTAL_REWARD = 1.0
19
- SCORE_EPSILON = 1e-6
 
20
  SUCCESS_SCORE_THRESHOLD = 0.8
21
 
22
 
23
  def log_start(task: str, env: str, model: str) -> None:
24
- log_data = {"task": task, "env": env, "model": model}
25
- print(f"[START] {json.dumps(log_data)}", flush=True)
26
 
27
 
28
  def log_step(step: int, action: Any, reward: float, done: bool, error: Any) -> None:
29
  action_dict = action.model_dump() if hasattr(action, "model_dump") else str(action)
30
- log_data = {
31
- "step": step,
32
- "action": action_dict,
33
- "reward": reward,
34
- "done": done,
35
- "error": error,
36
- }
37
- print(f"[STEP] {json.dumps(log_data)}", flush=True)
 
 
 
 
38
 
39
 
40
  def log_end(success: bool, steps: int, score: float, rewards: List[float]) -> None:
41
- log_data = {"success": success, "steps": steps, "score": score, "rewards": rewards}
42
- print(f"[END] {json.dumps(log_data)}", flush=True)
 
 
 
 
43
 
44
 
45
  def get_model_action(
@@ -85,10 +94,10 @@ def get_model_action(
85
  action_dict = json.loads(raw_text)
86
  return CloudAction(**action_dict), raw_text
87
  except (json.JSONDecodeError, ValidationError) as exc:
88
- print(f"[DEBUG] Model parse failed: {exc}", flush=True)
89
  return CloudAction(command="list_resources"), "failed_parse"
90
  except Exception as exc:
91
- print(f"[DEBUG] API request failed: {exc}", flush=True)
92
  return CloudAction(command="list_resources"), "api_error"
93
 
94
 
@@ -141,23 +150,30 @@ async def run_task(task_name: str, client: OpenAI) -> None:
141
  break
142
 
143
  score = sum(rewards)
144
- # Phase-2 validator expects each task score to be strictly within (0, 1).
145
- score = max(SCORE_EPSILON, min(score, MAX_TOTAL_REWARD - SCORE_EPSILON))
146
  success = score >= SUCCESS_SCORE_THRESHOLD
147
 
148
  finally:
 
 
 
 
149
  log_end(success=success, steps=steps_taken, score=score, rewards=rewards)
150
 
151
 
152
  async def main() -> None:
153
  if not HF_TOKEN:
154
- print("[WARNING] HF_TOKEN environment variable not set. API calls will likely fail.")
 
 
 
 
155
 
156
  client = OpenAI(base_url=API_BASE_URL, api_key=HF_TOKEN)
157
 
158
  tasks = ["easy", "medium", "hard"]
159
  for task in tasks:
160
- print(f"\n--- Running Task: {task.upper()} ---")
161
  await run_task(task, client)
162
 
163
 
 
1
  import asyncio
2
  import json
3
  import os
4
+ import sys
5
  from typing import Any, Dict, List, Tuple
6
 
7
  from openai import OpenAI
 
12
 
13
  API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
14
  MODEL_NAME = os.getenv("MODEL_NAME", "google/gemma-4-31B-it")
15
+ HF_TOKEN = os.getenv("HF_TOKEN") or os.getenv("API_KEY")
16
 
17
  BENCHMARK = "CloudDevOpsEnv"
18
  MAX_STEPS = 15
19
  MAX_TOTAL_REWARD = 1.0
20
+ SCORE_MIN = 0.001
21
+ SCORE_MAX = 0.999
22
  SUCCESS_SCORE_THRESHOLD = 0.8
23
 
24
 
25
  def log_start(task: str, env: str, model: str) -> None:
26
+ print(f"[START] task={task} env={env} model={model}", flush=True)
 
27
 
28
 
29
  def log_step(step: int, action: Any, reward: float, done: bool, error: Any) -> None:
30
  action_dict = action.model_dump() if hasattr(action, "model_dump") else str(action)
31
+ if isinstance(action_dict, dict):
32
+ action_str = json.dumps(action_dict, separators=(",", ":"))
33
+ else:
34
+ action_str = str(action_dict)
35
+ action_str = action_str.replace("\n", " ").replace("\r", " ")
36
+
37
+ error_str = "null" if not error else str(error).replace("\n", " ").replace("\r", " ")
38
+ done_str = str(done).lower()
39
+ print(
40
+ f"[STEP] step={step} action={action_str} reward={reward:.2f} done={done_str} error={error_str}",
41
+ flush=True,
42
+ )
43
 
44
 
45
  def log_end(success: bool, steps: int, score: float, rewards: List[float]) -> None:
46
+ rewards_str = ",".join(f"{r:.2f}" for r in rewards)
47
+ success_str = str(success).lower()
48
+ print(
49
+ f"[END] success={success_str} steps={steps} score={score:.3f} rewards={rewards_str}",
50
+ flush=True,
51
+ )
52
 
53
 
54
  def get_model_action(
 
94
  action_dict = json.loads(raw_text)
95
  return CloudAction(**action_dict), raw_text
96
  except (json.JSONDecodeError, ValidationError) as exc:
97
+ print(f"[DEBUG] Model parse failed: {exc}", file=sys.stderr, flush=True)
98
  return CloudAction(command="list_resources"), "failed_parse"
99
  except Exception as exc:
100
+ print(f"[DEBUG] API request failed: {exc}", file=sys.stderr, flush=True)
101
  return CloudAction(command="list_resources"), "api_error"
102
 
103
 
 
150
  break
151
 
152
  score = sum(rewards)
153
+ # Keep score strictly in (0,1) after formatting to avoid validator endpoint failures.
154
+ score = max(SCORE_MIN, min(score, SCORE_MAX))
155
  success = score >= SUCCESS_SCORE_THRESHOLD
156
 
157
  finally:
158
+ try:
159
+ await env.close()
160
+ except Exception as exc:
161
+ print(f"[DEBUG] env.close() failed: {exc}", file=sys.stderr, flush=True)
162
  log_end(success=success, steps=steps_taken, score=score, rewards=rewards)
163
 
164
 
165
  async def main() -> None:
166
  if not HF_TOKEN:
167
+ print(
168
+ "[WARN] HF_TOKEN (or API_KEY fallback) is not set. API calls will fail in remote evaluation.",
169
+ file=sys.stderr,
170
+ flush=True,
171
+ )
172
 
173
  client = OpenAI(base_url=API_BASE_URL, api_key=HF_TOKEN)
174
 
175
  tasks = ["easy", "medium", "hard"]
176
  for task in tasks:
 
177
  await run_task(task, client)
178
 
179