vishaldhakad commited on
Commit
e301abd
Β·
1 Parent(s): e7d44a8

fix: add pyproject.toml for openenv validate

Browse files
Files changed (1) hide show
  1. inference.py +82 -143
inference.py CHANGED
@@ -1,15 +1,6 @@
1
  """
2
  SecureCodeEnv - Baseline Inference Script
3
  Required by hackathon. Runs an LLM agent through the environment.
4
-
5
- Usage:
6
- export API_BASE_URL=https://api.openai.com/v1
7
- export MODEL_NAME=gpt-4o-mini
8
- export HF_TOKEN=hf_your_token
9
- export ENV_URL=http://localhost:7860 # or your HF Space URL
10
- python inference.py
11
-
12
- Completes in under 20 minutes on 2 vCPU / 8GB RAM.
13
  """
14
  import os
15
  import json
@@ -17,19 +8,24 @@ import time
17
  import sys
18
  import requests
19
  from openai import OpenAI
 
20
 
21
- # ── Required environment variables ──────────────────────────────────────────
22
  API_BASE_URL = os.environ.get("API_BASE_URL", "https://api.openai.com/v1")
23
- MODEL_NAME = os.environ.get("MODEL_NAME", "gpt-4o-mini")
24
- HF_TOKEN = os.environ.get("HF_TOKEN", "")
25
- ENV_URL = os.environ.get("ENV_URL", "http://localhost:7860")
 
 
 
 
 
26
 
27
  if not HF_TOKEN:
28
  print("⚠️ HF_TOKEN not set. Some model endpoints may reject requests.", file=sys.stderr)
29
 
30
  client = OpenAI(base_url=API_BASE_URL, api_key=HF_TOKEN or "sk-placeholder")
31
 
32
- # ── System prompt ─────────────────────────────────────────────────────────
33
  SYSTEM_PROMPT = """You are a senior Python security engineer.
34
  You write production-ready, secure Python code with no shortcuts.
35
 
@@ -42,17 +38,24 @@ Rules:
42
  6. Use hmac.compare_digest for secret comparison (not ==).
43
  7. Validate all inputs β€” handle None, empty string, type errors.
44
  8. Add type hints and docstrings to every function.
45
- 9. Follow the naming and style conventions shown in CODEBASE CONTEXT.
46
- 10. Use pathlib.Path.resolve() for file path validation (not string checks)."""
47
 
 
 
 
 
 
 
 
 
 
48
 
49
- def run_episode(difficulty: str = "medium") -> dict:
 
50
  """Run one full episode at the given difficulty and return results."""
51
- print(f"\n{'='*60}")
52
- print(f" Episode: {difficulty.upper()}")
53
- print(f"{'='*60}")
54
 
55
- # ── Step 1: Reset environment ─────────────────────────────────────────
56
  try:
57
  reset_resp = requests.post(
58
  f"{ENV_URL}/reset",
@@ -60,178 +63,114 @@ def run_episode(difficulty: str = "medium") -> dict:
60
  timeout=30,
61
  )
62
  reset_resp.raise_for_status()
63
- except requests.RequestException as e:
64
  print(f"❌ /reset failed: {e}")
65
- return {"task": "unknown", "scores": [], "final_score": 0.0, "improved": False, "error": str(e)}
66
 
67
  episode = reset_resp.json()
68
  sid = episode["session_id"]
69
  task_id = episode["task_id"]
70
- print(f" Task: {task_id}")
71
- print(f" CWE targets: {episode.get('cwe_targets', [])}")
72
-
73
- scores_history = []
74
- prev_feedback = {}
75
 
76
  for step_num in range(5):
77
- # ── Step 2: Build prompt ──────────────────────────────────────────
78
  context = episode.get("codegraph", {})
79
  context_prompt = context.get("context_prompt", "")
80
- # Cap context at 3000 chars to stay within token budget
81
- context_str = context_prompt[:3000] if context_prompt else json.dumps(context, indent=2)[:2000]
82
 
83
- feedback_str = ""
84
- if prev_feedback:
85
- feedback_str = "\n\nPREVIOUS ATTEMPT FEEDBACK:\n" + "\n".join(
86
- f" {k}: {v}" for k, v in prev_feedback.items() if v
87
- )
88
-
89
- user_message = f"""Task: {episode['problem_statement']}
90
-
91
- Security targets: {episode.get('cwe_targets', [])}
92
 
93
- {context_str}
94
- {feedback_str}
95
 
96
- Write the complete Python implementation now:"""
97
-
98
- messages = [
99
- {"role": "system", "content": SYSTEM_PROMPT},
100
- {"role": "user", "content": user_message},
101
- ]
102
-
103
- # ── Step 3: Call LLM ──────────────────────────────────────────────
104
  try:
105
  response = client.chat.completions.create(
106
  model=MODEL_NAME,
107
- messages=messages,
 
 
 
108
  max_tokens=1500,
109
- temperature=0.1, # Low temperature for consistent, focused code
110
  )
111
- code = response.choices[0].message.content.strip()
112
-
113
- # Strip markdown fences if model added them anyway
114
- if code.startswith("```python"):
115
- code = code[9:]
116
- if code.startswith("```"):
117
- code = code[3:]
118
- if code.endswith("```"):
119
- code = code[:-3]
120
- code = code.strip()
121
 
122
- except Exception as e:
123
- print(f" ⚠️ LLM call failed at step {step_num+1}: {e}")
124
- break
125
-
126
- # ── Step 4: Submit to environment ─────────────────────────────────
127
- try:
128
  step_resp = requests.post(
129
  f"{ENV_URL}/step",
130
  json={
131
  "session_id": sid,
132
  "code": code,
133
- "filename": f"solution_step{step_num}.py",
134
  "task_id": task_id,
135
  },
136
- timeout=60, # Grading can take up to 60s (bandit + attacks)
137
  )
138
  step_resp.raise_for_status()
139
- except requests.RequestException as e:
140
- print(f" ⚠️ /step failed: {e}")
141
- break
 
 
 
 
 
 
 
 
 
142
 
143
- result = step_resp.json()
144
- reward = result["total_reward"]
145
- scores_history.append(reward)
146
- prev_feedback = result.get("feedback", {})
147
-
148
- # Pretty print step result
149
- scores = result.get("scores", {})
150
- print(f"\n Step {step_num+1} β†’ reward={reward:.3f}")
151
- print(f" correctness={scores.get('correctness',0):.2f} "
152
- f"attack={scores.get('attack_resist',0):.2f} "
153
- f"static={scores.get('static_security',0):.2f} "
154
- f"consistency={scores.get('consistency',0):.2f}")
155
- print(f" summary: {prev_feedback.get('summary', '')}")
156
-
157
- if result["done"]:
158
- print(f"\n βœ… Episode complete in {step_num+1} steps!")
159
  break
160
 
161
- # Feed updated CodeGraph back for next step
162
- episode["codegraph"] = result.get("codegraph", {})
163
-
164
- if not scores_history:
165
- scores_history = [0.0]
166
-
167
- improved = len(scores_history) > 1 and scores_history[-1] > scores_history[0]
168
  return {
169
  "task": task_id,
170
  "difficulty": difficulty,
171
  "scores": scores_history,
172
- "final_score": scores_history[-1],
173
- "improved": improved,
174
  "steps": len(scores_history),
175
  }
176
 
177
 
178
- def main():
179
- """Run one episode per difficulty and print aggregate results."""
180
- print(f"\n{'='*60}")
181
- print(f" SecureCodeEnv β€” Baseline Inference")
182
- print(f" Model: {MODEL_NAME}")
183
- print(f" Env: {ENV_URL}")
184
- print(f"{'='*60}")
185
 
186
- # Verify environment is up
187
  try:
188
  health = requests.get(f"{ENV_URL}/health", timeout=10)
189
  health.raise_for_status()
190
- print(f"\n βœ… Environment healthy: {health.json()}")
191
  except Exception as e:
192
- print(f"\n ❌ Environment not reachable at {ENV_URL}: {e}")
193
- print(" Start the server: uvicorn app.main:app --host 0.0.0.0 --port 7860")
194
- sys.exit(1)
195
 
196
  results = []
197
- start = time.time()
198
 
199
- for difficulty in ["easy", "medium", "hard"]:
200
- r = run_episode(difficulty)
201
- results.append(r)
202
- # Small pause between episodes
 
203
  time.sleep(1)
204
 
205
- elapsed = time.time() - start
206
-
207
- # ── Final report ──────────────────────────────────────────────────────
208
- print(f"\n{'='*60}")
209
- print(f" FINAL RESULTS ({elapsed:.1f}s total)")
210
- print(f"{'='*60}")
211
-
212
- for r in results:
213
- status = "βœ…" if r["final_score"] >= 0.7 else "⚠️ " if r["final_score"] >= 0.4 else "❌"
214
- improved_str = "↑ improved" if r.get("improved") else "β€”"
215
- print(f" {status} {r['task']:45s} {r['final_score']:.3f} {improved_str}")
216
-
217
- valid_scores = [r["final_score"] for r in results]
218
- avg = sum(valid_scores) / len(valid_scores) if valid_scores else 0
219
- print(f"\n Average final score: {avg:.3f}")
220
- print(f" Scores: {[round(s, 3) for s in valid_scores]}")
221
-
222
- # Write machine-readable results
223
- output = {
224
- "model": MODEL_NAME,
225
- "env_url": ENV_URL,
226
- "elapsed_seconds": round(elapsed, 1),
227
- "results": results,
228
- "average_score": round(avg, 4),
229
- }
230
  with open("inference_results.json", "w") as f:
231
- json.dump(output, f, indent=2)
232
- print(f"\n Results saved to inference_results.json")
233
 
234
- return 0 if avg >= 0.4 else 1
 
 
235
 
236
 
237
  if __name__ == "__main__":
 
1
  """
2
  SecureCodeEnv - Baseline Inference Script
3
  Required by hackathon. Runs an LLM agent through the environment.
 
 
 
 
 
 
 
 
 
4
  """
5
  import os
6
  import json
 
8
  import sys
9
  import requests
10
  from openai import OpenAI
11
+ from typing import Dict, List, Any, Optional
12
 
13
+ # ── Constants & Configuration ──────────────────────────────────────────────
14
  API_BASE_URL = os.environ.get("API_BASE_URL", "https://api.openai.com/v1")
15
+ MODEL_NAME = os.environ.get("MODEL_NAME", "gpt-4o-mini")
16
+ HF_TOKEN = os.environ.get("HF_TOKEN", "")
17
+ ENV_URL = os.environ.get("ENV_URL", "http://localhost:7860").rstrip("/")
18
+
19
+ # Typed Exception for environment issues
20
+ class EnvironmentConnectionError(Exception):
21
+ """Raised when the sandbox environment is unreachable or returns 5xx."""
22
+ pass
23
 
24
  if not HF_TOKEN:
25
  print("⚠️ HF_TOKEN not set. Some model endpoints may reject requests.", file=sys.stderr)
26
 
27
  client = OpenAI(base_url=API_BASE_URL, api_key=HF_TOKEN or "sk-placeholder")
28
 
 
29
  SYSTEM_PROMPT = """You are a senior Python security engineer.
30
  You write production-ready, secure Python code with no shortcuts.
31
 
 
38
  6. Use hmac.compare_digest for secret comparison (not ==).
39
  7. Validate all inputs β€” handle None, empty string, type errors.
40
  8. Add type hints and docstrings to every function.
41
+ 9. Use pathlib.Path.resolve() for file path validation."""
42
+
43
 
44
+ def clean_code_output(raw_code: str) -> str:
45
+ """Removes markdown fences and surrounding whitespace safely."""
46
+ lines = raw_code.splitlines()
47
+ if not lines:
48
+ return ""
49
+
50
+ # Filter out markdown code fence markers
51
+ filtered = [line for line in lines if not line.strip().startswith("```")]
52
+ return "\n".join(filtered).strip()
53
 
54
+
55
+ def run_episode(difficulty: str = "medium") -> Dict[str, Any]:
56
  """Run one full episode at the given difficulty and return results."""
57
+ print(f"\n{'='*60}\n Episode: {difficulty.upper()}\n{'='*60}")
 
 
58
 
 
59
  try:
60
  reset_resp = requests.post(
61
  f"{ENV_URL}/reset",
 
63
  timeout=30,
64
  )
65
  reset_resp.raise_for_status()
66
+ except Exception as e:
67
  print(f"❌ /reset failed: {e}")
68
+ return {"task": f"reset_fail_{difficulty}", "scores": [0.0], "final_score": 0.0, "error": str(e)}
69
 
70
  episode = reset_resp.json()
71
  sid = episode["session_id"]
72
  task_id = episode["task_id"]
73
+
74
+ scores_history: List[float] = []
75
+ prev_feedback: Dict[str, Any] = {}
 
 
76
 
77
  for step_num in range(5):
 
78
  context = episode.get("codegraph", {})
79
  context_prompt = context.get("context_prompt", "")
80
+ context_str = context_prompt[:3000] if context_prompt else json.dumps(context)[:2000]
 
81
 
82
+ feedback_list = [f"{k}: {v}" for k, v in prev_feedback.items() if v]
83
+ feedback_str = "\n\nPREVIOUS FEEDBACK:\n" + "\n".join(feedback_list) if feedback_list else ""
 
 
 
 
 
 
 
84
 
85
+ user_message = f"Task: {episode['problem_statement']}\nTargets: {episode.get('cwe_targets', [])}\n{context_str}{feedback_str}\n\nImplementation:"
 
86
 
 
 
 
 
 
 
 
 
87
  try:
88
  response = client.chat.completions.create(
89
  model=MODEL_NAME,
90
+ messages=[
91
+ {"role": "system", "content": SYSTEM_PROMPT},
92
+ {"role": "user", "content": user_message},
93
+ ],
94
  max_tokens=1500,
95
+ temperature=0.1,
96
  )
97
+ raw_content = response.choices[0].message.content or ""
98
+ code = clean_code_output(raw_content)
99
+
100
+ if not code:
101
+ print(f" ⚠️ Step {step_num}: LLM returned empty code.")
102
+ break
 
 
 
 
103
 
 
 
 
 
 
 
104
  step_resp = requests.post(
105
  f"{ENV_URL}/step",
106
  json={
107
  "session_id": sid,
108
  "code": code,
109
+ "filename": f"solution_s{step_num}.py",
110
  "task_id": task_id,
111
  },
112
+ timeout=65,
113
  )
114
  step_resp.raise_for_status()
115
+ result = step_resp.json()
116
+
117
+ reward = result.get("total_reward", 0.0)
118
+ scores_history.append(reward)
119
+ prev_feedback = result.get("feedback", {})
120
+
121
+ print(f" Step {step_num+1} β†’ reward={reward:.3f}")
122
+
123
+ if result.get("done"):
124
+ break
125
+
126
+ episode["codegraph"] = result.get("codegraph", {})
127
 
128
+ except Exception as e:
129
+ print(f" ⚠️ Error during step {step_num+1}: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
130
  break
131
 
132
+ final_score = scores_history[-1] if scores_history else 0.0
 
 
 
 
 
 
133
  return {
134
  "task": task_id,
135
  "difficulty": difficulty,
136
  "scores": scores_history,
137
+ "final_score": final_score,
 
138
  "steps": len(scores_history),
139
  }
140
 
141
 
142
+ def main() -> int:
143
+ """Main execution loop."""
144
+ print(f"Model: {MODEL_NAME} | Env: {ENV_URL}")
 
 
 
 
145
 
 
146
  try:
147
  health = requests.get(f"{ENV_URL}/health", timeout=10)
148
  health.raise_for_status()
 
149
  except Exception as e:
150
+ print(f"❌ Environment unreachable at {ENV_URL}. Ensure server is running.\nError: {e}")
151
+ return 1
 
152
 
153
  results = []
154
+ start_time = time.time()
155
 
156
+ for diff in ["easy", "medium", "hard"]:
157
+ try:
158
+ results.append(run_episode(diff))
159
+ except Exception as e:
160
+ print(f"Critical failure in {diff} episode: {e}")
161
  time.sleep(1)
162
 
163
+ elapsed = time.time() - start_time
164
+ avg_score = sum(r["final_score"] for r in results) / len(results) if results else 0.0
165
+
166
+ print(f"\n{'='*60}\n FINAL AVERAGE: {avg_score:.3f} ({elapsed:.1f}s)\n{'='*60}")
167
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
168
  with open("inference_results.json", "w") as f:
169
+ json.dump({"results": results, "avg": avg_score}, f, indent=2)
 
170
 
171
+ # Return 0 to indicate the script finished its logic, regardless of score
172
+ # Unless there were absolutely no results (total failure)
173
+ return 0 if results else 1
174
 
175
 
176
  if __name__ == "__main__":