vishaldhakad commited on
Commit
82633d7
Β·
1 Parent(s): e301abd

fix: add pyproject.toml for openenv validate

Browse files
Files changed (1) hide show
  1. inference.py +51 -130
inference.py CHANGED
@@ -1,6 +1,7 @@
1
  """
2
  SecureCodeEnv - Baseline Inference Script
3
  Required by hackathon. Runs an LLM agent through the environment.
 
4
  """
5
  import os
6
  import json
@@ -8,170 +9,90 @@ import time
8
  import sys
9
  import requests
10
  from openai import OpenAI
11
- from typing import Dict, List, Any, Optional
12
 
13
- # ── Constants & Configuration ──────────────────────────────────────────────
14
  API_BASE_URL = os.environ.get("API_BASE_URL", "https://api.openai.com/v1")
15
  MODEL_NAME = os.environ.get("MODEL_NAME", "gpt-4o-mini")
16
  HF_TOKEN = os.environ.get("HF_TOKEN", "")
17
  ENV_URL = os.environ.get("ENV_URL", "http://localhost:7860").rstrip("/")
18
 
19
- # Typed Exception for environment issues
20
- class EnvironmentConnectionError(Exception):
21
- """Raised when the sandbox environment is unreachable or returns 5xx."""
22
- pass
23
-
24
- if not HF_TOKEN:
25
- print("⚠️ HF_TOKEN not set. Some model endpoints may reject requests.", file=sys.stderr)
26
-
27
  client = OpenAI(base_url=API_BASE_URL, api_key=HF_TOKEN or "sk-placeholder")
28
 
29
  SYSTEM_PROMPT = """You are a senior Python security engineer.
30
- You write production-ready, secure Python code with no shortcuts.
31
-
32
- Rules:
33
- 1. Output ONLY raw Python code β€” no markdown fences, no explanations.
34
- 2. Never use: eval(), exec(), shell=True, hashlib.md5, random.random() for security.
35
- 3. Always use parameterized queries (never f-string SQL).
36
- 4. Use secrets module (not random) for tokens and session IDs.
37
- 5. Use bcrypt (not hashlib) for password hashing.
38
- 6. Use hmac.compare_digest for secret comparison (not ==).
39
- 7. Validate all inputs β€” handle None, empty string, type errors.
40
- 8. Add type hints and docstrings to every function.
41
- 9. Use pathlib.Path.resolve() for file path validation."""
42
-
43
-
44
- def clean_code_output(raw_code: str) -> str:
45
- """Removes markdown fences and surrounding whitespace safely."""
46
- lines = raw_code.splitlines()
47
- if not lines:
48
- return ""
49
-
50
- # Filter out markdown code fence markers
51
- filtered = [line for line in lines if not line.strip().startswith("```")]
52
- return "\n".join(filtered).strip()
53
 
 
 
 
 
54
 
55
- def run_episode(difficulty: str = "medium") -> Dict[str, Any]:
56
- """Run one full episode at the given difficulty and return results."""
57
- print(f"\n{'='*60}\n Episode: {difficulty.upper()}\n{'='*60}")
58
-
59
  try:
60
- reset_resp = requests.post(
61
- f"{ENV_URL}/reset",
62
- json={"difficulty": difficulty},
63
- timeout=30,
64
- )
65
- reset_resp.raise_for_status()
66
  except Exception as e:
67
- print(f"❌ /reset failed: {e}")
68
- return {"task": f"reset_fail_{difficulty}", "scores": [0.0], "final_score": 0.0, "error": str(e)}
69
-
70
- episode = reset_resp.json()
71
- sid = episode["session_id"]
72
- task_id = episode["task_id"]
73
-
74
- scores_history: List[float] = []
75
- prev_feedback: Dict[str, Any] = {}
76
 
77
- for step_num in range(5):
78
- context = episode.get("codegraph", {})
79
- context_prompt = context.get("context_prompt", "")
80
- context_str = context_prompt[:3000] if context_prompt else json.dumps(context)[:2000]
81
 
82
- feedback_list = [f"{k}: {v}" for k, v in prev_feedback.items() if v]
83
- feedback_str = "\n\nPREVIOUS FEEDBACK:\n" + "\n".join(feedback_list) if feedback_list else ""
84
 
85
- user_message = f"Task: {episode['problem_statement']}\nTargets: {episode.get('cwe_targets', [])}\n{context_str}{feedback_str}\n\nImplementation:"
 
86
 
 
 
 
 
 
87
  try:
88
- response = client.chat.completions.create(
89
  model=MODEL_NAME,
90
- messages=[
91
- {"role": "system", "content": SYSTEM_PROMPT},
92
- {"role": "user", "content": user_message},
93
- ],
94
- max_tokens=1500,
95
- temperature=0.1,
96
  )
97
- raw_content = response.choices[0].message.content or ""
98
- code = clean_code_output(raw_content)
99
 
100
- if not code:
101
- print(f" ⚠️ Step {step_num}: LLM returned empty code.")
102
- break
103
-
104
- step_resp = requests.post(
105
  f"{ENV_URL}/step",
106
- json={
107
- "session_id": sid,
108
- "code": code,
109
- "filename": f"solution_s{step_num}.py",
110
- "task_id": task_id,
111
- },
112
- timeout=65,
113
  )
114
- step_resp.raise_for_status()
115
- result = step_resp.json()
116
 
117
- reward = result.get("total_reward", 0.0)
118
- scores_history.append(reward)
119
- prev_feedback = result.get("feedback", {})
120
 
121
- print(f" Step {step_num+1} β†’ reward={reward:.3f}")
 
122
 
123
- if result.get("done"):
124
  break
 
125
 
126
- episode["codegraph"] = result.get("codegraph", {})
127
-
128
- except Exception as e:
129
- print(f" ⚠️ Error during step {step_num+1}: {e}")
130
  break
131
 
132
- final_score = scores_history[-1] if scores_history else 0.0
133
- return {
134
- "task": task_id,
135
- "difficulty": difficulty,
136
- "scores": scores_history,
137
- "final_score": final_score,
138
- "steps": len(scores_history),
139
- }
140
-
141
-
142
- def main() -> int:
143
- """Main execution loop."""
144
- print(f"Model: {MODEL_NAME} | Env: {ENV_URL}")
145
 
 
 
146
  try:
147
- health = requests.get(f"{ENV_URL}/health", timeout=10)
148
- health.raise_for_status()
149
- except Exception as e:
150
- print(f"❌ Environment unreachable at {ENV_URL}. Ensure server is running.\nError: {e}")
151
- return 1
152
-
153
- results = []
154
- start_time = time.time()
155
 
156
  for diff in ["easy", "medium", "hard"]:
157
- try:
158
- results.append(run_episode(diff))
159
- except Exception as e:
160
- print(f"Critical failure in {diff} episode: {e}")
161
  time.sleep(1)
162
 
163
- elapsed = time.time() - start_time
164
- avg_score = sum(r["final_score"] for r in results) / len(results) if results else 0.0
165
-
166
- print(f"\n{'='*60}\n FINAL AVERAGE: {avg_score:.3f} ({elapsed:.1f}s)\n{'='*60}")
167
-
168
- with open("inference_results.json", "w") as f:
169
- json.dump({"results": results, "avg": avg_score}, f, indent=2)
170
-
171
- # Return 0 to indicate the script finished its logic, regardless of score
172
- # Unless there were absolutely no results (total failure)
173
- return 0 if results else 1
174
-
175
-
176
  if __name__ == "__main__":
177
- sys.exit(main())
 
1
  """
2
  SecureCodeEnv - Baseline Inference Script
3
  Required by hackathon. Runs an LLM agent through the environment.
4
+ Outputs structured [START]/[STEP]/[END] blocks for automated parsing.
5
  """
6
  import os
7
  import json
 
9
  import sys
10
  import requests
11
  from openai import OpenAI
12
+ from typing import Dict, List, Any
13
 
14
+ # ── Configuration ──────────────────────────────────────────────────────────
15
  API_BASE_URL = os.environ.get("API_BASE_URL", "https://api.openai.com/v1")
16
  MODEL_NAME = os.environ.get("MODEL_NAME", "gpt-4o-mini")
17
  HF_TOKEN = os.environ.get("HF_TOKEN", "")
18
  ENV_URL = os.environ.get("ENV_URL", "http://localhost:7860").rstrip("/")
19
 
 
 
 
 
 
 
 
 
20
  client = OpenAI(base_url=API_BASE_URL, api_key=HF_TOKEN or "sk-placeholder")
21
 
22
  SYSTEM_PROMPT = """You are a senior Python security engineer.
23
+ Output ONLY raw Python code with type hints and docstrings. No markdown.
24
+ Follow SOLID principles and use cryptographically secure libraries."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
 
26
+ def clean_code(raw: str) -> str:
27
+ """Strictly removes markdown and whitespace."""
28
+ lines = [line for line in raw.splitlines() if not line.strip().startswith("```")]
29
+ return "\n".join(lines).strip()
30
 
31
+ def run_episode(difficulty: str) -> None:
32
+ """Runs episode and prints [START], [STEP], and [END] blocks."""
 
 
33
  try:
34
+ r = requests.post(f"{ENV_URL}/reset", json={"difficulty": difficulty}, timeout=30)
35
+ r.raise_for_status()
36
+ data = r.json()
 
 
 
37
  except Exception as e:
38
+ return
 
 
 
 
 
 
 
 
39
 
40
+ sid = data["session_id"]
41
+ tid = data["task_id"]
 
 
42
 
43
+ # REQUIRED: [START] block
44
+ print(f"[START] task={tid} difficulty={difficulty}", flush=True)
45
 
46
+ final_score = 0.0
47
+ total_steps = 0
48
 
49
+ for i in range(1, 6):
50
+ total_steps = i
51
+ # Simple prompt construction
52
+ prompt = f"Task: {data['problem_statement']}\nCode context: {json.dumps(data.get('codegraph', {}))}"
53
+
54
  try:
55
+ resp = client.chat.completions.create(
56
  model=MODEL_NAME,
57
+ messages=[{"role": "system", "content": SYSTEM_PROMPT}, {"role": "user", "content": prompt}],
58
+ temperature=0.1
 
 
 
 
59
  )
60
+ code = clean_code(resp.choices[0].message.content or "")
 
61
 
62
+ step_r = requests.post(
 
 
 
 
63
  f"{ENV_URL}/step",
64
+ json={"session_id": sid, "code": code, "filename": f"step_{i}.py", "task_id": tid},
65
+ timeout=65
 
 
 
 
 
66
  )
67
+ step_r.raise_for_status()
68
+ res = step_r.json()
69
 
70
+ reward = res.get("total_reward", 0.0)
71
+ final_score = reward
 
72
 
73
+ # REQUIRED: [STEP] block
74
+ print(f"[STEP] step={i} reward={reward:.3f}", flush=True)
75
 
76
+ if res.get("done"):
77
  break
78
+ data["codegraph"] = res.get("codegraph", {})
79
 
80
+ except Exception:
 
 
 
81
  break
82
 
83
+ # REQUIRED: [END] block
84
+ print(f"[END] task={tid} score={final_score:.3f} steps={total_steps}", flush=True)
 
 
 
 
 
 
 
 
 
 
 
85
 
86
+ def main():
87
+ # Verify health first
88
  try:
89
+ requests.get(f"{ENV_URL}/health", timeout=5).raise_for_status()
90
+ except:
91
+ sys.exit(1)
 
 
 
 
 
92
 
93
  for diff in ["easy", "medium", "hard"]:
94
+ run_episode(diff)
 
 
 
95
  time.sleep(1)
96
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97
  if __name__ == "__main__":
98
+ main()