Spaces:

Tejasghatule
/

code-review-env

Running

codemaverick2

Code Review Environment OpenEnv hackathon submission

ff9fcbd 13 days ago

7.42 kB

	"""
	Inference script for the Code Review Environment.

	Environment variables:
	API_BASE_URL — LLM API endpoint (e.g. https://openrouter.ai/api/v1)
	MODEL_NAME — Model identifier (e.g. openai/gpt-4o-mini)
	HF_TOKEN — API key for the LLM provider
	ENV_URL — Environment base URL (default: localhost:7860)

	Usage:
	export API_BASE_URL=https://openrouter.ai/api/v1
	export MODEL_NAME=openai/gpt-4o-mini
	export HF_TOKEN=sk-...
	python inference.py
	"""
	from __future__ import annotations

	import os
	import sys
	import json
	import time

	import httpx

	API_BASE_URL: str = os.environ.get("API_BASE_URL", "").rstrip("/")
	MODEL_NAME: str = os.environ.get("MODEL_NAME", "gpt-4o-mini")
	HF_TOKEN: str = os.environ.get("HF_TOKEN", "")
	ENV_URL: str = os.environ.get("ENV_URL", "http://localhost:7860").rstrip("/")

	TASK_IDS = ["bug-detection", "security-audit", "comprehensive-review"]

	SYSTEM_PROMPT = """\
	You are an expert software engineer performing a thorough code review.

	Your job is to identify bugs, security vulnerabilities, and performance issues in code.

	For each issue you find, respond with a single JSON object:
	{"action_type": "flag_issue", "line_number": <int>, "filename": "<file>", "issue_type": "bug\|security\|performance\|logic", "severity": "low\|medium\|high\|critical", "description": "<explanation>", "fix_suggestion": "<fix>"}

	When done, respond with:
	{"action_type": "submit_review"}

	Rules:
	- Respond with raw JSON only — no markdown fences, no extra text
	- One action per response
	- Be precise with line numbers (count from line 1)
	- Only flag real issues, not style preferences
	"""


	def chat_completion(messages: list) -> str:
	try:
	from openai import OpenAI
	except ImportError:
	raise ImportError("pip install openai")

	kwargs = {"api_key": HF_TOKEN or "no-key"}
	if API_BASE_URL:
	kwargs["base_url"] = API_BASE_URL

	client = OpenAI(**kwargs)
	response = client.chat.completions.create(
	model=MODEL_NAME,
	messages=messages,
	temperature=0.0,
	max_tokens=400,
	)
	return response.choices[0].message.content.strip()


	def parse_action(text: str) -> dict:
	text = text.strip()

	if "```" in text:
	parts = text.split("```")
	for part in parts:
	part = part.strip()
	if part.startswith("json"):
	part = part[4:].strip()
	if part.startswith("{") or part.startswith("["):
	text = part
	break

	decoder = json.JSONDecoder()
	for i, ch in enumerate(text):
	if ch in ("{", "["):
	try:
	obj, _ = decoder.raw_decode(text, i)
	if isinstance(obj, dict):
	return obj
	if isinstance(obj, list):
	for item in obj:
	if isinstance(item, dict):
	return item
	except json.JSONDecodeError:
	continue

	return {"action_type": "submit_review"}


	def run_keyword_fallback(base_url: str, task_id: str) -> dict:
	"""Fallback: use the built-in /baseline endpoint (no LLM needed)."""
	with httpx.Client(timeout=30) as client:
	resp = client.post(f"{base_url}/baseline")
	resp.raise_for_status()
	results = resp.json()
	score = results["baseline_scores"].get(task_id, {}).get("score", 0.0)
	return {"task_id": task_id, "score": score, "steps": 0, "method": "keyword_heuristic"}


	def run_task(task_id: str, http_client: httpx.Client) -> dict:
	resp = http_client.post(f"{ENV_URL}/reset", json={"task_id": task_id}, timeout=30)
	resp.raise_for_status()
	obs = resp.json()

	code_display = "\n\n".join(
	f"=== {fname} ===\n{code}"
	for fname, code in obs.get("code_files", {}).items()
	)

	messages = [
	{"role": "system", "content": SYSTEM_PROMPT},
	{
	"role": "user",
	"content": (
	f"Task: {obs.get('task_description', '')}\n\n"
	f"{code_display}\n\n"
	f"Review this code carefully. Flag every issue you find. "
	f"You have {obs.get('max_steps', 20)} steps total."
	),
	},
	]

	done = False
	step_count = 0
	max_steps = obs.get("max_steps", 20)
	final_score = 0.0

	while not done and step_count < max_steps:
	action_text = chat_completion(messages)
	action = parse_action(action_text)

	try:
	step_resp = http_client.post(f"{ENV_URL}/step", json=action, timeout=30)
	step_resp.raise_for_status()
	obs = step_resp.json()
	except Exception as e:
	print(f" Step error: {e}")
	break

	done = obs.get("done", False)
	step_count += 1
	final_score = obs.get("current_score", 0.0)
	reward = obs.get("reward")

	messages.append({"role": "assistant", "content": action_text})
	messages.append({
	"role": "user",
	"content": (
	f"Feedback: {obs.get('feedback', '')} "
	f"(step {step_count}/{max_steps}, score: {obs.get('current_score', 0.0):.3f})"
	),
	})

	atype = action.get("action_type", "")
	print(f" Step {step_count:2d}: {atype:20s} \| reward={str(reward):8s} \| score={obs.get('current_score', 0.0):.3f}")

	if atype == "submit_review":
	final_score = obs.get("reward", obs.get("current_score", 0.0)) or 0.0
	break

	time.sleep(0.3)

	return {
	"task_id": task_id,
	"score": float(final_score),
	"steps": step_count,
	"method": "llm",
	}


	def main():
	use_llm = bool(HF_TOKEN and API_BASE_URL)

	print("Code Review Environment — Inference")
	print(f" Model : {MODEL_NAME}")
	print(f" API URL : {API_BASE_URL or '(not set — using keyword heuristic)'}")
	print(f" Env URL : {ENV_URL}")
	print(f" Tasks : {TASK_IDS}\n")

	try:
	with httpx.Client(timeout=10) as probe:
	health = probe.get(f"{ENV_URL}/health")
	health.raise_for_status()
	print(f" Health: {health.json()}\n")
	except Exception as e:
	print(f"ERROR: Cannot reach environment at {ENV_URL}: {e}")
	sys.exit(1)

	results = {}

	if use_llm:
	with httpx.Client(timeout=60) as client:
	for task_id in TASK_IDS:
	print(f"Running task: {task_id}")
	result = run_task(task_id, client)
	results[task_id] = result
	print(f" → score: {result['score']:.4f} ({result['steps']} steps)\n")
	else:
	print("HF_TOKEN / API_BASE_URL not set — using built-in keyword heuristic baseline.\n")
	for task_id in TASK_IDS:
	print(f"Running task: {task_id}")
	result = run_keyword_fallback(ENV_URL, task_id)
	results[task_id] = result
	print(f" → score: {result['score']:.4f}\n")

	print("=" * 50)
	print("INFERENCE RESULTS")
	print("=" * 50)
	for task_id, r in results.items():
	print(f" {task_id:30s} score={r['score']:.4f}")

	overall = sum(r["score"] for r in results.values()) / len(results)
	print(f"\n Overall average: {overall:.4f}")
	print("=" * 50)

	return results


	if __name__ == "__main__":
	main()