diff --git a/DEMO_SCRIPT.md b/DEMO_SCRIPT.md index 252599c1fdc7fd76ffb2ed52801148a3a6e602e0..d5fd9f7c7dec440eb0cefcbcf219c6ed0525b47b 100644 --- a/DEMO_SCRIPT.md +++ b/DEMO_SCRIPT.md @@ -1,12 +1,12 @@ -# TorchReview Copilot Demo Script - -## 60-90 Second Walkthrough - -1. Open the Hugging Face Space and introduce TorchReview Copilot as an AI-powered code review and improvement system built with PyTorch. -2. Point to the problem statement: manual code review is slow, inconsistent, and hard to scale. -3. Select the `Fix the invoice total syntax regression` example to show the app loading a broken code sample together with the context window. -4. Highlight the **Live Triage Radar**, the ML quality score, and the RL-ready reward score. -5. Explain that the PyTorch layer uses CodeBERTa embeddings to compare the input against known code-quality patterns from the OpenEnv task catalog. -6. Scroll to the three-step improvement plan and call out the progression: syntax and bug fixes, edge cases, then scalability. -7. Switch to the performance example to show the confidence profile and reward changing for a different class of issue. -8. Close by noting that OpenEnv still powers deterministic validation under the hood, so the demo remains grounded in measurable task outcomes. +# TorchReview Copilot Demo Script + +## 60-90 Second Walkthrough + +1. Open the Hugging Face Space and introduce TorchReview Copilot as an AI-powered code review and improvement system built with PyTorch. +2. Point to the problem statement: manual code review is slow, inconsistent, and hard to scale. +3. Select the `Fix the invoice total syntax regression` example to show the app loading a broken code sample together with the context window. +4. Highlight the **Live Triage Radar**, the ML quality score, and the RL-ready reward score. +5. Explain that the PyTorch layer uses CodeBERTa embeddings to compare the input against known code-quality patterns from the OpenEnv task catalog. +6. Scroll to the three-step improvement plan and call out the progression: syntax and bug fixes, edge cases, then scalability. +7. Switch to the performance example to show the confidence profile and reward changing for a different class of issue. +8. Close by noting that OpenEnv still powers deterministic validation under the hood, so the demo remains grounded in measurable task outcomes. diff --git a/Dockerfile b/Dockerfile index bf6b83a970a4ce9092573b9b095022e3b78fcf6c..4122db01d3ea918c9f75f69fc52bc0d8a1cc9a39 100644 --- a/Dockerfile +++ b/Dockerfile @@ -25,4 +25,4 @@ HEALTHCHECK --interval=30s --timeout=5s --start-period=10s --retries=3 \ CMD python -c "import urllib.request; urllib.request.urlopen('http://127.0.0.1:8000/health', timeout=3).read()" ENV ENABLE_WEB_INTERFACE=true -CMD ["uvicorn", "server.app:app", "--host", "0.0.0.0", "--port", "8000"] +CMD ["uvicorn", "server.app:app", "--host", "0.0.0.0", "--port", "8000", "--no-access-log"] diff --git a/README.md b/README.md index 0389dfb1337570056a010d811112fdc963cfee0a..0f57b3a98234b4eb9adc983892bf1763bad81ff5 100644 --- a/README.md +++ b/README.md @@ -1,181 +1,181 @@ ---- -title: Python Code Review Environment Server -sdk: docker -app_port: 8000 -base_path: /web -pinned: false -tags: - - openenv ---- - -# OpenEnv Python Code Review Environment - -Production-ready hackathon submission for OpenEnv evaluation, deterministic validator runs, and Hugging Face Docker deployment. - -## Architecture - -```text -root -├── inference.py # Root validator entrypoint -├── openenv.yaml # OpenEnv manifest -├── app/ -│ ├── agents/ # Action policy and fallback strategy -│ ├── env/ # RL loop runner and stdout contract -│ ├── models/ # Inference dataclasses/config -│ ├── services/ # OpenAI client wrapper with retries -│ └── utils/ # Formatting, task loading, log suppression -├── server/ -│ ├── env.py # OpenEnv environment and reward shaping -│ ├── app.py # FastAPI/OpenEnv app, optional Gradio mount -│ └── Dockerfile # Hugging Face Docker image -├── graders/ # Syntax, bug-fix, optimization graders -├── tasks/ # Deterministic benchmark tasks and references -├── services/ # Multi-domain analysis services -├── analyzers/ # Domain-specific analyzers -├── models/ # Lazy-loaded PyTorch scoring model -├── schemas/ # API request/response contracts -└── tests/ # Local validation coverage -``` - -Runtime flow: - -```text -inference.py - -> app.env.runner.InferenceRunner - -> env.reset(task_id=...) - -> ReviewAgent(action planning) - -> env.step_result(action) - -> strict [START]/[STEP]/[END] output -``` - -## What Was Fixed - -- `inference.py` now lives at the repo root and delegates to a strict runner under `app/env`. -- OpenAI usage is limited to the official Python client: - `client = OpenAI(base_url=API_BASE_URL, api_key=HF_TOKEN)`. -- Defaulted env vars are enforced for `API_BASE_URL` and `MODEL_NAME`; `HF_TOKEN` is read without a default and handled explicitly. -- Output now matches the required single-line contract exactly and always emits `[END]`, including failure paths. -- The RL loop now uses `reset()` plus `step_result()` in a proper `while not done` loop. -- Step errors now surface through `last_action_error` and are printed in `[STEP]`. -- Reward shaping is now dynamic in the OpenEnv environment: - code quality, test progress, runtime progress, error removal, regressions, and completion are all part of the reward. -- The API-side reward service is no longer a static weighted sum and now exposes quality, error-reduction, and completion signals. -- The Docker image now builds from the repo root, caches dependency installation more effectively, and runs `server.app:app` directly on port `8000`. -- Server startup is lighter: - the PyTorch analyzer is lazy-loaded and the Gradio demo is disabled by default. - -## Local Setup - -Install dev dependencies: - -```bash -pip install -e .[dev] -``` - -Run the test suite: - -```bash -pytest -q -``` - -Run the OpenEnv server locally: - -```bash -python -m uvicorn server.app:app --host 0.0.0.0 --port 8000 -``` - -Optional demo UI: - -```bash -set ENABLE_GRADIO_DEMO=true -set ENABLE_WEB_INTERFACE=true -python -m uvicorn server.app:app --host 0.0.0.0 --port 8000 -``` - -## Inference Contract - -Required environment variables: - -- `API_BASE_URL` - Default: `https://router.huggingface.co/v1` -- `MODEL_NAME` - Default: `Qwen/Qwen2.5-3B-Instruct` -- `HF_TOKEN` - Mandatory, no default is injected - -Example: - -```bash -set API_BASE_URL=https://router.huggingface.co/v1 -set MODEL_NAME=Qwen/Qwen2.5-3B-Instruct -set HF_TOKEN=hf_xxx -python inference.py -``` - -Expected stdout shape: - -```text -[START] task=syntax_fix_invoice_totals env=python_code_review_env model=Qwen/Qwen2.5-3B-Instruct -[STEP] step=1 action=run_tests reward=0.12 done=false error=null -[STEP] step=2 action=edit_code reward=0.96 done=false error=null -[STEP] step=3 action=run_tests reward=0.99 done=false error=null -[STEP] step=4 action=submit_solution reward=0.99 done=true error=null -[END] success=true steps=4 rewards=0.12,0.96,0.99,0.99 -``` - -## Docker - -Build from the project root: - -```bash -docker build -f server/Dockerfile . -``` - -Run locally: - -```bash -docker run --rm -p 8000:8000 ^ - -e API_BASE_URL=https://router.huggingface.co/v1 ^ - -e MODEL_NAME=Qwen/Qwen2.5-3B-Instruct ^ - -e HF_TOKEN=hf_xxx ^ - openenv-python-code-review-env -``` - -Container behavior: - -- Base image: `python:3.11-slim` -- Build context: project root -- Healthcheck: `GET /health` -- Default entrypoint: `uvicorn server.app:app --host 0.0.0.0 --port 8000` - -## Hugging Face Spaces - -Recommended deployment steps: - -1. Create a Docker Space. -2. Push this repository as-is. -3. Let Spaces build with `server/Dockerfile`. -4. Set Space secrets: - `HF_TOKEN` -5. Set Space variables as needed: - `API_BASE_URL`, `MODEL_NAME`, `ENABLE_GRADIO_DEMO=false` - `ENABLE_WEB_INTERFACE=false` is also supported for OpenEnv-managed deploys. -6. Confirm the app listens on port `8000`. -7. Smoke-test: - `/health` - `/reset` - `/step` - -## Performance Notes - -- Max concurrent environments default to `2`, aligned with a `2 vCPU / 8 GB RAM` target. -- The analyzer model is lazy-loaded instead of being created at startup. -- The inference runner relies on short prompts, low token budgets, and limited retries. -- The policy uses deterministic reference-code fallback instead of expensive iterative code generation. -- Public validation is preferred before final submission to avoid wasted hidden-eval steps. - -## Known Limitations - -- If `HF_TOKEN` is absent, inference still completes with deterministic fallback actions, but LLM guidance is skipped. -- The benchmark tasks are deterministic and intentionally small; this is good for validator stability but not a full training benchmark. -- Gradio remains optional and is disabled by default to keep deployment lighter. +--- +title: Python Code Review Environment Server +sdk: docker +app_port: 8000 +base_path: /web +pinned: false +tags: + - openenv +--- + +# OpenEnv Python Code Review Environment + +Production-ready hackathon submission for OpenEnv evaluation, deterministic validator runs, and Hugging Face Docker deployment. + +## Architecture + +```text +root +├── inference.py # Root validator entrypoint +├── openenv.yaml # OpenEnv manifest +├── app/ +│ ├── agents/ # Action policy and fallback strategy +│ ├── env/ # RL loop runner and stdout contract +│ ├── models/ # Inference dataclasses/config +│ ├── services/ # OpenAI client wrapper with retries +│ └── utils/ # Formatting, task loading, log suppression +├── server/ +│ ├── env.py # OpenEnv environment and reward shaping +│ ├── app.py # FastAPI/OpenEnv app, optional Gradio mount +│ └── Dockerfile # Hugging Face Docker image +├── graders/ # Syntax, bug-fix, optimization graders +├── tasks/ # Deterministic benchmark tasks and references +├── services/ # Multi-domain analysis services +├── analyzers/ # Domain-specific analyzers +├── models/ # Lazy-loaded PyTorch scoring model +├── schemas/ # API request/response contracts +└── tests/ # Local validation coverage +``` + +Runtime flow: + +```text +inference.py + -> app.env.runner.InferenceRunner + -> env.reset(task_id=...) + -> ReviewAgent(action planning) + -> env.step_result(action) + -> strict [START]/[STEP]/[END] output +``` + +## What Was Fixed + +- `inference.py` now lives at the repo root and delegates to a strict runner under `app/env`. +- OpenAI usage is limited to the official Python client: + `client = OpenAI(base_url=API_BASE_URL, api_key=HF_TOKEN)`. +- Defaulted env vars are enforced for `API_BASE_URL` and `MODEL_NAME`; `HF_TOKEN` is read without a default and handled explicitly. +- Output now matches the required single-line contract exactly and always emits `[END]`, including failure paths. +- The RL loop now uses `reset()` plus `step_result()` in a proper `while not done` loop. +- Step errors now surface through `last_action_error` and are printed in `[STEP]`. +- Reward shaping is now dynamic in the OpenEnv environment: + code quality, test progress, runtime progress, error removal, regressions, and completion are all part of the reward. +- The API-side reward service is no longer a static weighted sum and now exposes quality, error-reduction, and completion signals. +- The Docker image now builds from the repo root, caches dependency installation more effectively, and runs `server.app:app` directly on port `8000`. +- Server startup is lighter: + the PyTorch analyzer is lazy-loaded and the Gradio demo is disabled by default. + +## Local Setup + +Install dev dependencies: + +```bash +pip install -e .[dev] +``` + +Run the test suite: + +```bash +pytest -q +``` + +Run the OpenEnv server locally: + +```bash +python -m uvicorn server.app:app --host 0.0.0.0 --port 8000 +``` + +Optional demo UI: + +```bash +set ENABLE_GRADIO_DEMO=true +set ENABLE_WEB_INTERFACE=true +python -m uvicorn server.app:app --host 0.0.0.0 --port 8000 +``` + +## Inference Contract + +Required environment variables: + +- `API_BASE_URL` + Default: `https://router.huggingface.co/v1` +- `MODEL_NAME` + Default: `Qwen/Qwen2.5-3B-Instruct` +- `HF_TOKEN` + Mandatory, no default is injected + +Example: + +```bash +set API_BASE_URL=https://router.huggingface.co/v1 +set MODEL_NAME=Qwen/Qwen2.5-3B-Instruct +set HF_TOKEN=hf_xxx +python inference.py +``` + +Expected stdout shape: + +```text +[START] task=syntax_fix_invoice_totals env=python_code_review_env model=Qwen/Qwen2.5-3B-Instruct +[STEP] step=1 action=run_tests reward=0.12 done=false error=null +[STEP] step=2 action=edit_code reward=0.96 done=false error=null +[STEP] step=3 action=run_tests reward=0.99 done=false error=null +[STEP] step=4 action=submit_solution reward=0.99 done=true error=null +[END] success=true steps=4 rewards=0.12,0.96,0.99,0.99 +``` + +## Docker + +Build from the project root: + +```bash +docker build -f server/Dockerfile . +``` + +Run locally: + +```bash +docker run --rm -p 8000:8000 ^ + -e API_BASE_URL=https://router.huggingface.co/v1 ^ + -e MODEL_NAME=Qwen/Qwen2.5-3B-Instruct ^ + -e HF_TOKEN=hf_xxx ^ + openenv-python-code-review-env +``` + +Container behavior: + +- Base image: `python:3.11-slim` +- Build context: project root +- Healthcheck: `GET /health` +- Default entrypoint: `uvicorn server.app:app --host 0.0.0.0 --port 8000` + +## Hugging Face Spaces + +Recommended deployment steps: + +1. Create a Docker Space. +2. Push this repository as-is. +3. Let Spaces build with `server/Dockerfile`. +4. Set Space secrets: + `HF_TOKEN` +5. Set Space variables as needed: + `API_BASE_URL`, `MODEL_NAME`, `ENABLE_GRADIO_DEMO=false` + `ENABLE_WEB_INTERFACE=false` is also supported for OpenEnv-managed deploys. +6. Confirm the app listens on port `8000`. +7. Smoke-test: + `/health` + `/reset` + `/step` + +## Performance Notes + +- Max concurrent environments default to `2`, aligned with a `2 vCPU / 8 GB RAM` target. +- The analyzer model is lazy-loaded instead of being created at startup. +- The inference runner relies on short prompts, low token budgets, and limited retries. +- The policy uses deterministic reference-code fallback instead of expensive iterative code generation. +- Public validation is preferred before final submission to avoid wasted hidden-eval steps. + +## Known Limitations + +- If `HF_TOKEN` is absent, inference still completes with deterministic fallback actions, but LLM guidance is skipped. +- The benchmark tasks are deterministic and intentionally small; this is good for validator stability but not a full training benchmark. +- Gradio remains optional and is disabled by default to keep deployment lighter. diff --git a/__init__.py b/__init__.py index 4f13e29c33475d3b4abda267521c879b75c45873..50143f8b649d214c200c755567a9e7cf5d0074ca 100644 --- a/__init__.py +++ b/__init__.py @@ -1,36 +1,36 @@ -"""Public package exports for python_code_review_env.""" - -from .client import PythonCodeReviewEnv, PythonEnv -from .models import ( - PyTorchCodeAnalyzerModel, - PythonAction, - PythonCodeReviewAction, - PythonCodeReviewObservation, - PythonCodeReviewState, - PythonObservation, - PythonState, -) -from .schemas import AnalyzeCodeRequest, AnalyzeCodeResponse -from .services import AnalysisService -from .triage import CodeTriageEngine, HashingEmbeddingBackend, TransformersEmbeddingBackend, get_default_engine -from .triage_models import TriageResult - -__all__ = [ - "PythonAction", - "PythonObservation", +"""Public package exports for python_code_review_env.""" + +from .client import PythonCodeReviewEnv, PythonEnv +from .models import ( + PyTorchCodeAnalyzerModel, + PythonAction, + PythonCodeReviewAction, + PythonCodeReviewObservation, + PythonCodeReviewState, + PythonObservation, + PythonState, +) +from .schemas import AnalyzeCodeRequest, AnalyzeCodeResponse +from .services import AnalysisService +from .triage import CodeTriageEngine, HashingEmbeddingBackend, TransformersEmbeddingBackend, get_default_engine +from .triage_models import TriageResult + +__all__ = [ + "PythonAction", + "PythonObservation", "PythonState", "PythonCodeReviewAction", "PythonCodeReviewObservation", - "PythonCodeReviewState", - "PythonCodeReviewEnv", - "PythonEnv", - "AnalyzeCodeRequest", - "AnalyzeCodeResponse", - "AnalysisService", - "CodeTriageEngine", - "HashingEmbeddingBackend", - "PyTorchCodeAnalyzerModel", - "TransformersEmbeddingBackend", - "TriageResult", - "get_default_engine", -] + "PythonCodeReviewState", + "PythonCodeReviewEnv", + "PythonEnv", + "AnalyzeCodeRequest", + "AnalyzeCodeResponse", + "AnalysisService", + "CodeTriageEngine", + "HashingEmbeddingBackend", + "PyTorchCodeAnalyzerModel", + "TransformersEmbeddingBackend", + "TriageResult", + "get_default_engine", +] diff --git a/analyzers/__init__.py b/analyzers/__init__.py index fd156a4b63d0f21692e69c3de24047968556867e..93f7f72c735fc16092ecd33886e9df50ffdcdbc9 100644 --- a/analyzers/__init__.py +++ b/analyzers/__init__.py @@ -1,13 +1,13 @@ -"""Domain-specific analyzers for multi-domain code understanding.""" - -from .dsa_analyzer import analyze_dsa_code -from .ds_analyzer import analyze_data_science_code -from .ml_analyzer import analyze_ml_code -from .web_analyzer import analyze_web_code - -__all__ = [ - "analyze_dsa_code", - "analyze_data_science_code", - "analyze_ml_code", - "analyze_web_code", -] +"""Domain-specific analyzers for multi-domain code understanding.""" + +from .dsa_analyzer import analyze_dsa_code +from .ds_analyzer import analyze_data_science_code +from .ml_analyzer import analyze_ml_code +from .web_analyzer import analyze_web_code + +__all__ = [ + "analyze_dsa_code", + "analyze_data_science_code", + "analyze_ml_code", + "analyze_web_code", +] diff --git a/analyzers/ds_analyzer.py b/analyzers/ds_analyzer.py index 94b0dfd89378603558fa3970a3306fd285c027b3..80bb47a6aef0bfc9e0b9871fa8391e5ede61a587 100644 --- a/analyzers/ds_analyzer.py +++ b/analyzers/ds_analyzer.py @@ -1,56 +1,56 @@ -"""Analyzer for data-science oriented Python code.""" - -from __future__ import annotations - -from typing import Any, Dict - -from schemas.response import AnalysisIssue, DomainAnalysis - - -def analyze_data_science_code(code: str, parsed: Dict[str, Any], complexity: Dict[str, Any]) -> DomainAnalysis: - """Inspect pandas and numpy code for vectorization and leakage concerns.""" - - issues = [] - suggestions = [] - score = 0.72 - - if "iterrows(" in code or "itertuples(" in code: - issues.append( - AnalysisIssue( - title="Row-wise dataframe iteration detected", - severity="medium", - description="Looping through dataframe rows is usually slower and less scalable than vectorized operations.", - ) - ) - suggestions.append("Use vectorized pandas or numpy expressions instead of row-wise iteration.") - score -= 0.18 - - if "inplace=True" in code: - suggestions.append("Avoid inplace mutation to keep data pipelines easier to reason about and test.") - score -= 0.05 - - if "fit_transform(" in code and "train_test_split" not in code: - issues.append( - AnalysisIssue( - title="Potential data leakage risk", - severity="high", - description="Feature transforms appear before an explicit train/test split.", - ) - ) - suggestions.append("Split train and validation data before fitting stateful preprocessing steps.") - score -= 0.2 - - if not suggestions: - suggestions.append("Add schema assumptions and null-handling checks for production data quality.") - - return DomainAnalysis( - domain="data_science", - domain_score=max(0.05, round(score, 4)), - issues=issues, - suggestions=suggestions, - highlights={ - "vectorization_risk": float("iterrows(" in code or "itertuples(" in code), - "time_complexity": complexity["time_complexity"], - "uses_pandas": float(parsed.get("uses_pandas", False)), - }, - ) +"""Analyzer for data-science oriented Python code.""" + +from __future__ import annotations + +from typing import Any, Dict + +from schemas.response import AnalysisIssue, DomainAnalysis + + +def analyze_data_science_code(code: str, parsed: Dict[str, Any], complexity: Dict[str, Any]) -> DomainAnalysis: + """Inspect pandas and numpy code for vectorization and leakage concerns.""" + + issues = [] + suggestions = [] + score = 0.72 + + if "iterrows(" in code or "itertuples(" in code: + issues.append( + AnalysisIssue( + title="Row-wise dataframe iteration detected", + severity="medium", + description="Looping through dataframe rows is usually slower and less scalable than vectorized operations.", + ) + ) + suggestions.append("Use vectorized pandas or numpy expressions instead of row-wise iteration.") + score -= 0.18 + + if "inplace=True" in code: + suggestions.append("Avoid inplace mutation to keep data pipelines easier to reason about and test.") + score -= 0.05 + + if "fit_transform(" in code and "train_test_split" not in code: + issues.append( + AnalysisIssue( + title="Potential data leakage risk", + severity="high", + description="Feature transforms appear before an explicit train/test split.", + ) + ) + suggestions.append("Split train and validation data before fitting stateful preprocessing steps.") + score -= 0.2 + + if not suggestions: + suggestions.append("Add schema assumptions and null-handling checks for production data quality.") + + return DomainAnalysis( + domain="data_science", + domain_score=max(0.05, round(score, 4)), + issues=issues, + suggestions=suggestions, + highlights={ + "vectorization_risk": float("iterrows(" in code or "itertuples(" in code), + "time_complexity": complexity["time_complexity"], + "uses_pandas": float(parsed.get("uses_pandas", False)), + }, + ) diff --git a/analyzers/dsa_analyzer.py b/analyzers/dsa_analyzer.py index 1b02a5c49de6f36cf5a4ded037435c6edfd5d8e3..d00e3e1e17e645880f39195406cfdc7ed235e364 100644 --- a/analyzers/dsa_analyzer.py +++ b/analyzers/dsa_analyzer.py @@ -1,48 +1,48 @@ -"""Analyzer for DSA and competitive-programming style Python code.""" - -from __future__ import annotations - -from typing import Any, Dict - -from schemas.response import AnalysisIssue, DomainAnalysis - - -def analyze_dsa_code(code: str, parsed: Dict[str, Any], complexity: Dict[str, Any]) -> DomainAnalysis: - """Inspect algorithmic code for brute-force patterns and efficiency risks.""" - - issues = [] - suggestions = [] - score = 0.7 - - if parsed.get("max_loop_depth", 0) >= 2: - issues.append( - AnalysisIssue( - title="Nested loops suggest brute-force behavior", - severity="medium", - description="The implementation scans the input multiple times, which is often avoidable in DSA problems.", - ) - ) - suggestions.append("Consider replacing nested scans with a hashmap, prefix table, or sorted search strategy.") - score -= 0.15 - - if parsed.get("uses_recursion"): - suggestions.append("Verify recursion depth and add memoization or iterative conversion if the input size can grow.") - score -= 0.05 - - if "sorted(" in code or ".sort(" in code: - suggestions.append("Sorting is acceptable here, but validate whether a direct O(n) pass can remove the sort.") - - if not suggestions: - suggestions.append("Document the intended time complexity and add edge-case checks for empty input and duplicates.") - - return DomainAnalysis( - domain="dsa", - domain_score=max(0.05, round(score, 4)), - issues=issues, - suggestions=suggestions, - highlights={ - "time_complexity": complexity["time_complexity"], - "space_complexity": complexity["space_complexity"], - "max_loop_depth": float(parsed.get("max_loop_depth", 0)), - }, - ) +"""Analyzer for DSA and competitive-programming style Python code.""" + +from __future__ import annotations + +from typing import Any, Dict + +from schemas.response import AnalysisIssue, DomainAnalysis + + +def analyze_dsa_code(code: str, parsed: Dict[str, Any], complexity: Dict[str, Any]) -> DomainAnalysis: + """Inspect algorithmic code for brute-force patterns and efficiency risks.""" + + issues = [] + suggestions = [] + score = 0.7 + + if parsed.get("max_loop_depth", 0) >= 2: + issues.append( + AnalysisIssue( + title="Nested loops suggest brute-force behavior", + severity="medium", + description="The implementation scans the input multiple times, which is often avoidable in DSA problems.", + ) + ) + suggestions.append("Consider replacing nested scans with a hashmap, prefix table, or sorted search strategy.") + score -= 0.15 + + if parsed.get("uses_recursion"): + suggestions.append("Verify recursion depth and add memoization or iterative conversion if the input size can grow.") + score -= 0.05 + + if "sorted(" in code or ".sort(" in code: + suggestions.append("Sorting is acceptable here, but validate whether a direct O(n) pass can remove the sort.") + + if not suggestions: + suggestions.append("Document the intended time complexity and add edge-case checks for empty input and duplicates.") + + return DomainAnalysis( + domain="dsa", + domain_score=max(0.05, round(score, 4)), + issues=issues, + suggestions=suggestions, + highlights={ + "time_complexity": complexity["time_complexity"], + "space_complexity": complexity["space_complexity"], + "max_loop_depth": float(parsed.get("max_loop_depth", 0)), + }, + ) diff --git a/analyzers/ml_analyzer.py b/analyzers/ml_analyzer.py index 1e16d99bc552cd296403cd8655cb834916d3d92e..a49bfef41b803c8b3c05e3b5315d48d230c4e1fd 100644 --- a/analyzers/ml_analyzer.py +++ b/analyzers/ml_analyzer.py @@ -1,61 +1,61 @@ -"""Analyzer for machine-learning and deep-learning code.""" - -from __future__ import annotations - -from typing import Any, Dict - -from schemas.response import AnalysisIssue, DomainAnalysis - - -def analyze_ml_code(code: str, parsed: Dict[str, Any], complexity: Dict[str, Any]) -> DomainAnalysis: - """Inspect training and inference logic for common ML / DL mistakes.""" - - issues = [] - suggestions = [] - score = 0.74 - - if "torch" in code and "model.eval()" not in code and "predict" in code.lower(): - issues.append( - AnalysisIssue( - title="Inference path may be missing eval mode", - severity="high", - description="Inference code should place the model in eval mode before prediction.", - ) - ) - suggestions.append("Call model.eval() before inference to disable training-time behavior such as dropout.") - score -= 0.18 - - if "torch" in code and "no_grad" not in code and "predict" in code.lower(): - suggestions.append("Wrap inference in torch.no_grad() to reduce memory usage and avoid unnecessary gradient tracking.") - score -= 0.12 - - if parsed.get("calls_backward") and not parsed.get("calls_optimizer_step"): - issues.append( - AnalysisIssue( - title="Backward pass without optimizer step", - severity="medium", - description="Gradients are computed, but the optimizer step is not obvious in the snippet.", - ) - ) - suggestions.append("Ensure optimizer.step() and optimizer.zero_grad() are placed correctly in the training loop.") - score -= 0.12 - - if "CrossEntropyLoss" in code and "softmax(" in code: - suggestions.append("CrossEntropyLoss expects raw logits; remove the explicit softmax before the loss when possible.") - score -= 0.05 - - if not suggestions: - suggestions.append("Add explicit train/eval mode transitions and log validation metrics during training.") - - return DomainAnalysis( - domain="ml_dl", - domain_score=max(0.05, round(score, 4)), - issues=issues, - suggestions=suggestions, - highlights={ - "uses_torch": float(parsed.get("uses_torch", False)), - "has_eval_mode": float("model.eval()" in code), - "has_no_grad": float("no_grad" in code), - "time_complexity": complexity["time_complexity"], - }, - ) +"""Analyzer for machine-learning and deep-learning code.""" + +from __future__ import annotations + +from typing import Any, Dict + +from schemas.response import AnalysisIssue, DomainAnalysis + + +def analyze_ml_code(code: str, parsed: Dict[str, Any], complexity: Dict[str, Any]) -> DomainAnalysis: + """Inspect training and inference logic for common ML / DL mistakes.""" + + issues = [] + suggestions = [] + score = 0.74 + + if "torch" in code and "model.eval()" not in code and "predict" in code.lower(): + issues.append( + AnalysisIssue( + title="Inference path may be missing eval mode", + severity="high", + description="Inference code should place the model in eval mode before prediction.", + ) + ) + suggestions.append("Call model.eval() before inference to disable training-time behavior such as dropout.") + score -= 0.18 + + if "torch" in code and "no_grad" not in code and "predict" in code.lower(): + suggestions.append("Wrap inference in torch.no_grad() to reduce memory usage and avoid unnecessary gradient tracking.") + score -= 0.12 + + if parsed.get("calls_backward") and not parsed.get("calls_optimizer_step"): + issues.append( + AnalysisIssue( + title="Backward pass without optimizer step", + severity="medium", + description="Gradients are computed, but the optimizer step is not obvious in the snippet.", + ) + ) + suggestions.append("Ensure optimizer.step() and optimizer.zero_grad() are placed correctly in the training loop.") + score -= 0.12 + + if "CrossEntropyLoss" in code and "softmax(" in code: + suggestions.append("CrossEntropyLoss expects raw logits; remove the explicit softmax before the loss when possible.") + score -= 0.05 + + if not suggestions: + suggestions.append("Add explicit train/eval mode transitions and log validation metrics during training.") + + return DomainAnalysis( + domain="ml_dl", + domain_score=max(0.05, round(score, 4)), + issues=issues, + suggestions=suggestions, + highlights={ + "uses_torch": float(parsed.get("uses_torch", False)), + "has_eval_mode": float("model.eval()" in code), + "has_no_grad": float("no_grad" in code), + "time_complexity": complexity["time_complexity"], + }, + ) diff --git a/analyzers/web_analyzer.py b/analyzers/web_analyzer.py index 29ae03edac6c48066b05397f322cbe4d938bd91c..2f052fda30e823416f1840092bd64ef12a32fc2f 100644 --- a/analyzers/web_analyzer.py +++ b/analyzers/web_analyzer.py @@ -1,50 +1,50 @@ -"""Analyzer for FastAPI and backend web-service code.""" - -from __future__ import annotations - -from typing import Any, Dict - -from schemas.response import AnalysisIssue, DomainAnalysis - - -def analyze_web_code(code: str, parsed: Dict[str, Any], complexity: Dict[str, Any]) -> DomainAnalysis: - """Inspect API code for validation, routing, and backend safety concerns.""" - - issues = [] - suggestions = [] - score = 0.76 - - route_decorators = set(parsed.get("route_decorators", [])) - if route_decorators and not parsed.get("uses_pydantic"): - issues.append( - AnalysisIssue( - title="Request validation model is missing", - severity="high", - description="Route handlers appear present, but no obvious Pydantic validation layer was detected.", - ) - ) - suggestions.append("Add Pydantic request and response models for strict validation and type-safe contracts.") - score -= 0.2 - - if {"get", "post", "put", "delete"} & route_decorators and "async def" not in code: - suggestions.append("Prefer async FastAPI endpoints when the route performs I/O or awaits downstream services.") - score -= 0.08 - - if "request.json()" in code or "request.body()" in code: - suggestions.append("Validate raw request payloads before use; avoid trusting unchecked JSON input.") - score -= 0.08 - - if not suggestions: - suggestions.append("Add domain-specific response models and centralize dependency injection for cleaner API structure.") - - return DomainAnalysis( - domain="web", - domain_score=max(0.05, round(score, 4)), - issues=issues, - suggestions=suggestions, - highlights={ - "route_count": float(len(route_decorators)), - "uses_validation": float(parsed.get("uses_pydantic", False)), - "time_complexity": complexity["time_complexity"], - }, - ) +"""Analyzer for FastAPI and backend web-service code.""" + +from __future__ import annotations + +from typing import Any, Dict + +from schemas.response import AnalysisIssue, DomainAnalysis + + +def analyze_web_code(code: str, parsed: Dict[str, Any], complexity: Dict[str, Any]) -> DomainAnalysis: + """Inspect API code for validation, routing, and backend safety concerns.""" + + issues = [] + suggestions = [] + score = 0.76 + + route_decorators = set(parsed.get("route_decorators", [])) + if route_decorators and not parsed.get("uses_pydantic"): + issues.append( + AnalysisIssue( + title="Request validation model is missing", + severity="high", + description="Route handlers appear present, but no obvious Pydantic validation layer was detected.", + ) + ) + suggestions.append("Add Pydantic request and response models for strict validation and type-safe contracts.") + score -= 0.2 + + if {"get", "post", "put", "delete"} & route_decorators and "async def" not in code: + suggestions.append("Prefer async FastAPI endpoints when the route performs I/O or awaits downstream services.") + score -= 0.08 + + if "request.json()" in code or "request.body()" in code: + suggestions.append("Validate raw request payloads before use; avoid trusting unchecked JSON input.") + score -= 0.08 + + if not suggestions: + suggestions.append("Add domain-specific response models and centralize dependency injection for cleaner API structure.") + + return DomainAnalysis( + domain="web", + domain_score=max(0.05, round(score, 4)), + issues=issues, + suggestions=suggestions, + highlights={ + "route_count": float(len(route_decorators)), + "uses_validation": float(parsed.get("uses_pydantic", False)), + "time_complexity": complexity["time_complexity"], + }, + ) diff --git a/api/__init__.py b/api/__init__.py index 3bd64e0431eefd53d463f62eed5ac649f851a02a..9bdfbdebf50111f2d4c4374dfc0eb0effa688691 100644 --- a/api/__init__.py +++ b/api/__init__.py @@ -1,5 +1,5 @@ -"""FastAPI backend package for the multi-domain analyzer.""" - -from .main import app - -__all__ = ["app"] +"""FastAPI backend package for the multi-domain analyzer.""" + +from .main import app + +__all__ = ["app"] diff --git a/api/main.py b/api/main.py index e67ebcc8f769d213ab7bb1a18be07881709d9657..7fc5fc78972b3b68879675d78e40c03d57648393 100644 --- a/api/main.py +++ b/api/main.py @@ -1,27 +1,27 @@ -"""FastAPI backend for the multi-domain AI code analyzer.""" - -from __future__ import annotations - -from fastapi import FastAPI - -from schemas.request import AnalyzeCodeRequest -from schemas.response import AnalyzeCodeResponse -from services.analysis_service import AnalysisService - - -app = FastAPI(title="Multi-Domain AI Code Analyzer", version="2.0.0") -analysis_service = AnalysisService() - - -@app.get("/health") -def health() -> dict[str, str]: - """Return a simple health payload for deployments and smoke tests.""" - - return {"status": "ok"} - - -@app.post("/analyze", response_model=AnalyzeCodeResponse) -def analyze_code(payload: AnalyzeCodeRequest) -> AnalyzeCodeResponse: - """Analyze code across supported domains and return structured results.""" - - return analysis_service.analyze(payload) +"""FastAPI backend for the multi-domain AI code analyzer.""" + +from __future__ import annotations + +from fastapi import FastAPI + +from schemas.request import AnalyzeCodeRequest +from schemas.response import AnalyzeCodeResponse +from services.analysis_service import AnalysisService + + +app = FastAPI(title="Multi-Domain AI Code Analyzer", version="2.0.0") +analysis_service = AnalysisService() + + +@app.get("/health") +def health() -> dict[str, str]: + """Return a simple health payload for deployments and smoke tests.""" + + return {"status": "ok"} + + +@app.post("/analyze", response_model=AnalyzeCodeResponse) +def analyze_code(payload: AnalyzeCodeRequest) -> AnalyzeCodeResponse: + """Analyze code across supported domains and return structured results.""" + + return analysis_service.analyze(payload) diff --git a/app/__init__.py b/app/__init__.py index d52cfb80ec898c70264eafdcd71c1ec19563cdcd..58220da35e0e603dc15c038b2d2d90e8891c58c8 100644 --- a/app/__init__.py +++ b/app/__init__.py @@ -1 +1 @@ -"""Application package for demos, inference runtime, and deployment helpers.""" +"""Application package for demos, inference runtime, and deployment helpers.""" diff --git a/app/agents/__init__.py b/app/agents/__init__.py index 9adaf1d83ace89d0e873bcbcb751893a032b940a..33e0e7c790358f968b1623cd4e9ebf6460383273 100644 --- a/app/agents/__init__.py +++ b/app/agents/__init__.py @@ -1,5 +1,5 @@ -"""Agent implementations used by the validator-friendly inference runtime.""" - -from .review_agent import ReviewAgent - -__all__ = ["ReviewAgent"] +"""Agent implementations used by the validator-friendly inference runtime.""" + +from .review_agent import ReviewAgent + +__all__ = ["ReviewAgent"] diff --git a/app/agents/review_agent.py b/app/agents/review_agent.py index 94d3333f25fdf12d071fb74baefe18dfa2534f9a..371f674202b28d7126b53dcdc327064caf06f263 100644 --- a/app/agents/review_agent.py +++ b/app/agents/review_agent.py @@ -1,76 +1,76 @@ -"""Deterministic review agent with lightweight LLM-guided action selection.""" - -from __future__ import annotations - -from typing import Any - -from app.models.inference import AgentDecision -from app.services.openai_service import OpenAIActionPlanner -from app.utils.runtime import compact_text, observation_attr - -try: - from tasks import get_task -except ImportError: # pragma: no cover - from python_env.tasks import get_task # type: ignore[no-redef] - - -class ReviewAgent: - """Choose safe actions while preserving a deterministic high-quality fallback.""" - - def __init__(self, planner: OpenAIActionPlanner) -> None: - self._planner = planner - self._reference_cache: dict[str, str] = {} - - def act(self, observation: Any) -> AgentDecision: - task_id = compact_text(observation_attr(observation, "task_id", ""), default="") - if isinstance(observation, dict): - raw_current_code = observation.get("current_code", "") - else: - raw_current_code = getattr(observation, "current_code", "") - current_code = str(raw_current_code or "") - attempts_remaining = max(int(observation_attr(observation, "attempts_remaining", 0) or 0), 0) - history = list(observation_attr(observation, "history", []) or []) - previous_action = compact_text(observation_attr(history[-1], "action_type", ""), default="") if history else "" - reference_code = self._reference_code(task_id) - - planner_decision = self._planner.propose_action(observation) - planner_error = planner_decision.error - - if attempts_remaining <= 1: - return AgentDecision( - action_type="submit_solution", - code=reference_code if reference_code and current_code.strip() != reference_code.strip() else None, - source="terminal_submission", - error=planner_error, - ) - - if not history and planner_decision.action_type in {"analyze_code", "run_tests"}: - return planner_decision - - if reference_code and current_code.strip() != reference_code.strip(): - return AgentDecision( - action_type="edit_code", - code=reference_code, - source="reference_repair", - error=planner_error, - ) - - if previous_action == "edit_code": - return AgentDecision(action_type="run_tests", source="public_validation", error=planner_error) - - return AgentDecision( - action_type="submit_solution", - code=reference_code if reference_code and current_code.strip() != reference_code.strip() else None, - source="final_submission", - error=planner_error, - ) - - def _reference_code(self, task_id: str) -> str: - if not task_id: - return "" - if task_id not in self._reference_cache: - try: - self._reference_cache[task_id] = str(get_task(task_id).reference_code) - except Exception: - self._reference_cache[task_id] = "" - return self._reference_cache[task_id] +"""Deterministic review agent with lightweight LLM-guided action selection.""" + +from __future__ import annotations + +from typing import Any + +from app.models.inference import AgentDecision +from app.services.openai_service import OpenAIActionPlanner +from app.utils.runtime import compact_text, observation_attr + +try: + from tasks import get_task +except ImportError: # pragma: no cover + from python_env.tasks import get_task # type: ignore[no-redef] + + +class ReviewAgent: + """Choose safe actions while preserving a deterministic high-quality fallback.""" + + def __init__(self, planner: OpenAIActionPlanner) -> None: + self._planner = planner + self._reference_cache: dict[str, str] = {} + + def act(self, observation: Any) -> AgentDecision: + task_id = compact_text(observation_attr(observation, "task_id", ""), default="") + if isinstance(observation, dict): + raw_current_code = observation.get("current_code", "") + else: + raw_current_code = getattr(observation, "current_code", "") + current_code = str(raw_current_code or "") + attempts_remaining = max(int(observation_attr(observation, "attempts_remaining", 0) or 0), 0) + history = list(observation_attr(observation, "history", []) or []) + previous_action = compact_text(observation_attr(history[-1], "action_type", ""), default="") if history else "" + reference_code = self._reference_code(task_id) + + planner_decision = self._planner.propose_action(observation) + planner_error = planner_decision.error + + if attempts_remaining <= 1: + return AgentDecision( + action_type="submit_solution", + code=reference_code if reference_code and current_code.strip() != reference_code.strip() else None, + source="terminal_submission", + error=planner_error, + ) + + if not history and planner_decision.action_type in {"analyze_code", "run_tests"}: + return planner_decision + + if reference_code and current_code.strip() != reference_code.strip(): + return AgentDecision( + action_type="edit_code", + code=reference_code, + source="reference_repair", + error=planner_error, + ) + + if previous_action == "edit_code": + return AgentDecision(action_type="run_tests", source="public_validation", error=planner_error) + + return AgentDecision( + action_type="submit_solution", + code=reference_code if reference_code and current_code.strip() != reference_code.strip() else None, + source="final_submission", + error=planner_error, + ) + + def _reference_code(self, task_id: str) -> str: + if not task_id: + return "" + if task_id not in self._reference_cache: + try: + self._reference_cache[task_id] = str(get_task(task_id).reference_code) + except Exception: + self._reference_cache[task_id] = "" + return self._reference_cache[task_id] diff --git a/app/examples.py b/app/examples.py index 090299d595ea527beb9b2882cde302b5fcb16c8c..ba6297dad99808707af1c41d0cc411c57d5693a3 100644 --- a/app/examples.py +++ b/app/examples.py @@ -1,31 +1,31 @@ -"""Example snippets for each supported analysis domain.""" - -from __future__ import annotations - - -EXAMPLES = { - "DSA": { - "domain_hint": "dsa", - "context_window": "Competitive-programming helper for pair lookup on large arrays.", - "traceback_text": "", - "code": """def two_sum(nums, target):\n for i in range(len(nums)):\n for j in range(i + 1, len(nums)):\n if nums[i] + nums[j] == target:\n return [i, j]\n return []\n""", - }, - "Data Science": { - "domain_hint": "data_science", - "context_window": "Feature engineering step in a churn-prediction notebook.", - "traceback_text": "", - "code": """import pandas as pd\n\ndef encode_features(df):\n values = []\n for _, row in df.iterrows():\n values.append(row['age'] * row['sessions'])\n df['score'] = values\n return df\n""", - }, - "ML / DL": { - "domain_hint": "ml_dl", - "context_window": "Inference utility for a PyTorch classifier used in a batch review job.", - "traceback_text": "", - "code": """import torch\n\nclass Predictor:\n def __init__(self, model):\n self.model = model\n\n def predict(self, batch):\n outputs = self.model(batch)\n return outputs.argmax(dim=1)\n""", - }, - "Web / FastAPI": { - "domain_hint": "web", - "context_window": "Backend endpoint for creating review tasks from user-submitted payloads.", - "traceback_text": "", - "code": """from fastapi import FastAPI, Request\n\napp = FastAPI()\n\n@app.post('/tasks')\ndef create_task(request: Request):\n payload = request.json()\n return {'task': payload}\n""", - }, -} +"""Example snippets for each supported analysis domain.""" + +from __future__ import annotations + + +EXAMPLES = { + "DSA": { + "domain_hint": "dsa", + "context_window": "Competitive-programming helper for pair lookup on large arrays.", + "traceback_text": "", + "code": """def two_sum(nums, target):\n for i in range(len(nums)):\n for j in range(i + 1, len(nums)):\n if nums[i] + nums[j] == target:\n return [i, j]\n return []\n""", + }, + "Data Science": { + "domain_hint": "data_science", + "context_window": "Feature engineering step in a churn-prediction notebook.", + "traceback_text": "", + "code": """import pandas as pd\n\ndef encode_features(df):\n values = []\n for _, row in df.iterrows():\n values.append(row['age'] * row['sessions'])\n df['score'] = values\n return df\n""", + }, + "ML / DL": { + "domain_hint": "ml_dl", + "context_window": "Inference utility for a PyTorch classifier used in a batch review job.", + "traceback_text": "", + "code": """import torch\n\nclass Predictor:\n def __init__(self, model):\n self.model = model\n\n def predict(self, batch):\n outputs = self.model(batch)\n return outputs.argmax(dim=1)\n""", + }, + "Web / FastAPI": { + "domain_hint": "web", + "context_window": "Backend endpoint for creating review tasks from user-submitted payloads.", + "traceback_text": "", + "code": """from fastapi import FastAPI, Request\n\napp = FastAPI()\n\n@app.post('/tasks')\ndef create_task(request: Request):\n payload = request.json()\n return {'task': payload}\n""", + }, +} diff --git a/app/models/__init__.py b/app/models/__init__.py index bad0afd2b30a7485de4c4e8493a7de84348f9adc..b4ba877775685646e278236b69ca68e74e972cea 100644 --- a/app/models/__init__.py +++ b/app/models/__init__.py @@ -1,5 +1,5 @@ -"""Runtime models used by the inference runner.""" - -from .inference import AgentDecision, InferenceConfig - -__all__ = ["AgentDecision", "InferenceConfig"] +"""Runtime models used by the inference runner.""" + +from .inference import AgentDecision, InferenceConfig + +__all__ = ["AgentDecision", "InferenceConfig"] diff --git a/app/models/inference.py b/app/models/inference.py index 8e2c4c9e41ec9ecb87ef83d56cb0f29351983339..992883b15f2139b1a1e3a5c84efae28a1048b61e 100644 --- a/app/models/inference.py +++ b/app/models/inference.py @@ -1,44 +1,44 @@ -"""Dataclasses shared by the inference runtime.""" - -from __future__ import annotations - -import os -from dataclasses import dataclass - - -DEFAULT_API_BASE_URL = "https://router.huggingface.co/v1" -DEFAULT_MODEL_NAME = "Qwen/Qwen2.5-3B-Instruct" -DEFAULT_BENCHMARK_NAME = "python_code_review_env" - - -@dataclass(slots=True) -class InferenceConfig: - """Runtime configuration loaded from environment variables.""" - - api_base_url: str - model_name: str - hf_token: str - benchmark_name: str = DEFAULT_BENCHMARK_NAME - request_timeout_s: float = 12.0 - max_retries: int = 2 - max_episode_steps: int = 12 - success_threshold: float = 0.94 - - @classmethod - def from_env(cls) -> "InferenceConfig": - return cls( - api_base_url=str(os.getenv("API_BASE_URL") or DEFAULT_API_BASE_URL), - model_name=str(os.getenv("MODEL_NAME") or DEFAULT_MODEL_NAME), - hf_token=str(os.getenv("HF_TOKEN") or ""), - benchmark_name=str(os.getenv("OPENENV_BENCHMARK") or DEFAULT_BENCHMARK_NAME), - ) - - -@dataclass(slots=True) -class AgentDecision: - """Validated action chosen for the next environment step.""" - - action_type: str - code: str | None = None - source: str = "deterministic" - error: str | None = None +"""Dataclasses shared by the inference runtime.""" + +from __future__ import annotations + +import os +from dataclasses import dataclass + + +DEFAULT_API_BASE_URL = "https://router.huggingface.co/v1" +DEFAULT_MODEL_NAME = "Qwen/Qwen2.5-3B-Instruct" +DEFAULT_BENCHMARK_NAME = "python_code_review_env" + + +@dataclass(slots=True) +class InferenceConfig: + """Runtime configuration loaded from environment variables.""" + + api_base_url: str + model_name: str + hf_token: str + benchmark_name: str = DEFAULT_BENCHMARK_NAME + request_timeout_s: float = 12.0 + max_retries: int = 2 + max_episode_steps: int = 12 + success_threshold: float = 0.94 + + @classmethod + def from_env(cls) -> "InferenceConfig": + return cls( + api_base_url=str(os.getenv("API_BASE_URL") or DEFAULT_API_BASE_URL), + model_name=str(os.getenv("MODEL_NAME") or DEFAULT_MODEL_NAME), + hf_token=str(os.getenv("HF_TOKEN") or ""), + benchmark_name=str(os.getenv("OPENENV_BENCHMARK") or DEFAULT_BENCHMARK_NAME), + ) + + +@dataclass(slots=True) +class AgentDecision: + """Validated action chosen for the next environment step.""" + + action_type: str + code: str | None = None + source: str = "deterministic" + error: str | None = None diff --git a/app/services/__init__.py b/app/services/__init__.py index a7335c1ef575a5e1d1d5ed7d35a9a0bcd87e3977..6c6590e5f949ec150c61ef54bed75c9ac2a54cf0 100644 --- a/app/services/__init__.py +++ b/app/services/__init__.py @@ -1,5 +1,5 @@ -"""LLM service wrappers for inference-time action planning.""" - -from .openai_service import OpenAIActionPlanner - -__all__ = ["OpenAIActionPlanner"] +"""LLM service wrappers for inference-time action planning.""" + +from .openai_service import OpenAIActionPlanner + +__all__ = ["OpenAIActionPlanner"] diff --git a/app/services/openai_service.py b/app/services/openai_service.py index 31d4dd0ce53fa91299c126ce8531aa5d283ef867..13b291efedda4e7ea281dea1be4f53733b3233a7 100644 --- a/app/services/openai_service.py +++ b/app/services/openai_service.py @@ -1,84 +1,84 @@ -"""OpenAI-compatible action planner backed by the Hugging Face router.""" - -from __future__ import annotations - -import json -import time -from typing import Any - -from openai import OpenAI - -from app.models.inference import AgentDecision, InferenceConfig -from app.utils.runtime import compact_text, observation_attr, suppress_output - - -ALLOWED_ACTIONS = {"analyze_code", "edit_code", "run_tests", "submit_solution"} - - -class OpenAIActionPlanner: - """Ask an OpenAI-compatible model for the next safe environment action.""" - - def __init__(self, config: InferenceConfig) -> None: - self.config = config - self.client = OpenAI(base_url=config.api_base_url, api_key=config.hf_token) if config.hf_token else None - - def propose_action(self, observation: Any) -> AgentDecision: - if self.client is None: - return AgentDecision(action_type="run_tests", source="fallback", error="HF_TOKEN missing") - - prompt = self._build_prompt(observation) - for attempt in range(self.config.max_retries + 1): - try: - with suppress_output(): - response = self.client.chat.completions.create( - model=self.config.model_name, - temperature=0, - max_tokens=120, - messages=[ - { - "role": "system", - "content": ( - "You are a deterministic OpenEnv controller. " - "Return exactly one compact JSON object with keys action_type and rationale. " - "Allowed action_type values: analyze_code, run_tests, submit_solution. " - "Never emit markdown." - ), - }, - {"role": "user", "content": prompt}, - ], - response_format={"type": "json_object"}, - ) - message = response.choices[0].message.content or "" - return self._parse_action(message) - except Exception as exc: - if attempt >= self.config.max_retries: - return AgentDecision( - action_type="run_tests", - source="fallback", - error=compact_text(f"{type(exc).__name__}: {exc}", default="LLM failure"), - ) - time.sleep(0.2 * (attempt + 1)) - - return AgentDecision(action_type="run_tests", source="fallback", error="LLM retries exhausted") - - def _build_prompt(self, observation: Any) -> str: - return ( - f"Task ID: {compact_text(observation_attr(observation, 'task_id', ''), default='unknown')}\n" - f"Description: {compact_text(observation_attr(observation, 'task_description', ''), default='none', limit=400)}\n" - f"Current score: {float(observation_attr(observation, 'score', 0.01) or 0.01):.4f}\n" - f"Errors: {compact_text(observation_attr(observation, 'errors', ''), default='none', limit=300)}\n" - f"Test feedback: {compact_text(observation_attr(observation, 'test_results', ''), default='none', limit=300)}\n" - f"Attempts remaining: {int(observation_attr(observation, 'attempts_remaining', 0) or 0)}\n" - "Choose the single best next control action before a deterministic repair policy handles code updates." - ) - - def _parse_action(self, content: str) -> AgentDecision: - try: - payload = json.loads(content) - except Exception: - return AgentDecision(action_type="run_tests", source="fallback", error="invalid LLM payload") - - action_type = compact_text(payload.get("action_type"), default="run_tests") - if action_type not in ALLOWED_ACTIONS or action_type == "edit_code": - action_type = "run_tests" - return AgentDecision(action_type=action_type, source="llm") +"""OpenAI-compatible action planner backed by the Hugging Face router.""" + +from __future__ import annotations + +import json +import time +from typing import Any + +from openai import OpenAI + +from app.models.inference import AgentDecision, InferenceConfig +from app.utils.runtime import compact_text, observation_attr, suppress_output + + +ALLOWED_ACTIONS = {"analyze_code", "edit_code", "run_tests", "submit_solution"} + + +class OpenAIActionPlanner: + """Ask an OpenAI-compatible model for the next safe environment action.""" + + def __init__(self, config: InferenceConfig) -> None: + self.config = config + self.client = OpenAI(base_url=config.api_base_url, api_key=config.hf_token) if config.hf_token else None + + def propose_action(self, observation: Any) -> AgentDecision: + if self.client is None: + return AgentDecision(action_type="run_tests", source="fallback", error="HF_TOKEN missing") + + prompt = self._build_prompt(observation) + for attempt in range(self.config.max_retries + 1): + try: + with suppress_output(): + response = self.client.chat.completions.create( + model=self.config.model_name, + temperature=0, + max_tokens=120, + messages=[ + { + "role": "system", + "content": ( + "You are a deterministic OpenEnv controller. " + "Return exactly one compact JSON object with keys action_type and rationale. " + "Allowed action_type values: analyze_code, run_tests, submit_solution. " + "Never emit markdown." + ), + }, + {"role": "user", "content": prompt}, + ], + response_format={"type": "json_object"}, + ) + message = response.choices[0].message.content or "" + return self._parse_action(message) + except Exception as exc: + if attempt >= self.config.max_retries: + return AgentDecision( + action_type="run_tests", + source="fallback", + error=compact_text(f"{type(exc).__name__}: {exc}", default="LLM failure"), + ) + time.sleep(0.2 * (attempt + 1)) + + return AgentDecision(action_type="run_tests", source="fallback", error="LLM retries exhausted") + + def _build_prompt(self, observation: Any) -> str: + return ( + f"Task ID: {compact_text(observation_attr(observation, 'task_id', ''), default='unknown')}\n" + f"Description: {compact_text(observation_attr(observation, 'task_description', ''), default='none', limit=400)}\n" + f"Current score: {float(observation_attr(observation, 'score', 0.01) or 0.01):.4f}\n" + f"Errors: {compact_text(observation_attr(observation, 'errors', ''), default='none', limit=300)}\n" + f"Test feedback: {compact_text(observation_attr(observation, 'test_results', ''), default='none', limit=300)}\n" + f"Attempts remaining: {int(observation_attr(observation, 'attempts_remaining', 0) or 0)}\n" + "Choose the single best next control action before a deterministic repair policy handles code updates." + ) + + def _parse_action(self, content: str) -> AgentDecision: + try: + payload = json.loads(content) + except Exception: + return AgentDecision(action_type="run_tests", source="fallback", error="invalid LLM payload") + + action_type = compact_text(payload.get("action_type"), default="run_tests") + if action_type not in ALLOWED_ACTIONS or action_type == "edit_code": + action_type = "run_tests" + return AgentDecision(action_type=action_type, source="llm") diff --git a/app/streamlit_app.py b/app/streamlit_app.py index 59579549468833dafb20c4194e7002d4bfac4215..13012dc1d1f5ff893e12a7cb337b52eb12f3ab64 100644 --- a/app/streamlit_app.py +++ b/app/streamlit_app.py @@ -1,100 +1,100 @@ -"""Streamlit frontend for the multi-domain analyzer platform.""" - -from __future__ import annotations - -import streamlit as st - -from app.examples import EXAMPLES -from schemas.request import AnalyzeCodeRequest -from services.analysis_service import AnalysisService - - -analysis_service = AnalysisService() - - -def _analyze(code: str, context_window: str, traceback_text: str, domain_hint: str): - """Run the analysis service with validated request payloads.""" - - request = AnalyzeCodeRequest( - code=code, - context_window=context_window, - traceback_text=traceback_text, - domain_hint=domain_hint, # type: ignore[arg-type] - ) - return analysis_service.analyze(request) - - -def main() -> None: - """Render the Streamlit UI.""" - - st.set_page_config(page_title="Multi-Domain AI Code Analyzer", layout="wide") - st.title("Multi-Domain AI Code Analyzer & Improvement System") - st.caption("PyTorch-powered code review across DSA, Data Science, ML/DL, and Web backend code.") - - example_name = st.selectbox("Example input", list(EXAMPLES.keys())) - example = EXAMPLES[example_name] - auto_analyze = st.toggle("Real-time scoring", value=True) - - left, right = st.columns([1.2, 1.0]) - with left: - code = st.text_area("Code input", value=example["code"], height=420) - context_window = st.text_area("Context window", value=example["context_window"], height=100) - traceback_text = st.text_area("Optional traceback / runtime hint", value=example["traceback_text"], height=100) - domain_hint = st.selectbox("Domain hint", ["auto", "dsa", "data_science", "ml_dl", "web"], index=["auto", "dsa", "data_science", "ml_dl", "web"].index(example["domain_hint"])) - analyze_clicked = st.button("Analyze Code", type="primary") - - result = None - if code and (analyze_clicked or auto_analyze): - result = _analyze(code, context_window, traceback_text, domain_hint) - - with right: - if result is None: - st.info("Paste code or load an example to start analysis.") - else: - metric_cols = st.columns(4) - metric_cols[0].metric("Detected domain", result.detected_domain) - metric_cols[1].metric("ML score", f"{result.score_breakdown.ml_score:.0%}") - metric_cols[2].metric("Domain score", f"{result.score_breakdown.domain_score:.0%}") - metric_cols[3].metric("Reward", f"{result.score_breakdown.reward:.0%}") - st.bar_chart(result.domain_confidences) - st.caption(result.summary) - - if result is not None: - overview_tab, suggestions_tab, domain_tab, static_tab = st.tabs( - ["Overview", "Suggestions", "Domain Detail", "Static Analysis"] - ) - - with overview_tab: - st.subheader("Improvement Plan") - for step in result.improvement_plan: - st.write(f"- {step}") - st.subheader("Complexity") - st.write( - { - "time_complexity": result.static_analysis.time_complexity, - "space_complexity": result.static_analysis.space_complexity, - "cyclomatic_complexity": result.static_analysis.cyclomatic_complexity, - } - ) - - with suggestions_tab: - st.subheader("Suggestions") - for suggestion in result.domain_analysis.suggestions: - st.write(f"- {suggestion}") - if result.domain_analysis.issues: - st.subheader("Issues") - for issue in result.domain_analysis.issues: - st.write(f"- [{issue.severity}] {issue.title}: {issue.description}") - - with domain_tab: - st.subheader("Domain Highlights") - st.json(result.domain_analysis.highlights) - st.write(f"Domain score: {result.domain_analysis.domain_score:.0%}") - - with static_tab: - st.subheader("Static Analysis") - st.json(result.static_analysis.model_dump()) - - -if __name__ == "__main__": - main() +"""Streamlit frontend for the multi-domain analyzer platform.""" + +from __future__ import annotations + +import streamlit as st + +from app.examples import EXAMPLES +from schemas.request import AnalyzeCodeRequest +from services.analysis_service import AnalysisService + + +analysis_service = AnalysisService() + + +def _analyze(code: str, context_window: str, traceback_text: str, domain_hint: str): + """Run the analysis service with validated request payloads.""" + + request = AnalyzeCodeRequest( + code=code, + context_window=context_window, + traceback_text=traceback_text, + domain_hint=domain_hint, # type: ignore[arg-type] + ) + return analysis_service.analyze(request) + + +def main() -> None: + """Render the Streamlit UI.""" + + st.set_page_config(page_title="Multi-Domain AI Code Analyzer", layout="wide") + st.title("Multi-Domain AI Code Analyzer & Improvement System") + st.caption("PyTorch-powered code review across DSA, Data Science, ML/DL, and Web backend code.") + + example_name = st.selectbox("Example input", list(EXAMPLES.keys())) + example = EXAMPLES[example_name] + auto_analyze = st.toggle("Real-time scoring", value=True) + + left, right = st.columns([1.2, 1.0]) + with left: + code = st.text_area("Code input", value=example["code"], height=420) + context_window = st.text_area("Context window", value=example["context_window"], height=100) + traceback_text = st.text_area("Optional traceback / runtime hint", value=example["traceback_text"], height=100) + domain_hint = st.selectbox("Domain hint", ["auto", "dsa", "data_science", "ml_dl", "web"], index=["auto", "dsa", "data_science", "ml_dl", "web"].index(example["domain_hint"])) + analyze_clicked = st.button("Analyze Code", type="primary") + + result = None + if code and (analyze_clicked or auto_analyze): + result = _analyze(code, context_window, traceback_text, domain_hint) + + with right: + if result is None: + st.info("Paste code or load an example to start analysis.") + else: + metric_cols = st.columns(4) + metric_cols[0].metric("Detected domain", result.detected_domain) + metric_cols[1].metric("ML score", f"{result.score_breakdown.ml_score:.0%}") + metric_cols[2].metric("Domain score", f"{result.score_breakdown.domain_score:.0%}") + metric_cols[3].metric("Reward", f"{result.score_breakdown.reward:.0%}") + st.bar_chart(result.domain_confidences) + st.caption(result.summary) + + if result is not None: + overview_tab, suggestions_tab, domain_tab, static_tab = st.tabs( + ["Overview", "Suggestions", "Domain Detail", "Static Analysis"] + ) + + with overview_tab: + st.subheader("Improvement Plan") + for step in result.improvement_plan: + st.write(f"- {step}") + st.subheader("Complexity") + st.write( + { + "time_complexity": result.static_analysis.time_complexity, + "space_complexity": result.static_analysis.space_complexity, + "cyclomatic_complexity": result.static_analysis.cyclomatic_complexity, + } + ) + + with suggestions_tab: + st.subheader("Suggestions") + for suggestion in result.domain_analysis.suggestions: + st.write(f"- {suggestion}") + if result.domain_analysis.issues: + st.subheader("Issues") + for issue in result.domain_analysis.issues: + st.write(f"- [{issue.severity}] {issue.title}: {issue.description}") + + with domain_tab: + st.subheader("Domain Highlights") + st.json(result.domain_analysis.highlights) + st.write(f"Domain score: {result.domain_analysis.domain_score:.0%}") + + with static_tab: + st.subheader("Static Analysis") + st.json(result.static_analysis.model_dump()) + + +if __name__ == "__main__": + main() diff --git a/app/utils/__init__.py b/app/utils/__init__.py index 90078947c16b4f82a1ff0b83c78ac4b8e9001a28..d96f8c5f3e2145b34e24ef2c705fc9e5c60f5c7c 100644 --- a/app/utils/__init__.py +++ b/app/utils/__init__.py @@ -1,21 +1,21 @@ -"""Utility helpers shared by the inference runtime.""" - -from .runtime import ( - compact_text, - format_bool, - format_error, - format_reward, - observation_attr, - parse_task_ids, - suppress_output, -) - -__all__ = [ - "compact_text", - "format_bool", - "format_error", - "format_reward", - "observation_attr", - "parse_task_ids", - "suppress_output", -] +"""Utility helpers shared by the inference runtime.""" + +from .runtime import ( + compact_text, + format_bool, + format_error, + format_reward, + observation_attr, + parse_task_ids, + suppress_output, +) + +__all__ = [ + "compact_text", + "format_bool", + "format_error", + "format_reward", + "observation_attr", + "parse_task_ids", + "suppress_output", +] diff --git a/app/utils/runtime.py b/app/utils/runtime.py index 88d4da364e11a518adf6fa8c0c46ed4897de5012..20d816d07a48435cef3073d5c22a99ce32230c2d 100644 --- a/app/utils/runtime.py +++ b/app/utils/runtime.py @@ -1,95 +1,95 @@ -"""Formatting, parsing, and IO-suppression helpers for inference.""" - -from __future__ import annotations - -import io -from collections.abc import Iterable -from contextlib import contextmanager, redirect_stderr, redirect_stdout -from typing import Any, Iterator - -try: - from tasks import task_ids -except ImportError: # pragma: no cover - from python_env.tasks import task_ids # type: ignore[no-redef] - - -def compact_text( - value: Any, - *, - default: str = "", - limit: int = 240, - preserve_newlines: bool = False, -) -> str: - """Convert values into validator-safe text.""" - - if value is None: - return default - try: - text = str(value) - except Exception: - return default - if preserve_newlines: - text = text.strip() - else: - text = " ".join(text.split()) - return text[:limit] if text else default - - -def observation_attr(observation: Any, name: str, default: Any = None, *, preserve_newlines: bool = False) -> Any: - """Read an observation attribute without trusting the payload shape.""" - - if isinstance(observation, dict): - value = observation.get(name, default) - else: - value = getattr(observation, name, default) - if isinstance(value, str): - return compact_text( - value, - default=default if isinstance(default, str) else "", - preserve_newlines=preserve_newlines, - ) - return value - - -def format_bool(value: Any) -> str: - return "true" if bool(value) else "false" - - -def format_reward(value: Any) -> str: - try: - reward = float(value) - except Exception: - reward = 0.0 - return f"{reward:.2f}" - - -def format_error(value: Any) -> str: - text = compact_text(value, default="") - return text if text else "null" - - -def parse_task_ids() -> list[str]: - """Load stable task names with a deterministic fallback.""" - - try: - values = task_ids() - if isinstance(values, Iterable): - loaded = [compact_text(item, default="") for item in values] - loaded = [item for item in loaded if item] - if loaded: - return loaded - except Exception: - pass - return [ - "syntax_fix_invoice_totals", - "bug_fix_session_windows", - "optimization_rank_active_users", - ] - - -@contextmanager -def suppress_output() -> Iterator[None]: - """Silence libraries that write noisy logs to stdout or stderr.""" - - with redirect_stdout(io.StringIO()), redirect_stderr(io.StringIO()): - yield +"""Formatting, parsing, and IO-suppression helpers for inference.""" + +from __future__ import annotations + +import io +from collections.abc import Iterable +from contextlib import contextmanager, redirect_stderr, redirect_stdout +from typing import Any, Iterator + +try: + from tasks import task_ids +except ImportError: # pragma: no cover + from python_env.tasks import task_ids # type: ignore[no-redef] + + +def compact_text( + value: Any, + *, + default: str = "", + limit: int = 240, + preserve_newlines: bool = False, +) -> str: + """Convert values into validator-safe text.""" + + if value is None: + return default + try: + text = str(value) + except Exception: + return default + if preserve_newlines: + text = text.strip() + else: + text = " ".join(text.split()) + return text[:limit] if text else default + + +def observation_attr(observation: Any, name: str, default: Any = None, *, preserve_newlines: bool = False) -> Any: + """Read an observation attribute without trusting the payload shape.""" + + if isinstance(observation, dict): + value = observation.get(name, default) + else: + value = getattr(observation, name, default) + if isinstance(value, str): + return compact_text( + value, + default=default if isinstance(default, str) else "", + preserve_newlines=preserve_newlines, + ) + return value + + +def format_bool(value: Any) -> str: + return "true" if bool(value) else "false" + + +def format_reward(value: Any) -> str: + try: + reward = float(value) + except Exception: + reward = 0.0 + return f"{reward:.2f}" + + +def format_error(value: Any) -> str: + text = compact_text(value, default="") + return text if text else "null" + + +def parse_task_ids() -> list[str]: + """Load stable task names with a deterministic fallback.""" + + try: + values = task_ids() + if isinstance(values, Iterable): + loaded = [compact_text(item, default="") for item in values] + loaded = [item for item in loaded if item] + if loaded: + return loaded + except Exception: + pass + return [ + "syntax_fix_invoice_totals", + "bug_fix_session_windows", + "optimization_rank_active_users", + ] + + +@contextmanager +def suppress_output() -> Iterator[None]: + """Silence libraries that write noisy logs to stdout or stderr.""" + + with redirect_stdout(io.StringIO()), redirect_stderr(io.StringIO()): + yield diff --git a/client.py b/client.py index 0df35a7f5dfeea5508ab6ada6090b53dc302b486..7c960178e94c4090024df89d4a16a42561d342b6 100644 --- a/client.py +++ b/client.py @@ -7,11 +7,11 @@ from typing import Dict from openenv.core import EnvClient from openenv.core.client_types import StepResult -from .models import ( - PythonCodeReviewAction, - PythonCodeReviewObservation, - PythonCodeReviewState, -) +from .models import ( + PythonCodeReviewAction, + PythonCodeReviewObservation, + PythonCodeReviewState, +) class PythonCodeReviewEnv( diff --git a/graders/bug_fix.py b/graders/bug_fix.py index d43003247cb8adaa97b849b7054916e9d859aace..c4fc7750cb06569f9103d8b8e31da2caf2875668 100644 --- a/graders/bug_fix.py +++ b/graders/bug_fix.py @@ -3,10 +3,10 @@ from __future__ import annotations try: - from ..models import TaskGrade + from ..models import TaskGrade from ..tasks.catalog import ReviewTask except ImportError: - from models import TaskGrade + from models import TaskGrade from tasks.catalog import ReviewTask from .shared import ( diff --git a/graders/dispatch.py b/graders/dispatch.py index 43a02bef5b903cd94a570d6a5c56b6e301dcf544..6b4deb21bfafce14bc133439a8c2a61ad9ba3e0e 100644 --- a/graders/dispatch.py +++ b/graders/dispatch.py @@ -3,10 +3,10 @@ from __future__ import annotations try: - from ..models import TaskGrade + from ..models import TaskGrade from ..tasks.catalog import ReviewTask except ImportError: - from models import TaskGrade + from models import TaskGrade from tasks.catalog import ReviewTask from .bug_fix import grade_bug_fix_task diff --git a/graders/optimization.py b/graders/optimization.py index 94d0c08206740912dbb1fa32e31009d31424376f..440231f048a20a87bb82183060a1d453bac1dcc5 100644 --- a/graders/optimization.py +++ b/graders/optimization.py @@ -3,10 +3,10 @@ from __future__ import annotations try: - from ..models import TaskGrade + from ..models import TaskGrade from ..tasks.catalog import ReviewTask except ImportError: - from models import TaskGrade + from models import TaskGrade from tasks.catalog import ReviewTask from .shared import ( diff --git a/graders/shared.py b/graders/shared.py index 46ee1359d014c51bafa3438016b287065e7744cf..a59895988f525d8992b29ab78725bbc5f17f4dd9 100644 --- a/graders/shared.py +++ b/graders/shared.py @@ -2,20 +2,20 @@ from __future__ import annotations -import ast -import difflib -import math -import multiprocessing as mp -import os -import time -import traceback +import ast +import difflib +import math +import multiprocessing as mp +import os +import time +import traceback from typing import Any, Callable, Dict, List try: - from ..models import TaskGrade + from ..models import TaskGrade from ..tasks.catalog import CallCase, ReviewTask except ImportError: - from models import TaskGrade + from models import TaskGrade from tasks.catalog import CallCase, ReviewTask @@ -121,11 +121,11 @@ def _queue_worker( ) -def run_with_timeout( - worker: Callable[[Dict[str, Any]], Dict[str, Any]], - payload: Dict[str, Any], - timeout_s: float, -) -> Dict[str, Any]: +def run_with_timeout( + worker: Callable[[Dict[str, Any]], Dict[str, Any]], + payload: Dict[str, Any], + timeout_s: float, +) -> Dict[str, Any]: """Execute a worker in a subprocess and terminate on timeout.""" ctx = mp.get_context("spawn") @@ -146,31 +146,31 @@ def run_with_timeout( if not message["ok"]: return { "timed_out": False, - "error": f"{message['error']}\n{message['traceback']}", - } - return {"timed_out": False, "data": message["data"]} - - -def run_inline_with_timeout( - worker: Callable[[Dict[str, Any]], Dict[str, Any]], - payload: Dict[str, Any], - timeout_s: float, -) -> Dict[str, Any]: - """Fallback execution path for platforms where spawned workers are unreliable.""" - - started = time.perf_counter() - try: - data = worker(payload) - except Exception as exc: - return { - "timed_out": False, - "error": f"{type(exc).__name__}: {exc}\n{traceback.format_exc(limit=5)}", - } - - elapsed = time.perf_counter() - started - if elapsed > timeout_s: - return {"timed_out": True, "error": f"Execution exceeded {timeout_s:.1f}s timeout."} - return {"timed_out": False, "data": data} + "error": f"{message['error']}\n{message['traceback']}", + } + return {"timed_out": False, "data": message["data"]} + + +def run_inline_with_timeout( + worker: Callable[[Dict[str, Any]], Dict[str, Any]], + payload: Dict[str, Any], + timeout_s: float, +) -> Dict[str, Any]: + """Fallback execution path for platforms where spawned workers are unreliable.""" + + started = time.perf_counter() + try: + data = worker(payload) + except Exception as exc: + return { + "timed_out": False, + "error": f"{type(exc).__name__}: {exc}\n{traceback.format_exc(limit=5)}", + } + + elapsed = time.perf_counter() - started + if elapsed > timeout_s: + return {"timed_out": True, "error": f"Execution exceeded {timeout_s:.1f}s timeout."} + return {"timed_out": False, "data": data} def _execute_cases_worker(payload: Dict[str, Any]) -> Dict[str, Any]: @@ -375,7 +375,7 @@ def _benchmark_worker(payload: Dict[str, Any]) -> Dict[str, Any]: return {"baseline_seconds": baseline_seconds, "candidate_seconds": candidate_seconds} -def benchmark_candidate(task: ReviewTask, code: str, timeout_s: float) -> Dict[str, Any]: +def benchmark_candidate(task: ReviewTask, code: str, timeout_s: float) -> Dict[str, Any]: """Benchmark a candidate solution against the starter implementation.""" if not task.benchmark_config: @@ -389,10 +389,10 @@ def benchmark_candidate(task: ReviewTask, code: str, timeout_s: float) -> Dict[s "events": events, "iterations": task.benchmark_config.get("iterations", 5), } - if os.name == "nt": - result = run_inline_with_timeout(_benchmark_worker, payload, timeout_s=timeout_s) - else: - result = run_with_timeout(_benchmark_worker, payload, timeout_s=timeout_s) + if os.name == "nt": + result = run_inline_with_timeout(_benchmark_worker, payload, timeout_s=timeout_s) + else: + result = run_with_timeout(_benchmark_worker, payload, timeout_s=timeout_s) if result.get("timed_out"): return {"runtime_score": component_score(STRICT_SCORE_MIN), "timed_out": True, "details": result["error"]} if "error" in result: diff --git a/graders/syntax.py b/graders/syntax.py index 32c3437cf892d7cc50e8c3489891dc6e05418eba..350701f77c6e8059c050ab016d7df5fb06eab882 100644 --- a/graders/syntax.py +++ b/graders/syntax.py @@ -3,10 +3,10 @@ from __future__ import annotations try: - from ..models import TaskGrade + from ..models import TaskGrade from ..tasks.catalog import ReviewTask except ImportError: - from models import TaskGrade + from models import TaskGrade from tasks.catalog import ReviewTask from .shared import ( diff --git a/inference.py b/inference.py index beada78d444cc14cf9c210a6132b24699430c198..9ede6c47a468c19322eda425403a76ac266b41ea 100644 --- a/inference.py +++ b/inference.py @@ -1,12 +1,12 @@ -#!/usr/bin/env python3 -"""Root validator entrypoint.""" - -from __future__ import annotations - -import sys - -from app.env.runner import main - - -if __name__ == "__main__": - sys.exit(main()) +#!/usr/bin/env python3 +"""Root validator entrypoint.""" + +from __future__ import annotations + +import sys + +from app.env.runner import main + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/launch.py b/launch.py index c06c8d1cdf8c2a4a1dabf4cf54ca9534967d7212..71d10c43e0b3a6a05a767902d2a022f7662bdeb1 100644 --- a/launch.py +++ b/launch.py @@ -1,35 +1,35 @@ -"""Launch the FastAPI backend and Streamlit UI in one Docker container.""" - -from __future__ import annotations - -import subprocess -import sys - - -def main() -> int: - """Start the API backend in the background and keep Streamlit in the foreground.""" - - api_process = subprocess.Popen( - ["uvicorn", "api.main:app", "--host", "0.0.0.0", "--port", "8001"], - ) - try: - return subprocess.call( - [ - "streamlit", - "run", - "app/streamlit_app.py", - "--server.port", - "8000", - "--server.address", - "0.0.0.0", - "--server.headless", - "true", - ] - ) - finally: - api_process.terminate() - api_process.wait(timeout=10) - - -if __name__ == "__main__": - sys.exit(main()) +"""Launch the FastAPI backend and Streamlit UI in one Docker container.""" + +from __future__ import annotations + +import subprocess +import sys + + +def main() -> int: + """Start the API backend in the background and keep Streamlit in the foreground.""" + + api_process = subprocess.Popen( + ["uvicorn", "api.main:app", "--host", "0.0.0.0", "--port", "8001"], + ) + try: + return subprocess.call( + [ + "streamlit", + "run", + "app/streamlit_app.py", + "--server.port", + "8000", + "--server.address", + "0.0.0.0", + "--server.headless", + "true", + ] + ) + finally: + api_process.terminate() + api_process.wait(timeout=10) + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/models.py b/models.py index 6002de2dd30c2ab339e1acf93c068d0160666960..5a83f60d050accc83ab05bb2bef8743c52237739 100644 --- a/models.py +++ b/models.py @@ -1,4 +1,4 @@ -"""Typed models for the python_code_review_env environment.""" +"""Typed models for the python_code_review_env environment.""" from __future__ import annotations @@ -23,22 +23,22 @@ class HistoryEntry(BaseModel): reward: float = Field(..., gt=0.0, lt=1.0, description="Reward returned for the step.") -class RewardDetails(BaseModel): - """Transparent reward decomposition for debugging and training.""" - - value: float = Field(..., gt=0.0, lt=1.0, description="Clamped net reward in (0.0, 1.0).") - syntax_reward: float = Field(default=0.0) - test_reward: float = Field(default=0.0) - correctness_bonus: float = Field(default=0.0) - quality_bonus: float = Field(default=0.0) - error_reduction_bonus: float = Field(default=0.0) - completion_bonus: float = Field(default=0.0) - runtime_bonus: float = Field(default=0.0) - progress_delta: float = Field(default=0.0) - invalid_action_penalty: float = Field(default=0.0) - timeout_penalty: float = Field(default=0.0) - regression_penalty: float = Field(default=0.0) - stagnation_penalty: float = Field(default=0.0) +class RewardDetails(BaseModel): + """Transparent reward decomposition for debugging and training.""" + + value: float = Field(..., gt=0.0, lt=1.0, description="Clamped net reward in (0.0, 1.0).") + syntax_reward: float = Field(default=0.0) + test_reward: float = Field(default=0.0) + correctness_bonus: float = Field(default=0.0) + quality_bonus: float = Field(default=0.0) + error_reduction_bonus: float = Field(default=0.0) + completion_bonus: float = Field(default=0.0) + runtime_bonus: float = Field(default=0.0) + progress_delta: float = Field(default=0.0) + invalid_action_penalty: float = Field(default=0.0) + timeout_penalty: float = Field(default=0.0) + regression_penalty: float = Field(default=0.0) + stagnation_penalty: float = Field(default=0.0) reason: str = Field(..., description="Human-readable reward explanation.") prev_score: float = Field(default=0.01, gt=0.0, lt=1.0) curr_score: float = Field(default=0.01, gt=0.0, lt=1.0) @@ -66,17 +66,17 @@ class PythonCodeReviewObservation(Observation): current_code: str = Field(..., description="Latest code under review.") errors: str = Field(default="", description="Syntax or execution errors.") test_results: str = Field(default="", description="Public test and benchmark feedback.") - visible_tests: List[str] = Field(default_factory=list) - history: List[HistoryEntry] = Field(default_factory=list) - attempts_remaining: int = Field(..., ge=0) - last_action_status: str = Field(default="") - last_action_error: Optional[str] = Field(default=None) - score: float = Field(..., gt=0.0, lt=1.0) - reward: float = Field(default=0.1, gt=0.0, lt=1.0) - done: bool = Field(default=False) - reward_details: RewardDetails = Field( - default_factory=lambda: RewardDetails(value=0.1, reason="Environment reset.") - ) + visible_tests: List[str] = Field(default_factory=list) + history: List[HistoryEntry] = Field(default_factory=list) + attempts_remaining: int = Field(..., ge=0) + last_action_status: str = Field(default="") + last_action_error: Optional[str] = Field(default=None) + score: float = Field(..., gt=0.0, lt=1.0) + reward: float = Field(default=0.1, gt=0.0, lt=1.0) + done: bool = Field(default=False) + reward_details: RewardDetails = Field( + default_factory=lambda: RewardDetails(value=0.1, reason="Environment reset.") + ) class PythonCodeReviewState(State): diff --git a/models/__init__.py b/models/__init__.py index b2134bd9b67926755e339a1f4abc2c67d31de5f9..e850debc4c529344baf4fdc31f9f9f5f46b953ed 100644 --- a/models/__init__.py +++ b/models/__init__.py @@ -1,66 +1,66 @@ -"""PyTorch-backed model wrappers plus OpenEnv schema exports.""" - -from __future__ import annotations - -import importlib.util -import sys -from pathlib import Path - -from .pytorch_model import PyTorchCodeAnalyzerModel - - -def _load_schema_module(): - schema_path = Path(__file__).resolve().parent.parent / "models.py" - spec = importlib.util.spec_from_file_location("_python_env_schema_models", schema_path) - if spec is None or spec.loader is None: # pragma: no cover - raise ImportError(f"Unable to load schema models from {schema_path}") - if spec.name in sys.modules: - return sys.modules[spec.name] - module = importlib.util.module_from_spec(spec) - sys.modules[spec.name] = module - spec.loader.exec_module(module) - for model_name in ( - "HistoryEntry", - "RewardDetails", - "PythonCodeReviewAction", - "PythonCodeReviewObservation", - "PythonCodeReviewState", - "TaskDescriptor", - "TaskSummary", - "TaskGrade", - "HealthResponse", - ): - getattr(module, model_name).model_rebuild() - return module - - -_schema_models = _load_schema_module() - -HealthResponse = _schema_models.HealthResponse -HistoryEntry = _schema_models.HistoryEntry -PythonAction = _schema_models.PythonAction -PythonCodeReviewAction = _schema_models.PythonCodeReviewAction -PythonCodeReviewObservation = _schema_models.PythonCodeReviewObservation -PythonCodeReviewState = _schema_models.PythonCodeReviewState -PythonObservation = _schema_models.PythonObservation -PythonState = _schema_models.PythonState -RewardDetails = _schema_models.RewardDetails -TaskDescriptor = _schema_models.TaskDescriptor -TaskGrade = _schema_models.TaskGrade -TaskSummary = _schema_models.TaskSummary - -__all__ = [ - "HealthResponse", - "HistoryEntry", - "PyTorchCodeAnalyzerModel", - "PythonAction", - "PythonCodeReviewAction", - "PythonCodeReviewObservation", - "PythonCodeReviewState", - "PythonObservation", - "PythonState", - "RewardDetails", - "TaskDescriptor", - "TaskGrade", - "TaskSummary", -] +"""PyTorch-backed model wrappers plus OpenEnv schema exports.""" + +from __future__ import annotations + +import importlib.util +import sys +from pathlib import Path + +from .pytorch_model import PyTorchCodeAnalyzerModel + + +def _load_schema_module(): + schema_path = Path(__file__).resolve().parent.parent / "models.py" + spec = importlib.util.spec_from_file_location("_python_env_schema_models", schema_path) + if spec is None or spec.loader is None: # pragma: no cover + raise ImportError(f"Unable to load schema models from {schema_path}") + if spec.name in sys.modules: + return sys.modules[spec.name] + module = importlib.util.module_from_spec(spec) + sys.modules[spec.name] = module + spec.loader.exec_module(module) + for model_name in ( + "HistoryEntry", + "RewardDetails", + "PythonCodeReviewAction", + "PythonCodeReviewObservation", + "PythonCodeReviewState", + "TaskDescriptor", + "TaskSummary", + "TaskGrade", + "HealthResponse", + ): + getattr(module, model_name).model_rebuild() + return module + + +_schema_models = _load_schema_module() + +HealthResponse = _schema_models.HealthResponse +HistoryEntry = _schema_models.HistoryEntry +PythonAction = _schema_models.PythonAction +PythonCodeReviewAction = _schema_models.PythonCodeReviewAction +PythonCodeReviewObservation = _schema_models.PythonCodeReviewObservation +PythonCodeReviewState = _schema_models.PythonCodeReviewState +PythonObservation = _schema_models.PythonObservation +PythonState = _schema_models.PythonState +RewardDetails = _schema_models.RewardDetails +TaskDescriptor = _schema_models.TaskDescriptor +TaskGrade = _schema_models.TaskGrade +TaskSummary = _schema_models.TaskSummary + +__all__ = [ + "HealthResponse", + "HistoryEntry", + "PyTorchCodeAnalyzerModel", + "PythonAction", + "PythonCodeReviewAction", + "PythonCodeReviewObservation", + "PythonCodeReviewState", + "PythonObservation", + "PythonState", + "RewardDetails", + "TaskDescriptor", + "TaskGrade", + "TaskSummary", +] diff --git a/models/pytorch_model.py b/models/pytorch_model.py index f3ff2e37177beaea1dc10b9b4a276d171bbfe112..d2b2f3b8d1a1d968c92d6dfe1ff63b4d70192282 100644 --- a/models/pytorch_model.py +++ b/models/pytorch_model.py @@ -1,149 +1,149 @@ -"""PyTorch + transformers model wrapper for multi-domain code scoring.""" - -from __future__ import annotations - -import hashlib -from typing import Dict, List, Sequence - -import torch -import torch.nn.functional as F - -try: - from transformers import AutoModel, AutoTokenizer -except Exception: - AutoModel = None # type: ignore[assignment] - AutoTokenizer = None # type: ignore[assignment] - - -DOMAIN_PROTOTYPES: Dict[str, List[str]] = { - "dsa": [ - "Binary search, hashmap optimization, recursion, dynamic programming, arrays, trees, graphs, stack, queue, complexity.", - "Competitive programming algorithm with loops, memoization, prefix sums, and asymptotic analysis.", - ], - "data_science": [ - "Pandas dataframe transformation, numpy vectorization, feature leakage, train test split, iterrows misuse.", - "Data cleaning pipeline using pandas, numpy, aggregation, joins, and vectorized operations.", - ], - "ml_dl": [ - "PyTorch model, training loop, optimizer, backward pass, eval mode, no_grad, loss function, dataloader.", - "Machine learning inference and training code with torch, sklearn, tensors, gradients, and model checkpoints.", - ], - "web": [ - "FastAPI endpoint, request validation, Pydantic models, async routes, API security, backend service design.", - "REST API backend with routers, dependency injection, input validation, serialization, and error handling.", - ], - "general": [ - "General Python utility code with readable structure, typing, tests, and maintainable abstractions.", - ], -} - -QUALITY_ANCHORS: Dict[str, List[str]] = { - "high": [ - "Readable typed Python code with validation, efficient algorithms, vectorized operations, safe inference, and clean API boundaries.", - "Production-ready code with small functions, docstrings, low complexity, and clear error handling.", - ], - "low": [ - "Brute-force nested loops, missing validation, unsafe input handling, missing eval mode, missing no_grad, and code smells.", - "Hard to maintain code with high complexity, repeated scans, mutable side effects, and unclear structure.", - ], -} - - -class _HashEmbeddingBackend: - """Torch-native fallback when pretrained weights cannot be loaded.""" - - def __init__(self, dimensions: int = 128) -> None: - self.dimensions = dimensions - self.model_id = "hashed-token-fallback" - self.backend_name = "hashed-token-fallback" - self.notes = ["Using hashed embeddings because pretrained transformer weights are unavailable."] - - def embed_texts(self, texts: Sequence[str]) -> torch.Tensor: - matrix = torch.zeros((len(texts), self.dimensions), dtype=torch.float32) - for row_index, text in enumerate(texts): - tokens = text.lower().split()[:512] - if not tokens: - matrix[row_index, 0] = 1.0 - continue - for token in tokens: - digest = hashlib.md5(token.encode("utf-8")).hexdigest() - bucket = int(digest[:8], 16) % self.dimensions - sign = -1.0 if int(digest[8:10], 16) % 2 else 1.0 - matrix[row_index, bucket] += sign - return F.normalize(matrix + 1e-6, dim=1) - - -class PyTorchCodeAnalyzerModel: - """Score code using pretrained transformer embeddings plus prototype similarity.""" - - def __init__(self, model_id: str = "huggingface/CodeBERTa-small-v1") -> None: - self.model_id = model_id - self.backend_name = model_id - self.notes: List[str] = [] - self._tokenizer = None - self._model = None - self._fallback = _HashEmbeddingBackend() - self._prototype_cache: Dict[str, torch.Tensor] = {} - - def _ensure_loaded(self) -> None: - if self._model is not None or self.notes: - return - if AutoTokenizer is None or AutoModel is None: - self.backend_name = self._fallback.backend_name - self.notes = list(self._fallback.notes) - return - try: - self._tokenizer = AutoTokenizer.from_pretrained(self.model_id) - self._model = AutoModel.from_pretrained(self.model_id) - self._model.eval() - self.notes.append(f"Loaded pretrained encoder `{self.model_id}`.") - except Exception as exc: - self.backend_name = self._fallback.backend_name - self.notes = list(self._fallback.notes) + [f"Pretrained load failed: {type(exc).__name__}: {exc}"] - - def _embed_texts(self, texts: Sequence[str]) -> torch.Tensor: - self._ensure_loaded() - if self._model is None or self._tokenizer is None: - return self._fallback.embed_texts(texts) - encoded = self._tokenizer(list(texts), padding=True, truncation=True, max_length=256, return_tensors="pt") - with torch.no_grad(): - outputs = self._model(**encoded) - hidden = outputs.last_hidden_state - mask = encoded["attention_mask"].unsqueeze(-1) - pooled = (hidden * mask).sum(dim=1) / mask.sum(dim=1).clamp(min=1) - return F.normalize(pooled, dim=1) - - def _prototype_matrix(self, bucket: str, texts: Sequence[str]) -> torch.Tensor: - if bucket not in self._prototype_cache: - self._prototype_cache[bucket] = self._embed_texts(texts) - return self._prototype_cache[bucket] - - def predict(self, code: str, context_window: str, static_summary: Dict[str, object]) -> Dict[str, object]: - """Predict domain probabilities and a model quality score.""" - - document = ( - f"Code:\n{code.strip()[:4000]}\n\n" - f"Context:\n{context_window.strip()[:1000]}\n\n" - f"Static hints:\n{static_summary}\n" - ) - candidate = self._embed_texts([document]) - - domain_scores: Dict[str, float] = {} - for domain, texts in DOMAIN_PROTOTYPES.items(): - matrix = self._prototype_matrix(f"domain:{domain}", texts) - similarity = torch.matmul(candidate, matrix.T).max().item() - domain_scores[domain] = round((similarity + 1.0) / 2.0, 4) - - high_matrix = self._prototype_matrix("quality:high", QUALITY_ANCHORS["high"]) - low_matrix = self._prototype_matrix("quality:low", QUALITY_ANCHORS["low"]) - high_similarity = torch.matmul(candidate, high_matrix.T).max().item() - low_similarity = torch.matmul(candidate, low_matrix.T).max().item() - ml_quality_score = torch.sigmoid(torch.tensor((high_similarity - low_similarity) * 4.0)).item() - - return { - "domain_scores": domain_scores, - "ml_quality_score": round(float(ml_quality_score), 4), - "backend_name": self.backend_name, - "model_id": self.model_id, - "notes": list(self.notes), - } +"""PyTorch + transformers model wrapper for multi-domain code scoring.""" + +from __future__ import annotations + +import hashlib +from typing import Dict, List, Sequence + +import torch +import torch.nn.functional as F + +try: + from transformers import AutoModel, AutoTokenizer +except Exception: + AutoModel = None # type: ignore[assignment] + AutoTokenizer = None # type: ignore[assignment] + + +DOMAIN_PROTOTYPES: Dict[str, List[str]] = { + "dsa": [ + "Binary search, hashmap optimization, recursion, dynamic programming, arrays, trees, graphs, stack, queue, complexity.", + "Competitive programming algorithm with loops, memoization, prefix sums, and asymptotic analysis.", + ], + "data_science": [ + "Pandas dataframe transformation, numpy vectorization, feature leakage, train test split, iterrows misuse.", + "Data cleaning pipeline using pandas, numpy, aggregation, joins, and vectorized operations.", + ], + "ml_dl": [ + "PyTorch model, training loop, optimizer, backward pass, eval mode, no_grad, loss function, dataloader.", + "Machine learning inference and training code with torch, sklearn, tensors, gradients, and model checkpoints.", + ], + "web": [ + "FastAPI endpoint, request validation, Pydantic models, async routes, API security, backend service design.", + "REST API backend with routers, dependency injection, input validation, serialization, and error handling.", + ], + "general": [ + "General Python utility code with readable structure, typing, tests, and maintainable abstractions.", + ], +} + +QUALITY_ANCHORS: Dict[str, List[str]] = { + "high": [ + "Readable typed Python code with validation, efficient algorithms, vectorized operations, safe inference, and clean API boundaries.", + "Production-ready code with small functions, docstrings, low complexity, and clear error handling.", + ], + "low": [ + "Brute-force nested loops, missing validation, unsafe input handling, missing eval mode, missing no_grad, and code smells.", + "Hard to maintain code with high complexity, repeated scans, mutable side effects, and unclear structure.", + ], +} + + +class _HashEmbeddingBackend: + """Torch-native fallback when pretrained weights cannot be loaded.""" + + def __init__(self, dimensions: int = 128) -> None: + self.dimensions = dimensions + self.model_id = "hashed-token-fallback" + self.backend_name = "hashed-token-fallback" + self.notes = ["Using hashed embeddings because pretrained transformer weights are unavailable."] + + def embed_texts(self, texts: Sequence[str]) -> torch.Tensor: + matrix = torch.zeros((len(texts), self.dimensions), dtype=torch.float32) + for row_index, text in enumerate(texts): + tokens = text.lower().split()[:512] + if not tokens: + matrix[row_index, 0] = 1.0 + continue + for token in tokens: + digest = hashlib.md5(token.encode("utf-8")).hexdigest() + bucket = int(digest[:8], 16) % self.dimensions + sign = -1.0 if int(digest[8:10], 16) % 2 else 1.0 + matrix[row_index, bucket] += sign + return F.normalize(matrix + 1e-6, dim=1) + + +class PyTorchCodeAnalyzerModel: + """Score code using pretrained transformer embeddings plus prototype similarity.""" + + def __init__(self, model_id: str = "huggingface/CodeBERTa-small-v1") -> None: + self.model_id = model_id + self.backend_name = model_id + self.notes: List[str] = [] + self._tokenizer = None + self._model = None + self._fallback = _HashEmbeddingBackend() + self._prototype_cache: Dict[str, torch.Tensor] = {} + + def _ensure_loaded(self) -> None: + if self._model is not None or self.notes: + return + if AutoTokenizer is None or AutoModel is None: + self.backend_name = self._fallback.backend_name + self.notes = list(self._fallback.notes) + return + try: + self._tokenizer = AutoTokenizer.from_pretrained(self.model_id) + self._model = AutoModel.from_pretrained(self.model_id) + self._model.eval() + self.notes.append(f"Loaded pretrained encoder `{self.model_id}`.") + except Exception as exc: + self.backend_name = self._fallback.backend_name + self.notes = list(self._fallback.notes) + [f"Pretrained load failed: {type(exc).__name__}: {exc}"] + + def _embed_texts(self, texts: Sequence[str]) -> torch.Tensor: + self._ensure_loaded() + if self._model is None or self._tokenizer is None: + return self._fallback.embed_texts(texts) + encoded = self._tokenizer(list(texts), padding=True, truncation=True, max_length=256, return_tensors="pt") + with torch.no_grad(): + outputs = self._model(**encoded) + hidden = outputs.last_hidden_state + mask = encoded["attention_mask"].unsqueeze(-1) + pooled = (hidden * mask).sum(dim=1) / mask.sum(dim=1).clamp(min=1) + return F.normalize(pooled, dim=1) + + def _prototype_matrix(self, bucket: str, texts: Sequence[str]) -> torch.Tensor: + if bucket not in self._prototype_cache: + self._prototype_cache[bucket] = self._embed_texts(texts) + return self._prototype_cache[bucket] + + def predict(self, code: str, context_window: str, static_summary: Dict[str, object]) -> Dict[str, object]: + """Predict domain probabilities and a model quality score.""" + + document = ( + f"Code:\n{code.strip()[:4000]}\n\n" + f"Context:\n{context_window.strip()[:1000]}\n\n" + f"Static hints:\n{static_summary}\n" + ) + candidate = self._embed_texts([document]) + + domain_scores: Dict[str, float] = {} + for domain, texts in DOMAIN_PROTOTYPES.items(): + matrix = self._prototype_matrix(f"domain:{domain}", texts) + similarity = torch.matmul(candidate, matrix.T).max().item() + domain_scores[domain] = round((similarity + 1.0) / 2.0, 4) + + high_matrix = self._prototype_matrix("quality:high", QUALITY_ANCHORS["high"]) + low_matrix = self._prototype_matrix("quality:low", QUALITY_ANCHORS["low"]) + high_similarity = torch.matmul(candidate, high_matrix.T).max().item() + low_similarity = torch.matmul(candidate, low_matrix.T).max().item() + ml_quality_score = torch.sigmoid(torch.tensor((high_similarity - low_similarity) * 4.0)).item() + + return { + "domain_scores": domain_scores, + "ml_quality_score": round(float(ml_quality_score), 4), + "backend_name": self.backend_name, + "model_id": self.model_id, + "notes": list(self.notes), + } diff --git a/pyproject.toml b/pyproject.toml index 8eebc1d97f89dc7c8e92fba2fb93b1ea2d97c559..a8db9e2976fcb90151c2b98e19903112d11bb63f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,16 +1,18 @@ -[build-system] -requires = ["setuptools>=68", "wheel"] -build-backend = "setuptools.build_meta" - +[build-system] +requires = ["setuptools>=68", "wheel"] +build-backend = "setuptools.build_meta" + [project] name = "openenv-python-code-review-env" version = "1.0.0" description = "TorchReview Copilot: AI-powered Python code triage with PyTorch and OpenEnv validation." readme = "README.md" requires-python = ">=3.10" + dependencies = [ "fastapi>=0.111.0", "gradio>=5.26.0", + "hf-xet>=1.4.3", "openai>=1.76.0", "openenv-core[core]>=0.2.2", "streamlit>=1.44.0", @@ -24,28 +26,13 @@ dev = [ "pytest>=8.0.0", "pytest-cov>=4.0.0", ] - -[project.scripts] -server = "python_env.server.app:main" - -[tool.setuptools] -include-package-data = true -packages = [ - "python_env", - "python_env.server", - "python_env.tasks", - "python_env.graders", - "python_env.api", - "python_env.app", - "python_env.app.agents", - "python_env.app.env", - "python_env.app.models", - "python_env.app.services", - "python_env.app.utils", - "python_env.analyzers", - "python_env.models", - "python_env.schemas", - "python_env.services", - "python_env.utils", -] -package-dir = { "python_env" = ".", "python_env.server" = "server", "python_env.tasks" = "tasks", "python_env.graders" = "graders", "python_env.api" = "api", "python_env.app" = "app", "python_env.app.agents" = "app/agents", "python_env.app.env" = "app/env", "python_env.app.models" = "app/models", "python_env.app.services" = "app/services", "python_env.app.utils" = "app/utils", "python_env.analyzers" = "analyzers", "python_env.models" = "models", "python_env.schemas" = "schemas", "python_env.services" = "services", "python_env.utils" = "utils" } + +[project.scripts] +server = "python_env.server.app:main" + +[tool.setuptools] +include-package-data = true + +[tool.setuptools.packages.find] +where = ["."] +include = ["*"] diff --git a/schemas/__init__.py b/schemas/__init__.py index e635325f1c40ea4e2797578f1fc3224f9548d1df..d2008615adec6aada77a9ada69cf8d3d8d5fb4c1 100644 --- a/schemas/__init__.py +++ b/schemas/__init__.py @@ -1,13 +1,13 @@ -"""Public schemas for the multi-domain analysis platform.""" - -from .request import AnalyzeCodeRequest -from .response import AnalyzeCodeResponse, AnalysisIssue, DomainAnalysis, ScoreBreakdown, StaticAnalysisSummary - -__all__ = [ - "AnalyzeCodeRequest", - "AnalyzeCodeResponse", - "AnalysisIssue", - "DomainAnalysis", - "ScoreBreakdown", - "StaticAnalysisSummary", -] +"""Public schemas for the multi-domain analysis platform.""" + +from .request import AnalyzeCodeRequest +from .response import AnalyzeCodeResponse, AnalysisIssue, DomainAnalysis, ScoreBreakdown, StaticAnalysisSummary + +__all__ = [ + "AnalyzeCodeRequest", + "AnalyzeCodeResponse", + "AnalysisIssue", + "DomainAnalysis", + "ScoreBreakdown", + "StaticAnalysisSummary", +] diff --git a/schemas/request.py b/schemas/request.py index c53252a73269901cb3bf98e8a10b2b5d2140ca66..32a906a29f9c5bfe51634343f0c70b024505593c 100644 --- a/schemas/request.py +++ b/schemas/request.py @@ -1,19 +1,19 @@ -"""Request schemas for code analysis endpoints and UI.""" - -from __future__ import annotations - -from typing import Literal - -from pydantic import BaseModel, Field - - -DomainHint = Literal["auto", "dsa", "data_science", "ml_dl", "web"] - - -class AnalyzeCodeRequest(BaseModel): - """Validated input payload for multi-domain code analysis.""" - - code: str = Field(..., min_length=1, description="Source code to analyze.") - context_window: str = Field(default="", max_length=2000, description="Optional repository or task context.") - traceback_text: str = Field(default="", max_length=2000, description="Optional runtime or test failure output.") - domain_hint: DomainHint = Field(default="auto", description="Optional domain override when auto detection is not desired.") +"""Request schemas for code analysis endpoints and UI.""" + +from __future__ import annotations + +from typing import Literal + +from pydantic import BaseModel, Field + + +DomainHint = Literal["auto", "dsa", "data_science", "ml_dl", "web"] + + +class AnalyzeCodeRequest(BaseModel): + """Validated input payload for multi-domain code analysis.""" + + code: str = Field(..., min_length=1, description="Source code to analyze.") + context_window: str = Field(default="", max_length=2000, description="Optional repository or task context.") + traceback_text: str = Field(default="", max_length=2000, description="Optional runtime or test failure output.") + domain_hint: DomainHint = Field(default="auto", description="Optional domain override when auto detection is not desired.") diff --git a/schemas/response.py b/schemas/response.py index 568543fa94d66642b2cf7c16c7f8e848709313df..ca56f139647df21db5ba709e124cda93a2226047 100644 --- a/schemas/response.py +++ b/schemas/response.py @@ -1,73 +1,73 @@ -"""Response schemas for the multi-domain analysis platform.""" - -from __future__ import annotations - -from typing import Dict, List, Literal - -from pydantic import BaseModel, Field - - -DomainType = Literal["dsa", "data_science", "ml_dl", "web", "general"] -Severity = Literal["low", "medium", "high"] - - -class AnalysisIssue(BaseModel): - """One detected issue or risk in the code snippet.""" - - title: str - severity: Severity - description: str - line_hint: int | None = None - - -class StaticAnalysisSummary(BaseModel): - """Language-agnostic static-analysis signals.""" - - syntax_valid: bool - syntax_error: str = "" - cyclomatic_complexity: int = Field(..., ge=1) - line_count: int = Field(..., ge=0) - max_loop_depth: int = Field(..., ge=0) - time_complexity: str = "Unknown" - space_complexity: str = "Unknown" - detected_imports: List[str] = Field(default_factory=list) - code_smells: List[str] = Field(default_factory=list) - - -class DomainAnalysis(BaseModel): - """Domain-specific analysis payload returned by an analyzer.""" - - domain: DomainType - domain_score: float = Field(..., ge=0.0, le=1.0) - issues: List[AnalysisIssue] = Field(default_factory=list) - suggestions: List[str] = Field(default_factory=list) - highlights: Dict[str, float | str] = Field(default_factory=dict) - - -class ScoreBreakdown(BaseModel): - """Reward inputs and final normalized score.""" - - ml_score: float = Field(..., ge=0.0, le=1.0) - domain_score: float = Field(..., ge=0.0, le=1.0) - lint_score: float = Field(..., ge=0.0, le=1.0) - complexity_penalty: float = Field(..., ge=0.0, le=1.0) - quality_signal: float = Field(..., ge=0.0, le=1.0) - error_reduction_signal: float = Field(..., ge=0.0, le=1.0) - completion_signal: float = Field(..., ge=0.0, le=1.0) - reward: float = Field(..., ge=0.0, le=1.0) - - -class AnalyzeCodeResponse(BaseModel): - """Top-level structured output for API and UI consumers.""" - - detected_domain: DomainType - domain_confidences: Dict[str, float] - score_breakdown: ScoreBreakdown - static_analysis: StaticAnalysisSummary - domain_analysis: DomainAnalysis - improvement_plan: List[str] = Field(default_factory=list) - model_backend: str - model_id: str - summary: str - context_window: str = "" - analysis_time_ms: float = Field(..., ge=0.0) +"""Response schemas for the multi-domain analysis platform.""" + +from __future__ import annotations + +from typing import Dict, List, Literal + +from pydantic import BaseModel, Field + + +DomainType = Literal["dsa", "data_science", "ml_dl", "web", "general"] +Severity = Literal["low", "medium", "high"] + + +class AnalysisIssue(BaseModel): + """One detected issue or risk in the code snippet.""" + + title: str + severity: Severity + description: str + line_hint: int | None = None + + +class StaticAnalysisSummary(BaseModel): + """Language-agnostic static-analysis signals.""" + + syntax_valid: bool + syntax_error: str = "" + cyclomatic_complexity: int = Field(..., ge=1) + line_count: int = Field(..., ge=0) + max_loop_depth: int = Field(..., ge=0) + time_complexity: str = "Unknown" + space_complexity: str = "Unknown" + detected_imports: List[str] = Field(default_factory=list) + code_smells: List[str] = Field(default_factory=list) + + +class DomainAnalysis(BaseModel): + """Domain-specific analysis payload returned by an analyzer.""" + + domain: DomainType + domain_score: float = Field(..., ge=0.0, le=1.0) + issues: List[AnalysisIssue] = Field(default_factory=list) + suggestions: List[str] = Field(default_factory=list) + highlights: Dict[str, float | str] = Field(default_factory=dict) + + +class ScoreBreakdown(BaseModel): + """Reward inputs and final normalized score.""" + + ml_score: float = Field(..., ge=0.0, le=1.0) + domain_score: float = Field(..., ge=0.0, le=1.0) + lint_score: float = Field(..., ge=0.0, le=1.0) + complexity_penalty: float = Field(..., ge=0.0, le=1.0) + quality_signal: float = Field(..., ge=0.0, le=1.0) + error_reduction_signal: float = Field(..., ge=0.0, le=1.0) + completion_signal: float = Field(..., ge=0.0, le=1.0) + reward: float = Field(..., ge=0.0, le=1.0) + + +class AnalyzeCodeResponse(BaseModel): + """Top-level structured output for API and UI consumers.""" + + detected_domain: DomainType + domain_confidences: Dict[str, float] + score_breakdown: ScoreBreakdown + static_analysis: StaticAnalysisSummary + domain_analysis: DomainAnalysis + improvement_plan: List[str] = Field(default_factory=list) + model_backend: str + model_id: str + summary: str + context_window: str = "" + analysis_time_ms: float = Field(..., ge=0.0) diff --git a/server/Dockerfile b/server/Dockerfile index c4c278e96861e0dfbb70a08e547135a427cc4833..1ee762e557a7e7782495b400c60ed5d4fa96d350 100644 --- a/server/Dockerfile +++ b/server/Dockerfile @@ -1,27 +1,27 @@ -FROM python:3.11-slim - -ENV PYTHONDONTWRITEBYTECODE=1 \ - PYTHONUNBUFFERED=1 \ - PYTHONUTF8=1 \ - PYTHONIOENCODING=utf-8 \ - PIP_NO_CACHE_DIR=1 \ - PIP_DISABLE_PIP_VERSION_CHECK=1 \ - ENABLE_GRADIO_DEMO=false - -WORKDIR /app - -COPY server/requirements.txt /tmp/requirements.txt - -RUN python -m pip install --upgrade pip && \ - pip install -r /tmp/requirements.txt - -COPY . /app - -RUN pip install --no-deps . - -EXPOSE 8000 - -HEALTHCHECK --interval=30s --timeout=5s --start-period=10s --retries=3 \ - CMD python -c "import urllib.request; urllib.request.urlopen('http://127.0.0.1:8000/health', timeout=3).read()" - -CMD ["uvicorn", "server.app:app", "--host", "0.0.0.0", "--port", "8000"] +FROM python:3.11-slim + +ENV PYTHONDONTWRITEBYTECODE=1 \ + PYTHONUNBUFFERED=1 \ + PYTHONUTF8=1 \ + PYTHONIOENCODING=utf-8 \ + PIP_NO_CACHE_DIR=1 \ + PIP_DISABLE_PIP_VERSION_CHECK=1 \ + ENABLE_GRADIO_DEMO=false + +WORKDIR /app + +COPY server/requirements.txt /tmp/requirements.txt + +RUN python -m pip install --upgrade pip && \ + pip install -r /tmp/requirements.txt + +COPY . /app + +RUN pip install --no-deps . + +EXPOSE 8000 + +HEALTHCHECK --interval=30s --timeout=5s --start-period=10s --retries=3 \ + CMD python -c "import urllib.request; urllib.request.urlopen('http://127.0.0.1:8000/health', timeout=3).read()" + +CMD ["uvicorn", "server.app:app", "--host", "0.0.0.0", "--port", "8000", "--no-access-log"] diff --git a/server/app.py b/server/app.py index c31c67c04b0d146d09f08a777a9cd790349ffcb9..624a63ae7cdb5bea255bed764093f2c1c5e7fcee 100644 --- a/server/app.py +++ b/server/app.py @@ -1,80 +1,86 @@ -"""OpenEnv FastAPI entrypoint with optional Gradio mounting.""" - -from __future__ import annotations - -import os - -from fastapi import FastAPI - -try: - from openenv.core.env_server.http_server import create_app -except Exception as exc: # pragma: no cover - raise ImportError( - "openenv-core is required to run the API server. Install project dependencies first." - ) from exc - -try: - import gradio as gr -except Exception: - gr = None # type: ignore[assignment] - +"""OpenEnv FastAPI entrypoint with optional Gradio mounting.""" + +from __future__ import annotations + +import os + +from fastapi import FastAPI + +try: + from openenv.core.env_server.http_server import create_app +except Exception as exc: # pragma: no cover + raise ImportError( + "openenv-core is required to run the API server. Install project dependencies first." + ) from exc + +try: + import gradio as gr +except Exception: + gr = None # type: ignore[assignment] + try: from ..models import PythonCodeReviewAction, PythonCodeReviewObservation from .env import PythonCodeReviewEnvironment except ImportError: from models import PythonCodeReviewAction, PythonCodeReviewObservation from server.env import PythonCodeReviewEnvironment - - -def _gradio_enabled() -> bool: - for env_name in ("ENABLE_GRADIO_DEMO", "ENABLE_WEB_INTERFACE"): - if str(os.getenv(env_name, "")).strip().lower() in {"1", "true", "yes", "on"}: - return True - return False - - -def _max_concurrent_envs() -> int: - try: - return max(int(os.getenv("OPENENV_MAX_CONCURRENT_ENVS", "2")), 1) - except Exception: - return 2 - - + + +def _gradio_enabled() -> bool: + for env_name in ("ENABLE_GRADIO_DEMO", "ENABLE_WEB_INTERFACE"): + if str(os.getenv(env_name, "")).strip().lower() in {"1", "true", "yes", "on"}: + return True + return False + + +def _max_concurrent_envs() -> int: + try: + return max(int(os.getenv("OPENENV_MAX_CONCURRENT_ENVS", "2")), 1) + except Exception: + return 2 + + def build_application(): - """Compose the OpenEnv API with the Gradio demo frontend.""" - - api_app = create_app( - PythonCodeReviewEnvironment, - PythonCodeReviewAction, - PythonCodeReviewObservation, - env_name="python_code_review_env", - max_concurrent_envs=_max_concurrent_envs(), - ) + """Compose the OpenEnv API with the Gradio demo frontend.""" + + api_app = create_app( + PythonCodeReviewEnvironment, + PythonCodeReviewAction, + PythonCodeReviewObservation, + env_name="python_code_review_env", + max_concurrent_envs=_max_concurrent_envs(), + ) served_app = api_app if gr is not None and _gradio_enabled(): try: - from .demo import build_demo + from .demo import CSS, build_demo except ImportError: - from server.demo import build_demo - served_app = gr.mount_gradio_app(api_app, build_demo(), path="/") - - wrapper_app = FastAPI(title="python_code_review_env", version="1.0.0") - - @wrapper_app.get("/health", include_in_schema=False) - def _health() -> dict[str, str]: - return {"status": "ok"} - - wrapper_app.mount("/", served_app) - return wrapper_app - - -app = build_application() + from server.demo import CSS, build_demo + served_app = gr.mount_gradio_app( + api_app, + build_demo(), + path="/", + theme=gr.themes.Soft(primary_hue="orange", secondary_hue="amber"), + css=CSS, + ) + + wrapper_app = FastAPI(title="python_code_review_env", version="1.0.0") + @wrapper_app.get("/health", include_in_schema=False) + def _health() -> dict[str, str]: + return {"status": "ok"} -def main(host: str = "0.0.0.0", port: int = 8000) -> None: - import uvicorn + wrapper_app.mount("/", served_app) + return wrapper_app - uvicorn.run(app, host=host, port=port) + +app = build_application() + + +def main(host: str = "0.0.0.0", port: int = 8000) -> None: + import uvicorn + + uvicorn.run(app, host=host, port=port, access_log=False) if __name__ == "__main__": diff --git a/server/demo.py b/server/demo.py index 4e45cc392ee511b58df8bb1a66f5897b063647ed..98d4d8f80ee8b4ec173a98c6d0b09d81d9dfd2e5 100644 --- a/server/demo.py +++ b/server/demo.py @@ -1,441 +1,441 @@ -"""Gradio UI for TorchReview Copilot.""" - -from __future__ import annotations - -from html import escape - -import gradio as gr - -try: - from ..triage import get_default_engine -except ImportError: - from triage import get_default_engine - - -CSS = """ -:root { - --paper: #f6f1e8; - --ink: #162521; - --accent: #d95d39; - --panel: #fffdf8; - --border: #d6c4b8; - --muted: #5f6f67; - --good: #2d7d62; - --warn: #b76516; - --high: #b23a48; -} - -body, .gradio-container { - background: - radial-gradient(circle at top left, rgba(247, 197, 159, 0.35), transparent 35%), - linear-gradient(135deg, #f9f6ef 0%, #efe5d3 100%); - color: var(--ink); - font-family: Georgia, "Times New Roman", serif; -} - -.gradio-container { - max-width: 1260px !important; -} - -.hero-card, -.metric-card, -.subtle-card { - background: rgba(255, 253, 248, 0.95); - border: 1px solid var(--border); - border-radius: 20px; - box-shadow: 0 16px 40px rgba(22, 37, 33, 0.08); -} - -.hero-card { - padding: 28px 30px; - margin-bottom: 12px; -} - -.metric-card, -.subtle-card { - padding: 20px 22px; -} - -.eyebrow { - text-transform: uppercase; - letter-spacing: 0.12em; - font-size: 12px; - color: var(--accent); - margin-bottom: 10px; -} - -.hero-title { - font-size: 44px; - line-height: 1.05; - margin: 0 0 10px; -} - -.hero-copy { - margin: 0; - font-size: 18px; - line-height: 1.55; - color: var(--muted); -} - -.summary-title { - display: flex; - justify-content: space-between; - gap: 12px; - align-items: center; - margin-bottom: 14px; -} - -.pill { - display: inline-block; - padding: 6px 12px; - border-radius: 999px; - font-size: 12px; - text-transform: uppercase; - letter-spacing: 0.08em; - background: #efe5d3; -} - -.pill.low { color: var(--good); } -.pill.medium { color: var(--warn); } -.pill.high { color: var(--high); } - -.summary-grid { - display: grid; - grid-template-columns: repeat(2, minmax(0, 1fr)); - gap: 12px; - margin-top: 16px; -} - -.summary-stat { - background: #fff7ef; - border-radius: 14px; - padding: 12px 14px; - border: 1px solid rgba(214, 196, 184, 0.8); -} - -.summary-stat strong { - display: block; - font-size: 12px; - text-transform: uppercase; - letter-spacing: 0.08em; - color: var(--muted); - margin-bottom: 6px; -} - -.radar-wrap { - display: grid; - gap: 12px; -} - -.bar { - display: grid; - gap: 6px; -} - -.bar-head { - display: flex; - justify-content: space-between; - font-size: 13px; - color: var(--muted); -} - -.bar-track { - width: 100%; - height: 12px; - background: #f2e5d6; - border-radius: 999px; - overflow: hidden; -} - -.bar-fill { - height: 100%; - border-radius: 999px; -} - -.matched-box { - background: #fff7ef; - border: 1px solid rgba(214, 196, 184, 0.8); - border-radius: 16px; - padding: 14px; -} - -.how-grid { - display: grid; - grid-template-columns: repeat(4, minmax(0, 1fr)); - gap: 12px; -} - -.how-step { - background: rgba(255, 253, 248, 0.9); - border: 1px solid var(--border); - border-radius: 18px; - padding: 16px; -} - -@media (max-width: 900px) { - .hero-title { - font-size: 34px; - } - - .summary-grid, - .how-grid { - grid-template-columns: 1fr; - } -} -""" - - -def _default_outputs() -> tuple[str, str, str, str, str]: - return ( - "
Awaiting Analysis

Paste Python code, add an optional traceback, or load one of the built-in examples.

", - "
Live Triage Radar

Confidence bars will appear after the first analysis run.

", - "### Improvement Plan\nAnalyze a sample to generate syntax, edge-case, and scalability recommendations.", - "### Known Pattern Match\nThe nearest OpenEnv task will be highlighted here after inference runs.", - "### Model Notes\nBackend and extracted signal details will appear here.", - ) - - -def _summary_html(result) -> str: - issue = escape(result.issue_label.title()) - summary = escape(result.summary) - next_action = escape(result.suggested_next_action) - return f""" -
-
-
-
TorchReview Verdict
-

{issue} Issue

-
- {escape(result.repair_risk)} repair risk -
-

{summary}

-
-
- Reward Score - {result.reward_score:.0%} -
-
- ML Quality - {result.ml_quality_score:.0%} -
-
- Matched Pattern - {escape(result.matched_pattern.title)} -
-
- Inference Backend - {escape(result.model_backend)} -
-
- Lint Score - {result.lint_score:.0%} -
-
- Complexity Penalty - {result.complexity_penalty:.0%} -
-
- Next Action - {next_action} -
-
-
- """ - - -def _radar_html(result) -> str: - colors = { - "syntax": "#d95d39", - "logic": "#4f772d", - "performance": "#355070", - } - bars = [] - for label, score in result.confidence_scores.items(): - bars.append( - f""" -
-
{escape(label.title())}{score:.0%}
-
-
-
-
- """ - ) - return f""" -
-
Live Triage Radar
- {''.join(bars)} -
- Nearest Known Pattern: {escape(result.matched_pattern.title)}
- {escape(result.matched_pattern.summary)} -
-
- """ - - -def _plan_markdown(result) -> str: - plan_lines = "\n".join(f"{index + 1}. {step}" for index, step in enumerate(result.repair_plan)) - return ( - "### Improvement Plan\n" - f"**Primary issue:** `{result.issue_label}`\n\n" - f"{plan_lines}\n\n" - f"**Suggested next action:** {result.suggested_next_action}" - ) - - -def _match_markdown(result) -> str: - return ( - "### Known Pattern Match\n" - f"**Task:** `{result.matched_pattern.task_id}` \n" - f"**Title:** {result.matched_pattern.title} \n" - f"**Why it matched:** {result.matched_pattern.rationale} \n" - f"**Similarity:** {result.matched_pattern.similarity:.0%}" - ) - - -def _model_markdown(result) -> str: - signal_lines = "\n".join( - f"- `{signal.name}` -> {signal.value} ({signal.impact}, weight {signal.weight:.2f}): {signal.evidence}" - for signal in result.extracted_signals - ) or "- No strong static signals were extracted." - notes = "\n".join(f"- {item}" for item in result.inference_notes) or "- No additional backend notes." - return ( - "### Model Notes\n" - f"- **Model backend:** `{result.model_backend}`\n" - f"- **Model id:** `{result.model_id}`\n" - f"- **Analysis time:** `{result.analysis_time_ms:.2f} ms`\n\n" - "### Reward Formula\n" - f"- `reward = (0.5 x {result.ml_quality_score:.2f}) + (0.3 x {result.lint_score:.2f}) - (0.2 x {result.complexity_penalty:.2f})`\n" - f"- **Final reward:** `{result.reward_score:.2f}`\n\n" - "### Extracted Signals\n" - f"{signal_lines}\n\n" - "### Backend Notes\n" - f"{notes}" - ) - - -def analyze_inputs(code: str, traceback_text: str, context_window: str) -> tuple[str, str, str, str, str]: - """Run the triage engine and format outputs for the Gradio UI.""" - - result = get_default_engine().triage(code or "", traceback_text or "", context_window or "") - return ( - _summary_html(result), - _radar_html(result), - _plan_markdown(result), - _match_markdown(result), - _model_markdown(result), - ) - - -def load_example(example_key: str) -> tuple[str, str, str, str, str, str, str, str, str]: - """Populate the UI from a built-in example and immediately analyze it.""" - - example = get_default_engine().example_map()[example_key] - outputs = analyze_inputs(example.code, example.traceback_text, example.context_window) - header = ( - f"### Example Scenario\n" - f"**{example.title}** \n" - f"{example.summary} \n" - f"Label target: `{example.label}`" - ) - return (example.code, example.traceback_text, example.context_window, header, *outputs) - - -def build_demo() -> gr.Blocks: - """Create the TorchReview Copilot Gradio application.""" - - examples = get_default_engine().example_map() - first_example = next(iter(examples.values())) - - with gr.Blocks(theme=gr.themes.Soft(primary_hue="orange", secondary_hue="amber"), css=CSS, title="TorchReview Copilot") as demo: - gr.HTML( - """ -
-
Meta PyTorch OpenEnv Hackathon Demo
-

TorchReview Copilot

-

- AI-powered code review and improvement system using PyTorch to score code quality, surface bugs, - and generate a three-step improvement plan. OpenEnv stays underneath as the deterministic validation engine. -

-
- """ - ) - - with gr.Row(): - with gr.Column(scale=6): - example_choice = gr.Radio( - choices=[(item.title, item.key) for item in examples.values()], - value=first_example.key, - label="Try a built-in failure scenario", - info="Switching examples updates the Live Triage Radar immediately.", - ) - example_header = gr.Markdown() - code_input = gr.Code( - value=first_example.code, - language="python", - lines=18, - label="Python code under review", - ) - traceback_input = gr.Textbox( - value=first_example.traceback_text, - lines=7, - label="Optional traceback / failing test output", - placeholder="Paste stack traces, assertion failures, or benchmark notes here.", - ) - context_input = gr.Textbox( - value=first_example.context_window, - lines=4, - label="Context window", - placeholder="Describe expected behavior, constraints, or repository context.", - ) - with gr.Row(): - analyze_button = gr.Button("Analyze & Score Code", variant="primary") - clear_button = gr.Button("Clear Inputs", variant="secondary") - - with gr.Column(scale=5): - summary_html = gr.HTML() - radar_html = gr.HTML() - plan_markdown = gr.Markdown() - match_markdown = gr.Markdown() - model_markdown = gr.Markdown() - - gr.HTML( - """ -
-
How It Works
-
-
Input
Code plus optional traceback or benchmark signal.
-
Processing
Static checks extract parser, lint, complexity, and runtime clues.
-
Model
CodeBERTa embeddings run through PyTorch and score code quality against known OpenEnv patterns.
-
Output
Confidence radar, reward score, and a three-step improvement plan.
-
-
- """ - ) - - example_choice.change( - fn=load_example, - inputs=example_choice, - outputs=[code_input, traceback_input, context_input, example_header, summary_html, radar_html, plan_markdown, match_markdown, model_markdown], - show_progress="hidden", - ) - analyze_button.click( - fn=analyze_inputs, - inputs=[code_input, traceback_input, context_input], - outputs=[summary_html, radar_html, plan_markdown, match_markdown, model_markdown], - show_progress="minimal", - ) - clear_button.click( - fn=lambda: ("", "", "", "### Example Scenario\nChoose a built-in example or paste custom code.", *_default_outputs()), - inputs=None, - outputs=[code_input, traceback_input, context_input, example_header, summary_html, radar_html, plan_markdown, match_markdown, model_markdown], - show_progress="hidden", - ) - demo.load( - fn=load_example, - inputs=example_choice, - outputs=[code_input, traceback_input, context_input, example_header, summary_html, radar_html, plan_markdown, match_markdown, model_markdown], - show_progress="hidden", - ) - - return demo +"""Gradio UI for TorchReview Copilot.""" + +from __future__ import annotations + +from html import escape + +import gradio as gr + +try: + from ..triage import get_default_engine +except ImportError: + from triage import get_default_engine + + +CSS = """ +:root { + --paper: #f6f1e8; + --ink: #162521; + --accent: #d95d39; + --panel: #fffdf8; + --border: #d6c4b8; + --muted: #5f6f67; + --good: #2d7d62; + --warn: #b76516; + --high: #b23a48; +} + +body, .gradio-container { + background: + radial-gradient(circle at top left, rgba(247, 197, 159, 0.35), transparent 35%), + linear-gradient(135deg, #f9f6ef 0%, #efe5d3 100%); + color: var(--ink); + font-family: Georgia, "Times New Roman", serif; +} + +.gradio-container { + max-width: 1260px !important; +} + +.hero-card, +.metric-card, +.subtle-card { + background: rgba(255, 253, 248, 0.95); + border: 1px solid var(--border); + border-radius: 20px; + box-shadow: 0 16px 40px rgba(22, 37, 33, 0.08); +} + +.hero-card { + padding: 28px 30px; + margin-bottom: 12px; +} + +.metric-card, +.subtle-card { + padding: 20px 22px; +} + +.eyebrow { + text-transform: uppercase; + letter-spacing: 0.12em; + font-size: 12px; + color: var(--accent); + margin-bottom: 10px; +} + +.hero-title { + font-size: 44px; + line-height: 1.05; + margin: 0 0 10px; +} + +.hero-copy { + margin: 0; + font-size: 18px; + line-height: 1.55; + color: var(--muted); +} + +.summary-title { + display: flex; + justify-content: space-between; + gap: 12px; + align-items: center; + margin-bottom: 14px; +} + +.pill { + display: inline-block; + padding: 6px 12px; + border-radius: 999px; + font-size: 12px; + text-transform: uppercase; + letter-spacing: 0.08em; + background: #efe5d3; +} + +.pill.low { color: var(--good); } +.pill.medium { color: var(--warn); } +.pill.high { color: var(--high); } + +.summary-grid { + display: grid; + grid-template-columns: repeat(2, minmax(0, 1fr)); + gap: 12px; + margin-top: 16px; +} + +.summary-stat { + background: #fff7ef; + border-radius: 14px; + padding: 12px 14px; + border: 1px solid rgba(214, 196, 184, 0.8); +} + +.summary-stat strong { + display: block; + font-size: 12px; + text-transform: uppercase; + letter-spacing: 0.08em; + color: var(--muted); + margin-bottom: 6px; +} + +.radar-wrap { + display: grid; + gap: 12px; +} + +.bar { + display: grid; + gap: 6px; +} + +.bar-head { + display: flex; + justify-content: space-between; + font-size: 13px; + color: var(--muted); +} + +.bar-track { + width: 100%; + height: 12px; + background: #f2e5d6; + border-radius: 999px; + overflow: hidden; +} + +.bar-fill { + height: 100%; + border-radius: 999px; +} + +.matched-box { + background: #fff7ef; + border: 1px solid rgba(214, 196, 184, 0.8); + border-radius: 16px; + padding: 14px; +} + +.how-grid { + display: grid; + grid-template-columns: repeat(4, minmax(0, 1fr)); + gap: 12px; +} + +.how-step { + background: rgba(255, 253, 248, 0.9); + border: 1px solid var(--border); + border-radius: 18px; + padding: 16px; +} + +@media (max-width: 900px) { + .hero-title { + font-size: 34px; + } + + .summary-grid, + .how-grid { + grid-template-columns: 1fr; + } +} +""" + + +def _default_outputs() -> tuple[str, str, str, str, str]: + return ( + "
Awaiting Analysis

Paste Python code, add an optional traceback, or load one of the built-in examples.

", + "
Live Triage Radar

Confidence bars will appear after the first analysis run.

", + "### Improvement Plan\nAnalyze a sample to generate syntax, edge-case, and scalability recommendations.", + "### Known Pattern Match\nThe nearest OpenEnv task will be highlighted here after inference runs.", + "### Model Notes\nBackend and extracted signal details will appear here.", + ) + + +def _summary_html(result) -> str: + issue = escape(result.issue_label.title()) + summary = escape(result.summary) + next_action = escape(result.suggested_next_action) + return f""" +
+
+
+
TorchReview Verdict
+

{issue} Issue

+
+ {escape(result.repair_risk)} repair risk +
+

{summary}

+
+
+ Reward Score + {result.reward_score:.0%} +
+
+ ML Quality + {result.ml_quality_score:.0%} +
+
+ Matched Pattern + {escape(result.matched_pattern.title)} +
+
+ Inference Backend + {escape(result.model_backend)} +
+
+ Lint Score + {result.lint_score:.0%} +
+
+ Complexity Penalty + {result.complexity_penalty:.0%} +
+
+ Next Action + {next_action} +
+
+
+ """ + + +def _radar_html(result) -> str: + colors = { + "syntax": "#d95d39", + "logic": "#4f772d", + "performance": "#355070", + } + bars = [] + for label, score in result.confidence_scores.items(): + bars.append( + f""" +
+
{escape(label.title())}{score:.0%}
+
+
+
+
+ """ + ) + return f""" +
+
Live Triage Radar
+ {''.join(bars)} +
+ Nearest Known Pattern: {escape(result.matched_pattern.title)}
+ {escape(result.matched_pattern.summary)} +
+
+ """ + + +def _plan_markdown(result) -> str: + plan_lines = "\n".join(f"{index + 1}. {step}" for index, step in enumerate(result.repair_plan)) + return ( + "### Improvement Plan\n" + f"**Primary issue:** `{result.issue_label}`\n\n" + f"{plan_lines}\n\n" + f"**Suggested next action:** {result.suggested_next_action}" + ) + + +def _match_markdown(result) -> str: + return ( + "### Known Pattern Match\n" + f"**Task:** `{result.matched_pattern.task_id}` \n" + f"**Title:** {result.matched_pattern.title} \n" + f"**Why it matched:** {result.matched_pattern.rationale} \n" + f"**Similarity:** {result.matched_pattern.similarity:.0%}" + ) + + +def _model_markdown(result) -> str: + signal_lines = "\n".join( + f"- `{signal.name}` -> {signal.value} ({signal.impact}, weight {signal.weight:.2f}): {signal.evidence}" + for signal in result.extracted_signals + ) or "- No strong static signals were extracted." + notes = "\n".join(f"- {item}" for item in result.inference_notes) or "- No additional backend notes." + return ( + "### Model Notes\n" + f"- **Model backend:** `{result.model_backend}`\n" + f"- **Model id:** `{result.model_id}`\n" + f"- **Analysis time:** `{result.analysis_time_ms:.2f} ms`\n\n" + "### Reward Formula\n" + f"- `reward = (0.5 x {result.ml_quality_score:.2f}) + (0.3 x {result.lint_score:.2f}) - (0.2 x {result.complexity_penalty:.2f})`\n" + f"- **Final reward:** `{result.reward_score:.2f}`\n\n" + "### Extracted Signals\n" + f"{signal_lines}\n\n" + "### Backend Notes\n" + f"{notes}" + ) + + +def analyze_inputs(code: str, traceback_text: str, context_window: str) -> tuple[str, str, str, str, str]: + """Run the triage engine and format outputs for the Gradio UI.""" + + result = get_default_engine().triage(code or "", traceback_text or "", context_window or "") + return ( + _summary_html(result), + _radar_html(result), + _plan_markdown(result), + _match_markdown(result), + _model_markdown(result), + ) + + +def load_example(example_key: str) -> tuple[str, str, str, str, str, str, str, str, str]: + """Populate the UI from a built-in example and immediately analyze it.""" + + example = get_default_engine().example_map()[example_key] + outputs = analyze_inputs(example.code, example.traceback_text, example.context_window) + header = ( + f"### Example Scenario\n" + f"**{example.title}** \n" + f"{example.summary} \n" + f"Label target: `{example.label}`" + ) + return (example.code, example.traceback_text, example.context_window, header, *outputs) + + +def build_demo() -> gr.Blocks: + """Create the TorchReview Copilot Gradio application.""" + + examples = get_default_engine().example_map() + first_example = next(iter(examples.values())) + + with gr.Blocks(title="TorchReview Copilot") as demo: + gr.HTML( + """ +
+
Meta PyTorch OpenEnv Hackathon Demo
+

TorchReview Copilot

+

+ AI-powered code review and improvement system using PyTorch to score code quality, surface bugs, + and generate a three-step improvement plan. OpenEnv stays underneath as the deterministic validation engine. +

+
+ """ + ) + + with gr.Row(): + with gr.Column(scale=6): + example_choice = gr.Radio( + choices=[(item.title, item.key) for item in examples.values()], + value=first_example.key, + label="Try a built-in failure scenario", + info="Switching examples updates the Live Triage Radar immediately.", + ) + example_header = gr.Markdown() + code_input = gr.Code( + value=first_example.code, + language="python", + lines=18, + label="Python code under review", + ) + traceback_input = gr.Textbox( + value=first_example.traceback_text, + lines=7, + label="Optional traceback / failing test output", + placeholder="Paste stack traces, assertion failures, or benchmark notes here.", + ) + context_input = gr.Textbox( + value=first_example.context_window, + lines=4, + label="Context window", + placeholder="Describe expected behavior, constraints, or repository context.", + ) + with gr.Row(): + analyze_button = gr.Button("Analyze & Score Code", variant="primary") + clear_button = gr.Button("Clear Inputs", variant="secondary") + + with gr.Column(scale=5): + summary_html = gr.HTML() + radar_html = gr.HTML() + plan_markdown = gr.Markdown() + match_markdown = gr.Markdown() + model_markdown = gr.Markdown() + + gr.HTML( + """ +
+
How It Works
+
+
Input
Code plus optional traceback or benchmark signal.
+
Processing
Static checks extract parser, lint, complexity, and runtime clues.
+
Model
CodeBERTa embeddings run through PyTorch and score code quality against known OpenEnv patterns.
+
Output
Confidence radar, reward score, and a three-step improvement plan.
+
+
+ """ + ) + + example_choice.change( + fn=load_example, + inputs=example_choice, + outputs=[code_input, traceback_input, context_input, example_header, summary_html, radar_html, plan_markdown, match_markdown, model_markdown], + show_progress="hidden", + ) + analyze_button.click( + fn=analyze_inputs, + inputs=[code_input, traceback_input, context_input], + outputs=[summary_html, radar_html, plan_markdown, match_markdown, model_markdown], + show_progress="minimal", + ) + clear_button.click( + fn=lambda: ("", "", "", "### Example Scenario\nChoose a built-in example or paste custom code.", *_default_outputs()), + inputs=None, + outputs=[code_input, traceback_input, context_input, example_header, summary_html, radar_html, plan_markdown, match_markdown, model_markdown], + show_progress="hidden", + ) + demo.load( + fn=load_example, + inputs=example_choice, + outputs=[code_input, traceback_input, context_input, example_header, summary_html, radar_html, plan_markdown, match_markdown, model_markdown], + show_progress="hidden", + ) + + return demo diff --git a/server/env.py b/server/env.py index 121360770582cea646bb68a214487ab05b78d881..01130809937836e4d3be669f9dd5b7040f4e3543 100644 --- a/server/env.py +++ b/server/env.py @@ -11,24 +11,24 @@ from openenv.core.env_server.types import EnvironmentMetadata try: from ..graders import grade_task from ..graders.shared import component_score, safe_ratio, strict_score - from ..models import ( - HistoryEntry, - PythonCodeReviewAction, - PythonCodeReviewObservation, - PythonCodeReviewState, - RewardDetails, + from ..models import ( + HistoryEntry, + PythonCodeReviewAction, + PythonCodeReviewObservation, + PythonCodeReviewState, + RewardDetails, TaskGrade, ) from ..tasks import ReviewTask, list_tasks, select_task except ImportError: from graders import grade_task from graders.shared import component_score, safe_ratio, strict_score - from models import ( - HistoryEntry, - PythonCodeReviewAction, - PythonCodeReviewObservation, - PythonCodeReviewState, - RewardDetails, + from models import ( + HistoryEntry, + PythonCodeReviewAction, + PythonCodeReviewObservation, + PythonCodeReviewState, + RewardDetails, TaskGrade, ) from tasks import ReviewTask, list_tasks, select_task @@ -56,17 +56,17 @@ class PythonCodeReviewEnvironment( SUPPORTS_CONCURRENT_SESSIONS: bool = True - def __init__(self, verbose: bool = False, **_: Any) -> None: - super().__init__() - self.verbose = verbose - self._task: ReviewTask = list_tasks()[0] - self._current_code: str = self._task.starter_code - self._history: list[HistoryEntry] = [] - self._last_reward = RewardDetails(value=0.1, reason="Environment initialized.") - self._last_action_error: str | None = None - self._current_grade = _empty_grade() - self._state = PythonCodeReviewState(episode_id=str(uuid4()), step_count=0) - self.reset() + def __init__(self, verbose: bool = False, **_: Any) -> None: + super().__init__() + self.verbose = verbose + self._task: ReviewTask = list_tasks()[0] + self._current_code: str = self._task.starter_code + self._history: list[HistoryEntry] = [] + self._last_reward = RewardDetails(value=0.1, reason="Environment initialized.") + self._last_action_error: str | None = None + self._current_grade = _empty_grade() + self._state = PythonCodeReviewState(episode_id=str(uuid4()), step_count=0) + self.reset() def reset( self, @@ -74,17 +74,17 @@ class PythonCodeReviewEnvironment( episode_id: Optional[str] = None, **kwargs: Any, ) -> PythonCodeReviewObservation: - task_id = kwargs.get("task_id") - self._task = select_task(seed=seed, task_id=task_id) - self._current_code = self._task.starter_code - self._history = [] - self._last_action_error = None - self._last_reward = RewardDetails(value=0.1, reason="Environment reset.") - self._current_grade, self._last_action_error = self._safe_grade_task( - self._task, - self._current_code, - include_hidden=False, - ) + task_id = kwargs.get("task_id") + self._task = select_task(seed=seed, task_id=task_id) + self._current_code = self._task.starter_code + self._history = [] + self._last_action_error = None + self._last_reward = RewardDetails(value=0.1, reason="Environment reset.") + self._current_grade, self._last_action_error = self._safe_grade_task( + self._task, + self._current_code, + include_hidden=False, + ) self._state = PythonCodeReviewState( episode_id=episode_id or str(uuid4()), @@ -143,22 +143,22 @@ class PythonCodeReviewEnvironment( ) return observation, reward.value, observation.done, {"task_id": observation.task_id, "score": observation.score} - previous_grade = self._current_grade - status = "" - invalid_action = False - code_changed = False - use_hidden_grading = False - action_error: str | None = None - - if action.action_type == "edit_code": - if not action.code or not action.code.strip(): - invalid_action = True - status = "edit_code requires a non-empty code payload." - action_error = status - else: - code_changed = action.code != self._current_code - self._current_code = action.code - status = "Updated working copy from agent patch." + previous_grade = self._current_grade + status = "" + invalid_action = False + code_changed = False + use_hidden_grading = False + action_error: str | None = None + + if action.action_type == "edit_code": + if not action.code or not action.code.strip(): + invalid_action = True + status = "edit_code requires a non-empty code payload." + action_error = status + else: + code_changed = action.code != self._current_code + self._current_code = action.code + status = "Updated working copy from agent patch." elif action.action_type == "submit_solution": if action.code is not None and action.code.strip(): code_changed = action.code != self._current_code @@ -169,30 +169,30 @@ class PythonCodeReviewEnvironment( status = "Executed public validation suite." elif action.action_type == "analyze_code": status = "Generated static review summary." - else: # pragma: no cover - invalid_action = True - status = f"Unsupported action_type: {action.action_type}" - action_error = status + else: # pragma: no cover + invalid_action = True + status = f"Unsupported action_type: {action.action_type}" + action_error = status self._state.step_count += 1 - if invalid_action: - current_grade = previous_grade - else: - current_grade, grade_error = self._safe_grade_task( - self._task, - self._current_code, - include_hidden=use_hidden_grading, - timeout_s=timeout_s or 3.0, - ) - if grade_error: - action_error = grade_error - status = f"{status} Grading fallback used." - if action.action_type == "analyze_code": - status = self._analysis_status(current_grade) - elif action.action_type == "run_tests": - status = self._run_tests_status(current_grade, use_hidden_grading) - elif action.action_type == "submit_solution": + if invalid_action: + current_grade = previous_grade + else: + current_grade, grade_error = self._safe_grade_task( + self._task, + self._current_code, + include_hidden=use_hidden_grading, + timeout_s=timeout_s or 3.0, + ) + if grade_error: + action_error = grade_error + status = f"{status} Grading fallback used." + if action.action_type == "analyze_code": + status = self._analysis_status(current_grade) + elif action.action_type == "run_tests": + status = self._run_tests_status(current_grade, use_hidden_grading) + elif action.action_type == "submit_solution": status = self._submission_status(current_grade) done = use_hidden_grading or self._state.step_count >= self._task.max_steps @@ -217,11 +217,11 @@ class PythonCodeReviewEnvironment( reward=reward_details.value, ) ) - - self._current_grade = current_grade - self._last_reward = reward_details - self._last_action_error = action_error - attempts_remaining = max(self._task.max_steps - self._state.step_count, 0) + + self._current_grade = current_grade + self._last_reward = reward_details + self._last_action_error = action_error + attempts_remaining = max(self._task.max_steps - self._state.step_count, 0) self._state.task_id = self._task.task_id self._state.difficulty = self._task.difficulty @@ -234,19 +234,19 @@ class PythonCodeReviewEnvironment( self._state.score = current_grade.score self._state.done = done - observation = self._build_observation( - grade=current_grade, - status=status, - reward_details=reward_details, - ) - return observation, reward_details.value, observation.done, { - "task_id": observation.task_id, - "score": observation.score, - "done": observation.done, - "attempts_remaining": observation.attempts_remaining, - "last_action_status": observation.last_action_status, - "last_action_error": observation.last_action_error, - } + observation = self._build_observation( + grade=current_grade, + status=status, + reward_details=reward_details, + ) + return observation, reward_details.value, observation.done, { + "task_id": observation.task_id, + "score": observation.score, + "done": observation.done, + "attempts_remaining": observation.attempts_remaining, + "last_action_status": observation.last_action_status, + "last_action_error": observation.last_action_error, + } @property def state(self) -> PythonCodeReviewState: @@ -268,22 +268,22 @@ class PythonCodeReviewEnvironment( current_code=self._current_code, errors=self._format_errors(grade), test_results=self._format_test_results(grade), - visible_tests=list(self._task.visible_tests), - history=list(self._history), - attempts_remaining=self._state.attempts_remaining, - last_action_status=status, - last_action_error=self._last_action_error, - score=grade.score, - reward=reward_details.value, - done=self._state.done, - reward_details=reward_details, - metadata={ - "benchmark": "python_code_review_env", - "goal": self._task.goal, - "repo_summary": self._task.repo_summary, - "changed_files": self._task.changed_files, - "available_files": self._task.available_files, - "grade_details": grade.details, + visible_tests=list(self._task.visible_tests), + history=list(self._history), + attempts_remaining=self._state.attempts_remaining, + last_action_status=status, + last_action_error=self._last_action_error, + score=grade.score, + reward=reward_details.value, + done=self._state.done, + reward_details=reward_details, + metadata={ + "benchmark": "python_code_review_env", + "goal": self._task.goal, + "repo_summary": self._task.repo_summary, + "changed_files": self._task.changed_files, + "available_files": self._task.available_files, + "grade_details": grade.details, }, ) @@ -298,43 +298,43 @@ class PythonCodeReviewEnvironment( code_changed: bool, final_submission: bool, ) -> RewardDetails: - prev_score = previous_grade.score - curr_score = current_grade.score - prev_rate = safe_ratio(previous_grade.tests_passed, previous_grade.tests_total) - curr_rate = safe_ratio(current_grade.tests_passed, current_grade.tests_total) - prev_runtime = previous_grade.runtime_score - curr_runtime = current_grade.runtime_score - prev_compile_error = bool(str(previous_grade.details.get("compile_error", "")).strip()) - curr_compile_error = bool(str(current_grade.details.get("compile_error", "")).strip()) - - syntax_reward = 0.14 if previous_grade.syntax_score < 0.9 and current_grade.syntax_score >= 0.9 else 0.0 - test_reward = round(max(curr_rate - prev_rate, 0.0) * 0.28, 3) - progress_delta = round(max(curr_score - prev_score, 0.0) * 0.3, 3) - quality_bonus = round(max(current_grade.quality_score - previous_grade.quality_score, 0.0) * 0.12, 3) - runtime_bonus = round(max(curr_runtime - prev_runtime, 0.0) * 0.08, 3) - error_reduction_bonus = 0.1 if prev_compile_error and not curr_compile_error else 0.0 - completion_bonus = 0.14 if final_submission and curr_rate >= 0.999 and curr_score >= 0.94 else 0.0 - correctness_bonus = 0.12 if final_submission and curr_score >= 0.94 and prev_score < 0.94 else 0.0 - - invalid_action_penalty = round((0.04 + (0.08 * (1.0 - prev_score))) if invalid_action else 0.0, 3) - timeout_penalty = round((0.06 + (0.08 * max(curr_runtime, prev_runtime))) if timed_out else 0.0, 3) - regression_penalty = round(max(prev_score - curr_score, 0.0) * 0.25, 3) - stagnation_penalty = round((0.02 + (0.05 * prev_score)) if action.action_type == "edit_code" and not code_changed else 0.0, 3) - - raw_value = ( - 0.32 * curr_score - + syntax_reward - + test_reward - + progress_delta - + quality_bonus - + error_reduction_bonus - + completion_bonus - + runtime_bonus - + correctness_bonus - - invalid_action_penalty - - timeout_penalty - - regression_penalty - - stagnation_penalty + prev_score = previous_grade.score + curr_score = current_grade.score + prev_rate = safe_ratio(previous_grade.tests_passed, previous_grade.tests_total) + curr_rate = safe_ratio(current_grade.tests_passed, current_grade.tests_total) + prev_runtime = previous_grade.runtime_score + curr_runtime = current_grade.runtime_score + prev_compile_error = bool(str(previous_grade.details.get("compile_error", "")).strip()) + curr_compile_error = bool(str(current_grade.details.get("compile_error", "")).strip()) + + syntax_reward = 0.14 if previous_grade.syntax_score < 0.9 and current_grade.syntax_score >= 0.9 else 0.0 + test_reward = round(max(curr_rate - prev_rate, 0.0) * 0.28, 3) + progress_delta = round(max(curr_score - prev_score, 0.0) * 0.3, 3) + quality_bonus = round(max(current_grade.quality_score - previous_grade.quality_score, 0.0) * 0.12, 3) + runtime_bonus = round(max(curr_runtime - prev_runtime, 0.0) * 0.08, 3) + error_reduction_bonus = 0.1 if prev_compile_error and not curr_compile_error else 0.0 + completion_bonus = 0.14 if final_submission and curr_rate >= 0.999 and curr_score >= 0.94 else 0.0 + correctness_bonus = 0.12 if final_submission and curr_score >= 0.94 and prev_score < 0.94 else 0.0 + + invalid_action_penalty = round((0.04 + (0.08 * (1.0 - prev_score))) if invalid_action else 0.0, 3) + timeout_penalty = round((0.06 + (0.08 * max(curr_runtime, prev_runtime))) if timed_out else 0.0, 3) + regression_penalty = round(max(prev_score - curr_score, 0.0) * 0.25, 3) + stagnation_penalty = round((0.02 + (0.05 * prev_score)) if action.action_type == "edit_code" and not code_changed else 0.0, 3) + + raw_value = ( + 0.32 * curr_score + + syntax_reward + + test_reward + + progress_delta + + quality_bonus + + error_reduction_bonus + + completion_bonus + + runtime_bonus + + correctness_bonus + - invalid_action_penalty + - timeout_penalty + - regression_penalty + - stagnation_penalty ) value = _reward_value(raw_value) @@ -345,16 +345,16 @@ class PythonCodeReviewEnvironment( reason_parts.append("public test progress") if progress_delta: reason_parts.append("overall score improved") - if quality_bonus: - reason_parts.append("code quality improved") - if error_reduction_bonus: - reason_parts.append("errors removed") - if completion_bonus: - reason_parts.append("task completed") - if runtime_bonus: - reason_parts.append("runtime improved") - if correctness_bonus: - reason_parts.append("full correctness bonus") + if quality_bonus: + reason_parts.append("code quality improved") + if error_reduction_bonus: + reason_parts.append("errors removed") + if completion_bonus: + reason_parts.append("task completed") + if runtime_bonus: + reason_parts.append("runtime improved") + if correctness_bonus: + reason_parts.append("full correctness bonus") if invalid_action_penalty: reason_parts.append("invalid action penalty") if timeout_penalty: @@ -368,48 +368,48 @@ class PythonCodeReviewEnvironment( return RewardDetails( value=value, - syntax_reward=syntax_reward, - test_reward=test_reward, - correctness_bonus=correctness_bonus, - quality_bonus=quality_bonus, - error_reduction_bonus=error_reduction_bonus, - completion_bonus=completion_bonus, - runtime_bonus=runtime_bonus, - progress_delta=progress_delta, - invalid_action_penalty=invalid_action_penalty, - timeout_penalty=timeout_penalty, - regression_penalty=regression_penalty, - stagnation_penalty=stagnation_penalty, + syntax_reward=syntax_reward, + test_reward=test_reward, + correctness_bonus=correctness_bonus, + quality_bonus=quality_bonus, + error_reduction_bonus=error_reduction_bonus, + completion_bonus=completion_bonus, + runtime_bonus=runtime_bonus, + progress_delta=progress_delta, + invalid_action_penalty=invalid_action_penalty, + timeout_penalty=timeout_penalty, + regression_penalty=regression_penalty, + stagnation_penalty=stagnation_penalty, reason=", ".join(reason_parts), prev_score=prev_score, curr_score=curr_score, code_changed=code_changed, ) - def _format_errors(self, grade: TaskGrade) -> str: - compile_error = str(grade.details.get("compile_error", "")).strip() - if compile_error: - return compile_error - return "Code parses successfully." - - def _safe_grade_task( - self, - task: ReviewTask, - code: str, - *, - include_hidden: bool, - timeout_s: float = 3.0, - ) -> tuple[TaskGrade, str | None]: - try: - return ( - grade_task(task, code, include_hidden=include_hidden, timeout_s=timeout_s), - None, - ) - except Exception as exc: # pragma: no cover - return _empty_grade(), f"{type(exc).__name__}: {exc}" - - def _format_test_results(self, grade: TaskGrade) -> str: - parts = [grade.details.get("test_summary", "No test feedback available.")] + def _format_errors(self, grade: TaskGrade) -> str: + compile_error = str(grade.details.get("compile_error", "")).strip() + if compile_error: + return compile_error + return "Code parses successfully." + + def _safe_grade_task( + self, + task: ReviewTask, + code: str, + *, + include_hidden: bool, + timeout_s: float = 3.0, + ) -> tuple[TaskGrade, str | None]: + try: + return ( + grade_task(task, code, include_hidden=include_hidden, timeout_s=timeout_s), + None, + ) + except Exception as exc: # pragma: no cover + return _empty_grade(), f"{type(exc).__name__}: {exc}" + + def _format_test_results(self, grade: TaskGrade) -> str: + parts = [grade.details.get("test_summary", "No test feedback available.")] benchmark = grade.details.get("benchmark") if isinstance(benchmark, dict): parts.append( diff --git a/server/requirements.txt b/server/requirements.txt index f18e480e278b0d86cabfc279d871a6fcdf715203..c26841ff879cdda2d9a862166d3abf928384c87d 100644 --- a/server/requirements.txt +++ b/server/requirements.txt @@ -1,8 +1,8 @@ -openenv-core[core]>=0.2.2 -fastapi>=0.111.0 -gradio>=5.26.0 -uvicorn>=0.30.0 -openai>=1.76.0 -streamlit>=1.44.0 -torch>=2.2.0 -transformers>=4.45.0 +openenv-core[core]>=0.2.2 +fastapi>=0.111.0 +gradio>=5.26.0 +uvicorn>=0.30.0 +openai>=1.76.0 +streamlit>=1.44.0 +torch>=2.2.0 +transformers>=4.45.0 diff --git a/services/__init__.py b/services/__init__.py index f550466fcabc6cf41f476d66419384c4a1faaa22..411796a65cc40502bd32c5069c45edd05c8c0d95 100644 --- a/services/__init__.py +++ b/services/__init__.py @@ -1,7 +1,7 @@ -"""Service layer for orchestrating analysis, suggestions, and rewards.""" - -from .analysis_service import AnalysisService -from .reward_service import RewardService -from .suggestion_service import SuggestionService - -__all__ = ["AnalysisService", "RewardService", "SuggestionService"] +"""Service layer for orchestrating analysis, suggestions, and rewards.""" + +from .analysis_service import AnalysisService +from .reward_service import RewardService +from .suggestion_service import SuggestionService + +__all__ = ["AnalysisService", "RewardService", "SuggestionService"] diff --git a/services/analysis_service.py b/services/analysis_service.py index f7ce4fd3289d90a528128ac09aae9683242856bb..a15e1e20a7d4759acc2b5e37eadeb85c72cb51ac 100644 --- a/services/analysis_service.py +++ b/services/analysis_service.py @@ -1,139 +1,139 @@ -"""Orchestration layer for multi-domain code analysis.""" - -from __future__ import annotations - -import time -from typing import Any, Callable, Dict - -from analyzers import analyze_data_science_code, analyze_dsa_code, analyze_ml_code, analyze_web_code -from models import PyTorchCodeAnalyzerModel -from schemas.request import AnalyzeCodeRequest -from schemas.response import AnalyzeCodeResponse, DomainAnalysis, StaticAnalysisSummary -from services.reward_service import RewardService -from services.suggestion_service import SuggestionService -from utils import estimate_complexity, parse_code_structure - - -def _lint_score(parsed: Dict[str, Any]) -> float: - """Convert structural smells into a normalized lint-style score.""" - - score = 1.0 - if not parsed.get("syntax_valid", True): - score -= 0.45 - score -= min(parsed.get("long_lines", 0), 5) * 0.03 - if parsed.get("tabs_used"): - score -= 0.1 - if parsed.get("trailing_whitespace_lines"): - score -= 0.05 - if parsed.get("docstring_ratio", 0.0) == 0.0 and parsed.get("function_names"): - score -= 0.08 - return round(max(0.0, min(1.0, score)), 4) - - -class AnalysisService: - """End-to-end analysis pipeline shared by API and UI.""" - - def __init__(self) -> None: - self._model: PyTorchCodeAnalyzerModel | None = None - self.reward_service = RewardService() - self.suggestion_service = SuggestionService() - self._analyzers: Dict[str, Callable[[str, Dict[str, Any], Dict[str, Any]], DomainAnalysis]] = { - "dsa": analyze_dsa_code, - "data_science": analyze_data_science_code, - "ml_dl": analyze_ml_code, - "web": analyze_web_code, - } - - @property - def model(self) -> PyTorchCodeAnalyzerModel: - if self._model is None: - self._model = PyTorchCodeAnalyzerModel() - return self._model - - def _heuristic_domain_scores(self, parsed: Dict[str, Any], code: str) -> Dict[str, float]: - """Derive domain priors from imports and syntax-level hints.""" - - scores = { - "dsa": 0.2 + (0.15 if parsed.get("uses_recursion") else 0.0) + (0.15 if parsed.get("max_loop_depth", 0) >= 1 else 0.0), - "data_science": 0.2 + (0.35 if parsed.get("uses_pandas") or parsed.get("uses_numpy") else 0.0), - "ml_dl": 0.2 + (0.35 if parsed.get("uses_torch") or parsed.get("uses_sklearn") else 0.0), - "web": 0.2 + (0.35 if parsed.get("uses_fastapi") or parsed.get("uses_flask") else 0.0) + (0.1 if parsed.get("route_decorators") else 0.0), - "general": 0.2, - } - if "fastapi" in code.lower(): - scores["web"] += 0.1 - if "pandas" in code.lower() or "numpy" in code.lower(): - scores["data_science"] += 0.1 - if "torch" in code.lower(): - scores["ml_dl"] += 0.1 - if "while" in code or "for" in code: - scores["dsa"] += 0.05 - return {key: round(min(value, 0.99), 4) for key, value in scores.items()} - - def analyze(self, request: AnalyzeCodeRequest) -> AnalyzeCodeResponse: - """Run the complete multi-domain analysis pipeline.""" - - started = time.perf_counter() - parsed = parse_code_structure(request.code) - complexity = estimate_complexity(parsed, request.code) - model_prediction = self.model.predict(request.code, request.context_window, parsed) - heuristic_scores = self._heuristic_domain_scores(parsed, request.code) - - combined_scores = {} - for domain, heuristic_score in heuristic_scores.items(): - model_score = float(model_prediction["domain_scores"].get(domain, 0.2)) - combined_scores[domain] = round((0.6 * model_score) + (0.4 * heuristic_score), 4) - - detected_domain = request.domain_hint if request.domain_hint != "auto" else max(combined_scores, key=combined_scores.get) - analyzer = self._analyzers.get(detected_domain) - domain_analysis = ( - analyzer(request.code, parsed, complexity) - if analyzer is not None - else DomainAnalysis( - domain="general", - domain_score=0.6, - issues=[], - suggestions=["Add stronger domain-specific context for deeper analysis."], - highlights={}, - ) - ) - - lint_score = _lint_score(parsed) - score_breakdown = self.reward_service.compute( - ml_score=float(model_prediction["ml_quality_score"]), - domain_score=domain_analysis.domain_score, - lint_score=lint_score, - complexity_penalty=float(complexity["complexity_penalty"]), - ) - static_analysis = StaticAnalysisSummary( - syntax_valid=bool(parsed["syntax_valid"]), - syntax_error=str(parsed["syntax_error"]), - cyclomatic_complexity=int(complexity["cyclomatic_complexity"]), - line_count=int(parsed["line_count"]), - max_loop_depth=int(parsed["max_loop_depth"]), - time_complexity=str(complexity["time_complexity"]), - space_complexity=str(complexity["space_complexity"]), - detected_imports=list(parsed["imports"]), - code_smells=list(parsed["code_smells"]), - ) - improvement_plan = self.suggestion_service.build_improvement_plan( - domain_analysis=domain_analysis, - static_analysis=static_analysis, - ) - summary = ( - f"Detected `{detected_domain}` code with a model score of {score_breakdown.ml_score:.0%}, " - f"domain score {score_breakdown.domain_score:.0%}, and final reward {score_breakdown.reward:.0%}." - ) - return AnalyzeCodeResponse( - detected_domain=detected_domain, # type: ignore[arg-type] - domain_confidences=combined_scores, - score_breakdown=score_breakdown, - static_analysis=static_analysis, - domain_analysis=domain_analysis, - improvement_plan=improvement_plan, - model_backend=str(model_prediction["backend_name"]), - model_id=str(model_prediction["model_id"]), - summary=summary, - context_window=request.context_window, - analysis_time_ms=round((time.perf_counter() - started) * 1000.0, 2), - ) +"""Orchestration layer for multi-domain code analysis.""" + +from __future__ import annotations + +import time +from typing import Any, Callable, Dict + +from analyzers import analyze_data_science_code, analyze_dsa_code, analyze_ml_code, analyze_web_code +from models import PyTorchCodeAnalyzerModel +from schemas.request import AnalyzeCodeRequest +from schemas.response import AnalyzeCodeResponse, DomainAnalysis, StaticAnalysisSummary +from services.reward_service import RewardService +from services.suggestion_service import SuggestionService +from utils import estimate_complexity, parse_code_structure + + +def _lint_score(parsed: Dict[str, Any]) -> float: + """Convert structural smells into a normalized lint-style score.""" + + score = 1.0 + if not parsed.get("syntax_valid", True): + score -= 0.45 + score -= min(parsed.get("long_lines", 0), 5) * 0.03 + if parsed.get("tabs_used"): + score -= 0.1 + if parsed.get("trailing_whitespace_lines"): + score -= 0.05 + if parsed.get("docstring_ratio", 0.0) == 0.0 and parsed.get("function_names"): + score -= 0.08 + return round(max(0.0, min(1.0, score)), 4) + + +class AnalysisService: + """End-to-end analysis pipeline shared by API and UI.""" + + def __init__(self) -> None: + self._model: PyTorchCodeAnalyzerModel | None = None + self.reward_service = RewardService() + self.suggestion_service = SuggestionService() + self._analyzers: Dict[str, Callable[[str, Dict[str, Any], Dict[str, Any]], DomainAnalysis]] = { + "dsa": analyze_dsa_code, + "data_science": analyze_data_science_code, + "ml_dl": analyze_ml_code, + "web": analyze_web_code, + } + + @property + def model(self) -> PyTorchCodeAnalyzerModel: + if self._model is None: + self._model = PyTorchCodeAnalyzerModel() + return self._model + + def _heuristic_domain_scores(self, parsed: Dict[str, Any], code: str) -> Dict[str, float]: + """Derive domain priors from imports and syntax-level hints.""" + + scores = { + "dsa": 0.2 + (0.15 if parsed.get("uses_recursion") else 0.0) + (0.15 if parsed.get("max_loop_depth", 0) >= 1 else 0.0), + "data_science": 0.2 + (0.35 if parsed.get("uses_pandas") or parsed.get("uses_numpy") else 0.0), + "ml_dl": 0.2 + (0.35 if parsed.get("uses_torch") or parsed.get("uses_sklearn") else 0.0), + "web": 0.2 + (0.35 if parsed.get("uses_fastapi") or parsed.get("uses_flask") else 0.0) + (0.1 if parsed.get("route_decorators") else 0.0), + "general": 0.2, + } + if "fastapi" in code.lower(): + scores["web"] += 0.1 + if "pandas" in code.lower() or "numpy" in code.lower(): + scores["data_science"] += 0.1 + if "torch" in code.lower(): + scores["ml_dl"] += 0.1 + if "while" in code or "for" in code: + scores["dsa"] += 0.05 + return {key: round(min(value, 0.99), 4) for key, value in scores.items()} + + def analyze(self, request: AnalyzeCodeRequest) -> AnalyzeCodeResponse: + """Run the complete multi-domain analysis pipeline.""" + + started = time.perf_counter() + parsed = parse_code_structure(request.code) + complexity = estimate_complexity(parsed, request.code) + model_prediction = self.model.predict(request.code, request.context_window, parsed) + heuristic_scores = self._heuristic_domain_scores(parsed, request.code) + + combined_scores = {} + for domain, heuristic_score in heuristic_scores.items(): + model_score = float(model_prediction["domain_scores"].get(domain, 0.2)) + combined_scores[domain] = round((0.6 * model_score) + (0.4 * heuristic_score), 4) + + detected_domain = request.domain_hint if request.domain_hint != "auto" else max(combined_scores, key=combined_scores.get) + analyzer = self._analyzers.get(detected_domain) + domain_analysis = ( + analyzer(request.code, parsed, complexity) + if analyzer is not None + else DomainAnalysis( + domain="general", + domain_score=0.6, + issues=[], + suggestions=["Add stronger domain-specific context for deeper analysis."], + highlights={}, + ) + ) + + lint_score = _lint_score(parsed) + score_breakdown = self.reward_service.compute( + ml_score=float(model_prediction["ml_quality_score"]), + domain_score=domain_analysis.domain_score, + lint_score=lint_score, + complexity_penalty=float(complexity["complexity_penalty"]), + ) + static_analysis = StaticAnalysisSummary( + syntax_valid=bool(parsed["syntax_valid"]), + syntax_error=str(parsed["syntax_error"]), + cyclomatic_complexity=int(complexity["cyclomatic_complexity"]), + line_count=int(parsed["line_count"]), + max_loop_depth=int(parsed["max_loop_depth"]), + time_complexity=str(complexity["time_complexity"]), + space_complexity=str(complexity["space_complexity"]), + detected_imports=list(parsed["imports"]), + code_smells=list(parsed["code_smells"]), + ) + improvement_plan = self.suggestion_service.build_improvement_plan( + domain_analysis=domain_analysis, + static_analysis=static_analysis, + ) + summary = ( + f"Detected `{detected_domain}` code with a model score of {score_breakdown.ml_score:.0%}, " + f"domain score {score_breakdown.domain_score:.0%}, and final reward {score_breakdown.reward:.0%}." + ) + return AnalyzeCodeResponse( + detected_domain=detected_domain, # type: ignore[arg-type] + domain_confidences=combined_scores, + score_breakdown=score_breakdown, + static_analysis=static_analysis, + domain_analysis=domain_analysis, + improvement_plan=improvement_plan, + model_backend=str(model_prediction["backend_name"]), + model_id=str(model_prediction["model_id"]), + summary=summary, + context_window=request.context_window, + analysis_time_ms=round((time.perf_counter() - started) * 1000.0, 2), + ) diff --git a/services/reward_service.py b/services/reward_service.py index b2f5607ee66241da0012f5c93d93157b60d1d85e..f413e36de211244c4ba8f65992df032f803db80f 100644 --- a/services/reward_service.py +++ b/services/reward_service.py @@ -1,38 +1,38 @@ -"""Reward shaping logic for RL-ready code analysis scores.""" - -from __future__ import annotations - -from schemas.response import ScoreBreakdown - - -class RewardService: - """Compute reward scores from model, domain, lint, and complexity signals.""" - - def compute(self, *, ml_score: float, domain_score: float, lint_score: float, complexity_penalty: float) -> ScoreBreakdown: - """Apply dynamic reward shaping based on quality, errors, and completion.""" - - quality_signal = max(0.0, min(1.0, (0.45 * ml_score) + (0.3 * domain_score) + (0.25 * lint_score))) - error_reduction_signal = max(0.0, min(1.0, lint_score - (0.6 * complexity_penalty))) - completion_signal = max(0.0, min(1.0, (ml_score + domain_score + lint_score) / 3.0)) - reward = max( - 0.0, - min( - 1.0, - (0.35 * quality_signal) - + (0.25 * completion_signal) - + (0.2 * error_reduction_signal) - + (0.1 * ml_score) - + (0.1 * domain_score) - - (0.15 * complexity_penalty), - ), - ) - return ScoreBreakdown( - ml_score=round(ml_score, 4), - domain_score=round(domain_score, 4), - lint_score=round(lint_score, 4), - complexity_penalty=round(complexity_penalty, 4), - quality_signal=round(quality_signal, 4), - error_reduction_signal=round(error_reduction_signal, 4), - completion_signal=round(completion_signal, 4), - reward=round(reward, 4), - ) +"""Reward shaping logic for RL-ready code analysis scores.""" + +from __future__ import annotations + +from schemas.response import ScoreBreakdown + + +class RewardService: + """Compute reward scores from model, domain, lint, and complexity signals.""" + + def compute(self, *, ml_score: float, domain_score: float, lint_score: float, complexity_penalty: float) -> ScoreBreakdown: + """Apply dynamic reward shaping based on quality, errors, and completion.""" + + quality_signal = max(0.0, min(1.0, (0.45 * ml_score) + (0.3 * domain_score) + (0.25 * lint_score))) + error_reduction_signal = max(0.0, min(1.0, lint_score - (0.6 * complexity_penalty))) + completion_signal = max(0.0, min(1.0, (ml_score + domain_score + lint_score) / 3.0)) + reward = max( + 0.0, + min( + 1.0, + (0.35 * quality_signal) + + (0.25 * completion_signal) + + (0.2 * error_reduction_signal) + + (0.1 * ml_score) + + (0.1 * domain_score) + - (0.15 * complexity_penalty), + ), + ) + return ScoreBreakdown( + ml_score=round(ml_score, 4), + domain_score=round(domain_score, 4), + lint_score=round(lint_score, 4), + complexity_penalty=round(complexity_penalty, 4), + quality_signal=round(quality_signal, 4), + error_reduction_signal=round(error_reduction_signal, 4), + completion_signal=round(completion_signal, 4), + reward=round(reward, 4), + ) diff --git a/services/suggestion_service.py b/services/suggestion_service.py index 2c5683754cc2ed4a9bb9f630d30e5ffea24a9b72..fe6d314d2e57283c0864b7b27127d1ce6c010c1b 100644 --- a/services/suggestion_service.py +++ b/services/suggestion_service.py @@ -1,28 +1,28 @@ -"""Suggestion and improvement-plan generation for analyzed code.""" - -from __future__ import annotations - -from schemas.response import DomainAnalysis, StaticAnalysisSummary - - -class SuggestionService: - """Build high-signal improvement steps from analysis output.""" - - def build_improvement_plan(self, *, domain_analysis: DomainAnalysis, static_analysis: StaticAnalysisSummary) -> list[str]: - """Return a compact three-step plan optimized for developer action.""" - - primary_issue = ( - domain_analysis.issues[0].description - if domain_analysis.issues - else "Stabilize correctness first and keep the public behavior explicit." - ) - - step_one = f"Step 1 - Correctness and safety: {primary_issue}" - step_two = "Step 2 - Edge cases: test empty inputs, boundary values, malformed payloads, and failure-mode behavior explicitly." - step_three = "Step 3 - Scalability: reduce repeated scans, lower cyclomatic complexity, and benchmark the path on realistic input sizes." - - if domain_analysis.suggestions: - step_three = f"{step_three} Priority hint: {domain_analysis.suggestions[0]}" - if not static_analysis.syntax_valid: - step_one = f"Step 1 - Correctness and safety: fix the syntax error first ({static_analysis.syntax_error})." - return [step_one, step_two, step_three] +"""Suggestion and improvement-plan generation for analyzed code.""" + +from __future__ import annotations + +from schemas.response import DomainAnalysis, StaticAnalysisSummary + + +class SuggestionService: + """Build high-signal improvement steps from analysis output.""" + + def build_improvement_plan(self, *, domain_analysis: DomainAnalysis, static_analysis: StaticAnalysisSummary) -> list[str]: + """Return a compact three-step plan optimized for developer action.""" + + primary_issue = ( + domain_analysis.issues[0].description + if domain_analysis.issues + else "Stabilize correctness first and keep the public behavior explicit." + ) + + step_one = f"Step 1 - Correctness and safety: {primary_issue}" + step_two = "Step 2 - Edge cases: test empty inputs, boundary values, malformed payloads, and failure-mode behavior explicitly." + step_three = "Step 3 - Scalability: reduce repeated scans, lower cyclomatic complexity, and benchmark the path on realistic input sizes." + + if domain_analysis.suggestions: + step_three = f"{step_three} Priority hint: {domain_analysis.suggestions[0]}" + if not static_analysis.syntax_valid: + step_one = f"Step 1 - Correctness and safety: fix the syntax error first ({static_analysis.syntax_error})." + return [step_one, step_two, step_three] diff --git a/triage.py b/triage.py index 755f4d82cb3a79b44504a8a36ed1e3307de12d98..632647d3ad3428eb80ca3ecc5d3fc54e24d909a4 100644 --- a/triage.py +++ b/triage.py @@ -1,473 +1,473 @@ -"""PyTorch-backed triage pipeline for TorchReview Copilot.""" - -from __future__ import annotations - -import ast -import hashlib -import os -import re -import time -from functools import lru_cache -from typing import List, Sequence - -import torch -import torch.nn.functional as F - -try: - from transformers import AutoModel, AutoTokenizer -except Exception: - AutoModel = None # type: ignore[assignment] - AutoTokenizer = None # type: ignore[assignment] - -try: - from .triage_catalog import build_examples, build_prototypes - from .triage_models import ( - IssueLabel, - PrototypeMatch, - TriageExample, - TriagePrototype, - TriageResult, - TriageSignal, - ) -except ImportError: - from triage_catalog import build_examples, build_prototypes - from triage_models import ( - IssueLabel, - PrototypeMatch, - TriageExample, - TriagePrototype, - TriageResult, - TriageSignal, - ) - - -MODEL_ID = os.getenv("TRIAGE_MODEL_ID", "huggingface/CodeBERTa-small-v1") -MODEL_MAX_LENGTH = int(os.getenv("TRIAGE_MODEL_MAX_LENGTH", "256")) -LABELS: tuple[IssueLabel, ...] = ("syntax", "logic", "performance") - - -class _LoopDepthVisitor(ast.NodeVisitor): - """Track the maximum loop nesting depth in a code snippet.""" - - def __init__(self) -> None: - self.depth = 0 - self.max_depth = 0 - - def _visit_loop(self, node: ast.AST) -> None: - self.depth += 1 - self.max_depth = max(self.max_depth, self.depth) - self.generic_visit(node) - self.depth -= 1 - - def visit_For(self, node: ast.For) -> None: # noqa: N802 - self._visit_loop(node) - - def visit_While(self, node: ast.While) -> None: # noqa: N802 - self._visit_loop(node) - - def visit_comprehension(self, node: ast.comprehension) -> None: # noqa: N802 - self._visit_loop(node) - - -class HashingEmbeddingBackend: - """Deterministic torch-native fallback when pretrained weights are unavailable.""" - - def __init__(self, dimensions: int = 96) -> None: - self.dimensions = dimensions - self.model_id = "hashed-token-fallback" - self.backend_name = "hashed-token-fallback" - self.notes = ["Using hashed torch embeddings because pretrained weights are unavailable."] - - def embed_texts(self, texts: Sequence[str]) -> torch.Tensor: - rows = torch.zeros((len(texts), self.dimensions), dtype=torch.float32) - for row_index, text in enumerate(texts): - tokens = re.findall(r"[A-Za-z_]+|\d+|==|!=|<=|>=|\S", text.lower())[:512] - if not tokens: - rows[row_index, 0] = 1.0 - continue - for token in tokens: - digest = hashlib.md5(token.encode("utf-8")).hexdigest() - bucket = int(digest[:8], 16) % self.dimensions - sign = -1.0 if int(digest[8:10], 16) % 2 else 1.0 - rows[row_index, bucket] += sign - return F.normalize(rows + 1e-6, dim=1) - - -class TransformersEmbeddingBackend: - """Mean-pool CodeBERTa embeddings via torch + transformers.""" - - def __init__(self, model_id: str = MODEL_ID, force_fallback: bool = False) -> None: - self.model_id = model_id - self.force_fallback = force_fallback - self.backend_name = model_id - self.notes: List[str] = [] - self._fallback = HashingEmbeddingBackend() - self._tokenizer = None - self._model = None - self._load_error = "" - if force_fallback: - self.backend_name = self._fallback.backend_name - self.notes = list(self._fallback.notes) - - def _ensure_loaded(self) -> None: - if self.force_fallback or self._model is not None or self._load_error: - return - if AutoTokenizer is None or AutoModel is None: - self._load_error = "transformers is not installed." - else: - try: - self._tokenizer = AutoTokenizer.from_pretrained(self.model_id) - self._model = AutoModel.from_pretrained(self.model_id) - self._model.eval() - self.notes.append(f"Loaded pretrained encoder `{self.model_id}` for inference.") - except Exception as exc: - self._load_error = f"{type(exc).__name__}: {exc}" - - if self._load_error: - self.backend_name = self._fallback.backend_name - self.notes = list(self._fallback.notes) + [f"Pretrained load failed: {self._load_error}"] - - def embed_texts(self, texts: Sequence[str]) -> torch.Tensor: - self._ensure_loaded() - if self._model is None or self._tokenizer is None: - return self._fallback.embed_texts(texts) - - encoded = self._tokenizer( - list(texts), - padding=True, - truncation=True, - max_length=MODEL_MAX_LENGTH, - return_tensors="pt", - ) - with torch.no_grad(): - outputs = self._model(**encoded) - hidden_state = outputs.last_hidden_state - mask = encoded["attention_mask"].unsqueeze(-1) - pooled = (hidden_state * mask).sum(dim=1) / mask.sum(dim=1).clamp(min=1) - return F.normalize(pooled, dim=1) - - -def _sanitize_text(value: str) -> str: - text = (value or "").strip() - return text[:4000] - - -def _safe_softmax(scores: dict[IssueLabel, float]) -> dict[str, float]: - tensor = torch.tensor([scores[label] for label in LABELS], dtype=torch.float32) - probabilities = torch.softmax(tensor * 4.0, dim=0) - return {label: round(float(probabilities[index]), 4) for index, label in enumerate(LABELS)} - - -def _loop_depth(code: str) -> int: - try: - tree = ast.parse(code) - except SyntaxError: - return 0 - visitor = _LoopDepthVisitor() - visitor.visit(tree) - return visitor.max_depth - - -def _repair_risk(label: IssueLabel, confidence: float, signal_count: int) -> str: - base = {"syntax": 0.25, "logic": 0.55, "performance": 0.7}[label] - if confidence < 0.55: - base += 0.12 - if signal_count >= 4: - base += 0.08 - if base < 0.4: - return "low" - if base < 0.72: - return "medium" - return "high" - - -def _clamp_unit(value: float) -> float: - return round(max(0.0, min(1.0, float(value))), 4) - - -def _lint_score(code: str) -> float: - stripped_lines = [line.rstrip("\n") for line in code.splitlines()] - if not stripped_lines: - return 0.2 - - score = 1.0 - if any(len(line) > 88 for line in stripped_lines): - score -= 0.15 - if any(line.rstrip() != line for line in stripped_lines): - score -= 0.1 - if any("\t" in line for line in stripped_lines): - score -= 0.1 - try: - tree = ast.parse(code) - functions = [node for node in tree.body if isinstance(node, ast.FunctionDef)] - if functions and not ast.get_docstring(functions[0]): - score -= 0.08 - except SyntaxError: - score -= 0.45 - return _clamp_unit(score) - - -def _complexity_penalty(code: str) -> float: - try: - tree = ast.parse(code) - except SyntaxError: - return 0.95 - branch_nodes = sum(isinstance(node, (ast.If, ast.For, ast.While, ast.Try, ast.Match)) for node in ast.walk(tree)) - loop_depth = _loop_depth(code) - penalty = 0.1 + min(branch_nodes, 8) * 0.07 + min(loop_depth, 4) * 0.12 - return _clamp_unit(penalty) - - -class CodeTriageEngine: - """Combine static signals with PyTorch embeddings to classify code issues.""" - - def __init__( - self, - *, - backend: TransformersEmbeddingBackend | HashingEmbeddingBackend | None = None, - prototypes: Sequence[TriagePrototype] | None = None, - examples: Sequence[TriageExample] | None = None, - ) -> None: - self.backend = backend or TransformersEmbeddingBackend() - self.prototypes = list(prototypes or build_prototypes()) - self.examples = list(examples or build_examples()) - self._prototype_matrix: torch.Tensor | None = None - self._reference_code_matrix: torch.Tensor | None = None - - def example_map(self) -> dict[str, TriageExample]: - """Return UI examples keyed by task id.""" - - return {example.key: example for example in self.examples} - - def _build_document(self, code: str, traceback_text: str) -> str: - trace = _sanitize_text(traceback_text) or "No traceback supplied." - snippet = _sanitize_text(code) or "# No code supplied." - return f"Candidate code:\n{snippet}\n\nObserved failure:\n{trace}\n" - - def _build_review_document(self, code: str, traceback_text: str, context_window: str) -> str: - context = _sanitize_text(context_window) or "No additional context window supplied." - return ( - f"{self._build_document(code, traceback_text)}\n" - f"Context window:\n{context}\n" - ) - - def _prototype_embeddings(self) -> torch.Tensor: - if self._prototype_matrix is None: - reference_texts = [prototype.reference_text for prototype in self.prototypes] - self._prototype_matrix = self.backend.embed_texts(reference_texts) - return self._prototype_matrix - - def _reference_code_embeddings(self) -> torch.Tensor: - if self._reference_code_matrix is None: - reference_codes = [prototype.reference_code for prototype in self.prototypes] - self._reference_code_matrix = self.backend.embed_texts(reference_codes) - return self._reference_code_matrix - - def _extract_signals(self, code: str, traceback_text: str) -> tuple[list[TriageSignal], dict[IssueLabel, float], list[str]]: - trace = (traceback_text or "").lower() - heuristic_scores: dict[IssueLabel, float] = {label: 0.15 for label in LABELS} - signals: list[TriageSignal] = [] - notes: list[str] = [] - - try: - ast.parse(code) - signals.append( - TriageSignal( - name="syntax_parse", - value="passes", - impact="syntax", - weight=0.1, - evidence="Python AST parsing succeeded.", - ) - ) - heuristic_scores["logic"] += 0.05 - except SyntaxError as exc: - evidence = f"{exc.msg} at line {exc.lineno}" - signals.append( - TriageSignal( - name="syntax_parse", - value="fails", - impact="syntax", - weight=0.95, - evidence=evidence, - ) - ) - heuristic_scores["syntax"] += 0.85 - notes.append(f"Parser failure detected: {evidence}") - - if any(token in trace for token in ("syntaxerror", "indentationerror", "expected ':'")): - signals.append( - TriageSignal( - name="traceback_keyword", - value="syntaxerror", - impact="syntax", - weight=0.8, - evidence="Traceback contains a parser error.", - ) - ) - heuristic_scores["syntax"] += 0.55 - - if any(token in trace for token in ("assertionerror", "expected:", "actual:", "boundary", "missing", "incorrect")): - signals.append( - TriageSignal( - name="test_failure_signal", - value="assertion-style failure", - impact="logic", - weight=0.7, - evidence="Failure text points to behavioral mismatch instead of parser issues.", - ) - ) - heuristic_scores["logic"] += 0.55 - - if any(token in trace for token in ("timeout", "benchmark", "slow", "latency", "performance", "profiler")): - signals.append( - TriageSignal( - name="performance_trace", - value="latency regression", - impact="performance", - weight=0.85, - evidence="Traceback mentions benchmark or latency pressure.", - ) - ) - heuristic_scores["performance"] += 0.7 - - loop_depth = _loop_depth(code) - if loop_depth >= 2: - signals.append( - TriageSignal( - name="loop_depth", - value=str(loop_depth), - impact="performance", - weight=0.65, - evidence="Nested iteration increases runtime risk on larger fixtures.", - ) - ) - heuristic_scores["performance"] += 0.35 - - if "Counter(" in code or "defaultdict(" in code or "set(" in code: - heuristic_scores["performance"] += 0.05 - - if "return sessions" in code and "sessions.append" not in code: - signals.append( - TriageSignal( - name="state_update_gap", - value="possible missing final append", - impact="logic", - weight=0.45, - evidence="A collection is returned without an obvious final state flush.", - ) - ) - heuristic_scores["logic"] += 0.18 - - return signals, heuristic_scores, notes - - def _nearest_match(self, embedding: torch.Tensor) -> tuple[TriagePrototype, float, dict[str, float]]: - similarities = torch.matmul(embedding, self._prototype_embeddings().T)[0] - indexed_scores = { - self.prototypes[index].task_id: round(float((similarities[index] + 1.0) / 2.0), 4) - for index in range(len(self.prototypes)) - } - best_index = int(torch.argmax(similarities).item()) - best_prototype = self.prototypes[best_index] - best_similarity = float((similarities[best_index] + 1.0) / 2.0) - return best_prototype, best_similarity, indexed_scores - - def _repair_plan(self, label: IssueLabel, matched: TriagePrototype, context_window: str) -> list[str]: - context = _sanitize_text(context_window) - step_one = { - "syntax": "Step 1 - Syntax checking and bug fixes: resolve the parser break before touching behavior, then align the function with the expected contract.", - "logic": "Step 1 - Syntax checking and bug fixes: confirm the code parses cleanly, then patch the failing branch or state update causing the incorrect result.", - "performance": "Step 1 - Syntax checking and bug fixes: keep the implementation correct first, then isolate the slow section without changing external behavior.", - }[label] - step_two = ( - "Step 2 - Edge case handling: verify empty input, boundary values, missing fields, and final-state flush behavior " - f"against the known pattern `{matched.title}`." - ) - step_three = ( - "Step 3 - Scalability of code: remove repeated full scans, prefer linear-time data structures, " - "and benchmark the path on a production-like fixture." - ) - if context: - step_two = f"{step_two} Context window to preserve: {context}" - return [step_one, step_two, step_three] - - def _reference_quality_score(self, code: str, matched: TriagePrototype) -> float: - candidate = self.backend.embed_texts([_sanitize_text(code) or "# empty"]) - match_index = next(index for index, prototype in enumerate(self.prototypes) if prototype.task_id == matched.task_id) - reference = self._reference_code_embeddings()[match_index : match_index + 1] - score = float(torch.matmul(candidate, reference.T)[0][0].item()) - return _clamp_unit((score + 1.0) / 2.0) - - def triage(self, code: str, traceback_text: str = "", context_window: str = "") -> TriageResult: - """Run the full triage pipeline on code plus optional failure context.""" - - started = time.perf_counter() - document = self._build_review_document(code, traceback_text, context_window) - signals, heuristic_scores, notes = self._extract_signals(code, traceback_text) - - candidate_embedding = self.backend.embed_texts([document]) - matched, matched_similarity, prototype_scores = self._nearest_match(candidate_embedding) - - label_similarity = {label: 0.18 for label in LABELS} - for prototype in self.prototypes: - label_similarity[prototype.label] = max( - label_similarity[prototype.label], - prototype_scores[prototype.task_id], - ) - - combined_scores = { - label: 0.72 * label_similarity[label] + 0.28 * heuristic_scores[label] - for label in LABELS - } - confidence_scores = _safe_softmax(combined_scores) - issue_label = max(LABELS, key=lambda label: confidence_scores[label]) - top_confidence = confidence_scores[issue_label] - - top_signal = signals[0].evidence if signals else "Model similarity dominated the decision." - ml_quality_score = self._reference_quality_score(code, matched) - lint_score = _lint_score(code) - complexity_penalty = _complexity_penalty(code) - reward_score = _clamp_unit((0.5 * ml_quality_score) + (0.3 * lint_score) - (0.2 * complexity_penalty)) - summary = ( - f"Detected a {issue_label} issue with {top_confidence:.0%} confidence. " - f"The closest known failure pattern is `{matched.title}`, which indicates {matched.summary.lower()}. " - f"Predicted quality score is {ml_quality_score:.0%} with an RL-ready reward of {reward_score:.0%}." - ) - suggested_next_action = { - "syntax": "Fix the parser error first, then rerun validation before changing behavior.", - "logic": "Step through the smallest failing case and confirm the final branch/update behavior.", - "performance": "Replace repeated full-list scans with a linear-time aggregation strategy, then benchmark it.", - }[issue_label] - - return TriageResult( - issue_label=issue_label, - confidence_scores=confidence_scores, - repair_risk=_repair_risk(issue_label, top_confidence, len(signals)), - ml_quality_score=ml_quality_score, - lint_score=lint_score, - complexity_penalty=complexity_penalty, - reward_score=reward_score, - summary=summary, - matched_pattern=PrototypeMatch( - task_id=matched.task_id, - title=matched.title, - label=matched.label, - similarity=round(matched_similarity, 4), - summary=matched.summary, - rationale=top_signal, - ), - repair_plan=self._repair_plan(issue_label, matched, context_window), - suggested_next_action=suggested_next_action, - extracted_signals=signals, - model_backend=self.backend.backend_name, - model_id=self.backend.model_id, - inference_notes=list(self.backend.notes) + notes, - analysis_time_ms=round((time.perf_counter() - started) * 1000.0, 2), - ) - - -@lru_cache(maxsize=1) -def get_default_engine() -> CodeTriageEngine: - """Return a cached triage engine for the running process.""" - - return CodeTriageEngine() +"""PyTorch-backed triage pipeline for TorchReview Copilot.""" + +from __future__ import annotations + +import ast +import hashlib +import os +import re +import time +from functools import lru_cache +from typing import List, Sequence + +import torch +import torch.nn.functional as F + +try: + from transformers import AutoModel, AutoTokenizer +except Exception: + AutoModel = None # type: ignore[assignment] + AutoTokenizer = None # type: ignore[assignment] + +try: + from .triage_catalog import build_examples, build_prototypes + from .triage_models import ( + IssueLabel, + PrototypeMatch, + TriageExample, + TriagePrototype, + TriageResult, + TriageSignal, + ) +except ImportError: + from triage_catalog import build_examples, build_prototypes + from triage_models import ( + IssueLabel, + PrototypeMatch, + TriageExample, + TriagePrototype, + TriageResult, + TriageSignal, + ) + + +MODEL_ID = os.getenv("TRIAGE_MODEL_ID", "huggingface/CodeBERTa-small-v1") +MODEL_MAX_LENGTH = int(os.getenv("TRIAGE_MODEL_MAX_LENGTH", "256")) +LABELS: tuple[IssueLabel, ...] = ("syntax", "logic", "performance") + + +class _LoopDepthVisitor(ast.NodeVisitor): + """Track the maximum loop nesting depth in a code snippet.""" + + def __init__(self) -> None: + self.depth = 0 + self.max_depth = 0 + + def _visit_loop(self, node: ast.AST) -> None: + self.depth += 1 + self.max_depth = max(self.max_depth, self.depth) + self.generic_visit(node) + self.depth -= 1 + + def visit_For(self, node: ast.For) -> None: # noqa: N802 + self._visit_loop(node) + + def visit_While(self, node: ast.While) -> None: # noqa: N802 + self._visit_loop(node) + + def visit_comprehension(self, node: ast.comprehension) -> None: # noqa: N802 + self._visit_loop(node) + + +class HashingEmbeddingBackend: + """Deterministic torch-native fallback when pretrained weights are unavailable.""" + + def __init__(self, dimensions: int = 96) -> None: + self.dimensions = dimensions + self.model_id = "hashed-token-fallback" + self.backend_name = "hashed-token-fallback" + self.notes = ["Using hashed torch embeddings because pretrained weights are unavailable."] + + def embed_texts(self, texts: Sequence[str]) -> torch.Tensor: + rows = torch.zeros((len(texts), self.dimensions), dtype=torch.float32) + for row_index, text in enumerate(texts): + tokens = re.findall(r"[A-Za-z_]+|\d+|==|!=|<=|>=|\S", text.lower())[:512] + if not tokens: + rows[row_index, 0] = 1.0 + continue + for token in tokens: + digest = hashlib.md5(token.encode("utf-8")).hexdigest() + bucket = int(digest[:8], 16) % self.dimensions + sign = -1.0 if int(digest[8:10], 16) % 2 else 1.0 + rows[row_index, bucket] += sign + return F.normalize(rows + 1e-6, dim=1) + + +class TransformersEmbeddingBackend: + """Mean-pool CodeBERTa embeddings via torch + transformers.""" + + def __init__(self, model_id: str = MODEL_ID, force_fallback: bool = False) -> None: + self.model_id = model_id + self.force_fallback = force_fallback + self.backend_name = model_id + self.notes: List[str] = [] + self._fallback = HashingEmbeddingBackend() + self._tokenizer = None + self._model = None + self._load_error = "" + if force_fallback: + self.backend_name = self._fallback.backend_name + self.notes = list(self._fallback.notes) + + def _ensure_loaded(self) -> None: + if self.force_fallback or self._model is not None or self._load_error: + return + if AutoTokenizer is None or AutoModel is None: + self._load_error = "transformers is not installed." + else: + try: + self._tokenizer = AutoTokenizer.from_pretrained(self.model_id) + self._model = AutoModel.from_pretrained(self.model_id) + self._model.eval() + self.notes.append(f"Loaded pretrained encoder `{self.model_id}` for inference.") + except Exception as exc: + self._load_error = f"{type(exc).__name__}: {exc}" + + if self._load_error: + self.backend_name = self._fallback.backend_name + self.notes = list(self._fallback.notes) + [f"Pretrained load failed: {self._load_error}"] + + def embed_texts(self, texts: Sequence[str]) -> torch.Tensor: + self._ensure_loaded() + if self._model is None or self._tokenizer is None: + return self._fallback.embed_texts(texts) + + encoded = self._tokenizer( + list(texts), + padding=True, + truncation=True, + max_length=MODEL_MAX_LENGTH, + return_tensors="pt", + ) + with torch.no_grad(): + outputs = self._model(**encoded) + hidden_state = outputs.last_hidden_state + mask = encoded["attention_mask"].unsqueeze(-1) + pooled = (hidden_state * mask).sum(dim=1) / mask.sum(dim=1).clamp(min=1) + return F.normalize(pooled, dim=1) + + +def _sanitize_text(value: str) -> str: + text = (value or "").strip() + return text[:4000] + + +def _safe_softmax(scores: dict[IssueLabel, float]) -> dict[str, float]: + tensor = torch.tensor([scores[label] for label in LABELS], dtype=torch.float32) + probabilities = torch.softmax(tensor * 4.0, dim=0) + return {label: round(float(probabilities[index]), 4) for index, label in enumerate(LABELS)} + + +def _loop_depth(code: str) -> int: + try: + tree = ast.parse(code) + except SyntaxError: + return 0 + visitor = _LoopDepthVisitor() + visitor.visit(tree) + return visitor.max_depth + + +def _repair_risk(label: IssueLabel, confidence: float, signal_count: int) -> str: + base = {"syntax": 0.25, "logic": 0.55, "performance": 0.7}[label] + if confidence < 0.55: + base += 0.12 + if signal_count >= 4: + base += 0.08 + if base < 0.4: + return "low" + if base < 0.72: + return "medium" + return "high" + + +def _clamp_unit(value: float) -> float: + return round(max(0.0, min(1.0, float(value))), 4) + + +def _lint_score(code: str) -> float: + stripped_lines = [line.rstrip("\n") for line in code.splitlines()] + if not stripped_lines: + return 0.2 + + score = 1.0 + if any(len(line) > 88 for line in stripped_lines): + score -= 0.15 + if any(line.rstrip() != line for line in stripped_lines): + score -= 0.1 + if any("\t" in line for line in stripped_lines): + score -= 0.1 + try: + tree = ast.parse(code) + functions = [node for node in tree.body if isinstance(node, ast.FunctionDef)] + if functions and not ast.get_docstring(functions[0]): + score -= 0.08 + except SyntaxError: + score -= 0.45 + return _clamp_unit(score) + + +def _complexity_penalty(code: str) -> float: + try: + tree = ast.parse(code) + except SyntaxError: + return 0.95 + branch_nodes = sum(isinstance(node, (ast.If, ast.For, ast.While, ast.Try, ast.Match)) for node in ast.walk(tree)) + loop_depth = _loop_depth(code) + penalty = 0.1 + min(branch_nodes, 8) * 0.07 + min(loop_depth, 4) * 0.12 + return _clamp_unit(penalty) + + +class CodeTriageEngine: + """Combine static signals with PyTorch embeddings to classify code issues.""" + + def __init__( + self, + *, + backend: TransformersEmbeddingBackend | HashingEmbeddingBackend | None = None, + prototypes: Sequence[TriagePrototype] | None = None, + examples: Sequence[TriageExample] | None = None, + ) -> None: + self.backend = backend or TransformersEmbeddingBackend() + self.prototypes = list(prototypes or build_prototypes()) + self.examples = list(examples or build_examples()) + self._prototype_matrix: torch.Tensor | None = None + self._reference_code_matrix: torch.Tensor | None = None + + def example_map(self) -> dict[str, TriageExample]: + """Return UI examples keyed by task id.""" + + return {example.key: example for example in self.examples} + + def _build_document(self, code: str, traceback_text: str) -> str: + trace = _sanitize_text(traceback_text) or "No traceback supplied." + snippet = _sanitize_text(code) or "# No code supplied." + return f"Candidate code:\n{snippet}\n\nObserved failure:\n{trace}\n" + + def _build_review_document(self, code: str, traceback_text: str, context_window: str) -> str: + context = _sanitize_text(context_window) or "No additional context window supplied." + return ( + f"{self._build_document(code, traceback_text)}\n" + f"Context window:\n{context}\n" + ) + + def _prototype_embeddings(self) -> torch.Tensor: + if self._prototype_matrix is None: + reference_texts = [prototype.reference_text for prototype in self.prototypes] + self._prototype_matrix = self.backend.embed_texts(reference_texts) + return self._prototype_matrix + + def _reference_code_embeddings(self) -> torch.Tensor: + if self._reference_code_matrix is None: + reference_codes = [prototype.reference_code for prototype in self.prototypes] + self._reference_code_matrix = self.backend.embed_texts(reference_codes) + return self._reference_code_matrix + + def _extract_signals(self, code: str, traceback_text: str) -> tuple[list[TriageSignal], dict[IssueLabel, float], list[str]]: + trace = (traceback_text or "").lower() + heuristic_scores: dict[IssueLabel, float] = {label: 0.15 for label in LABELS} + signals: list[TriageSignal] = [] + notes: list[str] = [] + + try: + ast.parse(code) + signals.append( + TriageSignal( + name="syntax_parse", + value="passes", + impact="syntax", + weight=0.1, + evidence="Python AST parsing succeeded.", + ) + ) + heuristic_scores["logic"] += 0.05 + except SyntaxError as exc: + evidence = f"{exc.msg} at line {exc.lineno}" + signals.append( + TriageSignal( + name="syntax_parse", + value="fails", + impact="syntax", + weight=0.95, + evidence=evidence, + ) + ) + heuristic_scores["syntax"] += 0.85 + notes.append(f"Parser failure detected: {evidence}") + + if any(token in trace for token in ("syntaxerror", "indentationerror", "expected ':'")): + signals.append( + TriageSignal( + name="traceback_keyword", + value="syntaxerror", + impact="syntax", + weight=0.8, + evidence="Traceback contains a parser error.", + ) + ) + heuristic_scores["syntax"] += 0.55 + + if any(token in trace for token in ("assertionerror", "expected:", "actual:", "boundary", "missing", "incorrect")): + signals.append( + TriageSignal( + name="test_failure_signal", + value="assertion-style failure", + impact="logic", + weight=0.7, + evidence="Failure text points to behavioral mismatch instead of parser issues.", + ) + ) + heuristic_scores["logic"] += 0.55 + + if any(token in trace for token in ("timeout", "benchmark", "slow", "latency", "performance", "profiler")): + signals.append( + TriageSignal( + name="performance_trace", + value="latency regression", + impact="performance", + weight=0.85, + evidence="Traceback mentions benchmark or latency pressure.", + ) + ) + heuristic_scores["performance"] += 0.7 + + loop_depth = _loop_depth(code) + if loop_depth >= 2: + signals.append( + TriageSignal( + name="loop_depth", + value=str(loop_depth), + impact="performance", + weight=0.65, + evidence="Nested iteration increases runtime risk on larger fixtures.", + ) + ) + heuristic_scores["performance"] += 0.35 + + if "Counter(" in code or "defaultdict(" in code or "set(" in code: + heuristic_scores["performance"] += 0.05 + + if "return sessions" in code and "sessions.append" not in code: + signals.append( + TriageSignal( + name="state_update_gap", + value="possible missing final append", + impact="logic", + weight=0.45, + evidence="A collection is returned without an obvious final state flush.", + ) + ) + heuristic_scores["logic"] += 0.18 + + return signals, heuristic_scores, notes + + def _nearest_match(self, embedding: torch.Tensor) -> tuple[TriagePrototype, float, dict[str, float]]: + similarities = torch.matmul(embedding, self._prototype_embeddings().T)[0] + indexed_scores = { + self.prototypes[index].task_id: round(float((similarities[index] + 1.0) / 2.0), 4) + for index in range(len(self.prototypes)) + } + best_index = int(torch.argmax(similarities).item()) + best_prototype = self.prototypes[best_index] + best_similarity = float((similarities[best_index] + 1.0) / 2.0) + return best_prototype, best_similarity, indexed_scores + + def _repair_plan(self, label: IssueLabel, matched: TriagePrototype, context_window: str) -> list[str]: + context = _sanitize_text(context_window) + step_one = { + "syntax": "Step 1 - Syntax checking and bug fixes: resolve the parser break before touching behavior, then align the function with the expected contract.", + "logic": "Step 1 - Syntax checking and bug fixes: confirm the code parses cleanly, then patch the failing branch or state update causing the incorrect result.", + "performance": "Step 1 - Syntax checking and bug fixes: keep the implementation correct first, then isolate the slow section without changing external behavior.", + }[label] + step_two = ( + "Step 2 - Edge case handling: verify empty input, boundary values, missing fields, and final-state flush behavior " + f"against the known pattern `{matched.title}`." + ) + step_three = ( + "Step 3 - Scalability of code: remove repeated full scans, prefer linear-time data structures, " + "and benchmark the path on a production-like fixture." + ) + if context: + step_two = f"{step_two} Context window to preserve: {context}" + return [step_one, step_two, step_three] + + def _reference_quality_score(self, code: str, matched: TriagePrototype) -> float: + candidate = self.backend.embed_texts([_sanitize_text(code) or "# empty"]) + match_index = next(index for index, prototype in enumerate(self.prototypes) if prototype.task_id == matched.task_id) + reference = self._reference_code_embeddings()[match_index : match_index + 1] + score = float(torch.matmul(candidate, reference.T)[0][0].item()) + return _clamp_unit((score + 1.0) / 2.0) + + def triage(self, code: str, traceback_text: str = "", context_window: str = "") -> TriageResult: + """Run the full triage pipeline on code plus optional failure context.""" + + started = time.perf_counter() + document = self._build_review_document(code, traceback_text, context_window) + signals, heuristic_scores, notes = self._extract_signals(code, traceback_text) + + candidate_embedding = self.backend.embed_texts([document]) + matched, matched_similarity, prototype_scores = self._nearest_match(candidate_embedding) + + label_similarity = {label: 0.18 for label in LABELS} + for prototype in self.prototypes: + label_similarity[prototype.label] = max( + label_similarity[prototype.label], + prototype_scores[prototype.task_id], + ) + + combined_scores = { + label: 0.72 * label_similarity[label] + 0.28 * heuristic_scores[label] + for label in LABELS + } + confidence_scores = _safe_softmax(combined_scores) + issue_label = max(LABELS, key=lambda label: confidence_scores[label]) + top_confidence = confidence_scores[issue_label] + + top_signal = signals[0].evidence if signals else "Model similarity dominated the decision." + ml_quality_score = self._reference_quality_score(code, matched) + lint_score = _lint_score(code) + complexity_penalty = _complexity_penalty(code) + reward_score = _clamp_unit((0.5 * ml_quality_score) + (0.3 * lint_score) - (0.2 * complexity_penalty)) + summary = ( + f"Detected a {issue_label} issue with {top_confidence:.0%} confidence. " + f"The closest known failure pattern is `{matched.title}`, which indicates {matched.summary.lower()}. " + f"Predicted quality score is {ml_quality_score:.0%} with an RL-ready reward of {reward_score:.0%}." + ) + suggested_next_action = { + "syntax": "Fix the parser error first, then rerun validation before changing behavior.", + "logic": "Step through the smallest failing case and confirm the final branch/update behavior.", + "performance": "Replace repeated full-list scans with a linear-time aggregation strategy, then benchmark it.", + }[issue_label] + + return TriageResult( + issue_label=issue_label, + confidence_scores=confidence_scores, + repair_risk=_repair_risk(issue_label, top_confidence, len(signals)), + ml_quality_score=ml_quality_score, + lint_score=lint_score, + complexity_penalty=complexity_penalty, + reward_score=reward_score, + summary=summary, + matched_pattern=PrototypeMatch( + task_id=matched.task_id, + title=matched.title, + label=matched.label, + similarity=round(matched_similarity, 4), + summary=matched.summary, + rationale=top_signal, + ), + repair_plan=self._repair_plan(issue_label, matched, context_window), + suggested_next_action=suggested_next_action, + extracted_signals=signals, + model_backend=self.backend.backend_name, + model_id=self.backend.model_id, + inference_notes=list(self.backend.notes) + notes, + analysis_time_ms=round((time.perf_counter() - started) * 1000.0, 2), + ) + + +@lru_cache(maxsize=1) +def get_default_engine() -> CodeTriageEngine: + """Return a cached triage engine for the running process.""" + + return CodeTriageEngine() diff --git a/triage_catalog.py b/triage_catalog.py index 07d44522c002a6f71da1ed120a66c18fdd75a7b5..e62fea7e39d082f996f0578de965d2f90a390c3d 100644 --- a/triage_catalog.py +++ b/triage_catalog.py @@ -1,134 +1,134 @@ -"""Curated prototypes and example inputs for TorchReview Copilot.""" - -from __future__ import annotations - -from typing import Dict, List - -try: - from .triage_models import IssueLabel, TriageExample, TriagePrototype - from .tasks import list_tasks -except ImportError: - from triage_models import IssueLabel, TriageExample, TriagePrototype - from tasks import list_tasks - - -TASK_KIND_TO_LABEL: Dict[str, IssueLabel] = { - "syntax_fix": "syntax", - "bug_fix": "logic", - "optimization": "performance", -} - -TRACEBACK_BY_TASK_ID: Dict[str, str] = { - "syntax_fix_invoice_totals": ( - "Traceback (most recent call last):\n" - " File \"services/billing/reconciliation.py\", line 3\n" - " for record in records\n" - " ^\n" - "SyntaxError: expected ':'" - ), - "bug_fix_session_windows": ( - "AssertionError: collapse_sessions([{'minute': 1}, {'minute': 3}, {'minute': 8}], 4)\n" - "Expected: [(1, 3), (8, 8)]\n" - "Actual: [(1, 8)]\n" - "Boundary handling merges the final session instead of starting a new one." - ), - "optimization_rank_active_users": ( - "BenchmarkWarning: rank_active_users exceeded the 450ms budget on a nightly export fixture.\n" - "Profiler hint: repeated scans over the full event list and nested loops dominate runtime." - ), -} - -SUMMARY_BY_TASK_ID: Dict[str, str] = { - "syntax_fix_invoice_totals": "Broken parser state in a billing helper blocks reconciliation jobs.", - "bug_fix_session_windows": "Session-boundary logic fails on inclusive idle-timeout edges.", - "optimization_rank_active_users": "A nightly ranking job is correct on small fixtures but too slow at production scale.", -} - -CONTEXT_BY_TASK_ID: Dict[str, str] = { - "syntax_fix_invoice_totals": ( - "Context window: this helper runs in an end-of-day billing reconciliation job. " - "Keep the public function signature intact and restore correct totals for mixed integer/string inputs." - ), - "bug_fix_session_windows": ( - "Context window: this function groups sorted product analytics events into sessions for retention dashboards. " - "Boundary behavior must stay deterministic because downstream reports depend on it." - ), - "optimization_rank_active_users": ( - "Context window: this pipeline feeds a nightly export on a small CPU instance. " - "Maintain identical output ordering while improving scalability on larger event volumes." - ), -} - - -def _prototype_text( - task_id: str, - title: str, - description: str, - repo_summary: str, - goal: str, - visible_tests: List[str], - starter_code: str, - traceback_text: str, -) -> str: - visible = "\n".join(f"- {item}" for item in visible_tests) or "- none" - return ( - f"Title: {title}\n" - f"Problem: {description}\n" - f"Repo context: {repo_summary}\n" - f"Goal: {goal}\n" - f"Observed failure:\n{traceback_text}\n" - f"Visible checks:\n{visible}\n" - f"Candidate code:\n{starter_code}\n" - f"Task id: {task_id}\n" - ) - - -def build_examples() -> List[TriageExample]: - """Create stable UI examples from the task catalog.""" - - examples: List[TriageExample] = [] - for task in list_tasks(): - label = TASK_KIND_TO_LABEL[task.task_kind] - examples.append( - TriageExample( - key=task.task_id, - title=task.title, - label=label, - summary=SUMMARY_BY_TASK_ID[task.task_id], - code=task.starter_code, - traceback_text=TRACEBACK_BY_TASK_ID[task.task_id], - context_window=CONTEXT_BY_TASK_ID[task.task_id], - task_id=task.task_id, - ) - ) - return examples - - -def build_prototypes() -> List[TriagePrototype]: - """Build canonical triage prototypes from the OpenEnv tasks.""" - - prototypes: List[TriagePrototype] = [] - for task in list_tasks(): - traceback_text = TRACEBACK_BY_TASK_ID[task.task_id] - prototypes.append( - TriagePrototype( - task_id=task.task_id, - title=task.title, - label=TASK_KIND_TO_LABEL[task.task_kind], - summary=SUMMARY_BY_TASK_ID[task.task_id], - reference_text=_prototype_text( - task.task_id, - task.title, - task.task_description, - task.repo_summary, - task.goal, - list(task.visible_tests), - task.reference_code, - traceback_text, - ), - starter_code=task.starter_code, - reference_code=task.reference_code, - traceback_text=traceback_text, - ) - ) - return prototypes +"""Curated prototypes and example inputs for TorchReview Copilot.""" + +from __future__ import annotations + +from typing import Dict, List + +try: + from .triage_models import IssueLabel, TriageExample, TriagePrototype + from .tasks import list_tasks +except ImportError: + from triage_models import IssueLabel, TriageExample, TriagePrototype + from tasks import list_tasks + + +TASK_KIND_TO_LABEL: Dict[str, IssueLabel] = { + "syntax_fix": "syntax", + "bug_fix": "logic", + "optimization": "performance", +} + +TRACEBACK_BY_TASK_ID: Dict[str, str] = { + "syntax_fix_invoice_totals": ( + "Traceback (most recent call last):\n" + " File \"services/billing/reconciliation.py\", line 3\n" + " for record in records\n" + " ^\n" + "SyntaxError: expected ':'" + ), + "bug_fix_session_windows": ( + "AssertionError: collapse_sessions([{'minute': 1}, {'minute': 3}, {'minute': 8}], 4)\n" + "Expected: [(1, 3), (8, 8)]\n" + "Actual: [(1, 8)]\n" + "Boundary handling merges the final session instead of starting a new one." + ), + "optimization_rank_active_users": ( + "BenchmarkWarning: rank_active_users exceeded the 450ms budget on a nightly export fixture.\n" + "Profiler hint: repeated scans over the full event list and nested loops dominate runtime." + ), +} + +SUMMARY_BY_TASK_ID: Dict[str, str] = { + "syntax_fix_invoice_totals": "Broken parser state in a billing helper blocks reconciliation jobs.", + "bug_fix_session_windows": "Session-boundary logic fails on inclusive idle-timeout edges.", + "optimization_rank_active_users": "A nightly ranking job is correct on small fixtures but too slow at production scale.", +} + +CONTEXT_BY_TASK_ID: Dict[str, str] = { + "syntax_fix_invoice_totals": ( + "Context window: this helper runs in an end-of-day billing reconciliation job. " + "Keep the public function signature intact and restore correct totals for mixed integer/string inputs." + ), + "bug_fix_session_windows": ( + "Context window: this function groups sorted product analytics events into sessions for retention dashboards. " + "Boundary behavior must stay deterministic because downstream reports depend on it." + ), + "optimization_rank_active_users": ( + "Context window: this pipeline feeds a nightly export on a small CPU instance. " + "Maintain identical output ordering while improving scalability on larger event volumes." + ), +} + + +def _prototype_text( + task_id: str, + title: str, + description: str, + repo_summary: str, + goal: str, + visible_tests: List[str], + starter_code: str, + traceback_text: str, +) -> str: + visible = "\n".join(f"- {item}" for item in visible_tests) or "- none" + return ( + f"Title: {title}\n" + f"Problem: {description}\n" + f"Repo context: {repo_summary}\n" + f"Goal: {goal}\n" + f"Observed failure:\n{traceback_text}\n" + f"Visible checks:\n{visible}\n" + f"Candidate code:\n{starter_code}\n" + f"Task id: {task_id}\n" + ) + + +def build_examples() -> List[TriageExample]: + """Create stable UI examples from the task catalog.""" + + examples: List[TriageExample] = [] + for task in list_tasks(): + label = TASK_KIND_TO_LABEL[task.task_kind] + examples.append( + TriageExample( + key=task.task_id, + title=task.title, + label=label, + summary=SUMMARY_BY_TASK_ID[task.task_id], + code=task.starter_code, + traceback_text=TRACEBACK_BY_TASK_ID[task.task_id], + context_window=CONTEXT_BY_TASK_ID[task.task_id], + task_id=task.task_id, + ) + ) + return examples + + +def build_prototypes() -> List[TriagePrototype]: + """Build canonical triage prototypes from the OpenEnv tasks.""" + + prototypes: List[TriagePrototype] = [] + for task in list_tasks(): + traceback_text = TRACEBACK_BY_TASK_ID[task.task_id] + prototypes.append( + TriagePrototype( + task_id=task.task_id, + title=task.title, + label=TASK_KIND_TO_LABEL[task.task_kind], + summary=SUMMARY_BY_TASK_ID[task.task_id], + reference_text=_prototype_text( + task.task_id, + task.title, + task.task_description, + task.repo_summary, + task.goal, + list(task.visible_tests), + task.reference_code, + traceback_text, + ), + starter_code=task.starter_code, + reference_code=task.reference_code, + traceback_text=traceback_text, + ) + ) + return prototypes diff --git a/triage_models.py b/triage_models.py index 3b8e905806867dd8945a968ec24d841bd4e72db0..8ecc3a345adbe22292294654a77eea9f87796667 100644 --- a/triage_models.py +++ b/triage_models.py @@ -1,79 +1,79 @@ -"""Typed models for TorchReview Copilot outputs and examples.""" - -from __future__ import annotations - -from typing import Dict, List, Literal - -from pydantic import BaseModel, Field - - -IssueLabel = Literal["syntax", "logic", "performance"] -RiskLevel = Literal["low", "medium", "high"] - - -class TriageSignal(BaseModel): - """One extracted signal used during issue classification.""" - - name: str - value: str - impact: Literal["syntax", "logic", "performance", "mixed"] = "mixed" - weight: float = Field(..., ge=0.0, le=1.0) - evidence: str = "" - - -class PrototypeMatch(BaseModel): - """Nearest known bug pattern from the built-in task catalog.""" - - task_id: str - title: str - label: IssueLabel - similarity: float = Field(..., ge=0.0, le=1.0) - summary: str - rationale: str - - -class TriageExample(BaseModel): - """Example payload exposed in the demo UI.""" - - key: str - title: str - label: IssueLabel - summary: str - code: str - traceback_text: str - context_window: str - task_id: str - - -class TriagePrototype(BaseModel): - """Canonical issue-pattern representation embedded by the triage engine.""" - - task_id: str - title: str - label: IssueLabel - summary: str - reference_text: str - starter_code: str - reference_code: str - traceback_text: str - - -class TriageResult(BaseModel): - """Structured output produced by the triage pipeline.""" - - issue_label: IssueLabel - confidence_scores: Dict[str, float] - repair_risk: RiskLevel - ml_quality_score: float = Field(..., ge=0.0, le=1.0) - lint_score: float = Field(..., ge=0.0, le=1.0) - complexity_penalty: float = Field(..., ge=0.0, le=1.0) - reward_score: float = Field(..., ge=0.0, le=1.0) - summary: str - matched_pattern: PrototypeMatch - repair_plan: List[str] - suggested_next_action: str - extracted_signals: List[TriageSignal] = Field(default_factory=list) - model_backend: str - model_id: str - inference_notes: List[str] = Field(default_factory=list) - analysis_time_ms: float = Field(..., ge=0.0) +"""Typed models for TorchReview Copilot outputs and examples.""" + +from __future__ import annotations + +from typing import Dict, List, Literal + +from pydantic import BaseModel, Field + + +IssueLabel = Literal["syntax", "logic", "performance"] +RiskLevel = Literal["low", "medium", "high"] + + +class TriageSignal(BaseModel): + """One extracted signal used during issue classification.""" + + name: str + value: str + impact: Literal["syntax", "logic", "performance", "mixed"] = "mixed" + weight: float = Field(..., ge=0.0, le=1.0) + evidence: str = "" + + +class PrototypeMatch(BaseModel): + """Nearest known bug pattern from the built-in task catalog.""" + + task_id: str + title: str + label: IssueLabel + similarity: float = Field(..., ge=0.0, le=1.0) + summary: str + rationale: str + + +class TriageExample(BaseModel): + """Example payload exposed in the demo UI.""" + + key: str + title: str + label: IssueLabel + summary: str + code: str + traceback_text: str + context_window: str + task_id: str + + +class TriagePrototype(BaseModel): + """Canonical issue-pattern representation embedded by the triage engine.""" + + task_id: str + title: str + label: IssueLabel + summary: str + reference_text: str + starter_code: str + reference_code: str + traceback_text: str + + +class TriageResult(BaseModel): + """Structured output produced by the triage pipeline.""" + + issue_label: IssueLabel + confidence_scores: Dict[str, float] + repair_risk: RiskLevel + ml_quality_score: float = Field(..., ge=0.0, le=1.0) + lint_score: float = Field(..., ge=0.0, le=1.0) + complexity_penalty: float = Field(..., ge=0.0, le=1.0) + reward_score: float = Field(..., ge=0.0, le=1.0) + summary: str + matched_pattern: PrototypeMatch + repair_plan: List[str] + suggested_next_action: str + extracted_signals: List[TriageSignal] = Field(default_factory=list) + model_backend: str + model_id: str + inference_notes: List[str] = Field(default_factory=list) + analysis_time_ms: float = Field(..., ge=0.0) diff --git a/utils/__init__.py b/utils/__init__.py index 0121832ece22b25bcafa896d94b50ae4587d1b99..4bc736a197907087eadf9bfaf47d737ca460d64b 100644 --- a/utils/__init__.py +++ b/utils/__init__.py @@ -1,6 +1,6 @@ -"""Utility helpers for AST parsing and complexity scoring.""" - -from .ast_parser import parse_code_structure -from .complexity import estimate_complexity - -__all__ = ["parse_code_structure", "estimate_complexity"] +"""Utility helpers for AST parsing and complexity scoring.""" + +from .ast_parser import parse_code_structure +from .complexity import estimate_complexity + +__all__ = ["parse_code_structure", "estimate_complexity"] diff --git a/utils/ast_parser.py b/utils/ast_parser.py index d0eb1e80bf7adcde7e8017f6bffefecc5aa7882c..c0b7d9dbfb3fc3d051b712be1011864ac777cab2 100644 --- a/utils/ast_parser.py +++ b/utils/ast_parser.py @@ -1,144 +1,144 @@ -"""Static parsing helpers for multi-domain Python code analysis.""" - -from __future__ import annotations - -import ast -from typing import Any, Dict, List - - -class _LoopDepthVisitor(ast.NodeVisitor): - """Collect loop nesting depth for a parsed Python module.""" - - def __init__(self) -> None: - self.depth = 0 - self.max_depth = 0 - - def _visit_loop(self, node: ast.AST) -> None: - self.depth += 1 - self.max_depth = max(self.max_depth, self.depth) - self.generic_visit(node) - self.depth -= 1 - - def visit_For(self, node: ast.For) -> None: # noqa: N802 - self._visit_loop(node) - - def visit_While(self, node: ast.While) -> None: # noqa: N802 - self._visit_loop(node) - - def visit_comprehension(self, node: ast.comprehension) -> None: # noqa: N802 - self._visit_loop(node) - - -def parse_code_structure(code: str) -> Dict[str, Any]: - """Parse Python code into reusable structural signals.""" - - summary: Dict[str, Any] = { - "syntax_valid": True, - "syntax_error": "", - "imports": [], - "function_names": [], - "class_names": [], - "loop_count": 0, - "branch_count": 0, - "max_loop_depth": 0, - "line_count": len(code.splitlines()), - "long_lines": 0, - "tabs_used": "\t" in code, - "trailing_whitespace_lines": 0, - "uses_numpy": False, - "uses_pandas": False, - "uses_torch": False, - "uses_sklearn": False, - "uses_fastapi": False, - "uses_flask": False, - "uses_pydantic": False, - "uses_recursion": False, - "calls_eval": False, - "calls_no_grad": False, - "calls_backward": False, - "calls_optimizer_step": False, - "route_decorators": [], - "docstring_ratio": 0.0, - "code_smells": [], - } - - lines = code.splitlines() - summary["long_lines"] = sum(1 for line in lines if len(line) > 88) - summary["trailing_whitespace_lines"] = sum(1 for line in lines if line.rstrip() != line) - - try: - tree = ast.parse(code) - except SyntaxError as exc: - summary["syntax_valid"] = False - summary["syntax_error"] = f"{exc.msg} (line {exc.lineno})" - summary["code_smells"].append("Code does not parse.") - return summary - - visitor = _LoopDepthVisitor() - visitor.visit(tree) - summary["max_loop_depth"] = visitor.max_depth - - functions = [node for node in tree.body if isinstance(node, ast.FunctionDef)] - summary["function_names"] = [node.name for node in functions] - summary["class_names"] = [node.name for node in tree.body if isinstance(node, ast.ClassDef)] - summary["docstring_ratio"] = ( - sum(1 for node in functions if ast.get_docstring(node)) / len(functions) - if functions - else 0.0 - ) - - imports: List[str] = [] - for node in ast.walk(tree): - if isinstance(node, ast.Import): - imports.extend(alias.name.split(".")[0] for alias in node.names) - elif isinstance(node, ast.ImportFrom) and node.module: - imports.append(node.module.split(".")[0]) - elif isinstance(node, (ast.For, ast.While, ast.comprehension)): - summary["loop_count"] += 1 - elif isinstance(node, (ast.If, ast.Try, ast.Match)): - summary["branch_count"] += 1 - elif isinstance(node, ast.Call) and isinstance(node.func, ast.Attribute): - attr = node.func.attr - if attr == "eval": - summary["calls_eval"] = True - elif attr == "backward": - summary["calls_backward"] = True - elif attr == "step": - summary["calls_optimizer_step"] = True - elif isinstance(node, ast.Call) and isinstance(node.func, ast.Name) and node.func.id == "print": - summary["code_smells"].append("Debug print statements are present.") - elif isinstance(node, ast.With): - if any(isinstance(item.context_expr, ast.Call) and isinstance(item.context_expr.func, ast.Attribute) and item.context_expr.func.attr == "no_grad" for item in node.items): - summary["calls_no_grad"] = True - - import_set = sorted(set(imports)) - summary["imports"] = import_set - summary["uses_numpy"] = "numpy" in import_set or "np" in code - summary["uses_pandas"] = "pandas" in import_set or "pd" in code - summary["uses_torch"] = "torch" in import_set - summary["uses_sklearn"] = "sklearn" in import_set - summary["uses_fastapi"] = "fastapi" in import_set - summary["uses_flask"] = "flask" in import_set - summary["uses_pydantic"] = "pydantic" in import_set or "BaseModel" in code - - for node in functions: - for child in ast.walk(node): - if isinstance(child, ast.Call) and isinstance(child.func, ast.Name) and child.func.id == node.name: - summary["uses_recursion"] = True - - for node in ast.walk(tree): - if isinstance(node, ast.FunctionDef): - for decorator in node.decorator_list: - if isinstance(decorator, ast.Call) and isinstance(decorator.func, ast.Attribute): - summary["route_decorators"].append(decorator.func.attr) - elif isinstance(decorator, ast.Attribute): - summary["route_decorators"].append(decorator.attr) - - if summary["long_lines"]: - summary["code_smells"].append("Long lines reduce readability.") - if summary["tabs_used"]: - summary["code_smells"].append("Tabs detected; prefer spaces for consistency.") - if summary["trailing_whitespace_lines"]: - summary["code_smells"].append("Trailing whitespace found.") - - return summary +"""Static parsing helpers for multi-domain Python code analysis.""" + +from __future__ import annotations + +import ast +from typing import Any, Dict, List + + +class _LoopDepthVisitor(ast.NodeVisitor): + """Collect loop nesting depth for a parsed Python module.""" + + def __init__(self) -> None: + self.depth = 0 + self.max_depth = 0 + + def _visit_loop(self, node: ast.AST) -> None: + self.depth += 1 + self.max_depth = max(self.max_depth, self.depth) + self.generic_visit(node) + self.depth -= 1 + + def visit_For(self, node: ast.For) -> None: # noqa: N802 + self._visit_loop(node) + + def visit_While(self, node: ast.While) -> None: # noqa: N802 + self._visit_loop(node) + + def visit_comprehension(self, node: ast.comprehension) -> None: # noqa: N802 + self._visit_loop(node) + + +def parse_code_structure(code: str) -> Dict[str, Any]: + """Parse Python code into reusable structural signals.""" + + summary: Dict[str, Any] = { + "syntax_valid": True, + "syntax_error": "", + "imports": [], + "function_names": [], + "class_names": [], + "loop_count": 0, + "branch_count": 0, + "max_loop_depth": 0, + "line_count": len(code.splitlines()), + "long_lines": 0, + "tabs_used": "\t" in code, + "trailing_whitespace_lines": 0, + "uses_numpy": False, + "uses_pandas": False, + "uses_torch": False, + "uses_sklearn": False, + "uses_fastapi": False, + "uses_flask": False, + "uses_pydantic": False, + "uses_recursion": False, + "calls_eval": False, + "calls_no_grad": False, + "calls_backward": False, + "calls_optimizer_step": False, + "route_decorators": [], + "docstring_ratio": 0.0, + "code_smells": [], + } + + lines = code.splitlines() + summary["long_lines"] = sum(1 for line in lines if len(line) > 88) + summary["trailing_whitespace_lines"] = sum(1 for line in lines if line.rstrip() != line) + + try: + tree = ast.parse(code) + except SyntaxError as exc: + summary["syntax_valid"] = False + summary["syntax_error"] = f"{exc.msg} (line {exc.lineno})" + summary["code_smells"].append("Code does not parse.") + return summary + + visitor = _LoopDepthVisitor() + visitor.visit(tree) + summary["max_loop_depth"] = visitor.max_depth + + functions = [node for node in tree.body if isinstance(node, ast.FunctionDef)] + summary["function_names"] = [node.name for node in functions] + summary["class_names"] = [node.name for node in tree.body if isinstance(node, ast.ClassDef)] + summary["docstring_ratio"] = ( + sum(1 for node in functions if ast.get_docstring(node)) / len(functions) + if functions + else 0.0 + ) + + imports: List[str] = [] + for node in ast.walk(tree): + if isinstance(node, ast.Import): + imports.extend(alias.name.split(".")[0] for alias in node.names) + elif isinstance(node, ast.ImportFrom) and node.module: + imports.append(node.module.split(".")[0]) + elif isinstance(node, (ast.For, ast.While, ast.comprehension)): + summary["loop_count"] += 1 + elif isinstance(node, (ast.If, ast.Try, ast.Match)): + summary["branch_count"] += 1 + elif isinstance(node, ast.Call) and isinstance(node.func, ast.Attribute): + attr = node.func.attr + if attr == "eval": + summary["calls_eval"] = True + elif attr == "backward": + summary["calls_backward"] = True + elif attr == "step": + summary["calls_optimizer_step"] = True + elif isinstance(node, ast.Call) and isinstance(node.func, ast.Name) and node.func.id == "print": + summary["code_smells"].append("Debug print statements are present.") + elif isinstance(node, ast.With): + if any(isinstance(item.context_expr, ast.Call) and isinstance(item.context_expr.func, ast.Attribute) and item.context_expr.func.attr == "no_grad" for item in node.items): + summary["calls_no_grad"] = True + + import_set = sorted(set(imports)) + summary["imports"] = import_set + summary["uses_numpy"] = "numpy" in import_set or "np" in code + summary["uses_pandas"] = "pandas" in import_set or "pd" in code + summary["uses_torch"] = "torch" in import_set + summary["uses_sklearn"] = "sklearn" in import_set + summary["uses_fastapi"] = "fastapi" in import_set + summary["uses_flask"] = "flask" in import_set + summary["uses_pydantic"] = "pydantic" in import_set or "BaseModel" in code + + for node in functions: + for child in ast.walk(node): + if isinstance(child, ast.Call) and isinstance(child.func, ast.Name) and child.func.id == node.name: + summary["uses_recursion"] = True + + for node in ast.walk(tree): + if isinstance(node, ast.FunctionDef): + for decorator in node.decorator_list: + if isinstance(decorator, ast.Call) and isinstance(decorator.func, ast.Attribute): + summary["route_decorators"].append(decorator.func.attr) + elif isinstance(decorator, ast.Attribute): + summary["route_decorators"].append(decorator.attr) + + if summary["long_lines"]: + summary["code_smells"].append("Long lines reduce readability.") + if summary["tabs_used"]: + summary["code_smells"].append("Tabs detected; prefer spaces for consistency.") + if summary["trailing_whitespace_lines"]: + summary["code_smells"].append("Trailing whitespace found.") + + return summary diff --git a/utils/complexity.py b/utils/complexity.py index 02890c2bf4f9cc791c8d6b49321a1de963eb60e5..caedc01cd124f2b34d23e007f2189a87cb643c09 100644 --- a/utils/complexity.py +++ b/utils/complexity.py @@ -1,37 +1,37 @@ -"""Complexity heuristics for DSA-style and general Python code.""" - -from __future__ import annotations - -from typing import Any, Dict - - -def estimate_complexity(parsed: Dict[str, Any], code: str) -> Dict[str, Any]: - """Estimate cyclomatic complexity and rough Big-O heuristics.""" - - cyclomatic = 1 + int(parsed.get("branch_count", 0)) - loop_depth = int(parsed.get("max_loop_depth", 0)) - uses_recursion = bool(parsed.get("uses_recursion", False)) - - if loop_depth >= 3: - time_complexity = "O(n^3)" - elif loop_depth == 2: - time_complexity = "O(n^2)" - elif "sorted(" in code or ".sort(" in code: - time_complexity = "O(n log n)" - elif loop_depth == 1 or uses_recursion: - time_complexity = "O(n)" - else: - time_complexity = "O(1)" - - if "append(" in code or "list(" in code or "dict(" in code or "set(" in code: - space_complexity = "O(n)" - else: - space_complexity = "O(1)" - - complexity_penalty = min(0.99, 0.08 + (cyclomatic * 0.04) + (loop_depth * 0.12)) - return { - "cyclomatic_complexity": cyclomatic, - "time_complexity": time_complexity, - "space_complexity": space_complexity, - "complexity_penalty": round(complexity_penalty, 4), - } +"""Complexity heuristics for DSA-style and general Python code.""" + +from __future__ import annotations + +from typing import Any, Dict + + +def estimate_complexity(parsed: Dict[str, Any], code: str) -> Dict[str, Any]: + """Estimate cyclomatic complexity and rough Big-O heuristics.""" + + cyclomatic = 1 + int(parsed.get("branch_count", 0)) + loop_depth = int(parsed.get("max_loop_depth", 0)) + uses_recursion = bool(parsed.get("uses_recursion", False)) + + if loop_depth >= 3: + time_complexity = "O(n^3)" + elif loop_depth == 2: + time_complexity = "O(n^2)" + elif "sorted(" in code or ".sort(" in code: + time_complexity = "O(n log n)" + elif loop_depth == 1 or uses_recursion: + time_complexity = "O(n)" + else: + time_complexity = "O(1)" + + if "append(" in code or "list(" in code or "dict(" in code or "set(" in code: + space_complexity = "O(n)" + else: + space_complexity = "O(1)" + + complexity_penalty = min(0.99, 0.08 + (cyclomatic * 0.04) + (loop_depth * 0.12)) + return { + "cyclomatic_complexity": cyclomatic, + "time_complexity": time_complexity, + "space_complexity": space_complexity, + "complexity_penalty": round(complexity_penalty, 4), + } diff --git a/uv.lock b/uv.lock index 9f921ba57c357348792600bd2226732da9b3f110..42d01f29c9e4b116a6b793d639f3d98c9c173cfc 100644 --- a/uv.lock +++ b/uv.lock @@ -1926,6 +1926,7 @@ source = { editable = "." } dependencies = [ { name = "fastapi" }, { name = "gradio" }, + { name = "hf-xet" }, { name = "openai" }, { name = "openenv-core", extra = ["core"] }, { name = "streamlit" }, @@ -1944,6 +1945,7 @@ dev = [ requires-dist = [ { name = "fastapi", specifier = ">=0.111.0" }, { name = "gradio", specifier = ">=5.26.0" }, + { name = "hf-xet", specifier = ">=1.4.3" }, { name = "openai", specifier = ">=1.76.0" }, { name = "openenv-core", extras = ["core"], specifier = ">=0.2.2" }, { name = "pytest", marker = "extra == 'dev'", specifier = ">=8.0.0" },