Spaces:

uvpatel7271
/

python-code-review-env

Runtime error

App Files Files Community

uvpatel7271 commited on 4 days ago

Commit

9159c06

1 Parent(s): 3ba9e4a

added code modularity

Browse files

Files changed (17) hide show

analyzers/__init__.py +13 -0
analyzers/ds_analyzer.py +56 -0
analyzers/dsa_analyzer.py +48 -0
analyzers/ml_analyzer.py +61 -0
analyzers/web_analyzer.py +50 -0
models/__init__.py +5 -0
models/pytorch_model.py +149 -0
schemas/__init__.py +13 -0
schemas/request.py +19 -0
schemas/response.py +70 -0
services/__init__.py +7 -0
services/analysis_service.py +123 -0
services/reward_service.py +27 -0
services/suggestion_service.py +28 -0
utils/__init__.py +6 -0
utils/ast_parser.py +144 -0
utils/complexity.py +37 -0

analyzers/__init__.py ADDED Viewed

	@@ -0,0 +1,13 @@

+"""Domain-specific analyzers for multi-domain code understanding."""
+from .dsa_analyzer import analyze_dsa_code
+from .ds_analyzer import analyze_data_science_code
+from .ml_analyzer import analyze_ml_code
+from .web_analyzer import analyze_web_code
+__all__ = [
+    "analyze_dsa_code",
+    "analyze_data_science_code",
+    "analyze_ml_code",
+    "analyze_web_code",
+]

analyzers/ds_analyzer.py ADDED Viewed

	@@ -0,0 +1,56 @@

+"""Analyzer for data-science oriented Python code."""
+from __future__ import annotations
+from typing import Any, Dict
+from schemas.response import AnalysisIssue, DomainAnalysis
+def analyze_data_science_code(code: str, parsed: Dict[str, Any], complexity: Dict[str, Any]) -> DomainAnalysis:
+    """Inspect pandas and numpy code for vectorization and leakage concerns."""
+    issues = []
+    suggestions = []
+    score = 0.72
+    if "iterrows(" in code or "itertuples(" in code:
+        issues.append(
+            AnalysisIssue(
+                title="Row-wise dataframe iteration detected",
+                severity="medium",
+                description="Looping through dataframe rows is usually slower and less scalable than vectorized operations.",
+            )
+        )
+        suggestions.append("Use vectorized pandas or numpy expressions instead of row-wise iteration.")
+        score -= 0.18
+    if "inplace=True" in code:
+        suggestions.append("Avoid inplace mutation to keep data pipelines easier to reason about and test.")
+        score -= 0.05
+    if "fit_transform(" in code and "train_test_split" not in code:
+        issues.append(
+            AnalysisIssue(
+                title="Potential data leakage risk",
+                severity="high",
+                description="Feature transforms appear before an explicit train/test split.",
+            )
+        )
+        suggestions.append("Split train and validation data before fitting stateful preprocessing steps.")
+        score -= 0.2
+    if not suggestions:
+        suggestions.append("Add schema assumptions and null-handling checks for production data quality.")
+    return DomainAnalysis(
+        domain="data_science",
+        domain_score=max(0.05, round(score, 4)),
+        issues=issues,
+        suggestions=suggestions,
+        highlights={
+            "vectorization_risk": float("iterrows(" in code or "itertuples(" in code),
+            "time_complexity": complexity["time_complexity"],
+            "uses_pandas": float(parsed.get("uses_pandas", False)),
+        },
+    )

analyzers/dsa_analyzer.py ADDED Viewed

	@@ -0,0 +1,48 @@

+"""Analyzer for DSA and competitive-programming style Python code."""
+from __future__ import annotations
+from typing import Any, Dict
+from schemas.response import AnalysisIssue, DomainAnalysis
+def analyze_dsa_code(code: str, parsed: Dict[str, Any], complexity: Dict[str, Any]) -> DomainAnalysis:
+    """Inspect algorithmic code for brute-force patterns and efficiency risks."""
+    issues = []
+    suggestions = []
+    score = 0.7
+    if parsed.get("max_loop_depth", 0) >= 2:
+        issues.append(
+            AnalysisIssue(
+                title="Nested loops suggest brute-force behavior",
+                severity="medium",
+                description="The implementation scans the input multiple times, which is often avoidable in DSA problems.",
+            )
+        )
+        suggestions.append("Consider replacing nested scans with a hashmap, prefix table, or sorted search strategy.")
+        score -= 0.15
+    if parsed.get("uses_recursion"):
+        suggestions.append("Verify recursion depth and add memoization or iterative conversion if the input size can grow.")
+        score -= 0.05
+    if "sorted(" in code or ".sort(" in code:
+        suggestions.append("Sorting is acceptable here, but validate whether a direct O(n) pass can remove the sort.")
+    if not suggestions:
+        suggestions.append("Document the intended time complexity and add edge-case checks for empty input and duplicates.")
+    return DomainAnalysis(
+        domain="dsa",
+        domain_score=max(0.05, round(score, 4)),
+        issues=issues,
+        suggestions=suggestions,
+        highlights={
+            "time_complexity": complexity["time_complexity"],
+            "space_complexity": complexity["space_complexity"],
+            "max_loop_depth": float(parsed.get("max_loop_depth", 0)),
+        },
+    )

analyzers/ml_analyzer.py ADDED Viewed

	@@ -0,0 +1,61 @@

+"""Analyzer for machine-learning and deep-learning code."""
+from __future__ import annotations
+from typing import Any, Dict
+from schemas.response import AnalysisIssue, DomainAnalysis
+def analyze_ml_code(code: str, parsed: Dict[str, Any], complexity: Dict[str, Any]) -> DomainAnalysis:
+    """Inspect training and inference logic for common ML / DL mistakes."""
+    issues = []
+    suggestions = []
+    score = 0.74
+    if "torch" in code and "model.eval()" not in code and "predict" in code.lower():
+        issues.append(
+            AnalysisIssue(
+                title="Inference path may be missing eval mode",
+                severity="high",
+                description="Inference code should place the model in eval mode before prediction.",
+            )
+        )
+        suggestions.append("Call model.eval() before inference to disable training-time behavior such as dropout.")
+        score -= 0.18
+    if "torch" in code and "no_grad" not in code and "predict" in code.lower():
+        suggestions.append("Wrap inference in torch.no_grad() to reduce memory usage and avoid unnecessary gradient tracking.")
+        score -= 0.12
+    if parsed.get("calls_backward") and not parsed.get("calls_optimizer_step"):
+        issues.append(
+            AnalysisIssue(
+                title="Backward pass without optimizer step",
+                severity="medium",
+                description="Gradients are computed, but the optimizer step is not obvious in the snippet.",
+            )
+        )
+        suggestions.append("Ensure optimizer.step() and optimizer.zero_grad() are placed correctly in the training loop.")
+        score -= 0.12
+    if "CrossEntropyLoss" in code and "softmax(" in code:
+        suggestions.append("CrossEntropyLoss expects raw logits; remove the explicit softmax before the loss when possible.")
+        score -= 0.05
+    if not suggestions:
+        suggestions.append("Add explicit train/eval mode transitions and log validation metrics during training.")
+    return DomainAnalysis(
+        domain="ml_dl",
+        domain_score=max(0.05, round(score, 4)),
+        issues=issues,
+        suggestions=suggestions,
+        highlights={
+            "uses_torch": float(parsed.get("uses_torch", False)),
+            "has_eval_mode": float("model.eval()" in code),
+            "has_no_grad": float("no_grad" in code),
+            "time_complexity": complexity["time_complexity"],
+        },
+    )

analyzers/web_analyzer.py ADDED Viewed

	@@ -0,0 +1,50 @@

+"""Analyzer for FastAPI and backend web-service code."""
+from __future__ import annotations
+from typing import Any, Dict
+from schemas.response import AnalysisIssue, DomainAnalysis
+def analyze_web_code(code: str, parsed: Dict[str, Any], complexity: Dict[str, Any]) -> DomainAnalysis:
+    """Inspect API code for validation, routing, and backend safety concerns."""
+    issues = []
+    suggestions = []
+    score = 0.76
+    route_decorators = set(parsed.get("route_decorators", []))
+    if route_decorators and not parsed.get("uses_pydantic"):
+        issues.append(
+            AnalysisIssue(
+                title="Request validation model is missing",
+                severity="high",
+                description="Route handlers appear present, but no obvious Pydantic validation layer was detected.",
+            )
+        )
+        suggestions.append("Add Pydantic request and response models for strict validation and type-safe contracts.")
+        score -= 0.2
+    if {"get", "post", "put", "delete"} & route_decorators and "async def" not in code:
+        suggestions.append("Prefer async FastAPI endpoints when the route performs I/O or awaits downstream services.")
+        score -= 0.08
+    if "request.json()" in code or "request.body()" in code:
+        suggestions.append("Validate raw request payloads before use; avoid trusting unchecked JSON input.")
+        score -= 0.08
+    if not suggestions:
+        suggestions.append("Add domain-specific response models and centralize dependency injection for cleaner API structure.")
+    return DomainAnalysis(
+        domain="web",
+        domain_score=max(0.05, round(score, 4)),
+        issues=issues,
+        suggestions=suggestions,
+        highlights={
+            "route_count": float(len(route_decorators)),
+            "uses_validation": float(parsed.get("uses_pydantic", False)),
+            "time_complexity": complexity["time_complexity"],
+        },
+    )

models/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+"""PyTorch-backed model wrappers for the analyzer platform."""
+from .pytorch_model import PyTorchCodeAnalyzerModel
+__all__ = ["PyTorchCodeAnalyzerModel"]

models/pytorch_model.py ADDED Viewed

	@@ -0,0 +1,149 @@

+"""PyTorch + transformers model wrapper for multi-domain code scoring."""
+from __future__ import annotations
+import hashlib
+from typing import Dict, List, Sequence
+import torch
+import torch.nn.functional as F
+try:
+    from transformers import AutoModel, AutoTokenizer
+except Exception:
+    AutoModel = None  # type: ignore[assignment]
+    AutoTokenizer = None  # type: ignore[assignment]
+DOMAIN_PROTOTYPES: Dict[str, List[str]] = {
+    "dsa": [
+        "Binary search, hashmap optimization, recursion, dynamic programming, arrays, trees, graphs, stack, queue, complexity.",
+        "Competitive programming algorithm with loops, memoization, prefix sums, and asymptotic analysis.",
+    ],
+    "data_science": [
+        "Pandas dataframe transformation, numpy vectorization, feature leakage, train test split, iterrows misuse.",
+        "Data cleaning pipeline using pandas, numpy, aggregation, joins, and vectorized operations.",
+    ],
+    "ml_dl": [
+        "PyTorch model, training loop, optimizer, backward pass, eval mode, no_grad, loss function, dataloader.",
+        "Machine learning inference and training code with torch, sklearn, tensors, gradients, and model checkpoints.",
+    ],
+    "web": [
+        "FastAPI endpoint, request validation, Pydantic models, async routes, API security, backend service design.",
+        "REST API backend with routers, dependency injection, input validation, serialization, and error handling.",
+    ],
+    "general": [
+        "General Python utility code with readable structure, typing, tests, and maintainable abstractions.",
+    ],
+}
+QUALITY_ANCHORS: Dict[str, List[str]] = {
+    "high": [
+        "Readable typed Python code with validation, efficient algorithms, vectorized operations, safe inference, and clean API boundaries.",
+        "Production-ready code with small functions, docstrings, low complexity, and clear error handling.",
+    ],
+    "low": [
+        "Brute-force nested loops, missing validation, unsafe input handling, missing eval mode, missing no_grad, and code smells.",
+        "Hard to maintain code with high complexity, repeated scans, mutable side effects, and unclear structure.",
+    ],
+}
+class _HashEmbeddingBackend:
+    """Torch-native fallback when pretrained weights cannot be loaded."""
+    def __init__(self, dimensions: int = 128) -> None:
+        self.dimensions = dimensions
+        self.model_id = "hashed-token-fallback"
+        self.backend_name = "hashed-token-fallback"
+        self.notes = ["Using hashed embeddings because pretrained transformer weights are unavailable."]
+    def embed_texts(self, texts: Sequence[str]) -> torch.Tensor:
+        matrix = torch.zeros((len(texts), self.dimensions), dtype=torch.float32)
+        for row_index, text in enumerate(texts):
+            tokens = text.lower().split()[:512]
+            if not tokens:
+                matrix[row_index, 0] = 1.0
+                continue
+            for token in tokens:
+                digest = hashlib.md5(token.encode("utf-8")).hexdigest()
+                bucket = int(digest[:8], 16) % self.dimensions
+                sign = -1.0 if int(digest[8:10], 16) % 2 else 1.0
+                matrix[row_index, bucket] += sign
+        return F.normalize(matrix + 1e-6, dim=1)
+class PyTorchCodeAnalyzerModel:
+    """Score code using pretrained transformer embeddings plus prototype similarity."""
+    def __init__(self, model_id: str = "huggingface/CodeBERTa-small-v1") -> None:
+        self.model_id = model_id
+        self.backend_name = model_id
+        self.notes: List[str] = []
+        self._tokenizer = None
+        self._model = None
+        self._fallback = _HashEmbeddingBackend()
+        self._prototype_cache: Dict[str, torch.Tensor] = {}
+    def _ensure_loaded(self) -> None:
+        if self._model is not None or self.notes:
+            return
+        if AutoTokenizer is None or AutoModel is None:
+            self.backend_name = self._fallback.backend_name
+            self.notes = list(self._fallback.notes)
+            return
+        try:
+            self._tokenizer = AutoTokenizer.from_pretrained(self.model_id)
+            self._model = AutoModel.from_pretrained(self.model_id)
+            self._model.eval()
+            self.notes.append(f"Loaded pretrained encoder `{self.model_id}`.")
+        except Exception as exc:
+            self.backend_name = self._fallback.backend_name
+            self.notes = list(self._fallback.notes) + [f"Pretrained load failed: {type(exc).__name__}: {exc}"]
+    def _embed_texts(self, texts: Sequence[str]) -> torch.Tensor:
+        self._ensure_loaded()
+        if self._model is None or self._tokenizer is None:
+            return self._fallback.embed_texts(texts)
+        encoded = self._tokenizer(list(texts), padding=True, truncation=True, max_length=256, return_tensors="pt")
+        with torch.no_grad():
+            outputs = self._model(**encoded)
+            hidden = outputs.last_hidden_state
+            mask = encoded["attention_mask"].unsqueeze(-1)
+            pooled = (hidden * mask).sum(dim=1) / mask.sum(dim=1).clamp(min=1)
+        return F.normalize(pooled, dim=1)
+    def _prototype_matrix(self, bucket: str, texts: Sequence[str]) -> torch.Tensor:
+        if bucket not in self._prototype_cache:
+            self._prototype_cache[bucket] = self._embed_texts(texts)
+        return self._prototype_cache[bucket]
+    def predict(self, code: str, context_window: str, static_summary: Dict[str, object]) -> Dict[str, object]:
+        """Predict domain probabilities and a model quality score."""
+        document = (
+            f"Code:\n{code.strip()[:4000]}\n\n"
+            f"Context:\n{context_window.strip()[:1000]}\n\n"
+            f"Static hints:\n{static_summary}\n"
+        )
+        candidate = self._embed_texts([document])
+        domain_scores: Dict[str, float] = {}
+        for domain, texts in DOMAIN_PROTOTYPES.items():
+            matrix = self._prototype_matrix(f"domain:{domain}", texts)
+            similarity = torch.matmul(candidate, matrix.T).max().item()
+            domain_scores[domain] = round((similarity + 1.0) / 2.0, 4)
+        high_matrix = self._prototype_matrix("quality:high", QUALITY_ANCHORS["high"])
+        low_matrix = self._prototype_matrix("quality:low", QUALITY_ANCHORS["low"])
+        high_similarity = torch.matmul(candidate, high_matrix.T).max().item()
+        low_similarity = torch.matmul(candidate, low_matrix.T).max().item()
+        ml_quality_score = torch.sigmoid(torch.tensor((high_similarity - low_similarity) * 4.0)).item()
+        return {
+            "domain_scores": domain_scores,
+            "ml_quality_score": round(float(ml_quality_score), 4),
+            "backend_name": self.backend_name,
+            "model_id": self.model_id,
+            "notes": list(self.notes),
+        }

schemas/__init__.py ADDED Viewed

	@@ -0,0 +1,13 @@

+"""Public schemas for the multi-domain analysis platform."""
+from .request import AnalyzeCodeRequest
+from .response import AnalyzeCodeResponse, AnalysisIssue, DomainAnalysis, ScoreBreakdown, StaticAnalysisSummary
+__all__ = [
+    "AnalyzeCodeRequest",
+    "AnalyzeCodeResponse",
+    "AnalysisIssue",
+    "DomainAnalysis",
+    "ScoreBreakdown",
+    "StaticAnalysisSummary",
+]

schemas/request.py ADDED Viewed

	@@ -0,0 +1,19 @@

+"""Request schemas for code analysis endpoints and UI."""
+from __future__ import annotations
+from typing import Literal
+from pydantic import BaseModel, Field
+DomainHint = Literal["auto", "dsa", "data_science", "ml_dl", "web"]
+class AnalyzeCodeRequest(BaseModel):
+    """Validated input payload for multi-domain code analysis."""
+    code: str = Field(..., min_length=1, description="Source code to analyze.")
+    context_window: str = Field(default="", max_length=2000, description="Optional repository or task context.")
+    traceback_text: str = Field(default="", max_length=2000, description="Optional runtime or test failure output.")
+    domain_hint: DomainHint = Field(default="auto", description="Optional domain override when auto detection is not desired.")

schemas/response.py ADDED Viewed

	@@ -0,0 +1,70 @@

+"""Response schemas for the multi-domain analysis platform."""
+from __future__ import annotations
+from typing import Dict, List, Literal
+from pydantic import BaseModel, Field
+DomainType = Literal["dsa", "data_science", "ml_dl", "web", "general"]
+Severity = Literal["low", "medium", "high"]
+class AnalysisIssue(BaseModel):
+    """One detected issue or risk in the code snippet."""
+    title: str
+    severity: Severity
+    description: str
+    line_hint: int | None = None
+class StaticAnalysisSummary(BaseModel):
+    """Language-agnostic static-analysis signals."""
+    syntax_valid: bool
+    syntax_error: str = ""
+    cyclomatic_complexity: int = Field(..., ge=1)
+    line_count: int = Field(..., ge=0)
+    max_loop_depth: int = Field(..., ge=0)
+    time_complexity: str = "Unknown"
+    space_complexity: str = "Unknown"
+    detected_imports: List[str] = Field(default_factory=list)
+    code_smells: List[str] = Field(default_factory=list)
+class DomainAnalysis(BaseModel):
+    """Domain-specific analysis payload returned by an analyzer."""
+    domain: DomainType
+    domain_score: float = Field(..., ge=0.0, le=1.0)
+    issues: List[AnalysisIssue] = Field(default_factory=list)
+    suggestions: List[str] = Field(default_factory=list)
+    highlights: Dict[str, float | str] = Field(default_factory=dict)
+class ScoreBreakdown(BaseModel):
+    """Reward inputs and final normalized score."""
+    ml_score: float = Field(..., ge=0.0, le=1.0)
+    domain_score: float = Field(..., ge=0.0, le=1.0)
+    lint_score: float = Field(..., ge=0.0, le=1.0)
+    complexity_penalty: float = Field(..., ge=0.0, le=1.0)
+    reward: float = Field(..., ge=0.0, le=1.0)
+class AnalyzeCodeResponse(BaseModel):
+    """Top-level structured output for API and UI consumers."""
+    detected_domain: DomainType
+    domain_confidences: Dict[str, float]
+    score_breakdown: ScoreBreakdown
+    static_analysis: StaticAnalysisSummary
+    domain_analysis: DomainAnalysis
+    improvement_plan: List[str] = Field(default_factory=list)
+    model_backend: str
+    model_id: str
+    summary: str
+    context_window: str = ""
+    analysis_time_ms: float = Field(..., ge=0.0)

services/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+"""Service layer for orchestrating analysis, suggestions, and rewards."""
+from .analysis_service import AnalysisService
+from .reward_service import RewardService
+from .suggestion_service import SuggestionService
+__all__ = ["AnalysisService", "RewardService", "SuggestionService"]

services/analysis_service.py ADDED Viewed

	@@ -0,0 +1,123 @@

+"""Orchestration layer for multi-domain code analysis."""
+from __future__ import annotations
+import time
+from typing import Any, Callable, Dict
+from analyzers import analyze_data_science_code, analyze_dsa_code, analyze_ml_code, analyze_web_code
+from models import PyTorchCodeAnalyzerModel
+from schemas.request import AnalyzeCodeRequest
+from schemas.response import AnalyzeCodeResponse, DomainAnalysis, StaticAnalysisSummary
+from services.reward_service import RewardService
+from services.suggestion_service import SuggestionService
+from utils import estimate_complexity, parse_code_structure
+def _lint_score(parsed: Dict[str, Any]) -> float:
+    """Convert structural smells into a normalized lint-style score."""
+    score = 1.0
+    if not parsed.get("syntax_valid", True):
+        score -= 0.45
+    score -= min(parsed.get("long_lines", 0), 5) * 0.03
+    if parsed.get("tabs_used"):
+        score -= 0.1
+    if parsed.get("trailing_whitespace_lines"):
+        score -= 0.05
+    if parsed.get("docstring_ratio", 0.0) == 0.0 and parsed.get("function_names"):
+        score -= 0.08
+    return round(max(0.0, min(1.0, score)), 4)
+class AnalysisService:
+    """End-to-end analysis pipeline shared by API and UI."""
+    def __init__(self) -> None:
+        self.model = PyTorchCodeAnalyzerModel()
+        self.reward_service = RewardService()
+        self.suggestion_service = SuggestionService()
+        self._analyzers: Dict[str, Callable[[str, Dict[str, Any], Dict[str, Any]], DomainAnalysis]] = {
+            "dsa": analyze_dsa_code,
+            "data_science": analyze_data_science_code,
+            "ml_dl": analyze_ml_code,
+            "web": analyze_web_code,
+        }
+    def _heuristic_domain_scores(self, parsed: Dict[str, Any], code: str) -> Dict[str, float]:
+        """Derive domain priors from imports and syntax-level hints."""
+        scores = {
+            "dsa": 0.2 + (0.15 if parsed.get("uses_recursion") else 0.0) + (0.15 if parsed.get("max_loop_depth", 0) >= 1 else 0.0),
+            "data_science": 0.2 + (0.35 if parsed.get("uses_pandas") or parsed.get("uses_numpy") else 0.0),
+            "ml_dl": 0.2 + (0.35 if parsed.get("uses_torch") or parsed.get("uses_sklearn") else 0.0),
+            "web": 0.2 + (0.35 if parsed.get("uses_fastapi") or parsed.get("uses_flask") else 0.0) + (0.1 if parsed.get("route_decorators") else 0.0),
+            "general": 0.2,
+        }
+        if "fastapi" in code.lower():
+            scores["web"] += 0.1
+        if "pandas" in code.lower() or "numpy" in code.lower():
+            scores["data_science"] += 0.1
+        if "torch" in code.lower():
+            scores["ml_dl"] += 0.1
+        if "while" in code or "for" in code:
+            scores["dsa"] += 0.05
+        return {key: round(min(value, 0.99), 4) for key, value in scores.items()}
+    def analyze(self, request: AnalyzeCodeRequest) -> AnalyzeCodeResponse:
+        """Run the complete multi-domain analysis pipeline."""
+        started = time.perf_counter()
+        parsed = parse_code_structure(request.code)
+        complexity = estimate_complexity(parsed, request.code)
+        model_prediction = self.model.predict(request.code, request.context_window, parsed)
+        heuristic_scores = self._heuristic_domain_scores(parsed, request.code)
+        combined_scores = {}
+        for domain, heuristic_score in heuristic_scores.items():
+            model_score = float(model_prediction["domain_scores"].get(domain, 0.2))
+            combined_scores[domain] = round((0.6 * model_score) + (0.4 * heuristic_score), 4)
+        detected_domain = request.domain_hint if request.domain_hint != "auto" else max(combined_scores, key=combined_scores.get)
+        analyzer = self._analyzers.get(detected_domain, analyze_dsa_code if detected_domain == "dsa" else analyze_web_code)
+        domain_analysis = analyzer(request.code, parsed, complexity) if detected_domain in self._analyzers else DomainAnalysis(domain="general", domain_score=0.6, issues=[], suggestions=["Add stronger domain-specific context for deeper analysis."], highlights={})
+        lint_score = _lint_score(parsed)
+        score_breakdown = self.reward_service.compute(
+            ml_score=float(model_prediction["ml_quality_score"]),
+            domain_score=domain_analysis.domain_score,
+            lint_score=lint_score,
+            complexity_penalty=float(complexity["complexity_penalty"]),
+        )
+        static_analysis = StaticAnalysisSummary(
+            syntax_valid=bool(parsed["syntax_valid"]),
+            syntax_error=str(parsed["syntax_error"]),
+            cyclomatic_complexity=int(complexity["cyclomatic_complexity"]),
+            line_count=int(parsed["line_count"]),
+            max_loop_depth=int(parsed["max_loop_depth"]),
+            time_complexity=str(complexity["time_complexity"]),
+            space_complexity=str(complexity["space_complexity"]),
+            detected_imports=list(parsed["imports"]),
+            code_smells=list(parsed["code_smells"]),
+        )
+        improvement_plan = self.suggestion_service.build_improvement_plan(
+            domain_analysis=domain_analysis,
+            static_analysis=static_analysis,
+        )
+        summary = (
+            f"Detected `{detected_domain}` code with a model score of {score_breakdown.ml_score:.0%}, "
+            f"domain score {score_breakdown.domain_score:.0%}, and final reward {score_breakdown.reward:.0%}."
+        )
+        return AnalyzeCodeResponse(
+            detected_domain=detected_domain,  # type: ignore[arg-type]
+            domain_confidences=combined_scores,
+            score_breakdown=score_breakdown,
+            static_analysis=static_analysis,
+            domain_analysis=domain_analysis,
+            improvement_plan=improvement_plan,
+            model_backend=str(model_prediction["backend_name"]),
+            model_id=str(model_prediction["model_id"]),
+            summary=summary,
+            context_window=request.context_window,
+            analysis_time_ms=round((time.perf_counter() - started) * 1000.0, 2),
+        )

services/reward_service.py ADDED Viewed

	@@ -0,0 +1,27 @@

+"""Reward shaping logic for RL-ready code analysis scores."""
+from __future__ import annotations
+from schemas.response import ScoreBreakdown
+class RewardService:
+    """Compute reward scores from model, domain, lint, and complexity signals."""
+    def compute(self, *, ml_score: float, domain_score: float, lint_score: float, complexity_penalty: float) -> ScoreBreakdown:
+        """Apply the weighted reward formula and clamp the result."""
+        reward = max(
+            0.0,
+            min(
+                1.0,
+                (0.4 * ml_score) + (0.2 * domain_score) + (0.2 * lint_score) - (0.2 * complexity_penalty),
+            ),
+        )
+        return ScoreBreakdown(
+            ml_score=round(ml_score, 4),
+            domain_score=round(domain_score, 4),
+            lint_score=round(lint_score, 4),
+            complexity_penalty=round(complexity_penalty, 4),
+            reward=round(reward, 4),
+        )

services/suggestion_service.py ADDED Viewed

	@@ -0,0 +1,28 @@

+"""Suggestion and improvement-plan generation for analyzed code."""
+from __future__ import annotations
+from schemas.response import DomainAnalysis, StaticAnalysisSummary
+class SuggestionService:
+    """Build high-signal improvement steps from analysis output."""
+    def build_improvement_plan(self, *, domain_analysis: DomainAnalysis, static_analysis: StaticAnalysisSummary) -> list[str]:
+        """Return a compact three-step plan optimized for developer action."""
+        primary_issue = (
+            domain_analysis.issues[0].description
+            if domain_analysis.issues
+            else "Stabilize correctness first and keep the public behavior explicit."
+        )
+        step_one = f"Step 1 - Correctness and safety: {primary_issue}"
+        step_two = "Step 2 - Edge cases: test empty inputs, boundary values, malformed payloads, and failure-mode behavior explicitly."
+        step_three = "Step 3 - Scalability: reduce repeated scans, lower cyclomatic complexity, and benchmark the path on realistic input sizes."
+        if domain_analysis.suggestions:
+            step_three = f"{step_three} Priority hint: {domain_analysis.suggestions[0]}"
+        if not static_analysis.syntax_valid:
+            step_one = f"Step 1 - Correctness and safety: fix the syntax error first ({static_analysis.syntax_error})."
+        return [step_one, step_two, step_three]

utils/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+"""Utility helpers for AST parsing and complexity scoring."""
+from .ast_parser import parse_code_structure
+from .complexity import estimate_complexity
+__all__ = ["parse_code_structure", "estimate_complexity"]

utils/ast_parser.py ADDED Viewed

	@@ -0,0 +1,144 @@

+"""Static parsing helpers for multi-domain Python code analysis."""
+from __future__ import annotations
+import ast
+from typing import Any, Dict, List
+class _LoopDepthVisitor(ast.NodeVisitor):
+    """Collect loop nesting depth for a parsed Python module."""
+    def __init__(self) -> None:
+        self.depth = 0
+        self.max_depth = 0
+    def _visit_loop(self, node: ast.AST) -> None:
+        self.depth += 1
+        self.max_depth = max(self.max_depth, self.depth)
+        self.generic_visit(node)
+        self.depth -= 1
+    def visit_For(self, node: ast.For) -> None:  # noqa: N802
+        self._visit_loop(node)
+    def visit_While(self, node: ast.While) -> None:  # noqa: N802
+        self._visit_loop(node)
+    def visit_comprehension(self, node: ast.comprehension) -> None:  # noqa: N802
+        self._visit_loop(node)
+def parse_code_structure(code: str) -> Dict[str, Any]:
+    """Parse Python code into reusable structural signals."""
+    summary: Dict[str, Any] = {
+        "syntax_valid": True,
+        "syntax_error": "",
+        "imports": [],
+        "function_names": [],
+        "class_names": [],
+        "loop_count": 0,
+        "branch_count": 0,
+        "max_loop_depth": 0,
+        "line_count": len(code.splitlines()),
+        "long_lines": 0,
+        "tabs_used": "\t" in code,
+        "trailing_whitespace_lines": 0,
+        "uses_numpy": False,
+        "uses_pandas": False,
+        "uses_torch": False,
+        "uses_sklearn": False,
+        "uses_fastapi": False,
+        "uses_flask": False,
+        "uses_pydantic": False,
+        "uses_recursion": False,
+        "calls_eval": False,
+        "calls_no_grad": False,
+        "calls_backward": False,
+        "calls_optimizer_step": False,
+        "route_decorators": [],
+        "docstring_ratio": 0.0,
+        "code_smells": [],
+    }
+    lines = code.splitlines()
+    summary["long_lines"] = sum(1 for line in lines if len(line) > 88)
+    summary["trailing_whitespace_lines"] = sum(1 for line in lines if line.rstrip() != line)
+    try:
+        tree = ast.parse(code)
+    except SyntaxError as exc:
+        summary["syntax_valid"] = False
+        summary["syntax_error"] = f"{exc.msg} (line {exc.lineno})"
+        summary["code_smells"].append("Code does not parse.")
+        return summary
+    visitor = _LoopDepthVisitor()
+    visitor.visit(tree)
+    summary["max_loop_depth"] = visitor.max_depth
+    functions = [node for node in tree.body if isinstance(node, ast.FunctionDef)]
+    summary["function_names"] = [node.name for node in functions]
+    summary["class_names"] = [node.name for node in tree.body if isinstance(node, ast.ClassDef)]
+    summary["docstring_ratio"] = (
+        sum(1 for node in functions if ast.get_docstring(node)) / len(functions)
+        if functions
+        else 0.0
+    )
+    imports: List[str] = []
+    for node in ast.walk(tree):
+        if isinstance(node, ast.Import):
+            imports.extend(alias.name.split(".")[0] for alias in node.names)
+        elif isinstance(node, ast.ImportFrom) and node.module:
+            imports.append(node.module.split(".")[0])
+        elif isinstance(node, (ast.For, ast.While, ast.comprehension)):
+            summary["loop_count"] += 1
+        elif isinstance(node, (ast.If, ast.Try, ast.Match)):
+            summary["branch_count"] += 1
+        elif isinstance(node, ast.Call) and isinstance(node.func, ast.Attribute):
+            attr = node.func.attr
+            if attr == "eval":
+                summary["calls_eval"] = True
+            elif attr == "backward":
+                summary["calls_backward"] = True
+            elif attr == "step":
+                summary["calls_optimizer_step"] = True
+        elif isinstance(node, ast.Call) and isinstance(node.func, ast.Name) and node.func.id == "print":
+            summary["code_smells"].append("Debug print statements are present.")
+        elif isinstance(node, ast.With):
+            if any(isinstance(item.context_expr, ast.Call) and isinstance(item.context_expr.func, ast.Attribute) and item.context_expr.func.attr == "no_grad" for item in node.items):
+                summary["calls_no_grad"] = True
+    import_set = sorted(set(imports))
+    summary["imports"] = import_set
+    summary["uses_numpy"] = "numpy" in import_set or "np" in code
+    summary["uses_pandas"] = "pandas" in import_set or "pd" in code
+    summary["uses_torch"] = "torch" in import_set
+    summary["uses_sklearn"] = "sklearn" in import_set
+    summary["uses_fastapi"] = "fastapi" in import_set
+    summary["uses_flask"] = "flask" in import_set
+    summary["uses_pydantic"] = "pydantic" in import_set or "BaseModel" in code
+    for node in functions:
+        for child in ast.walk(node):
+            if isinstance(child, ast.Call) and isinstance(child.func, ast.Name) and child.func.id == node.name:
+                summary["uses_recursion"] = True
+    for node in ast.walk(tree):
+        if isinstance(node, ast.FunctionDef):
+            for decorator in node.decorator_list:
+                if isinstance(decorator, ast.Call) and isinstance(decorator.func, ast.Attribute):
+                    summary["route_decorators"].append(decorator.func.attr)
+                elif isinstance(decorator, ast.Attribute):
+                    summary["route_decorators"].append(decorator.attr)
+    if summary["long_lines"]:
+        summary["code_smells"].append("Long lines reduce readability.")
+    if summary["tabs_used"]:
+        summary["code_smells"].append("Tabs detected; prefer spaces for consistency.")
+    if summary["trailing_whitespace_lines"]:
+        summary["code_smells"].append("Trailing whitespace found.")
+    return summary

utils/complexity.py ADDED Viewed

	@@ -0,0 +1,37 @@

+"""Complexity heuristics for DSA-style and general Python code."""
+from __future__ import annotations
+from typing import Any, Dict
+def estimate_complexity(parsed: Dict[str, Any], code: str) -> Dict[str, Any]:
+    """Estimate cyclomatic complexity and rough Big-O heuristics."""
+    cyclomatic = 1 + int(parsed.get("branch_count", 0))
+    loop_depth = int(parsed.get("max_loop_depth", 0))
+    uses_recursion = bool(parsed.get("uses_recursion", False))
+    if loop_depth >= 3:
+        time_complexity = "O(n^3)"
+    elif loop_depth == 2:
+        time_complexity = "O(n^2)"
+    elif "sorted(" in code or ".sort(" in code:
+        time_complexity = "O(n log n)"
+    elif loop_depth == 1 or uses_recursion:
+        time_complexity = "O(n)"
+    else:
+        time_complexity = "O(1)"
+    if "append(" in code or "list(" in code or "dict(" in code or "set(" in code:
+        space_complexity = "O(n)"
+    else:
+        space_complexity = "O(1)"
+    complexity_penalty = min(0.99, 0.08 + (cyclomatic * 0.04) + (loop_depth * 0.12))
+    return {
+        "cyclomatic_complexity": cyclomatic,
+        "time_complexity": time_complexity,
+        "space_complexity": space_complexity,
+        "complexity_penalty": round(complexity_penalty, 4),
+    }