Spaces:

Samarthrr
/

revcode-ai-engine

Sleeping

App Files Files Community

Samarthrr commited on 28 days ago

Commit

879b56d

verified ·

1 Parent(s): e740563

Update app.py

Browse files

Files changed (1) hide show

app.py +109 -164

app.py CHANGED Viewed

@@ -1,9 +1,8 @@
 import ast
 import torch
 import torch.nn as nn
-from fastapi import FastAPI, HTTPException, BackgroundTasks
 from pydantic import BaseModel
-from typing import Optional
 from transformers import (
     T5ForConditionalGeneration,
     RobertaTokenizer,
@@ -12,225 +11,163 @@ from transformers import (
 )
 import pandas as pd
 import os
-import threading
-# Import the training function
-from train_engine import train_on_devign
-app = FastAPI(title="Revcode AI ULTRA Orchestrator")
-# Global training status
-training_lock = threading.Lock()
-is_training = False
-# ---------------------------------------------------------
-# 1. DATA MODELS
-# ---------------------------------------------------------
-class CodeInput(BaseModel):
-    code: str
-    filename: Optional[str] = "snippet.js"
 # ---------------------------------------------------------
-# 2. ADVANCED SECURITY SCANNER (CodeBERT-Devign + XAI)
 # ---------------------------------------------------------
 class DeepVulnerabilityScanner:
     def __init__(self):
-        # We check if a locally trained model exists, otherwise use the base
-        local_model = "./trained_model"
-        if os.path.exists(local_model):
-            self.model_name = local_model
-            self.tokenizer_name = local_model
-            print(f"Loading Locally Trained Security Scanner ({self.model_name})...")
-        else:
-            self.model_name = "mahdin70/codebert-devign-code-vulnerability-detector"
-            self.tokenizer_name = "microsoft/codebert-base"
-            print(f"Loading SOTA Security Scanner ({self.model_name})...")
-        self.tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_name)
-        self.model = AutoModelForSequenceClassification.from_pretrained(self.model_name)
-        self.model.eval()
     def scan(self, code: str) -> dict:
-        inputs = self.tokenizer(code, return_tensors="pt", truncation=True, padding=True, max_length=512)
         with torch.no_grad():
             logits = self.model(**inputs).logits
         probs = torch.softmax(logits, dim=1)
         vuln_prob = probs[0][1].item()
-        reasoning = "Analyzing code logic for Devign-pattern vulnerabilities."
-        if vuln_prob > 0.9:
-            reasoning = "CRITICAL: High-confidence fingerprint of a known vulnerability pattern (e.g., Buffer Overflow, Improper Sanitization)."
-        elif vuln_prob > 0.5:
-            reasoning = "WARNING: Code semantics mirror dangerous patterns found in the Devign security dataset."
-        elif vuln_prob < 0.1:
-            reasoning = "SAFE: Code logic is clean of any recognized vulnerability fingerprints."
         return {
             "is_vulnerable": vuln_prob > 0.5,
             "risk_score": round(vuln_prob * 100, 2),
-            "verdict": "VULNERABLE" if vuln_prob > 0.5 else "SECURE",
-            "reasoning": reasoning
         }
 # ---------------------------------------------------------
-# 3. STRUCTURAL SCANNER (Mini-Semgrep)
-# ---------------------------------------------------------
-class StructuralScanner:
-    @staticmethod
-    def scan_patterns(code: str, filename: str) -> list:
-        findings = []
-        if "os.system(" in code or "subprocess.Popen(..., shell=True)" in code:
-            findings.append({
-                "type": "Security",
-                "title": "Command Injection Risk",
-                "reasoning": "Detected use of shell=True or os.system which can lead to Remote Code Execution."
-            })
-        if "pickle.load" in code or "yaml.load(..., Loader=None)" in code:
-             findings.append({
-                "type": "Security",
-                "title": "Insecure Deserialization",
-                "reasoning": "Insecure loading of serialized data can lead to arbitrary code execution."
-            })
-        if "Password =" in code or "API_KEY =" in code:
-            findings.append({
-                "type": "Compliance",
-                "title": "Hardcoded Secret",
-                "reasoning": "Sensitive credentials found in source code. Use environment variables instead."
-            })
-        return findings
-# ---------------------------------------------------------
-# 4. AUTOMATED REPAIR ENGINE (The "Surgeon" + Context)
 # ---------------------------------------------------------
 class AutomatedRepairEngine:
     def __init__(self):
-        print("Loading Repair Engine (CodeT5+)...")
         self.model_name = "Salesforce/codet5p-220m"
-        self.tokenizer = RobertaTokenizer.from_pretrained(self.model_name)
-        self.model = T5ForConditionalGeneration.from_pretrained(self.model_name)
-        self.model.eval()
-    def repair(self, buggy_code: str, filename: str) -> str:
-        prompt = f"Fix the security vulnerability in this {filename} file: {buggy_code}"
-        inputs = self.tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
         with torch.no_grad():
             outputs = self.model.generate(
                 **inputs,
-                max_length=512,
                 num_beams=5,
                 temperature=0.7,
                 early_stopping=True
             )
         return self.tokenizer.decode(outputs[0], skip_special_tokens=True)
 # ---------------------------------------------------------
-# 5. ARCHITECTURAL GUARDRAILS
 # ---------------------------------------------------------
-class Guardrails:
     @staticmethod
-    def validate(code: str):
         try:
             ast.parse(code)
-            return True, "Valid"
-        except Exception as e:
-            return False, f"Syntax analysis failed: {str(e)}"
 # ---------------------------------------------------------
-# 6. GLOBAL HANDLERS
 # ---------------------------------------------------------
-scanner = None
-repairer = None
-struct_scanner = StructuralScanner()
-guardrails = Guardrails()
-def get_scanner(force_reload=False):
-    global scanner
-    if scanner is None or force_reload:
-        scanner = DeepVulnerabilityScanner()
-    return scanner
 def get_repairer():
-    global repairer
-    if repairer is None:
-        repairer = AutomatedRepairEngine()
-    return repairer
-# ---------------------------------------------------------
-# 7. TRAINING WRAPPER
-# ---------------------------------------------------------
-def run_training():
-    global is_training
-    with training_lock:
-        is_training = True
-    try:
-        print("--- STARTING BACKGROUND TRAINING CYCLE ---")
-        train_on_devign(output_dir="./trained_model")
-        print("--- TRAINING CYCLE COMPLETED. RELOADING SCANNER ---")
-        get_scanner(force_reload=True)
-    finally:
-        with training_lock:
-            is_training = False
 # ---------------------------------------------------------
-# 8. API ENDPOINTS
 # ---------------------------------------------------------
-@app.get("/")
-async def health():
-    return {
-        "status": "Revcode AI ULTRA Orchestrator Operational",
-        "is_training": is_training,
-        "features": ["XAI", "Structural-Scan", "Context-Injection", "Auto-Train"]
-    }
-@app.post("/train")
-async def trigger_training(background_tasks: BackgroundTasks):
-    global is_training
-    if is_training:
-        return {"status": "error", "message": "Training already in progress."}
-    background_tasks.add_task(run_training)
-    return {"status": "success", "message": "Training started in background."}
 @app.post("/analyze")
 async def analyze_security(data: CodeInput):
-    eng = get_scanner()
-    res = eng.scan(data.code)
-    structural_findings = struct_scanner.scan_patterns(data.code, data.filename)
-    if structural_findings:
-        res["is_vulnerable"] = True
-        res["reasoning"] += " | Structural rules flagged: " + ", ".join([f['title'] for f in structural_findings])
-        res["verdict"] = "CRITICAL_VULNERABILITY"
     return {
-        "is_vulnerable": res["is_vulnerable"],
-        "confidence": res["risk_score"],
-        "verdict": res["verdict"],
-        "reasoning": res["reasoning"],
-        "structural_findings": structural_findings,
-        "is_training": is_training,
-        "provider": "DeepScanner-ULTRA"
     }
 @app.post("/fix")
 async def fix_code(data: CodeInput):
-    rep = get_repairer()
-    suggestion = rep.repair(data.code, data.filename)
-    is_valid, msg = guardrails.validate(suggestion)
-    return {
-        "suggestion": suggestion,
-        "guardrail_status": "PASSED" if is_valid else "FAILED",
-        "guardrail_msg": msg,
-        "context_applied": data.filename
-    }
-@app.post("/verify")
-async def verify_fix(data: CodeInput):
-    is_valid, msg = guardrails.validate(data.code)
     return {
-        "is_valid": is_valid,
-        "message": msg,
-        "status": "PASSED" if is_valid else "WARNING"
     }
 @app.post("/feedback")
@@ -239,3 +176,11 @@ async def store_feedback(data: dict):
     df = pd.DataFrame([data])
     df.to_csv(feedback_file, mode='a', header=not os.path.exists(feedback_file), index=False)
     return {"status": "Feedback stored for retraining"}

 import ast
 import torch
 import torch.nn as nn
+from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel
 from transformers import (
     T5ForConditionalGeneration,
     RobertaTokenizer,
 )
 import pandas as pd
 import os
+app = FastAPI(title="Revcode AI Strong Orchestrator")
 # ---------------------------------------------------------
+# 1. ADVANCED SECURITY SCANNER (The "Brain")
 # ---------------------------------------------------------
 class DeepVulnerabilityScanner:
     def __init__(self):
+        print("Loading Deep Security Scanner (DistilRoBERTa)...")
+        self.model_name = "distilroberta-base"
+        try:
+            self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
+            self.model = AutoModelForSequenceClassification.from_pretrained(self.model_name, num_labels=2)
+            self.model.eval()
+        except Exception as e:
+            print(f"Failed to load Deep Scanner: {e}")
+            self.model = None
     def scan(self, code: str) -> dict:
+        if not self.model:
+            return {"is_vulnerable": False, "risk_score": 0.0, "details": "Scanner unavailable"}
+        inputs = self.tokenizer(code, return_tensors="pt", truncation=True, max_length=512)
         with torch.no_grad():
             logits = self.model(**inputs).logits
         probs = torch.softmax(logits, dim=1)
         vuln_prob = probs[0][1].item()
         return {
             "is_vulnerable": vuln_prob > 0.5,
             "risk_score": round(vuln_prob * 100, 2),
+            "details": "Potential vulnerability detected in code logic." if vuln_prob > 0.5 else "Clean code."
         }
 # ---------------------------------------------------------
+# 2. AUTOMATED REPAIR ENGINE (The "Surgeon")
 # ---------------------------------------------------------
 class AutomatedRepairEngine:
     def __init__(self):
+        print("Loading Repair Engine (CodeT5)...")
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         self.model_name = "Salesforce/codet5p-220m"
+        try:
+            self.tokenizer = RobertaTokenizer.from_pretrained(self.model_name)
+            self.model = T5ForConditionalGeneration.from_pretrained(self.model_name).to(self.device)
+            self.model.eval()
+        except Exception as e:
+            print(f"Failed to load Repair Engine: {e}")
+            self.model = None
+    def repair(self, buggy_code: str) -> str:
+        if not self.model:
+            return buggy_code
+        prompt = f"Fix the security vulnerability: {buggy_code}"
+        inputs = self.tokenizer(prompt, return_tensors="pt", truncation=True, max_length=256).to(self.device)
         with torch.no_grad():
             outputs = self.model.generate(
                 **inputs,
+                max_length=256,
                 num_beams=5,
                 temperature=0.7,
                 early_stopping=True
             )
         return self.tokenizer.decode(outputs[0], skip_special_tokens=True)
 # ---------------------------------------------------------
+# 3. SYNTAX & LOGIC VALIDATOR (The "Quality Control")
 # ---------------------------------------------------------
+class CodeValidator:
     @staticmethod
+    def is_syntax_valid(code: str) -> bool:
         try:
             ast.parse(code)
+            return True
+        except Exception:
+            return False
+    @staticmethod
+    def check_security_patterns(code: str) -> list:
+        issues = []
+        dangerous_calls = ["eval(", "exec(", "os.system(", "subprocess.call("]
+        for call in dangerous_calls:
+            if call in code:
+                issues.append(f"Dangerous call found: {call}")
+        return issues
 # ---------------------------------------------------------
+# 4. GLOBAL HANDLERS (Lazy Loading)
 # ---------------------------------------------------------
+_scanner = None
+_repairer = None
+def get_scanner():
+    global _scanner
+    if _scanner is None:
+        _scanner = DeepVulnerabilityScanner()
+    return _scanner
 def get_repairer():
+    global _repairer
+    if _repairer is None:
+        _repairer = AutomatedRepairEngine()
+    return _repairer
 # ---------------------------------------------------------
+# 5. API ENDPOINTS
 # ---------------------------------------------------------
+class CodeInput(BaseModel):
+    code: str
 @app.post("/analyze")
 async def analyze_security(data: CodeInput):
+    scanner = get_scanner()
+    result = scanner.scan(data.code)
     return {
+        "is_vulnerable": result["is_vulnerable"],
+        "confidence": result["risk_score"],
+        "verdict": "VULNERABLE" if result["is_vulnerable"] else "SECURE",
+        "details": result["details"],
+        "provider": "DistilRoBERTa-Strong"
     }
 @app.post("/fix")
 async def fix_code(data: CodeInput):
+    repairer = get_repairer()
+    validator = CodeValidator()
+    # ML Repair
+    suggestion = repairer.repair(data.code)
+    # Validation Loop
+    status = "PASSED"
+    msg = "Valid syntax"
+    if not validator.is_syntax_valid(suggestion):
+        status = "FAILED"
+        msg = "Repair generated invalid syntax"
+        # Heuristic fallback (from user's logic)
+        suggestion = data.code.replace("eval(", "safe_eval(")
+    # Final Security Pattern Check
+    final_issues = validator.check_security_patterns(suggestion)
+    if final_issues:
+        for issue in final_issues:
+            call_name = issue.split(": ")[1]
+            suggestion = suggestion.replace(call_name, f"# BLOCKED_{call_name.replace('(', '')}")
+        msg += f" | Blocked {len(final_issues)} dangerous calls"
     return {
+        "suggestion": suggestion,
+        "guardrail_status": status,
+        "guardrail_msg": msg
     }
 @app.post("/feedback")
     df = pd.DataFrame([data])
     df.to_csv(feedback_file, mode='a', header=not os.path.exists(feedback_file), index=False)
     return {"status": "Feedback stored for retraining"}
+@app.get("/")
+async def health():
+    return {"status": "Revcode AI Strong Engine is alive"}
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=8000)