MUHAMMADSAADAMIN commited on
Commit
bec7d28
Β·
verified Β·
1 Parent(s): f5d0342

Upload folder using huggingface_hub

Browse files
Files changed (4) hide show
  1. Dockerfile +9 -9
  2. README.md +13 -7
  3. app.py +165 -71
  4. requirements.txt +1 -1
Dockerfile CHANGED
@@ -1,14 +1,14 @@
1
- FROM python:3.9
2
-
3
- RUN useradd -m -u 1000 user
4
- USER user
5
- ENV PATH="/home/user/.local/bin:$PATH"
6
 
7
  WORKDIR /app
8
 
9
- COPY --chown=user ./requirements.txt requirements.txt
10
- RUN pip install --no-cache-dir --upgrade -r requirements.txt
 
 
 
 
11
 
12
- COPY --chown=user . /app
13
 
14
- CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
 
1
+ FROM python:3.11-slim
 
 
 
 
2
 
3
  WORKDIR /app
4
 
5
+ RUN apt-get update && apt-get install -y gcc && rm -rf /var/lib/apt/lists/*
6
+
7
+ COPY requirements.txt .
8
+ RUN pip install --no-cache-dir -r requirements.txt
9
+
10
+ COPY app.py .
11
 
12
+ EXPOSE 7860
13
 
14
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
README.md CHANGED
@@ -1,11 +1,17 @@
1
  ---
2
- title: Polyguard Api
3
- emoji: πŸ’»
4
- colorFrom: pink
5
- colorTo: gray
6
  sdk: docker
7
- pinned: false
8
- short_description: PolyGuard code security analyzer API
9
  ---
10
 
11
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
1
  ---
2
+ title: PolyGuard API
3
+ emoji: πŸ›‘οΈ
4
+ colorFrom: blue
5
+ colorTo: green
6
  sdk: docker
7
+ app_port: 7860
8
+ pinned: true
9
  ---
10
 
11
+ # PolyGuard API
12
+
13
+ Code vulnerability scanner powered by fine-tuned CodeBERT.
14
+
15
+ - **Model**: MUHAMMADSAADAMIN/PolyGuard
16
+ - **Docs**: /docs
17
+ - **Analyze**: POST /analyze
app.py CHANGED
@@ -1,99 +1,193 @@
1
- import torch
2
- from transformers import AutoTokenizer, AutoModelForSequenceClassification
3
  from fastapi import FastAPI
4
  from fastapi.middleware.cors import CORSMiddleware
5
  from pydantic import BaseModel
 
 
 
6
 
7
- MODEL_NAME = "MUHAMMADSAADAMIN/polyguard-model"
8
- tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
9
- model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
 
 
10
  model.eval()
 
11
 
12
- suggestions = {
13
- "sqli": "Use parameterized queries instead of building SQL strings manually.",
14
- "xss": "Sanitize all user inputs before rendering them to the page.",
15
- "secrets": "Never hardcode API keys or passwords. Use environment variables instead.",
16
- "crypto": "Avoid MD5 and SHA1. Use SHA256 or bcrypt for hashing.",
17
- "memory": "Always check buffer sizes before copying data in C/C++.",
18
- "auth": "Always verify user permissions before returning sensitive data.",
19
- }
20
-
21
- language_tips = {
22
  "python": [
23
- "Use list comprehensions instead of for loops where possible.",
24
- "Use f-strings for string formatting instead of .format() or %.",
25
- "Use 'with open()' for file handling instead of open/close.",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  ],
27
  "javascript": [
28
- "Use const and let instead of var.",
29
- "Use async/await instead of nested callbacks.",
30
- "Always use === instead of == for comparisons.",
 
 
 
 
 
31
  ],
32
- "java": [
33
- "Use try-with-resources for handling streams and connections.",
34
- "Use StringBuilder instead of String concatenation in loops.",
 
 
35
  ],
36
- "go": [
37
- "Always handle errors explicitly.",
38
- "Use goroutines for concurrency instead of threads.",
 
 
 
39
  ],
40
  }
41
 
42
- app = FastAPI(title="PolyGuard API")
43
-
44
- app.add_middleware(
45
- CORSMiddleware,
46
- allow_origins=["*"],
47
- allow_methods=["*"],
48
- allow_headers=["*"],
49
- )
50
 
51
- class CodeRequest(BaseModel):
52
- code: str
53
- language: str = "python"
 
 
 
 
 
54
 
55
- @app.get("/")
56
- def home():
57
- return {"status": "PolyGuard API is running!"}
58
 
59
- @app.post("/analyze")
60
- def analyze(request: CodeRequest):
61
- inputs = tokenizer(
62
- request.code,
63
- return_tensors="pt",
64
- truncation=True,
65
- max_length=256,
66
- padding=True
67
- )
68
  with torch.no_grad():
69
- outputs = model(**inputs)
 
 
 
 
 
 
 
70
 
71
- probs = torch.softmax(outputs.logits, dim=1)
72
- clean_conf = probs[0][0].item()
73
- vuln_conf = probs[0][1].item()
74
- score = round(clean_conf * 10, 1)
 
 
 
 
 
 
 
 
 
 
 
 
 
75
 
76
- if score >= 8:
77
- risk = "low"
78
- elif score >= 5:
79
- risk = "medium"
80
  else:
81
- risk = "high"
 
 
82
 
83
- findings = []
84
- if vuln_conf > 0.4:
85
- findings.append(suggestions["sqli"])
86
- if vuln_conf > 0.6:
87
- findings.append(suggestions["xss"])
88
 
89
- tips = language_tips.get(request.language.lower(), ["Keep learning!"])
 
 
90
 
91
  return {
92
  "score": score,
93
  "risk": risk,
94
- "verdict": "CLEAN" if score >= 7 else "VULNERABLE",
95
- "clean_confidence": round(clean_conf * 100, 1),
96
- "vuln_confidence": round(vuln_conf * 100, 1),
97
  "findings": findings,
98
- "tips": tips,
99
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import os, re, random
3
  from fastapi import FastAPI
4
  from fastapi.middleware.cors import CORSMiddleware
5
  from pydantic import BaseModel
6
+ from typing import List
7
+ import torch
8
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
9
 
10
+ # ── Load model from HF Hub ─────────────────────────────────
11
+ MODEL_ID = "MUHAMMADSAADAMIN/PolyGuard"
12
+ print(f"Loading model: {MODEL_ID}")
13
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
14
+ model = AutoModelForSequenceClassification.from_pretrained(MODEL_ID)
15
  model.eval()
16
+ print("βœ“ Model ready")
17
 
18
+ # ── Vulnerability rules ────────────────────────────────────
19
+ VULN_RULES = {
 
 
 
 
 
 
 
 
20
  "python": [
21
+ (r"execute\s*\(\s*[f'\"].*?\{", "Use parameterized queries instead of building SQL strings manually."),
22
+ (r"execute\s*\(\s*\".*?%", "Use parameterized queries instead of building SQL strings manually."),
23
+ (r"eval\s*\(", "Avoid eval() β€” it executes arbitrary code and is a critical security risk."),
24
+ (r"exec\s*\(", "Avoid exec() β€” it executes arbitrary code and is a critical security risk."),
25
+ (r"pickle\.loads?\s*\(", "Avoid pickle.load() on untrusted data β€” it can execute arbitrary code."),
26
+ (r"subprocess.*shell\s*=\s*True", "Never use shell=True in subprocess β€” use a list of arguments instead."),
27
+ (r"os\.system\s*\(", "Avoid os.system() β€” use subprocess with a list of arguments instead."),
28
+ (r"hashlib\.md5\s*\(", "MD5 is cryptographically broken β€” use SHA-256 or stronger."),
29
+ (r"hashlib\.sha1\s*\(", "SHA-1 is weak β€” use SHA-256 or stronger."),
30
+ (r"random\.(random|randint|choice)\s*\(", "Use secrets module instead of random for security-sensitive operations."),
31
+ (r"open\s*\(.*['\"]w['\"]", "Validate file paths before writing to prevent path traversal attacks."),
32
+ (r"request\.(args|form|json)\[", "Validate and sanitize all user input before use."),
33
+ (r"render_template_string\s*\(", "Avoid render_template_string with user input β€” use template files instead."),
34
+ (r"yaml\.load\s*\(", "Use yaml.safe_load() instead of yaml.load() to prevent code execution."),
35
+ (r"SSL_VERIFY\s*=\s*False|verify\s*=\s*False", "Never disable SSL verification in production."),
36
+ (r"password\s*=\s*['\"][^'\"]{1,20}['\"]", "Hardcoded password detected β€” use environment variables instead."),
37
+ (r"secret\s*=\s*['\"][^'\"]{1,20}['\"]", "Hardcoded secret detected β€” use environment variables instead."),
38
+ (r"api_key\s*=\s*['\"][^'\"]+['\"]", "Hardcoded API key detected β€” use environment variables instead."),
39
  ],
40
  "javascript": [
41
+ (r"eval\s*\(", "Avoid eval() β€” it executes arbitrary code and is a critical security risk."),
42
+ (r"innerHTML\s*=", "Avoid innerHTML β€” use textContent or DOMPurify to prevent XSS."),
43
+ (r"document\.write\s*\(", "Avoid document.write() β€” it can lead to XSS vulnerabilities."),
44
+ (r"dangerouslySetInnerHTML", "Avoid dangerouslySetInnerHTML β€” sanitize content with DOMPurify first."),
45
+ (r"localStorage\.setItem.*password", "Never store passwords or secrets in localStorage."),
46
+ (r"Math\.random\s*\(", "Use crypto.getRandomValues() instead of Math.random() for security tokens."),
47
+ (r"http://", "Use HTTPS instead of HTTP for all external requests."),
48
+ (r"password\s*=\s*['\"][^'\"]+['\"]", "Hardcoded password detected β€” use environment variables instead."),
49
  ],
50
+ "sql": [
51
+ (r"'\s*\+\s*", "String concatenation in SQL is vulnerable to injection β€” use parameterized queries."),
52
+ (r"GRANT ALL", "Avoid GRANT ALL β€” apply principle of least privilege."),
53
+ (r"DROP TABLE", "Dangerous DDL statement detected β€” ensure proper access controls."),
54
+ (r"xp_cmdshell", "xp_cmdshell is a critical security risk β€” disable it on the server."),
55
  ],
56
+ "php": [
57
+ (r"mysql_query\s*\(", "mysql_* functions are deprecated β€” use PDO or mysqli with prepared statements."),
58
+ (r"\$_GET\[|\$_POST\[|\$_REQUEST\[", "Sanitize all user input from $_GET/$_POST/$_REQUEST before use."),
59
+ (r"eval\s*\(", "Avoid eval() β€” it executes arbitrary code."),
60
+ (r"system\s*\(|exec\s*\(", "Avoid system()/exec() with user input β€” use escapeshellarg()."),
61
+ (r"md5\s*\(", "MD5 is not suitable for password hashing β€” use password_hash() instead."),
62
  ],
63
  }
64
 
65
+ CODE_TIPS = {
66
+ "python": ["Use list comprehensions instead of for loops.", "Use f-strings for string formatting.", "Use with open() for file handling.", "Add type hints to function signatures.", "Use logging instead of print() in production.", "Use dataclasses or Pydantic instead of plain dicts."],
67
+ "javascript": ["Use const and let instead of var.", "Use async/await instead of callback chains.", "Use strict equality (===) instead of ==.", "Prefer arrow functions for concise syntax."],
68
+ "sql": ["Always use parameterized queries.", "Add indexes on frequently queried columns.", "Use EXPLAIN to analyze query performance."],
69
+ "php": ["Use Composer for dependency management.", "Enable strict_types=1 at the top of files.", "Use prepared statements for all database queries."],
70
+ }
 
 
71
 
72
+ SMART_TIPS = {
73
+ "sql_injection": "Use parameterized queries e.g. cursor.execute(query, params) to prevent SQL injection.",
74
+ "code_execution": "Replace eval()/exec() with ast.literal_eval() for safe expression parsing.",
75
+ "hardcoded_secrets": "Store secrets in environment variables and load with os.environ.get(KEY).",
76
+ "weak_crypto": "Replace MD5/SHA1 with hashlib.sha256() or bcrypt for password hashing.",
77
+ "xss": "Sanitize output with DOMPurify or use textContent instead of innerHTML.",
78
+ "command_injection": "Pass arguments as a list to subprocess.run() and never use shell=True.",
79
+ }
80
 
81
+ SUPPORTED_LANGUAGES = list(VULN_RULES.keys()) + ["java", "c", "cpp", "go", "ruby", "rust"]
 
 
82
 
83
+ def analyze_code(code: str, language: str):
84
+ language = language.lower().strip()
85
+ inputs = tokenizer(code, return_tensors="pt", truncation=True, max_length=512, padding=True)
 
 
 
 
 
 
86
  with torch.no_grad():
87
+ logits = model(**inputs).logits
88
+
89
+ probs = torch.softmax(logits, dim=1).squeeze().tolist()
90
+ if not isinstance(probs, list):
91
+ probs = [probs, 1 - probs]
92
+
93
+ clean_conf = round(probs[0] * 100, 1)
94
+ vuln_conf = round(probs[1] * 100, 1)
95
 
96
+ findings, matched_categories = [], set()
97
+ for pattern, message in VULN_RULES.get(language, []):
98
+ if re.search(pattern, code, re.IGNORECASE):
99
+ if message not in findings:
100
+ findings.append(message)
101
+ if "sql" in message.lower() or "query" in message.lower():
102
+ matched_categories.add("sql_injection")
103
+ elif "eval" in message.lower() or "exec" in message.lower():
104
+ matched_categories.add("code_execution")
105
+ elif any(w in message.lower() for w in ["password", "secret", "api_key"]):
106
+ matched_categories.add("hardcoded_secrets")
107
+ elif any(w in message.lower() for w in ["md5", "sha1", "hash"]):
108
+ matched_categories.add("weak_crypto")
109
+ elif "xss" in message.lower() or "innerhtml" in message.lower():
110
+ matched_categories.add("xss")
111
+ elif "shell" in message.lower() or "subprocess" in message.lower():
112
+ matched_categories.add("command_injection")
113
 
114
+ model_uncertain = abs(vuln_conf - clean_conf) < 15.0
115
+ if model_uncertain and len(findings) == 0:
116
+ score = 0.0
 
117
  else:
118
+ finding_penalty = min(len(findings) * 0.8, 4.0)
119
+ base_score = (vuln_conf / 100) * 6.0 if not model_uncertain else 0.0
120
+ score = round(min(base_score + finding_penalty, 10.0), 1)
121
 
122
+ if score >= 7.0: risk, verdict = "critical", "VULNERABLE"
123
+ elif score >= 4.5: risk, verdict = "high", "VULNERABLE"
124
+ elif score >= 2.5: risk, verdict = "medium", "REVIEW NEEDED"
125
+ elif score >= 1.0: risk, verdict = "low", "MOSTLY SAFE"
126
+ else: risk, verdict = "safe", "CLEAN"
127
 
128
+ tips = [SMART_TIPS[c] for c in matched_categories if c in SMART_TIPS]
129
+ general = CODE_TIPS.get(language, ["Follow security best practices."])
130
+ tips += random.sample(general, min(max(0, 3 - len(tips)), len(general)))
131
 
132
  return {
133
  "score": score,
134
  "risk": risk,
135
+ "verdict": verdict,
136
+ "clean_confidence": clean_conf,
137
+ "vuln_confidence": vuln_conf,
138
  "findings": findings,
139
+ "tips": tips[:3],
140
+ "language": language,
141
+ "lines_analyzed": len(code.splitlines()),
142
+ }
143
+
144
+ # ── FastAPI app ────────────────────────────────────────────
145
+ app = FastAPI(
146
+ title = "PolyGuard API",
147
+ description = "Code vulnerability scanner powered by CodeBERT fine-tuned on CrossVUL.",
148
+ version = "3.0.0",
149
+ )
150
+
151
+ app.add_middleware(
152
+ CORSMiddleware,
153
+ allow_origins = ["*"],
154
+ allow_credentials = True,
155
+ allow_methods = ["*"],
156
+ allow_headers = ["*"],
157
+ )
158
+
159
+ class AnalyzeRequest(BaseModel):
160
+ code: str
161
+ language: str
162
+
163
+ class SnippetItem(BaseModel):
164
+ code: str
165
+ language: str
166
+
167
+ class BatchRequest(BaseModel):
168
+ snippets: List[SnippetItem]
169
+
170
+ @app.get("/")
171
+ def index():
172
+ return {
173
+ "service": "PolyGuard β€” Code Vulnerability Scanner",
174
+ "version": "3.0.0",
175
+ "model": MODEL_ID,
176
+ "status": "running",
177
+ "docs": "/docs",
178
+ "supported_languages": SUPPORTED_LANGUAGES,
179
+ "endpoints": ["/health", "/analyze", "/analyze_batch"],
180
+ }
181
+
182
+ @app.get("/health")
183
+ def health():
184
+ return {"status": "ok", "model": MODEL_ID, "supported_languages": SUPPORTED_LANGUAGES}
185
+
186
+ @app.post("/analyze")
187
+ def analyze(req: AnalyzeRequest):
188
+ return analyze_code(req.code, req.language)
189
+
190
+ @app.post("/analyze_batch")
191
+ def analyze_batch(req: BatchRequest):
192
+ results = [analyze_code(s.code, s.language) for s in req.snippets]
193
+ return {"count": len(results), "results": results}
requirements.txt CHANGED
@@ -2,4 +2,4 @@ fastapi
2
  uvicorn
3
  transformers
4
  torch
5
- pydantic
 
2
  uvicorn
3
  transformers
4
  torch
5
+ pydantic