CodeCommunity commited on
Commit
1716602
·
verified ·
1 Parent(s): fdae303

Update app/services/reviewer_service.py

Browse files
Files changed (1) hide show
  1. app/services/reviewer_service.py +81 -53
app/services/reviewer_service.py CHANGED
@@ -1,12 +1,11 @@
1
  import json
2
  import logging
3
- import re # For better cleanup
4
-
5
  from app.core.model_loader import llm_engine
6
 
7
  logger = logging.getLogger(__name__)
8
 
9
-
10
  class AIReviewerService:
11
  def __init__(self):
12
  pass
@@ -14,117 +13,146 @@ class AIReviewerService:
14
  def review_batch_code(self, files: list) -> list:
15
  results = []
16
 
17
- # Optimized Batching for Free Tier (15 RPM)
18
- # 5 files per batch means 1 request covers 5 files.
19
  batch_size = 5
20
 
21
  for i in range(0, len(files), batch_size):
22
  batch = files[i : i + batch_size]
23
  combined_code = ""
24
- file_names = [] # Track for fallback
 
25
  for f in batch:
26
- # Minify code to save tokens (remove comments/whitespace)
27
  raw_content = f.content or ""
28
- # Limit to 6k chars per file to fit 5 files safely in context
29
  minified_content = self._minify_code(raw_content[:6000], f.fileName)
30
 
31
  combined_code += f"\n--- FILE: {f.fileName} ---\n{minified_content}\n"
32
  file_names.append(f.fileName)
33
 
 
34
  prompt = f"""
35
- Analyze {len(batch)} files:
36
  {combined_code}
37
 
38
- Task: Detect severe security/logic issues.
39
- Output JSON array (1 obj/file):
40
- [{{"fileName": "path", "vulnerabilities": [{{"type": "SQLi", "line": 10, "description": "text"}}], "metrics": {{"complexity": 1-10, "maintainability": 1-10}}}}]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  """
42
  try:
43
- # 8k output tokens is plenty for 5 files
44
  response_text = llm_engine.generate(prompt, max_tokens=8192)
45
  batch_results = self._parse_json(response_text)
46
 
47
- # Validation Logic
48
  processed_map = {item.get('fileName'): item for item in batch_results if isinstance(item, dict)}
49
 
50
  for fn in file_names:
51
  if fn in processed_map:
52
  res = processed_map[fn]
 
 
53
  res.setdefault("vulnerabilities", [])
54
- res.setdefault("metrics", {})
 
 
 
 
 
 
 
55
  results.append(res)
56
  else:
57
- # Fallback if AI missed a file in the JSON list
58
- results.append({"fileName": fn, "vulnerabilities": [], "metrics": {}})
 
 
 
 
 
59
 
60
  except Exception as e:
61
- logger.error(f"Batch error: {e}")
62
  for fn in file_names:
63
- results.append(
64
- {"fileName": fn, "vulnerabilities": [], "metrics": {}}
65
- )
 
 
 
66
 
67
  return results
68
 
69
  def _minify_code(self, code: str, filename: str) -> str:
70
  """
71
- Aggressive minification to reduce token usage.
72
- Removes:
73
- 1. Empty lines
74
- 2. Full-line comments
75
- 3. Inline comments
76
- 4. Logging/Print statements
77
  """
78
  lines = code.split('\n')
79
  cleaned_lines = []
80
 
81
- # Determine comment style
82
  is_python = filename.endswith('.py')
83
- is_js_style = filename.endswith(('.js', '.ts', '.jsx', '.tsx', '.java', '.c', '.cpp', '.cs', '.go', '.rs', '.php'))
84
 
85
  for line in lines:
86
  stripped = line.strip()
87
 
88
- # 1. Skip empty lines
89
- if not stripped:
90
- continue
91
 
92
- # 2. Skip full-line comments
93
  if is_python and stripped.startswith('#'): continue
94
  if is_js_style and stripped.startswith(('//', '/*', '*')): continue
95
 
96
- # 3. Skip logging (High token cost, low security relevance usually)
97
- # Check for common logging patterns
98
- if 'console.log' in stripped or 'print(' in stripped or 'logger.' in stripped or 'System.out.print' in stripped:
99
  continue
100
 
101
- # 4. Strip inline comments
102
- # Heuristic: split on " //" or " #" to avoid breaking URLs (http://)
103
- if is_python and ' #' in line:
104
- line = line.split(' #', 1)[0]
105
-
106
- if is_js_style and ' //' in line:
107
- line = line.split(' //', 1)[0]
108
 
109
- # If line became empty after stripping
110
- if not line.strip():
111
- continue
112
 
113
  cleaned_lines.append(line.rstrip())
114
 
115
  return '\n'.join(cleaned_lines)
116
 
117
  def _parse_json(self, text: str):
 
 
 
118
  text = text.strip()
119
  if not text:
120
  return []
121
- # Aggressive cleanup: remove common junk
122
- text = re.sub(r"^[^[]*\[", "[", text) # Trim before [
123
- text = re.sub(r"\][^]]*$", "]", text) # Trim after ]
 
124
  text = text.replace("```json", "").replace("```", "").strip()
 
125
  try:
126
- data = json.loads(text)
127
- return data
128
  except json.JSONDecodeError as e:
129
- logger.warning(f"JSON error: {e} | Raw start: {text[:200]}...")
130
- return []
 
 
 
 
1
  import json
2
  import logging
3
+ import re
4
+ from typing import List, Dict
5
  from app.core.model_loader import llm_engine
6
 
7
  logger = logging.getLogger(__name__)
8
 
 
9
  class AIReviewerService:
10
  def __init__(self):
11
  pass
 
13
  def review_batch_code(self, files: list) -> list:
14
  results = []
15
 
16
+ # Process in batches of 5 to stay within Gemini Free Tier limits (15 RPM)
 
17
  batch_size = 5
18
 
19
  for i in range(0, len(files), batch_size):
20
  batch = files[i : i + batch_size]
21
  combined_code = ""
22
+ file_names = []
23
+
24
  for f in batch:
25
+ # Minify code to save tokens
26
  raw_content = f.content or ""
27
+ # Limit to 6k chars per file to stay within context window safely
28
  minified_content = self._minify_code(raw_content[:6000], f.fileName)
29
 
30
  combined_code += f"\n--- FILE: {f.fileName} ---\n{minified_content}\n"
31
  file_names.append(f.fileName)
32
 
33
+ # Updated prompt to explicitly request suggestions and improvements
34
  prompt = f"""
35
+ Analyze the following {len(batch)} source code files for security vulnerabilities and code quality.
36
  {combined_code}
37
 
38
+ Task:
39
+ 1. Detect severe security/logic issues (Vulnerabilities).
40
+ 2. If a vulnerability is found, provide a concise 'suggestion' on how to fix it.
41
+ 3. If NO vulnerabilities are found in a file, provide a list of 'improvement_suggestions' (clean code, performance, or architecture tips).
42
+ 4. Provide metrics for complexity and maintainability (scale 1-10).
43
+
44
+ Output a JSON array (exactly 1 object per file):
45
+ [
46
+ {{
47
+ "fileName": "exact/path/from/header",
48
+ "vulnerabilities": [
49
+ {{
50
+ "type": "SQLi/Logic/etc",
51
+ "line": 10,
52
+ "description": "Short explanation",
53
+ "suggestion": "Specific code fix"
54
+ }}
55
+ ],
56
+ "improvement_suggestions": ["Tip 1", "Tip 2"],
57
+ "metrics": {{"complexity": 3, "maintainability": 8}}
58
+ }}
59
+ ]
60
  """
61
  try:
62
+ # 8k output tokens for the batch analysis
63
  response_text = llm_engine.generate(prompt, max_tokens=8192)
64
  batch_results = self._parse_json(response_text)
65
 
66
+ # Map results by fileName for easy lookup
67
  processed_map = {item.get('fileName'): item for item in batch_results if isinstance(item, dict)}
68
 
69
  for fn in file_names:
70
  if fn in processed_map:
71
  res = processed_map[fn]
72
+
73
+ # Ensure all required keys exist to prevent Android serialization errors
74
  res.setdefault("vulnerabilities", [])
75
+ res.setdefault("improvement_suggestions", [])
76
+ res.setdefault("metrics", {"complexity": 5, "maintainability": 5})
77
+
78
+ # Ensure every vulnerability has a suggestion field
79
+ for vuln in res["vulnerabilities"]:
80
+ if "suggestion" not in vuln:
81
+ vuln["suggestion"] = "Review the implementation logic for improved safety."
82
+
83
  results.append(res)
84
  else:
85
+ # Fallback if the AI skipped a file in its response
86
+ results.append({
87
+ "fileName": fn,
88
+ "vulnerabilities": [],
89
+ "improvement_suggestions": ["No immediate improvements identified."],
90
+ "metrics": {"complexity": 1, "maintainability": 10}
91
+ })
92
 
93
  except Exception as e:
94
+ logger.error(f"Batch processing error: {e}")
95
  for fn in file_names:
96
+ results.append({
97
+ "fileName": fn,
98
+ "vulnerabilities": [],
99
+ "improvement_suggestions": [],
100
+ "metrics": {"complexity": 0, "maintainability": 0}
101
+ })
102
 
103
  return results
104
 
105
  def _minify_code(self, code: str, filename: str) -> str:
106
  """
107
+ Removes comments, empty lines, and logs to optimize token usage.
 
 
 
 
 
108
  """
109
  lines = code.split('\n')
110
  cleaned_lines = []
111
 
 
112
  is_python = filename.endswith('.py')
113
+ is_js_style = filename.endswith(('.js', '.ts', '.jsx', '.tsx', '.java', '.kt', '.c', '.cpp'))
114
 
115
  for line in lines:
116
  stripped = line.strip()
117
 
118
+ if not stripped: continue
 
 
119
 
120
+ # Skip comments
121
  if is_python and stripped.startswith('#'): continue
122
  if is_js_style and stripped.startswith(('//', '/*', '*')): continue
123
 
124
+ # Skip common logging
125
+ if any(log in stripped for log in ['console.log', 'print(', 'logger.', 'Log.d', 'Log.e']):
 
126
  continue
127
 
128
+ # Strip inline comments
129
+ if is_python and ' #' in line: line = line.split(' #', 1)[0]
130
+ if is_js_style and ' //' in line: line = line.split(' //', 1)[0]
 
 
 
 
131
 
132
+ if not line.strip(): continue
 
 
133
 
134
  cleaned_lines.append(line.rstrip())
135
 
136
  return '\n'.join(cleaned_lines)
137
 
138
  def _parse_json(self, text: str):
139
+ """
140
+ Cleans and parses the LLM response into a Python list/dict.
141
+ """
142
  text = text.strip()
143
  if not text:
144
  return []
145
+
146
+ # Clean markdown formatting if present
147
+ text = re.sub(r"^[^[]*\[", "[", text)
148
+ text = re.sub(r"\][^]]*$", "]", text)
149
  text = text.replace("```json", "").replace("```", "").strip()
150
+
151
  try:
152
+ return json.loads(text)
 
153
  except json.JSONDecodeError as e:
154
+ logger.warning(f"JSON Decode Error: {e}")
155
+ return []
156
+
157
+ # Instantiate the service
158
+ service = AIReviewerService()