SpencerCPurdy's picture
Create app.py
f795275 verified
# AI-Powered Code Review Assistant
# Author: Spencer Purdy
# Description: An intelligent code review tool that analyzes Python and JavaScript code
# for best practices, security vulnerabilities, and performance improvements using
# CodeT5 and advanced pattern analysis.
# Import required libraries
import subprocess
import sys
import re
import ast
import json
from typing import List, Dict, Tuple
# Install required packages if not already installed
def install_packages():
"""Install required packages for the application"""
packages = ['gradio', 'transformers', 'torch', 'sentencepiece']
for package in packages:
subprocess.check_call([sys.executable, '-m', 'pip', 'install', package, '-q'])
# Try importing, install if needed
try:
import gradio as gr
from transformers import RobertaTokenizer, T5ForConditionalGeneration
import torch
except ImportError:
print("Installing required packages...")
install_packages()
import gradio as gr
from transformers import RobertaTokenizer, T5ForConditionalGeneration
import torch
# Initialize the CodeT5 model for AI-powered code analysis
print("Loading CodeT5 model...")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = RobertaTokenizer.from_pretrained('Salesforce/codet5-base')
model = T5ForConditionalGeneration.from_pretrained('Salesforce/codet5-base').to(device)
model.eval()
print(f"Model loaded successfully on {device}!")
class CodeAnalyzer:
"""
Main class for analyzing code quality, security, and performance.
Uses pattern matching and AI to identify issues in Python and JavaScript code.
"""
def __init__(self):
"""Initialize analyzer with predefined patterns for security, performance, and best practices"""
# Security vulnerability patterns for each language
self.security_patterns = {
'python': {
'sql_injection': r'(execute|executemany)\s*\(\s*["\'].*%[s|d].*["\'].*%',
'command_injection': r'(os\.system|subprocess\.call|subprocess\.run)\s*\([^)]*\+[^)]*\)',
'eval_usage': r'\beval\s*\(',
'pickle_usage': r'pickle\.(load|loads)\s*\(',
'hardcoded_secrets': r'(password|api_key|secret|token)\s*=\s*["\'][^"\']+["\']',
'weak_random': r'random\.(random|randint|choice)\s*\(',
},
'javascript': {
'eval_usage': r'\beval\s*\(',
'innerHTML_xss': r'\.innerHTML\s*=',
'sql_injection': r'query\s*\(\s*["\'].*\+.*["\']',
'hardcoded_secrets': r'(password|apiKey|secret|token)\s*=\s*["\'][^"\']+["\']',
'weak_comparison': r'==\s*(null|undefined)',
'unsafe_regex': r'new\s+RegExp\s*\([^)]*\+[^)]*\)',
}
}
# Performance issue patterns
self.performance_patterns = {
'python': {
'nested_loops': r'for\s+.*:\s*\n\s*for\s+.*:',
'string_concatenation': r'["\'].*["\']\s*\+\s*["\'].*["\']',
'list_comprehension_opportunity': r'for\s+.*:\s*\n\s*.*\.append\(',
'inefficient_contains': r'if\s+.*\s+in\s+.*list\(',
},
'javascript': {
'nested_loops': r'for\s*\([^)]*\)\s*{\s*for\s*\(',
'dom_in_loop': r'for\s*\([^)]*\)\s*{[^}]*document\.(getElementById|querySelector)',
'string_concatenation': r'["\'].*["\']\s*\+\s*["\'].*["\']',
'inefficient_array_method': r'\.(forEach|map|filter)\s*\([^)]*\)\s*\.(forEach|map|filter)',
}
}
# Best practice checkers (mix of regex patterns and callable functions)
self.best_practices = {
'python': {
'missing_docstring': self._check_missing_docstring,
'long_functions': self._check_long_functions,
'naming_convention': self._check_python_naming,
'unused_variables': self._check_unused_variables,
},
'javascript': {
'var_usage': r'\bvar\s+',
'missing_semicolon': r'[^;]\s*\n\s*(let|const|return|if|for|while)',
'console_log': r'console\.(log|error|warn)\(',
'naming_convention': self._check_js_naming,
}
}
def analyze_code(self, code: str, language: str) -> Dict[str, List[Dict]]:
"""
Main analysis function that runs all checks on the provided code
Args:
code: Source code string to analyze
language: Programming language ('python' or 'javascript')
Returns:
Dictionary with categorized issues: security, performance, best_practices, ai_suggestions
"""
results = {
'security': [],
'performance': [],
'best_practices': [],
'ai_suggestions': []
}
# Run security analysis using regex patterns
for issue_name, pattern in self.security_patterns.get(language, {}).items():
if isinstance(pattern, str) and re.search(pattern, code, re.IGNORECASE):
results['security'].append({
'type': issue_name.replace('_', ' ').title(),
'severity': 'high' if issue_name in ['sql_injection', 'command_injection', 'eval_usage'] else 'medium',
'message': self._get_security_message(issue_name),
'line': self._find_line_number(code, pattern)
})
# Run performance analysis
for issue_name, pattern in self.performance_patterns.get(language, {}).items():
if isinstance(pattern, str) and re.search(pattern, code, re.MULTILINE):
results['performance'].append({
'type': issue_name.replace('_', ' ').title(),
'severity': 'medium',
'message': self._get_performance_message(issue_name),
'line': self._find_line_number(code, pattern)
})
# Run best practices analysis (can be regex or function-based)
for issue_name, checker in self.best_practices.get(language, {}).items():
if callable(checker):
issues = checker(code)
results['best_practices'].extend(issues)
elif isinstance(checker, str) and re.search(checker, code):
results['best_practices'].append({
'type': issue_name.replace('_', ' ').title(),
'severity': 'low',
'message': self._get_best_practice_message(issue_name),
'line': self._find_line_number(code, checker)
})
# Get AI-powered suggestions using CodeT5
ai_suggestions = self._get_ai_suggestions(code, language)
if ai_suggestions:
results['ai_suggestions'] = ai_suggestions
return results
def _check_missing_docstring(self, code: str) -> List[Dict]:
"""
Check Python code for functions and classes missing docstrings
Uses AST parsing to analyze code structure
"""
issues = []
try:
tree = ast.parse(code)
for node in ast.walk(tree):
if isinstance(node, (ast.FunctionDef, ast.ClassDef)):
if not ast.get_docstring(node):
issues.append({
'type': 'Missing Docstring',
'severity': 'low',
'message': f'Function/Class "{node.name}" should have a docstring explaining its purpose',
'line': node.lineno
})
except:
# If AST parsing fails, skip this check
pass
return issues
def _check_long_functions(self, code: str) -> List[Dict]:
"""
Identify functions that exceed recommended length (20 lines)
Long functions are harder to understand and maintain
"""
issues = []
try:
tree = ast.parse(code)
for node in ast.walk(tree):
if isinstance(node, ast.FunctionDef):
func_lines = node.end_lineno - node.lineno
if func_lines > 20:
issues.append({
'type': 'Long Function',
'severity': 'medium',
'message': f'Function "{node.name}" is {func_lines} lines long. Consider breaking it into smaller functions.',
'line': node.lineno
})
except:
pass
return issues
def _check_python_naming(self, code: str) -> List[Dict]:
"""Check for Python naming convention violations (should use snake_case)"""
issues = []
camel_case_pattern = r'\b[a-z]+[A-Z]\w*\s*='
matches = re.finditer(camel_case_pattern, code)
for match in matches:
issues.append({
'type': 'Naming Convention',
'severity': 'low',
'message': 'Use snake_case for variable names in Python (e.g., my_variable instead of myVariable)',
'line': code[:match.start()].count('\n') + 1
})
return issues
def _check_js_naming(self, code: str) -> List[Dict]:
"""Check for JavaScript naming convention violations (should use camelCase)"""
issues = []
snake_case_pattern = r'(let|const|var)\s+[a-z]+_[a-z]+\s*='
matches = re.finditer(snake_case_pattern, code)
for match in matches:
issues.append({
'type': 'Naming Convention',
'severity': 'low',
'message': 'Use camelCase for variable names in JavaScript (e.g., myVariable instead of my_variable)',
'line': code[:match.start()].count('\n') + 1
})
return issues
def _check_unused_variables(self, code: str) -> List[Dict]:
"""
Detect variables that are assigned but never used in Python code
Unused variables can indicate dead code or incomplete refactoring
"""
issues = []
try:
tree = ast.parse(code)
assigned_vars = set()
used_vars = set()
# Walk AST to find assignments and variable usage
for node in ast.walk(tree):
if isinstance(node, ast.Assign):
for target in node.targets:
if isinstance(target, ast.Name):
assigned_vars.add(target.id)
elif isinstance(node, ast.Name) and isinstance(node.ctx, ast.Load):
used_vars.add(node.id)
# Find variables that are assigned but never used
unused = assigned_vars - used_vars - {'_'} # Exclude underscore
for var in unused:
issues.append({
'type': 'Unused Variable',
'severity': 'low',
'message': f'Variable "{var}" is defined but never used',
'line': 0 # Line number would require more complex analysis
})
except:
pass
return issues
def _find_line_number(self, code: str, pattern: str) -> int:
"""Find the line number where a regex pattern first matches"""
match = re.search(pattern, code, re.MULTILINE | re.IGNORECASE)
if match:
return code[:match.start()].count('\n') + 1
return 0
def _get_security_message(self, issue_type: str) -> str:
"""Return detailed explanation for security issues"""
messages = {
'sql_injection': 'Potential SQL injection vulnerability. Use parameterized queries or prepared statements instead of string concatenation.',
'command_injection': 'Potential command injection. Never use user input directly in system commands. Sanitize and validate all inputs.',
'eval_usage': 'Using eval() is dangerous and can lead to code injection. Consider using ast.literal_eval() or alternative approaches.',
'pickle_usage': 'Pickle can execute arbitrary code during deserialization. Only unpickle data from trusted sources.',
'hardcoded_secrets': 'Hardcoded credentials detected. Use environment variables or secure configuration management.',
'weak_random': 'Using non-cryptographic randomness for security. Use secrets module for tokens, passwords, or security-sensitive operations.',
'innerHTML_xss': 'Setting innerHTML with user data can lead to XSS attacks. Use textContent or properly sanitize input.',
'weak_comparison': 'Use === instead of == to avoid JavaScript type coercion issues.',
'unsafe_regex': 'Dynamic regex creation can lead to ReDoS attacks. Validate and escape user input carefully.',
}
return messages.get(issue_type, 'Security issue detected.')
def _get_performance_message(self, issue_type: str) -> str:
"""Return detailed explanation for performance issues"""
messages = {
'nested_loops': 'Nested loops can have O(nΒ²) complexity. Consider using more efficient algorithms or data structures like sets or dictionaries.',
'string_concatenation': 'String concatenation in loops is inefficient. Use join() in Python or template literals in JavaScript.',
'list_comprehension_opportunity': 'This loop pattern could be replaced with a more efficient and readable list comprehension.',
'inefficient_contains': 'Checking membership in a list is O(n). Consider using a set for O(1) lookups if checking multiple times.',
'dom_in_loop': 'DOM manipulation inside loops causes reflows. Batch operations or use DocumentFragment for better performance.',
'inefficient_array_method': 'Chaining array methods creates intermediate arrays. Consider combining operations or using a single reduce().',
}
return messages.get(issue_type, 'Performance issue detected.')
def _get_best_practice_message(self, issue_type: str) -> str:
"""Return detailed explanation for best practice violations"""
messages = {
'var_usage': 'Use let or const instead of var for block scoping and to prevent hoisting issues.',
'missing_semicolon': 'Missing semicolon. While JavaScript has ASI, explicit semicolons prevent potential errors.',
'console_log': 'Remove console statements before production deployment or use a proper logging library.',
}
return messages.get(issue_type, 'Best practice violation detected.')
def _get_ai_suggestions(self, code: str, language: str) -> List[Dict]:
"""
Generate AI-powered code improvement suggestions using CodeT5 model
Provides high-level insights beyond pattern matching
"""
suggestions = []
try:
# Extract function names and analyze code structure
if language == 'python':
# Analyze Python code structure
try:
tree = ast.parse(code)
functions = [node.name for node in ast.walk(tree) if isinstance(node, ast.FunctionDef)]
classes = [node.name for node in ast.walk(tree) if isinstance(node, ast.ClassDef)]
# Generate suggestions based on code structure
if functions:
# Create a summarization prompt for CodeT5
prompt = f"summarize python: {code[:200]}"
inputs = tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True).to(device)
with torch.no_grad():
outputs = model.generate(
inputs.input_ids,
max_length=60,
num_beams=4,
early_stopping=True,
temperature=0.7
)
summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
# Add meaningful suggestion based on analysis
if 'database' in code.lower() or 'query' in code.lower():
suggestions.append({
'type': 'Architecture Suggestion',
'severity': 'info',
'message': 'Consider implementing a data access layer or using an ORM like SQLAlchemy to abstract database operations and prevent SQL injection.',
'line': 0
})
if len(functions) > 3:
suggestions.append({
'type': 'Code Organization',
'severity': 'info',
'message': f'This code contains {len(functions)} functions. Consider organizing related functions into classes for better code organization.',
'line': 0
})
except:
pass
# Language-specific suggestions
if 'for i in range(len(' in code:
suggestions.append({
'type': 'Pythonic Code',
'severity': 'info',
'message': 'Use enumerate() for index-value iteration: for i, item in enumerate(list) instead of range(len())',
'line': 0
})
if re.search(r'except\s*:', code):
suggestions.append({
'type': 'Error Handling',
'severity': 'info',
'message': 'Avoid bare except clauses. Specify exception types for better error handling.',
'line': 0
})
elif language == 'javascript':
# JavaScript-specific AI suggestions
if 'callback' in code.lower() and 'function' in code:
suggestions.append({
'type': 'Modern JavaScript',
'severity': 'info',
'message': 'Consider using Promises or async/await instead of callbacks for better readability and error handling.',
'line': 0
})
if 'getElementById' in code and code.count('getElementById') > 2:
suggestions.append({
'type': 'Performance Tip',
'severity': 'info',
'message': 'Cache DOM references when accessing the same element multiple times to improve performance.',
'line': 0
})
# Use CodeT5 for JavaScript analysis
prompt = f"summarize javascript: {code[:200]}"
inputs = tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True).to(device)
with torch.no_grad():
outputs = model.generate(
inputs.input_ids,
max_length=60,
num_beams=4,
early_stopping=True,
temperature=0.7
)
summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
if 'array' in code.lower() or 'foreach' in code.lower():
suggestions.append({
'type': 'Functional Programming',
'severity': 'info',
'message': 'Consider using functional array methods (map, filter, reduce) for cleaner and more expressive code.',
'line': 0
})
# General suggestions for both languages
lines = code.split('\n')
if max(len(line) for line in lines if line.strip()) > 100:
suggestions.append({
'type': 'Code Readability',
'severity': 'info',
'message': 'Some lines exceed 100 characters. Consider breaking long lines for better readability.',
'line': 0
})
# Check for code complexity
if code.count('if') + code.count('else') + code.count('elif') > 5:
suggestions.append({
'type': 'Complexity Warning',
'severity': 'info',
'message': 'High conditional complexity detected. Consider refactoring using early returns or extracting complex logic into separate functions.',
'line': 0
})
except Exception as e:
# Don't fail the entire analysis if AI suggestions fail
print(f"AI suggestion generation note: {e}")
return suggestions
def format_results(results: Dict[str, List[Dict]]) -> str:
"""
Format analysis results into a readable markdown report
Uses emojis and formatting for clear visual hierarchy
"""
if not any(results.values()):
return "βœ… **Excellent!** Your code looks great - no significant issues found."
output = []
# Security section (highest priority)
if results['security']:
output.append("## πŸ”’ Security Issues")
output.append("*These require immediate attention:*\n")
for issue in results['security']:
severity_emoji = "πŸ”΄" if issue['severity'] == 'high' else "🟑"
output.append(f"{severity_emoji} **{issue['type']}** (Line {issue['line']})")
output.append(f" β†’ {issue['message']}\n")
# Performance section
if results['performance']:
output.append("## ⚑ Performance Issues")
output.append("*Optimize these for better efficiency:*\n")
for issue in results['performance']:
output.append(f"🟑 **{issue['type']}** (Line {issue['line']})")
output.append(f" β†’ {issue['message']}\n")
# Best practices section
if results['best_practices']:
output.append("## πŸ“ Best Practices")
output.append("*Follow these for cleaner, more maintainable code:*\n")
for issue in results['best_practices']:
output.append(f"πŸ”΅ **{issue['type']}** (Line {issue['line']})")
output.append(f" β†’ {issue['message']}\n")
# AI suggestions section
if results['ai_suggestions']:
output.append("## πŸ€– AI-Powered Insights")
output.append("*Advanced suggestions from CodeT5 analysis:*\n")
for suggestion in results['ai_suggestions']:
output.append(f"πŸ’‘ **{suggestion['type']}**: {suggestion['message']}\n")
# Summary statistics
total_issues = sum(len(v) for v in results.values())
high_severity = sum(1 for v in results['security'] if v['severity'] == 'high')
output.append("---")
output.append(f"**πŸ“Š Summary**: {total_issues} total suggestions found")
if high_severity > 0:
output.append(f"**⚠️ Critical**: {high_severity} high-severity security issues need immediate attention!")
return "\n".join(output)
def analyze_code_handler(code: str, language: str) -> str:
"""
Main handler function called by Gradio interface
Coordinates the analysis and formats results for display
"""
if not code.strip():
return "❗ Please enter some code to analyze."
# Create analyzer instance
analyzer = CodeAnalyzer()
language_key = language.lower()
# Validate language selection
if language_key not in ['python', 'javascript']:
return "⚠️ Currently supporting Python and JavaScript. More languages coming soon!"
try:
# Run analysis
results = analyzer.analyze_code(code, language_key)
# Format and return results
return format_results(results)
except Exception as e:
return f"❌ An error occurred during analysis: {str(e)}\n\nPlease check your code syntax and try again."
def create_interface():
"""
Create and configure the Gradio web interface
Provides an intuitive UI for code analysis with examples
"""
# Example code snippets demonstrating various issues
python_example = '''def process_user_data(user_id):
# Fetch user data from database
query = "SELECT * FROM users WHERE id = " + user_id
result = db.execute(query)
password = "admin123"
data = []
for row in result:
data.append(row)
# Process each item
for i in range(len(data)):
if data[i]['status'] == True:
print(data[i])
return data'''
javascript_example = '''function fetchUserData(userId) {
var apiKey = "sk-1234567890abcdef";
// Get user element
for (var i = 0; i < users.length; i++) {
document.getElementById('user-' + i).innerHTML = users[i].name;
}
// Check user status
if (userStatus == null) {
console.log("User not found");
}
var query = "SELECT * FROM users WHERE id = " + userId;
return db.query(query)
}'''
# Build Gradio interface with custom theme
with gr.Blocks(title="AI Code Review Assistant", theme=gr.themes.Soft()) as interface:
# Header section
gr.Markdown("""
# πŸ€– AI-Powered Code Review Assistant
**Instantly analyze your code for security vulnerabilities, performance issues, and best practices!**
This tool uses advanced pattern matching and the CodeT5 AI model to provide comprehensive code analysis for Python and JavaScript.
### ✨ Features
- πŸ”’ **Security Analysis**: Detect SQL injection, XSS, hardcoded secrets, and more
- ⚑ **Performance Optimization**: Identify inefficient patterns and algorithms
- πŸ“ **Best Practices**: Ensure clean, maintainable code following language conventions
- πŸ€– **AI Insights**: Get intelligent suggestions powered by CodeT5 transformer model
""")
# Main content area with two columns
with gr.Row():
# Left column - Input
with gr.Column():
code_input = gr.Code(
label="πŸ“ Enter your code here",
language="python",
lines=15,
value=python_example
)
language_select = gr.Radio(
choices=["Python", "JavaScript"],
value="Python",
label="πŸ”€ Select Language"
)
analyze_btn = gr.Button("πŸ” Analyze Code", variant="primary", size="lg")
# Example section
gr.Examples(
examples=[
[python_example, "Python"],
[javascript_example, "JavaScript"]
],
inputs=[code_input, language_select],
label="πŸ“š Try These Examples"
)
# Right column - Output
with gr.Column():
output = gr.Markdown(
label="πŸ“Š Analysis Results",
value="*Your analysis results will appear here...*"
)
# Footer with instructions and attribution
gr.Markdown("""
---
### 🎯 How to Use
1. **Paste** your Python or JavaScript code in the editor
2. **Select** the appropriate programming language
3. **Click** "Analyze Code" to run the analysis
4. **Review** the categorized feedback and improve your code!
### πŸ’‘ Tips
- The tool works best with complete functions or code blocks
- Line numbers help you quickly locate issues in your code
- Security issues (πŸ”΄) should be fixed immediately
- Use the AI insights for high-level code improvements
---
πŸ‘¨β€πŸ’» **Created by Spencer Purdy** | Computer Science @ Auburn University
[GitHub](https://github.com/spencercpurdy) | [LinkedIn](https://linkedin.com/in/spencerpurdy) | [Hugging Face](https://huggingface.co/spencercpurdy)
""")
# Connect the analyze button to the handler function
analyze_btn.click(
fn=analyze_code_handler,
inputs=[code_input, language_select],
outputs=output
)
return interface
# Main execution block
if __name__ == "__main__":
# Create and launch the Gradio interface
interface = create_interface()
# Launch with sharing enabled for easy access
interface.launch(debug=True, share=True)