Spaces:

SpencerCPurdy
/

AI-Powered-Code-Review-Assistant

Running

App Files Files Community

AI-Powered-Code-Review-Assistant / app.py

SpencerCPurdy

Create app.py

f795275 verified 19 days ago

raw

history blame contribute delete

29.7 kB

	# AI-Powered Code Review Assistant
	# Author: Spencer Purdy
	# Description: An intelligent code review tool that analyzes Python and JavaScript code
	# for best practices, security vulnerabilities, and performance improvements using
	# CodeT5 and advanced pattern analysis.

	# Import required libraries
	import subprocess
	import sys
	import re
	import ast
	import json
	from typing import List, Dict, Tuple

	# Install required packages if not already installed
	def install_packages():
	"""Install required packages for the application"""
	packages = ['gradio', 'transformers', 'torch', 'sentencepiece']
	for package in packages:
	subprocess.check_call([sys.executable, '-m', 'pip', 'install', package, '-q'])

	# Try importing, install if needed
	try:
	import gradio as gr
	from transformers import RobertaTokenizer, T5ForConditionalGeneration
	import torch
	except ImportError:
	print("Installing required packages...")
	install_packages()
	import gradio as gr
	from transformers import RobertaTokenizer, T5ForConditionalGeneration
	import torch

	# Initialize the CodeT5 model for AI-powered code analysis
	print("Loading CodeT5 model...")
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	tokenizer = RobertaTokenizer.from_pretrained('Salesforce/codet5-base')
	model = T5ForConditionalGeneration.from_pretrained('Salesforce/codet5-base').to(device)
	model.eval()
	print(f"Model loaded successfully on {device}!")

	class CodeAnalyzer:
	"""
	Main class for analyzing code quality, security, and performance.
	Uses pattern matching and AI to identify issues in Python and JavaScript code.
	"""

	def __init__(self):
	"""Initialize analyzer with predefined patterns for security, performance, and best practices"""

	# Security vulnerability patterns for each language
	self.security_patterns = {
	'python': {
	'sql_injection': r'(execute\|executemany)\s\(\s["\'].%[s\|d].["\'].*%',
	'command_injection': r'(os\.system\|subprocess\.call\|subprocess\.run)\s\([^)]\+[^)]*\)',
	'eval_usage': r'\beval\s*\(',
	'pickle_usage': r'pickle\.(load\|loads)\s*\(',
	'hardcoded_secrets': r'(password\|api_key\|secret\|token)\s=\s["\'][^"\']+["\']',
	'weak_random': r'random\.(random\|randint\|choice)\s*\(',
	},
	'javascript': {
	'eval_usage': r'\beval\s*\(',
	'innerHTML_xss': r'\.innerHTML\s*=',
	'sql_injection': r'query\s\(\s["\'].\+.["\']',
	'hardcoded_secrets': r'(password\|apiKey\|secret\|token)\s=\s["\'][^"\']+["\']',
	'weak_comparison': r'==\s*(null\|undefined)',
	'unsafe_regex': r'new\s+RegExp\s\([^)]\+[^)]*\)',
	}
	}

	# Performance issue patterns
	self.performance_patterns = {
	'python': {
	'nested_loops': r'for\s+.:\s\n\sfor\s+.:',
	'string_concatenation': r'["\'].["\']\s\+\s["\'].["\']',
	'list_comprehension_opportunity': r'for\s+.:\s\n\s.\.append\(',
	'inefficient_contains': r'if\s+.\s+in\s+.list\(',
	},
	'javascript': {
	'nested_loops': r'for\s\([^)]\)\s{\sfor\s*\(',
	'dom_in_loop': r'for\s\([^)]\)\s{[^}]document\.(getElementById\|querySelector)',
	'string_concatenation': r'["\'].["\']\s\+\s["\'].["\']',
	'inefficient_array_method': r'\.(forEach\|map\|filter)\s\([^)]\)\s*\.(forEach\|map\|filter)',
	}
	}

	# Best practice checkers (mix of regex patterns and callable functions)
	self.best_practices = {
	'python': {
	'missing_docstring': self._check_missing_docstring,
	'long_functions': self._check_long_functions,
	'naming_convention': self._check_python_naming,
	'unused_variables': self._check_unused_variables,
	},
	'javascript': {
	'var_usage': r'\bvar\s+',
	'missing_semicolon': r'[^;]\s\n\s(let\|const\|return\|if\|for\|while)',
	'console_log': r'console\.(log\|error\|warn)\(',
	'naming_convention': self._check_js_naming,
	}
	}

	def analyze_code(self, code: str, language: str) -> Dict[str, List[Dict]]:
	"""
	Main analysis function that runs all checks on the provided code

	Args:
	code: Source code string to analyze
	language: Programming language ('python' or 'javascript')

	Returns:
	Dictionary with categorized issues: security, performance, best_practices, ai_suggestions
	"""
	results = {
	'security': [],
	'performance': [],
	'best_practices': [],
	'ai_suggestions': []
	}

	# Run security analysis using regex patterns
	for issue_name, pattern in self.security_patterns.get(language, {}).items():
	if isinstance(pattern, str) and re.search(pattern, code, re.IGNORECASE):
	results['security'].append({
	'type': issue_name.replace('_', ' ').title(),
	'severity': 'high' if issue_name in ['sql_injection', 'command_injection', 'eval_usage'] else 'medium',
	'message': self._get_security_message(issue_name),
	'line': self._find_line_number(code, pattern)
	})

	# Run performance analysis
	for issue_name, pattern in self.performance_patterns.get(language, {}).items():
	if isinstance(pattern, str) and re.search(pattern, code, re.MULTILINE):
	results['performance'].append({
	'type': issue_name.replace('_', ' ').title(),
	'severity': 'medium',
	'message': self._get_performance_message(issue_name),
	'line': self._find_line_number(code, pattern)
	})

	# Run best practices analysis (can be regex or function-based)
	for issue_name, checker in self.best_practices.get(language, {}).items():
	if callable(checker):
	issues = checker(code)
	results['best_practices'].extend(issues)
	elif isinstance(checker, str) and re.search(checker, code):
	results['best_practices'].append({
	'type': issue_name.replace('_', ' ').title(),
	'severity': 'low',
	'message': self._get_best_practice_message(issue_name),
	'line': self._find_line_number(code, checker)
	})

	# Get AI-powered suggestions using CodeT5
	ai_suggestions = self._get_ai_suggestions(code, language)
	if ai_suggestions:
	results['ai_suggestions'] = ai_suggestions

	return results

	def _check_missing_docstring(self, code: str) -> List[Dict]:
	"""
	Check Python code for functions and classes missing docstrings
	Uses AST parsing to analyze code structure
	"""
	issues = []
	try:
	tree = ast.parse(code)
	for node in ast.walk(tree):
	if isinstance(node, (ast.FunctionDef, ast.ClassDef)):
	if not ast.get_docstring(node):
	issues.append({
	'type': 'Missing Docstring',
	'severity': 'low',
	'message': f'Function/Class "{node.name}" should have a docstring explaining its purpose',
	'line': node.lineno
	})
	except:
	# If AST parsing fails, skip this check
	pass
	return issues

	def _check_long_functions(self, code: str) -> List[Dict]:
	"""
	Identify functions that exceed recommended length (20 lines)
	Long functions are harder to understand and maintain
	"""
	issues = []
	try:
	tree = ast.parse(code)
	for node in ast.walk(tree):
	if isinstance(node, ast.FunctionDef):
	func_lines = node.end_lineno - node.lineno
	if func_lines > 20:
	issues.append({
	'type': 'Long Function',
	'severity': 'medium',
	'message': f'Function "{node.name}" is {func_lines} lines long. Consider breaking it into smaller functions.',
	'line': node.lineno
	})
	except:
	pass
	return issues

	def _check_python_naming(self, code: str) -> List[Dict]:
	"""Check for Python naming convention violations (should use snake_case)"""
	issues = []
	camel_case_pattern = r'\b[a-z]+[A-Z]\w\s='
	matches = re.finditer(camel_case_pattern, code)
	for match in matches:
	issues.append({
	'type': 'Naming Convention',
	'severity': 'low',
	'message': 'Use snake_case for variable names in Python (e.g., my_variable instead of myVariable)',
	'line': code[:match.start()].count('\n') + 1
	})
	return issues

	def _check_js_naming(self, code: str) -> List[Dict]:
	"""Check for JavaScript naming convention violations (should use camelCase)"""
	issues = []
	snake_case_pattern = r'(let\|const\|var)\s+[a-z]+_[a-z]+\s*='
	matches = re.finditer(snake_case_pattern, code)
	for match in matches:
	issues.append({
	'type': 'Naming Convention',
	'severity': 'low',
	'message': 'Use camelCase for variable names in JavaScript (e.g., myVariable instead of my_variable)',
	'line': code[:match.start()].count('\n') + 1
	})
	return issues

	def _check_unused_variables(self, code: str) -> List[Dict]:
	"""
	Detect variables that are assigned but never used in Python code
	Unused variables can indicate dead code or incomplete refactoring
	"""
	issues = []
	try:
	tree = ast.parse(code)
	assigned_vars = set()
	used_vars = set()

	# Walk AST to find assignments and variable usage
	for node in ast.walk(tree):
	if isinstance(node, ast.Assign):
	for target in node.targets:
	if isinstance(target, ast.Name):
	assigned_vars.add(target.id)
	elif isinstance(node, ast.Name) and isinstance(node.ctx, ast.Load):
	used_vars.add(node.id)

	# Find variables that are assigned but never used
	unused = assigned_vars - used_vars - {'_'} # Exclude underscore
	for var in unused:
	issues.append({
	'type': 'Unused Variable',
	'severity': 'low',
	'message': f'Variable "{var}" is defined but never used',
	'line': 0 # Line number would require more complex analysis
	})
	except:
	pass
	return issues

	def _find_line_number(self, code: str, pattern: str) -> int:
	"""Find the line number where a regex pattern first matches"""
	match = re.search(pattern, code, re.MULTILINE \| re.IGNORECASE)
	if match:
	return code[:match.start()].count('\n') + 1
	return 0

	def _get_security_message(self, issue_type: str) -> str:
	"""Return detailed explanation for security issues"""
	messages = {
	'sql_injection': 'Potential SQL injection vulnerability. Use parameterized queries or prepared statements instead of string concatenation.',
	'command_injection': 'Potential command injection. Never use user input directly in system commands. Sanitize and validate all inputs.',
	'eval_usage': 'Using eval() is dangerous and can lead to code injection. Consider using ast.literal_eval() or alternative approaches.',
	'pickle_usage': 'Pickle can execute arbitrary code during deserialization. Only unpickle data from trusted sources.',
	'hardcoded_secrets': 'Hardcoded credentials detected. Use environment variables or secure configuration management.',
	'weak_random': 'Using non-cryptographic randomness for security. Use secrets module for tokens, passwords, or security-sensitive operations.',
	'innerHTML_xss': 'Setting innerHTML with user data can lead to XSS attacks. Use textContent or properly sanitize input.',
	'weak_comparison': 'Use === instead of == to avoid JavaScript type coercion issues.',
	'unsafe_regex': 'Dynamic regex creation can lead to ReDoS attacks. Validate and escape user input carefully.',
	}
	return messages.get(issue_type, 'Security issue detected.')

	def _get_performance_message(self, issue_type: str) -> str:
	"""Return detailed explanation for performance issues"""
	messages = {
	'nested_loops': 'Nested loops can have O(n²) complexity. Consider using more efficient algorithms or data structures like sets or dictionaries.',
	'string_concatenation': 'String concatenation in loops is inefficient. Use join() in Python or template literals in JavaScript.',
	'list_comprehension_opportunity': 'This loop pattern could be replaced with a more efficient and readable list comprehension.',
	'inefficient_contains': 'Checking membership in a list is O(n). Consider using a set for O(1) lookups if checking multiple times.',
	'dom_in_loop': 'DOM manipulation inside loops causes reflows. Batch operations or use DocumentFragment for better performance.',
	'inefficient_array_method': 'Chaining array methods creates intermediate arrays. Consider combining operations or using a single reduce().',
	}
	return messages.get(issue_type, 'Performance issue detected.')

	def _get_best_practice_message(self, issue_type: str) -> str:
	"""Return detailed explanation for best practice violations"""
	messages = {
	'var_usage': 'Use let or const instead of var for block scoping and to prevent hoisting issues.',
	'missing_semicolon': 'Missing semicolon. While JavaScript has ASI, explicit semicolons prevent potential errors.',
	'console_log': 'Remove console statements before production deployment or use a proper logging library.',
	}
	return messages.get(issue_type, 'Best practice violation detected.')

	def _get_ai_suggestions(self, code: str, language: str) -> List[Dict]:
	"""
	Generate AI-powered code improvement suggestions using CodeT5 model
	Provides high-level insights beyond pattern matching
	"""
	suggestions = []

	try:
	# Extract function names and analyze code structure
	if language == 'python':
	# Analyze Python code structure
	try:
	tree = ast.parse(code)
	functions = [node.name for node in ast.walk(tree) if isinstance(node, ast.FunctionDef)]
	classes = [node.name for node in ast.walk(tree) if isinstance(node, ast.ClassDef)]

	# Generate suggestions based on code structure
	if functions:
	# Create a summarization prompt for CodeT5
	prompt = f"summarize python: {code[:200]}"
	inputs = tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True).to(device)

	with torch.no_grad():
	outputs = model.generate(
	inputs.input_ids,
	max_length=60,
	num_beams=4,
	early_stopping=True,
	temperature=0.7
	)

	summary = tokenizer.decode(outputs[0], skip_special_tokens=True)

	# Add meaningful suggestion based on analysis
	if 'database' in code.lower() or 'query' in code.lower():
	suggestions.append({
	'type': 'Architecture Suggestion',
	'severity': 'info',
	'message': 'Consider implementing a data access layer or using an ORM like SQLAlchemy to abstract database operations and prevent SQL injection.',
	'line': 0
	})

	if len(functions) > 3:
	suggestions.append({
	'type': 'Code Organization',
	'severity': 'info',
	'message': f'This code contains {len(functions)} functions. Consider organizing related functions into classes for better code organization.',
	'line': 0
	})
	except:
	pass

	# Language-specific suggestions
	if 'for i in range(len(' in code:
	suggestions.append({
	'type': 'Pythonic Code',
	'severity': 'info',
	'message': 'Use enumerate() for index-value iteration: for i, item in enumerate(list) instead of range(len())',
	'line': 0
	})

	if re.search(r'except\s*:', code):
	suggestions.append({
	'type': 'Error Handling',
	'severity': 'info',
	'message': 'Avoid bare except clauses. Specify exception types for better error handling.',
	'line': 0
	})

	elif language == 'javascript':
	# JavaScript-specific AI suggestions
	if 'callback' in code.lower() and 'function' in code:
	suggestions.append({
	'type': 'Modern JavaScript',
	'severity': 'info',
	'message': 'Consider using Promises or async/await instead of callbacks for better readability and error handling.',
	'line': 0
	})

	if 'getElementById' in code and code.count('getElementById') > 2:
	suggestions.append({
	'type': 'Performance Tip',
	'severity': 'info',
	'message': 'Cache DOM references when accessing the same element multiple times to improve performance.',
	'line': 0
	})

	# Use CodeT5 for JavaScript analysis
	prompt = f"summarize javascript: {code[:200]}"
	inputs = tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True).to(device)

	with torch.no_grad():
	outputs = model.generate(
	inputs.input_ids,
	max_length=60,
	num_beams=4,
	early_stopping=True,
	temperature=0.7
	)

	summary = tokenizer.decode(outputs[0], skip_special_tokens=True)

	if 'array' in code.lower() or 'foreach' in code.lower():
	suggestions.append({
	'type': 'Functional Programming',
	'severity': 'info',
	'message': 'Consider using functional array methods (map, filter, reduce) for cleaner and more expressive code.',
	'line': 0
	})

	# General suggestions for both languages
	lines = code.split('\n')
	if max(len(line) for line in lines if line.strip()) > 100:
	suggestions.append({
	'type': 'Code Readability',
	'severity': 'info',
	'message': 'Some lines exceed 100 characters. Consider breaking long lines for better readability.',
	'line': 0
	})

	# Check for code complexity
	if code.count('if') + code.count('else') + code.count('elif') > 5:
	suggestions.append({
	'type': 'Complexity Warning',
	'severity': 'info',
	'message': 'High conditional complexity detected. Consider refactoring using early returns or extracting complex logic into separate functions.',
	'line': 0
	})

	except Exception as e:
	# Don't fail the entire analysis if AI suggestions fail
	print(f"AI suggestion generation note: {e}")

	return suggestions

	def format_results(results: Dict[str, List[Dict]]) -> str:
	"""
	Format analysis results into a readable markdown report
	Uses emojis and formatting for clear visual hierarchy
	"""
	if not any(results.values()):
	return "✅ Excellent! Your code looks great - no significant issues found."

	output = []

	# Security section (highest priority)
	if results['security']:
	output.append("## 🔒 Security Issues")
	output.append("These require immediate attention:\n")
	for issue in results['security']:
	severity_emoji = "🔴" if issue['severity'] == 'high' else "🟡"
	output.append(f"{severity_emoji} {issue['type']} (Line {issue['line']})")
	output.append(f" → {issue['message']}\n")

	# Performance section
	if results['performance']:
	output.append("## ⚡ Performance Issues")
	output.append("Optimize these for better efficiency:\n")
	for issue in results['performance']:
	output.append(f"🟡 {issue['type']} (Line {issue['line']})")
	output.append(f" → {issue['message']}\n")

	# Best practices section
	if results['best_practices']:
	output.append("## 📝 Best Practices")
	output.append("Follow these for cleaner, more maintainable code:\n")
	for issue in results['best_practices']:
	output.append(f"🔵 {issue['type']} (Line {issue['line']})")
	output.append(f" → {issue['message']}\n")

	# AI suggestions section
	if results['ai_suggestions']:
	output.append("## 🤖 AI-Powered Insights")
	output.append("Advanced suggestions from CodeT5 analysis:\n")
	for suggestion in results['ai_suggestions']:
	output.append(f"💡 {suggestion['type']}: {suggestion['message']}\n")

	# Summary statistics
	total_issues = sum(len(v) for v in results.values())
	high_severity = sum(1 for v in results['security'] if v['severity'] == 'high')

	output.append("---")
	output.append(f"📊 Summary: {total_issues} total suggestions found")
	if high_severity > 0:
	output.append(f"⚠️ Critical: {high_severity} high-severity security issues need immediate attention!")

	return "\n".join(output)

	def analyze_code_handler(code: str, language: str) -> str:
	"""
	Main handler function called by Gradio interface
	Coordinates the analysis and formats results for display
	"""
	if not code.strip():
	return "❗ Please enter some code to analyze."

	# Create analyzer instance
	analyzer = CodeAnalyzer()
	language_key = language.lower()

	# Validate language selection
	if language_key not in ['python', 'javascript']:
	return "⚠️ Currently supporting Python and JavaScript. More languages coming soon!"

	try:
	# Run analysis
	results = analyzer.analyze_code(code, language_key)
	# Format and return results
	return format_results(results)
	except Exception as e:
	return f"❌ An error occurred during analysis: {str(e)}\n\nPlease check your code syntax and try again."

	def create_interface():
	"""
	Create and configure the Gradio web interface
	Provides an intuitive UI for code analysis with examples
	"""

	# Example code snippets demonstrating various issues
	python_example = '''def process_user_data(user_id):
	# Fetch user data from database
	query = "SELECT * FROM users WHERE id = " + user_id
	result = db.execute(query)

	password = "admin123"

	data = []
	for row in result:
	data.append(row)

	# Process each item
	for i in range(len(data)):
	if data[i]['status'] == True:
	print(data[i])

	return data'''

	javascript_example = '''function fetchUserData(userId) {
	var apiKey = "sk-1234567890abcdef";

	// Get user element
	for (var i = 0; i < users.length; i++) {
	document.getElementById('user-' + i).innerHTML = users[i].name;
	}

	// Check user status
	if (userStatus == null) {
	console.log("User not found");
	}

	var query = "SELECT * FROM users WHERE id = " + userId;
	return db.query(query)
	}'''

	# Build Gradio interface with custom theme
	with gr.Blocks(title="AI Code Review Assistant", theme=gr.themes.Soft()) as interface:
	# Header section
	gr.Markdown("""
	# 🤖 AI-Powered Code Review Assistant

	Instantly analyze your code for security vulnerabilities, performance issues, and best practices!

	This tool uses advanced pattern matching and the CodeT5 AI model to provide comprehensive code analysis for Python and JavaScript.

	### ✨ Features
	- 🔒 Security Analysis: Detect SQL injection, XSS, hardcoded secrets, and more
	- ⚡ Performance Optimization: Identify inefficient patterns and algorithms
	- 📝 Best Practices: Ensure clean, maintainable code following language conventions
	- 🤖 AI Insights: Get intelligent suggestions powered by CodeT5 transformer model
	""")

	# Main content area with two columns
	with gr.Row():
	# Left column - Input
	with gr.Column():
	code_input = gr.Code(
	label="📝 Enter your code here",
	language="python",
	lines=15,
	value=python_example
	)

	language_select = gr.Radio(
	choices=["Python", "JavaScript"],
	value="Python",
	label="🔤 Select Language"
	)

	analyze_btn = gr.Button("🔍 Analyze Code", variant="primary", size="lg")

	# Example section
	gr.Examples(
	examples=[
	[python_example, "Python"],
	[javascript_example, "JavaScript"]
	],
	inputs=[code_input, language_select],
	label="📚 Try These Examples"
	)

	# Right column - Output
	with gr.Column():
	output = gr.Markdown(
	label="📊 Analysis Results",
	value="Your analysis results will appear here..."
	)

	# Footer with instructions and attribution
	gr.Markdown("""
	---
	### 🎯 How to Use
	1. Paste your Python or JavaScript code in the editor
	2. Select the appropriate programming language
	3. Click "Analyze Code" to run the analysis
	4. Review the categorized feedback and improve your code!

	### 💡 Tips
	- The tool works best with complete functions or code blocks
	- Line numbers help you quickly locate issues in your code
	- Security issues (🔴) should be fixed immediately
	- Use the AI insights for high-level code improvements

	---
	👨‍💻 Created by Spencer Purdy \| Computer Science @ Auburn University
	[GitHub](https://github.com/spencercpurdy) \| [LinkedIn](https://linkedin.com/in/spencerpurdy) \| [Hugging Face](https://huggingface.co/spencercpurdy)
	""")

	# Connect the analyze button to the handler function
	analyze_btn.click(
	fn=analyze_code_handler,
	inputs=[code_input, language_select],
	outputs=output
	)

	return interface

	# Main execution block
	if __name__ == "__main__":
	# Create and launch the Gradio interface
	interface = create_interface()

	# Launch with sharing enabled for easy access
	interface.launch(debug=True, share=True)