Spaces:

SpencerCPurdy
/

AI-Powered-Code-Review-Assistant

Running

File size: 29,670 Bytes

f795275

# AI-Powered Code Review Assistant
# Author: Spencer Purdy
# Description: An intelligent code review tool that analyzes Python and JavaScript code
# for best practices, security vulnerabilities, and performance improvements using
# CodeT5 and advanced pattern analysis.

# Import required libraries
import subprocess
import sys
import re
import ast
import json
from typing import List, Dict, Tuple

# Install required packages if not already installed
def install_packages():
    """Install required packages for the application"""
    packages = ['gradio', 'transformers', 'torch', 'sentencepiece']
    for package in packages:
        subprocess.check_call([sys.executable, '-m', 'pip', 'install', package, '-q'])

# Try importing, install if needed
try:
    import gradio as gr
    from transformers import RobertaTokenizer, T5ForConditionalGeneration
    import torch
except ImportError:
    print("Installing required packages...")
    install_packages()
    import gradio as gr
    from transformers import RobertaTokenizer, T5ForConditionalGeneration
    import torch

# Initialize the CodeT5 model for AI-powered code analysis
print("Loading CodeT5 model...")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = RobertaTokenizer.from_pretrained('Salesforce/codet5-base')
model = T5ForConditionalGeneration.from_pretrained('Salesforce/codet5-base').to(device)
model.eval()
print(f"Model loaded successfully on {device}!")

class CodeAnalyzer:
    """
    Main class for analyzing code quality, security, and performance.
    Uses pattern matching and AI to identify issues in Python and JavaScript code.
    """
    
    def __init__(self):
        """Initialize analyzer with predefined patterns for security, performance, and best practices"""
        
        # Security vulnerability patterns for each language
        self.security_patterns = {
            'python': {
                'sql_injection': r'(execute|executemany)\s*\(\s*["\'].*%[s|d].*["\'].*%',
                'command_injection': r'(os\.system|subprocess\.call|subprocess\.run)\s*\([^)]*\+[^)]*\)',
                'eval_usage': r'\beval\s*\(',
                'pickle_usage': r'pickle\.(load|loads)\s*\(',
                'hardcoded_secrets': r'(password|api_key|secret|token)\s*=\s*["\'][^"\']+["\']',
                'weak_random': r'random\.(random|randint|choice)\s*\(',
            },
            'javascript': {
                'eval_usage': r'\beval\s*\(',
                'innerHTML_xss': r'\.innerHTML\s*=',
                'sql_injection': r'query\s*\(\s*["\'].*\+.*["\']',
                'hardcoded_secrets': r'(password|apiKey|secret|token)\s*=\s*["\'][^"\']+["\']',
                'weak_comparison': r'==\s*(null|undefined)',
                'unsafe_regex': r'new\s+RegExp\s*\([^)]*\+[^)]*\)',
            }
        }
        
        # Performance issue patterns
        self.performance_patterns = {
            'python': {
                'nested_loops': r'for\s+.*:\s*\n\s*for\s+.*:',
                'string_concatenation': r'["\'].*["\']\s*\+\s*["\'].*["\']',
                'list_comprehension_opportunity': r'for\s+.*:\s*\n\s*.*\.append\(',
                'inefficient_contains': r'if\s+.*\s+in\s+.*list\(',
            },
            'javascript': {
                'nested_loops': r'for\s*\([^)]*\)\s*{\s*for\s*\(',
                'dom_in_loop': r'for\s*\([^)]*\)\s*{[^}]*document\.(getElementById|querySelector)',
                'string_concatenation': r'["\'].*["\']\s*\+\s*["\'].*["\']',
                'inefficient_array_method': r'\.(forEach|map|filter)\s*\([^)]*\)\s*\.(forEach|map|filter)',
            }
        }
        
        # Best practice checkers (mix of regex patterns and callable functions)
        self.best_practices = {
            'python': {
                'missing_docstring': self._check_missing_docstring,
                'long_functions': self._check_long_functions,
                'naming_convention': self._check_python_naming,
                'unused_variables': self._check_unused_variables,
            },
            'javascript': {
                'var_usage': r'\bvar\s+',
                'missing_semicolon': r'[^;]\s*\n\s*(let|const|return|if|for|while)',
                'console_log': r'console\.(log|error|warn)\(',
                'naming_convention': self._check_js_naming,
            }
        }
    
    def analyze_code(self, code: str, language: str) -> Dict[str, List[Dict]]:
        """
        Main analysis function that runs all checks on the provided code
        
        Args:
            code: Source code string to analyze
            language: Programming language ('python' or 'javascript')
            
        Returns:
            Dictionary with categorized issues: security, performance, best_practices, ai_suggestions
        """
        results = {
            'security': [],
            'performance': [],
            'best_practices': [],
            'ai_suggestions': []
        }
        
        # Run security analysis using regex patterns
        for issue_name, pattern in self.security_patterns.get(language, {}).items():
            if isinstance(pattern, str) and re.search(pattern, code, re.IGNORECASE):
                results['security'].append({
                    'type': issue_name.replace('_', ' ').title(),
                    'severity': 'high' if issue_name in ['sql_injection', 'command_injection', 'eval_usage'] else 'medium',
                    'message': self._get_security_message(issue_name),
                    'line': self._find_line_number(code, pattern)
                })
        
        # Run performance analysis
        for issue_name, pattern in self.performance_patterns.get(language, {}).items():
            if isinstance(pattern, str) and re.search(pattern, code, re.MULTILINE):
                results['performance'].append({
                    'type': issue_name.replace('_', ' ').title(),
                    'severity': 'medium',
                    'message': self._get_performance_message(issue_name),
                    'line': self._find_line_number(code, pattern)
                })
        
        # Run best practices analysis (can be regex or function-based)
        for issue_name, checker in self.best_practices.get(language, {}).items():
            if callable(checker):
                issues = checker(code)
                results['best_practices'].extend(issues)
            elif isinstance(checker, str) and re.search(checker, code):
                results['best_practices'].append({
                    'type': issue_name.replace('_', ' ').title(),
                    'severity': 'low',
                    'message': self._get_best_practice_message(issue_name),
                    'line': self._find_line_number(code, checker)
                })
        
        # Get AI-powered suggestions using CodeT5
        ai_suggestions = self._get_ai_suggestions(code, language)
        if ai_suggestions:
            results['ai_suggestions'] = ai_suggestions
        
        return results
    
    def _check_missing_docstring(self, code: str) -> List[Dict]:
        """
        Check Python code for functions and classes missing docstrings
        Uses AST parsing to analyze code structure
        """
        issues = []
        try:
            tree = ast.parse(code)
            for node in ast.walk(tree):
                if isinstance(node, (ast.FunctionDef, ast.ClassDef)):
                    if not ast.get_docstring(node):
                        issues.append({
                            'type': 'Missing Docstring',
                            'severity': 'low',
                            'message': f'Function/Class "{node.name}" should have a docstring explaining its purpose',
                            'line': node.lineno
                        })
        except:
            # If AST parsing fails, skip this check
            pass
        return issues
    
    def _check_long_functions(self, code: str) -> List[Dict]:
        """
        Identify functions that exceed recommended length (20 lines)
        Long functions are harder to understand and maintain
        """
        issues = []
        try:
            tree = ast.parse(code)
            for node in ast.walk(tree):
                if isinstance(node, ast.FunctionDef):
                    func_lines = node.end_lineno - node.lineno
                    if func_lines > 20:
                        issues.append({
                            'type': 'Long Function',
                            'severity': 'medium',
                            'message': f'Function "{node.name}" is {func_lines} lines long. Consider breaking it into smaller functions.',
                            'line': node.lineno
                        })
        except:
            pass
        return issues
    
    def _check_python_naming(self, code: str) -> List[Dict]:
        """Check for Python naming convention violations (should use snake_case)"""
        issues = []
        camel_case_pattern = r'\b[a-z]+[A-Z]\w*\s*='
        matches = re.finditer(camel_case_pattern, code)
        for match in matches:
            issues.append({
                'type': 'Naming Convention',
                'severity': 'low',
                'message': 'Use snake_case for variable names in Python (e.g., my_variable instead of myVariable)',
                'line': code[:match.start()].count('\n') + 1
            })
        return issues
    
    def _check_js_naming(self, code: str) -> List[Dict]:
        """Check for JavaScript naming convention violations (should use camelCase)"""
        issues = []
        snake_case_pattern = r'(let|const|var)\s+[a-z]+_[a-z]+\s*='
        matches = re.finditer(snake_case_pattern, code)
        for match in matches:
            issues.append({
                'type': 'Naming Convention',
                'severity': 'low',
                'message': 'Use camelCase for variable names in JavaScript (e.g., myVariable instead of my_variable)',
                'line': code[:match.start()].count('\n') + 1
            })
        return issues
    
    def _check_unused_variables(self, code: str) -> List[Dict]:
        """
        Detect variables that are assigned but never used in Python code
        Unused variables can indicate dead code or incomplete refactoring
        """
        issues = []
        try:
            tree = ast.parse(code)
            assigned_vars = set()
            used_vars = set()
            
            # Walk AST to find assignments and variable usage
            for node in ast.walk(tree):
                if isinstance(node, ast.Assign):
                    for target in node.targets:
                        if isinstance(target, ast.Name):
                            assigned_vars.add(target.id)
                elif isinstance(node, ast.Name) and isinstance(node.ctx, ast.Load):
                    used_vars.add(node.id)
            
            # Find variables that are assigned but never used
            unused = assigned_vars - used_vars - {'_'}  # Exclude underscore
            for var in unused:
                issues.append({
                    'type': 'Unused Variable',
                    'severity': 'low',
                    'message': f'Variable "{var}" is defined but never used',
                    'line': 0  # Line number would require more complex analysis
                })
        except:
            pass
        return issues
    
    def _find_line_number(self, code: str, pattern: str) -> int:
        """Find the line number where a regex pattern first matches"""
        match = re.search(pattern, code, re.MULTILINE | re.IGNORECASE)
        if match:
            return code[:match.start()].count('\n') + 1
        return 0
    
    def _get_security_message(self, issue_type: str) -> str:
        """Return detailed explanation for security issues"""
        messages = {
            'sql_injection': 'Potential SQL injection vulnerability. Use parameterized queries or prepared statements instead of string concatenation.',
            'command_injection': 'Potential command injection. Never use user input directly in system commands. Sanitize and validate all inputs.',
            'eval_usage': 'Using eval() is dangerous and can lead to code injection. Consider using ast.literal_eval() or alternative approaches.',
            'pickle_usage': 'Pickle can execute arbitrary code during deserialization. Only unpickle data from trusted sources.',
            'hardcoded_secrets': 'Hardcoded credentials detected. Use environment variables or secure configuration management.',
            'weak_random': 'Using non-cryptographic randomness for security. Use secrets module for tokens, passwords, or security-sensitive operations.',
            'innerHTML_xss': 'Setting innerHTML with user data can lead to XSS attacks. Use textContent or properly sanitize input.',
            'weak_comparison': 'Use === instead of == to avoid JavaScript type coercion issues.',
            'unsafe_regex': 'Dynamic regex creation can lead to ReDoS attacks. Validate and escape user input carefully.',
        }
        return messages.get(issue_type, 'Security issue detected.')
    
    def _get_performance_message(self, issue_type: str) -> str:
        """Return detailed explanation for performance issues"""
        messages = {
            'nested_loops': 'Nested loops can have O(n²) complexity. Consider using more efficient algorithms or data structures like sets or dictionaries.',
            'string_concatenation': 'String concatenation in loops is inefficient. Use join() in Python or template literals in JavaScript.',
            'list_comprehension_opportunity': 'This loop pattern could be replaced with a more efficient and readable list comprehension.',
            'inefficient_contains': 'Checking membership in a list is O(n). Consider using a set for O(1) lookups if checking multiple times.',
            'dom_in_loop': 'DOM manipulation inside loops causes reflows. Batch operations or use DocumentFragment for better performance.',
            'inefficient_array_method': 'Chaining array methods creates intermediate arrays. Consider combining operations or using a single reduce().',
        }
        return messages.get(issue_type, 'Performance issue detected.')
    
    def _get_best_practice_message(self, issue_type: str) -> str:
        """Return detailed explanation for best practice violations"""
        messages = {
            'var_usage': 'Use let or const instead of var for block scoping and to prevent hoisting issues.',
            'missing_semicolon': 'Missing semicolon. While JavaScript has ASI, explicit semicolons prevent potential errors.',
            'console_log': 'Remove console statements before production deployment or use a proper logging library.',
        }
        return messages.get(issue_type, 'Best practice violation detected.')
    
    def _get_ai_suggestions(self, code: str, language: str) -> List[Dict]:
        """
        Generate AI-powered code improvement suggestions using CodeT5 model
        Provides high-level insights beyond pattern matching
        """
        suggestions = []
        
        try:
            # Extract function names and analyze code structure
            if language == 'python':
                # Analyze Python code structure
                try:
                    tree = ast.parse(code)
                    functions = [node.name for node in ast.walk(tree) if isinstance(node, ast.FunctionDef)]
                    classes = [node.name for node in ast.walk(tree) if isinstance(node, ast.ClassDef)]
                    
                    # Generate suggestions based on code structure
                    if functions:
                        # Create a summarization prompt for CodeT5
                        prompt = f"summarize python: {code[:200]}"
                        inputs = tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True).to(device)
                        
                        with torch.no_grad():
                            outputs = model.generate(
                                inputs.input_ids,
                                max_length=60,
                                num_beams=4,
                                early_stopping=True,
                                temperature=0.7
                            )
                        
                        summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
                        
                        # Add meaningful suggestion based on analysis
                        if 'database' in code.lower() or 'query' in code.lower():
                            suggestions.append({
                                'type': 'Architecture Suggestion',
                                'severity': 'info',
                                'message': 'Consider implementing a data access layer or using an ORM like SQLAlchemy to abstract database operations and prevent SQL injection.',
                                'line': 0
                            })
                        
                        if len(functions) > 3:
                            suggestions.append({
                                'type': 'Code Organization',
                                'severity': 'info',
                                'message': f'This code contains {len(functions)} functions. Consider organizing related functions into classes for better code organization.',
                                'line': 0
                            })
                except:
                    pass
                
                # Language-specific suggestions
                if 'for i in range(len(' in code:
                    suggestions.append({
                        'type': 'Pythonic Code',
                        'severity': 'info',
                        'message': 'Use enumerate() for index-value iteration: for i, item in enumerate(list) instead of range(len())',
                        'line': 0
                    })
                
                if re.search(r'except\s*:', code):
                    suggestions.append({
                        'type': 'Error Handling',
                        'severity': 'info',
                        'message': 'Avoid bare except clauses. Specify exception types for better error handling.',
                        'line': 0
                    })
                    
            elif language == 'javascript':
                # JavaScript-specific AI suggestions
                if 'callback' in code.lower() and 'function' in code:
                    suggestions.append({
                        'type': 'Modern JavaScript',
                        'severity': 'info',
                        'message': 'Consider using Promises or async/await instead of callbacks for better readability and error handling.',
                        'line': 0
                    })
                
                if 'getElementById' in code and code.count('getElementById') > 2:
                    suggestions.append({
                        'type': 'Performance Tip',
                        'severity': 'info',
                        'message': 'Cache DOM references when accessing the same element multiple times to improve performance.',
                        'line': 0
                    })
                
                # Use CodeT5 for JavaScript analysis
                prompt = f"summarize javascript: {code[:200]}"
                inputs = tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True).to(device)
                
                with torch.no_grad():
                    outputs = model.generate(
                        inputs.input_ids,
                        max_length=60,
                        num_beams=4,
                        early_stopping=True,
                        temperature=0.7
                    )
                
                summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
                
                if 'array' in code.lower() or 'foreach' in code.lower():
                    suggestions.append({
                        'type': 'Functional Programming',
                        'severity': 'info',
                        'message': 'Consider using functional array methods (map, filter, reduce) for cleaner and more expressive code.',
                        'line': 0
                    })
            
            # General suggestions for both languages
            lines = code.split('\n')
            if max(len(line) for line in lines if line.strip()) > 100:
                suggestions.append({
                    'type': 'Code Readability',
                    'severity': 'info',
                    'message': 'Some lines exceed 100 characters. Consider breaking long lines for better readability.',
                    'line': 0
                })
            
            # Check for code complexity
            if code.count('if') + code.count('else') + code.count('elif') > 5:
                suggestions.append({
                    'type': 'Complexity Warning',
                    'severity': 'info',
                    'message': 'High conditional complexity detected. Consider refactoring using early returns or extracting complex logic into separate functions.',
                    'line': 0
                })
                
        except Exception as e:
            # Don't fail the entire analysis if AI suggestions fail
            print(f"AI suggestion generation note: {e}")
        
        return suggestions

def format_results(results: Dict[str, List[Dict]]) -> str:
    """
    Format analysis results into a readable markdown report
    Uses emojis and formatting for clear visual hierarchy
    """
    if not any(results.values()):
        return "✅ **Excellent!** Your code looks great - no significant issues found."
    
    output = []
    
    # Security section (highest priority)
    if results['security']:
        output.append("## 🔒 Security Issues")
        output.append("*These require immediate attention:*\n")
        for issue in results['security']:
            severity_emoji = "🔴" if issue['severity'] == 'high' else "🟡"
            output.append(f"{severity_emoji} **{issue['type']}** (Line {issue['line']})")
            output.append(f"   → {issue['message']}\n")
    
    # Performance section
    if results['performance']:
        output.append("## ⚡ Performance Issues")
        output.append("*Optimize these for better efficiency:*\n")
        for issue in results['performance']:
            output.append(f"🟡 **{issue['type']}** (Line {issue['line']})")
            output.append(f"   → {issue['message']}\n")
    
    # Best practices section
    if results['best_practices']:
        output.append("## 📝 Best Practices")
        output.append("*Follow these for cleaner, more maintainable code:*\n")
        for issue in results['best_practices']:
            output.append(f"🔵 **{issue['type']}** (Line {issue['line']})")
            output.append(f"   → {issue['message']}\n")
    
    # AI suggestions section
    if results['ai_suggestions']:
        output.append("## 🤖 AI-Powered Insights")
        output.append("*Advanced suggestions from CodeT5 analysis:*\n")
        for suggestion in results['ai_suggestions']:
            output.append(f"💡 **{suggestion['type']}**: {suggestion['message']}\n")
    
    # Summary statistics
    total_issues = sum(len(v) for v in results.values())
    high_severity = sum(1 for v in results['security'] if v['severity'] == 'high')
    
    output.append("---")
    output.append(f"**📊 Summary**: {total_issues} total suggestions found")
    if high_severity > 0:
        output.append(f"**⚠️ Critical**: {high_severity} high-severity security issues need immediate attention!")
    
    return "\n".join(output)

def analyze_code_handler(code: str, language: str) -> str:
    """
    Main handler function called by Gradio interface
    Coordinates the analysis and formats results for display
    """
    if not code.strip():
        return "❗ Please enter some code to analyze."
    
    # Create analyzer instance
    analyzer = CodeAnalyzer()
    language_key = language.lower()
    
    # Validate language selection
    if language_key not in ['python', 'javascript']:
        return "⚠️ Currently supporting Python and JavaScript. More languages coming soon!"
    
    try:
        # Run analysis
        results = analyzer.analyze_code(code, language_key)
        # Format and return results
        return format_results(results)
    except Exception as e:
        return f"❌ An error occurred during analysis: {str(e)}\n\nPlease check your code syntax and try again."

def create_interface():
    """
    Create and configure the Gradio web interface
    Provides an intuitive UI for code analysis with examples
    """
    
    # Example code snippets demonstrating various issues
    python_example = '''def process_user_data(user_id):
    # Fetch user data from database
    query = "SELECT * FROM users WHERE id = " + user_id
    result = db.execute(query)
    
    password = "admin123"
    
    data = []
    for row in result:
        data.append(row)
    
    # Process each item
    for i in range(len(data)):
        if data[i]['status'] == True:
            print(data[i])
    
    return data'''
    
    javascript_example = '''function fetchUserData(userId) {
    var apiKey = "sk-1234567890abcdef";
    
    // Get user element
    for (var i = 0; i < users.length; i++) {
        document.getElementById('user-' + i).innerHTML = users[i].name;
    }
    
    // Check user status
    if (userStatus == null) {
        console.log("User not found");
    }
    
    var query = "SELECT * FROM users WHERE id = " + userId;
    return db.query(query)
}'''
    
    # Build Gradio interface with custom theme
    with gr.Blocks(title="AI Code Review Assistant", theme=gr.themes.Soft()) as interface:
        # Header section
        gr.Markdown("""
        # 🤖 AI-Powered Code Review Assistant
        
        **Instantly analyze your code for security vulnerabilities, performance issues, and best practices!**
        
        This tool uses advanced pattern matching and the CodeT5 AI model to provide comprehensive code analysis for Python and JavaScript.
        
        ### ✨ Features
        - 🔒 **Security Analysis**: Detect SQL injection, XSS, hardcoded secrets, and more
        - ⚡ **Performance Optimization**: Identify inefficient patterns and algorithms
        - 📝 **Best Practices**: Ensure clean, maintainable code following language conventions
        - 🤖 **AI Insights**: Get intelligent suggestions powered by CodeT5 transformer model
        """)
        
        # Main content area with two columns
        with gr.Row():
            # Left column - Input
            with gr.Column():
                code_input = gr.Code(
                    label="📝 Enter your code here",
                    language="python",
                    lines=15,
                    value=python_example
                )
                
                language_select = gr.Radio(
                    choices=["Python", "JavaScript"],
                    value="Python",
                    label="🔤 Select Language"
                )
                
                analyze_btn = gr.Button("🔍 Analyze Code", variant="primary", size="lg")
                
                # Example section
                gr.Examples(
                    examples=[
                        [python_example, "Python"],
                        [javascript_example, "JavaScript"]
                    ],
                    inputs=[code_input, language_select],
                    label="📚 Try These Examples"
                )
            
            # Right column - Output
            with gr.Column():
                output = gr.Markdown(
                    label="📊 Analysis Results", 
                    value="*Your analysis results will appear here...*"
                )
        
        # Footer with instructions and attribution
        gr.Markdown("""
        ---
        ### 🎯 How to Use
        1. **Paste** your Python or JavaScript code in the editor
        2. **Select** the appropriate programming language
        3. **Click** "Analyze Code" to run the analysis
        4. **Review** the categorized feedback and improve your code!
        
        ### 💡 Tips
        - The tool works best with complete functions or code blocks
        - Line numbers help you quickly locate issues in your code
        - Security issues (🔴) should be fixed immediately
        - Use the AI insights for high-level code improvements
        
        ---
        👨‍💻 **Created by Spencer Purdy** | Computer Science @ Auburn University  
        [GitHub](https://github.com/spencercpurdy) | [LinkedIn](https://linkedin.com/in/spencerpurdy) | [Hugging Face](https://huggingface.co/spencercpurdy)
        """)
        
        # Connect the analyze button to the handler function
        analyze_btn.click(
            fn=analyze_code_handler,
            inputs=[code_input, language_select],
            outputs=output
        )
    
    return interface

# Main execution block
if __name__ == "__main__":
    # Create and launch the Gradio interface
    interface = create_interface()
    
    # Launch with sharing enabled for easy access
    interface.launch(debug=True, share=True)