Spaces:

Girinath11
/

jina-code-debugger

Running

File size: 15,833 Bytes

import os
import threading
# model_wrapper.py - Enhanced version with better debugging
from transformers import (
    AutoTokenizer, 
    AutoModel, 
    AutoModelForSeq2SeqLM, 
    AutoModelForCausalLM,
    AutoConfig,
    pipeline
)
import torch
import logging
import os
import traceback

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class CodeDebuggerWrapper:
    def __init__(self, model_name="Girinath11/aiml_code_debug_model"):
        self.model_name = model_name
        self.model = None
        self.tokenizer = None
        self.model_type = None
        self.pipeline = None
        self._ensure_model()
    
    def _log_system_info(self):
        """Log system information for debugging."""
        logger.info(f"Python version: {os.sys.version}")
        logger.info(f"PyTorch version: {torch.__version__}")
        try:
            import transformers
            logger.info(f"Transformers version: {transformers.__version__}")
        except:
            logger.warning("Could not get transformers version")
    
    def _ensure_model(self):
        """Load model and tokenizer with comprehensive fallback strategies."""
        logger.info(f"Starting model loading process for {self.model_name}")
        self._log_system_info()
        
        try:
            # First, let's inspect the model configuration
            logger.info("Step 1: Inspecting model configuration...")
            config = AutoConfig.from_pretrained(self.model_name)
            logger.info(f"Model architecture: {config.architectures}")
            logger.info(f"Model type: {config.model_type}")
            logger.info(f"Config class: {type(config).__name__}")
            
            # Load tokenizer
            logger.info("Step 2: Loading tokenizer...")
            self.tokenizer = AutoTokenizer.from_pretrained(
                self.model_name,
                trust_remote_code=True,
                use_fast=False  # Sometimes fast tokenizers cause issues
            )
            
            # Add special tokens if missing
            if self.tokenizer.pad_token is None:
                if self.tokenizer.eos_token is not None:
                    self.tokenizer.pad_token = self.tokenizer.eos_token
                else:
                    self.tokenizer.add_special_tokens({'pad_token': '[PAD]'})
            
            logger.info("✅ Tokenizer loaded successfully")
            logger.info(f"Vocab size: {len(self.tokenizer)}")
            logger.info(f"Special tokens: pad={self.tokenizer.pad_token}, eos={self.tokenizer.eos_token}")
            
            # Try loading with pipeline first (often more robust)
            logger.info("Step 3: Attempting pipeline loading...")
            pipeline_strategies = [
                ("text2text-generation", lambda: pipeline(
                    "text2text-generation",
                    model=self.model_name,
                    tokenizer=self.tokenizer,
                    trust_remote_code=True,
                    device=-1  # CPU
                )),
                ("text-generation", lambda: pipeline(
                    "text-generation",
                    model=self.model_name,
                    tokenizer=self.tokenizer,
                    trust_remote_code=True,
                    device=-1
                )),
            ]
            
            for pipe_type, pipe_func in pipeline_strategies:
                try:
                    logger.info(f"Trying {pipe_type} pipeline...")
                    self.pipeline = pipe_func()
                    logger.info(f"✅ Successfully loaded {pipe_type} pipeline")
                    self.model_type = f"{pipe_type}_pipeline"
                    return  # Success!
                except Exception as e:
                    logger.warning(f"❌ {pipe_type} pipeline failed: {str(e)[:200]}...")
            
            # If pipeline fails, try direct model loading
            logger.info("Step 4: Attempting direct model loading...")
            loading_strategies = [
                # Strategy 1: Based on config type, try the most appropriate loader
                ("Config-based AutoModel", lambda: self._load_based_on_config(config)),
                
                # Strategy 2: Force different model types with trust_remote_code
                ("AutoModel + trust_remote_code", lambda: AutoModel.from_pretrained(
                    self.model_name, 
                    trust_remote_code=True,
                    torch_dtype=torch.float32,
                    device_map="cpu"
                )),
                
                ("AutoModelForCausalLM + trust_remote_code", lambda: AutoModelForCausalLM.from_pretrained(
                    self.model_name, 
                    trust_remote_code=True,
                    torch_dtype=torch.float32,
                    device_map="cpu"
                )),
                
                ("AutoModelForSeq2SeqLM + trust_remote_code + ignore_mismatched", lambda: AutoModelForSeq2SeqLM.from_pretrained(
                    self.model_name, 
                    trust_remote_code=True,
                    torch_dtype=torch.float32,
                    ignore_mismatched_sizes=True,
                    device_map="cpu"
                )),
                
                # Strategy 3: Try without trust_remote_code but with other options
                ("AutoModel + low_cpu_mem", lambda: AutoModel.from_pretrained(
                    self.model_name,
                    torch_dtype=torch.float32,
                    low_cpu_mem_usage=True,
                    device_map="cpu"
                )),
            ]
            
            for strategy_name, strategy_func in loading_strategies:
                try:
                    logger.info(f"Trying: {strategy_name}")
                    self.model = strategy_func()
                    self.model_type = type(self.model).__name__
                    logger.info(f"✅ Successfully loaded model with {strategy_name}")
                    logger.info(f"Model type: {self.model_type}")
                    
                    # Set to eval mode
                    if hasattr(self.model, 'eval'):
                        self.model.eval()
                    
                    return  # Success!
                    
                except Exception as e:
                    logger.warning(f"❌ {strategy_name} failed: {str(e)[:200]}...")
                    logger.debug(f"Full error: {traceback.format_exc()}")
            
            # If we get here, all strategies failed
            raise RuntimeError("❌ All model loading strategies failed")
            
        except Exception as e:
            logger.error(f"❌ Critical error in model loading: {e}")
            logger.error(f"Full traceback: {traceback.format_exc()}")
            raise
    
    def _load_based_on_config(self, config):
        """Try to load model based on its configuration type."""
        config_type = type(config).__name__
        
        if "T5" in config_type or "Seq2Seq" in config_type:
            return AutoModelForSeq2SeqLM.from_pretrained(
                self.model_name, 
                trust_remote_code=True,
                config=config
            )
        elif "GPT" in config_type or "Causal" in config_type:
            return AutoModelForCausalLM.from_pretrained(
                self.model_name,
                trust_remote_code=True,
                config=config
            )
        else:
            return AutoModel.from_pretrained(
                self.model_name,
                trust_remote_code=True,
                config=config
            )
    
    def debug(self, code: str) -> str:
        """Debug the provided code using the loaded model."""
        if not code or not code.strip():
            return "❌ Please provide some code to debug."
        
        try:
            # Use pipeline if available (more robust)
            if self.pipeline is not None:
                return self._debug_with_pipeline(code)
            
            # Use direct model if pipeline not available
            if self.model is not None:
                return self._debug_with_model(code)
            
            # Fallback: provide manual debugging suggestions
            return self._manual_debug_suggestions(code)
            
        except Exception as e:
            logger.error(f"Error during debugging: {e}")
            return f"❌ Error during debugging: {str(e)}\n\n" + self._manual_debug_suggestions(code)
    
    def _debug_with_pipeline(self, code: str) -> str:
        """Debug using pipeline."""
        try:
            prompt = f"Fix this Python code:\n\n{code}\n\nFixed code:"
            
            if "text2text" in self.model_type:
                result = self.pipeline(prompt, max_length=512, num_beams=3, early_stopping=True)
                return result[0]['generated_text'] if result else self._manual_debug_suggestions(code)
            
            elif "text-generation" in self.model_type:
                result = self.pipeline(prompt, max_new_tokens=256, num_return_sequences=1, temperature=0.7)
                generated = result[0]['generated_text'] if result else ""
                
                # Clean up the response
                if prompt in generated:
                    generated = generated.replace(prompt, "").strip()
                
                return generated if generated else self._manual_debug_suggestions(code)
            
        except Exception as e:
            logger.error(f"Pipeline debugging failed: {e}")
            return self._manual_debug_suggestions(code)
    
    def _debug_with_model(self, code: str) -> str:
        """Debug using direct model."""
        try:
            prompt = f"Debug and fix this Python code:\n\n{code}\n\nFixed code:"
            
            inputs = self.tokenizer(
                prompt,
                return_tensors="pt",
                max_length=512,
                truncation=True,
                padding=True
            )
            
            with torch.no_grad():
                if hasattr(self.model, 'generate'):
                    outputs = self.model.generate(
                        **inputs,
                        max_new_tokens=256,
                        num_beams=3,
                        early_stopping=True,
                        pad_token_id=self.tokenizer.pad_token_id,
                        eos_token_id=getattr(self.tokenizer, 'eos_token_id', None),
                        do_sample=True,
                        temperature=0.7
                    )
                    
                    response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
                    
                    # Clean response
                    if prompt in response:
                        response = response.replace(prompt, "").strip()
                    
                    return response if response else self._manual_debug_suggestions(code)
                
                else:
                    return f"⚠️ Model type '{self.model_type}' doesn't support generation.\n\n" + self._manual_debug_suggestions(code)
        
        except Exception as e:
            logger.error(f"Direct model debugging failed: {e}")
            return self._manual_debug_suggestions(code)
    
    def _manual_debug_suggestions(self, code: str) -> str:
        """Provide manual debugging suggestions when AI model fails."""
        suggestions = []
        
        # Check for common Python syntax errors
        lines = code.split('\n')
        
        for i, line in enumerate(lines, 1):
            line_stripped = line.strip()
            if not line_stripped or line_stripped.startswith('#'):
                continue
            
            # Check for missing colons
            if any(keyword in line_stripped for keyword in ['if ', 'for ', 'while ', 'def ', 'class ', 'try:', 'except', 'else', 'elif']):
                if not line_stripped.endswith(':') and not line_stripped.endswith(':\\'):
                    suggestions.append(f"Line {i}: Missing colon (:) at end of statement")
            
            # Check for obvious indentation issues
            if i > 1 and line_stripped and not line.startswith(' ') and not line.startswith('\t'):
                prev_line = lines[i-2].strip() if i > 1 else ""
                if prev_line.endswith(':'):
                    suggestions.append(f"Line {i}: Possible indentation error - code after ':' should be indented")
        
        # Check for common runtime errors
        if 'len(' in code and '[]' in code:
            suggestions.append("⚠️ Potential division by zero: Check for empty lists before using len()")
        
        if '/0' in code or '/ 0' in code:
            suggestions.append("⚠️ Division by zero detected")
        
        # Create response
        result = f"🔧 **Manual Debug Analysis for:**\n```python\n{code}\n```\n\n"
        
        if suggestions:
            result += "**Issues Found:**\n"
            for suggestion in suggestions:
                result += f"• {suggestion}\n"
        else:
            result += "**No obvious syntax errors detected.**\n"
        
        result += "\n**General Tips:**\n"
        result += "• Check for missing colons (:) after if/for/def statements\n"
        result += "• Verify proper indentation (4 spaces per level)\n"
        result += "• Ensure all parentheses, brackets, and quotes are balanced\n"
        result += "• Check for typos in variable and function names\n"
        result += "• Make sure all required imports are included\n"
        
        return result


# Alternative lightweight debugger if the main one fails completely
class FallbackDebugger:
    def __init__(self):
        self.model = None
        self.tokenizer = None
        logger.info("Using fallback debugger - AI model unavailable")
    
    def debug(self, code: str) -> str:
        """Simple rule-based debugging."""
        if not code or not code.strip():
            return "❌ Please provide some code to debug."
        
        issues = []
        lines = code.split('\n')
        
        # Basic syntax checking
        for i, line in enumerate(lines, 1):
            stripped = line.strip()
            if not stripped or stripped.startswith('#'):
                continue
            
            # Missing colons
            control_words = ['if ', 'elif ', 'else', 'for ', 'while ', 'def ', 'class ', 'try', 'except', 'finally']
            if any(word in stripped for word in control_words):
                if not stripped.endswith(':'):
                    issues.append(f"Line {i}: Missing colon (:)")
            
            # Indentation after colon
            if i < len(lines) and stripped.endswith(':'):
                next_line = lines[i] if i < len(lines) else ""
                if next_line.strip() and not next_line.startswith((' ', '\t')):
                    issues.append(f"Line {i+1}: Should be indented after ':'")
        
        # Generate response
        result = f"🔧 **Code Analysis** (AI Model Unavailable)\n\n"
        result += f"```python\n{code}\n```\n\n"
        
        if issues:
            result += "**Potential Issues:**\n"
            for issue in issues:
                result += f"• {issue}\n"
        else:
            result += "**No obvious syntax errors found.**\n"
        
        result += "\n**Common Debugging Steps:**\n"
        result += "1. Run the code to see specific error messages\n"
        result += "2. Check syntax: colons, indentation, parentheses\n"
        result += "3. Verify variable names and imports\n"
        result += "4. Use print() statements to debug logic\n"
        
        return result