import gradio as gr
import PyPDF2
import requests
from bs4 import BeautifulSoup
import re
import random
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
import torch
from sentence_transformers import SentenceTransformer
import json
from typing import List, Dict, Tuple
import numpy as np

class InterviewBot:
    def __init__(self):
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        print(f"Using device: {self.device}")

        # Initialize models
        self.init_models()

        # Storage for content and questions
        self.content_chunks = []
        self.important_chunks = []
        self.questions = []
        self.current_question_idx = 0
        self.user_answers = []
        self.scores = []

    def init_models(self):
        """Initialize models for content analysis and answer evaluation"""
        try:
            # Text similarity model for answer evaluation
            print("Loading similarity model...")
            self.similarity_model = SentenceTransformer('all-MiniLM-L6-v2')

            # Text summarization for content processing (optional, with fallback)
            print("Loading summarization pipeline...")
            try:
                self.summarizer = pipeline("summarization", 
                                         model="facebook/bart-large-cnn",
                                         device=0 if self.device == "cuda" else -1)
            except:
                print("Using fallback summarization...")
                self.summarizer = None

            print("Models loaded successfully!")

        except Exception as e:
            print(f"Error loading models: {e}")
            self.init_fallback_models()

    def init_fallback_models(self):
        """Initialize minimal models if main models fail to load"""
        print("Loading fallback models...")
        self.similarity_model = SentenceTransformer('all-MiniLM-L6-v2')
        self.summarizer = None

    def extract_pdf_text(self, pdf_file) -> str:
        """Extract text from uploaded PDF file"""
        try:
            import io

            if isinstance(pdf_file, bytes):
                pdf_stream = io.BytesIO(pdf_file)
            elif hasattr(pdf_file, 'read'):
                pdf_stream = io.BytesIO(pdf_file.read())
            elif isinstance(pdf_file, str):
                with open(pdf_file, 'rb') as f:
                    pdf_stream = io.BytesIO(f.read())
            else:
                pdf_stream = pdf_file

            pdf_reader = PyPDF2.PdfReader(pdf_stream)
            text = ""

            max_pages = min(50, len(pdf_reader.pages))

            for page_num in range(max_pages):
                page = pdf_reader.pages[page_num]
                page_text = page.extract_text()
                if page_text.strip():
                    text += page_text + "\n"

            if not text.strip():
                return "Error extracting PDF: No readable text found in the PDF"

            return text
        except Exception as e:
            return f"Error extracting PDF: {str(e)}"

    def extract_web_content(self, url: str) -> str:
        """Extract text content from web URL"""
        try:
            headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
            }
            response = requests.get(url, headers=headers, timeout=10)
            response.raise_for_status()

            soup = BeautifulSoup(response.content, 'html.parser')

            for script in soup(["script", "style"]):
                script.decompose()

            content_tags = soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'article', 'div'])
            text = ""
            for tag in content_tags:
                if tag.get_text().strip():
                    text += tag.get_text().strip() + "\n"

            return text
        except Exception as e:
            return f"Error extracting web content: {str(e)}"

    def chunk_text(self, text: str, chunk_size: int = 800) -> List[str]:
        """Split text into manageable chunks with better content preservation"""
        text = re.sub(r'\s+', ' ', text).strip()
        
        # Split by paragraphs first, then by sentences if needed
        paragraphs = text.split('\n\n')
        chunks = []
        
        for paragraph in paragraphs:
            if len(paragraph) <= chunk_size:
                if paragraph.strip():
                    chunks.append(paragraph.strip())
            else:
                # Split long paragraphs by sentences
                sentences = re.split(r'[.!?]+', paragraph)
                current_chunk = ""
                
                for sentence in sentences:
                    sentence = sentence.strip()
                    if not sentence:
                        continue
                        
                    if len(current_chunk) + len(sentence) < chunk_size:
                        current_chunk += sentence + ". "
                    else:
                        if current_chunk:
                            chunks.append(current_chunk.strip())
                        current_chunk = sentence + ". "
                
                if current_chunk:
                    chunks.append(current_chunk.strip())
        
        return [chunk for chunk in chunks if len(chunk) > 100]  # Filter very short chunks

    def identify_important_chunks(self, chunks: List[str]) -> List[Dict]:
        """Identify and rank chunks by importance for question generation"""
        important_chunks = []
        
        for i, chunk in enumerate(chunks):
            importance_score = 0
            
            # Score based on content indicators
            # Key phrases that suggest important content
            key_indicators = [
                'definition', 'concept', 'principle', 'theory', 'method', 'process',
                'important', 'significant', 'key', 'main', 'primary', 'essential',
                'result', 'conclusion', 'finding', 'solution', 'approach',
                'example', 'case study', 'research', 'study', 'analysis'
            ]
            
            chunk_lower = chunk.lower()
            for indicator in key_indicators:
                importance_score += chunk_lower.count(indicator)
            
            # Bonus for chunks with numbers, dates, or specific data
            if re.search(r'\b\d+%\b|\b\d+\.\d+\b|\b\d{4}\b', chunk):
                importance_score += 2
                
            # Bonus for chunks with technical terms (capitalized words)
            capitalized_words = re.findall(r'\b[A-Z][a-z]+\b', chunk)
            importance_score += min(len(capitalized_words) * 0.5, 5)
            
            # Penalty for very short or very long chunks
            if len(chunk) < 200:
                importance_score -= 2
            elif len(chunk) > 1500:
                importance_score -= 1
                
            important_chunks.append({
                'text': chunk,
                'score': importance_score,
                'index': i
            })
        
        # Sort by importance and return top chunks
        important_chunks.sort(key=lambda x: x['score'], reverse=True)
        return important_chunks

    def generate_better_questions(self, chunks: List[str], num_questions: int = 10) -> List[Dict]:
        """Generate high-quality interview questions using rule-based and template approaches"""
        
        # Identify important content chunks
        important_chunks = self.identify_important_chunks(chunks)
        
        questions = []
        question_types = ["conceptual", "analytical", "application", "definition", "comparison"]
        
        # Use top chunks for question generation
        selected_chunks = important_chunks[:min(len(important_chunks), num_questions)]
        
        for i in range(min(num_questions, len(selected_chunks))):
            chunk_data = selected_chunks[i]
            chunk = chunk_data['text']
            question_type = question_types[i % len(question_types)]
            
            try:
                # Use rule-based question generation instead of unreliable AI generation
                question = self.create_intelligent_question(chunk, question_type)
                
                # Extract answer key points
                key_concepts = self.extract_key_concepts(chunk)
                
                questions.append({
                    'question': question,
                    'context': chunk,
                    'key_concepts': key_concepts,
                    'question_type': question_type,
                    'question_id': i + 1,
                    'importance_score': chunk_data['score']
                })
                
            except Exception as e:
                print(f"Error generating question {i+1}: {e}")
                # Create fallback question
                fallback_question = self.create_fallback_question(chunk, question_type)
                questions.append({
                    'question': fallback_question,
                    'context': chunk,
                    'key_concepts': self.extract_key_concepts(chunk),
                    'question_type': question_type,
                    'question_id': i + 1,
                    'importance_score': chunk_data['score']
                })
        
        return questions

    def create_intelligent_question(self, chunk: str, question_type: str) -> str:
        """Create intelligent questions using rule-based analysis of content"""
        
        # Analyze the chunk content
        analysis = self.analyze_chunk_content(chunk)
        
        # Generate question based on analysis and type
        if question_type == "conceptual":
            return self.create_conceptual_question(analysis)
        elif question_type == "analytical":
            return self.create_analytical_question(analysis)
        elif question_type == "application":
            return self.create_application_question(analysis)
        elif question_type == "definition":
            return self.create_definition_question(analysis)
        elif question_type == "comparison":
            return self.create_comparison_question(analysis)
        else:
            return self.create_general_question(analysis)

    def analyze_chunk_content(self, chunk: str) -> Dict:
        """Analyze chunk content to extract meaningful elements for question generation"""
        analysis = {
            'main_topics': [],
            'technical_terms': [],
            'processes': [],
            'benefits': [],
            'examples': [],
            'numbers_data': [],
            'key_phrases': [],
            'concepts': []
        }
        
        # Extract main topics (capitalized terms that appear multiple times or are emphasized)
        potential_topics = re.findall(r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b', chunk)
        topic_freq = {}
        for topic in potential_topics:
            if len(topic) > 3 and topic not in ['The', 'This', 'That', 'These', 'Those']:
                topic_freq[topic] = topic_freq.get(topic, 0) + 1
        
        analysis['main_topics'] = [topic for topic, freq in sorted(topic_freq.items(), key=lambda x: x[1], reverse=True)[:3]]
        
        # Extract technical terms (words with specific patterns)
        technical_patterns = [
            r'\b[a-z]+(?:[A-Z][a-z]*)+\b',  # camelCase
            r'\b[A-Z]+\b',  # Acronyms
            r'\b\w+\.\w+\b',  # module.function
            r'\b\w+_\w+\b'   # snake_case
        ]
        
        for pattern in technical_patterns:
            matches = re.findall(pattern, chunk)
            analysis['technical_terms'].extend(matches)
        
        analysis['technical_terms'] = list(set(analysis['technical_terms']))[:5]
        
        # Extract processes (words indicating steps or procedures)
        process_indicators = ['step', 'process', 'method', 'approach', 'way', 'how to', 'procedure']
        for indicator in process_indicators:
            if indicator in chunk.lower():
                # Find sentences containing process indicators
                sentences = re.split(r'[.!?]+', chunk)
                for sentence in sentences:
                    if indicator in sentence.lower() and len(sentence.strip()) > 20:
                        analysis['processes'].append(sentence.strip())
                        break
        
        # Extract benefits/advantages (positive outcome indicators)
        benefit_indicators = ['benefit', 'advantage', 'improvement', 'better', 'efficient', 'effective']
        for indicator in benefit_indicators:
            if indicator in chunk.lower():
                sentences = re.split(r'[.!?]+', chunk)
                for sentence in sentences:
                    if indicator in sentence.lower() and len(sentence.strip()) > 15:
                        analysis['benefits'].append(sentence.strip())
                        break
        
        # Extract examples
        example_patterns = [
            r'for example[^.!?]*[.!?]',
            r'such as[^.!?]*[.!?]',
            r'like[^.!?]*[.!?]',
            r'including[^.!?]*[.!?]'
        ]
        
        for pattern in example_patterns:
            matches = re.findall(pattern, chunk, re.IGNORECASE)
            analysis['examples'].extend(matches)
        
        # Extract numbers and data
        number_patterns = [
            r'\b\d+%\b',
            r'\b\d+\.\d+\b',
            r'\b\d{4}\b',
            r'\b\d+(?:,\d{3})*\b'
        ]
        
        for pattern in number_patterns:
            matches = re.findall(pattern, chunk)
            analysis['numbers_data'].extend(matches)
        
        # Extract key phrases (noun phrases)
        key_phrase_pattern = r'\b(?:[A-Z][a-z]*\s+)*[A-Z][a-z]*\b'
        potential_phrases = re.findall(key_phrase_pattern, chunk)
        analysis['key_phrases'] = [phrase.strip() for phrase in potential_phrases if len(phrase.strip()) > 5][:5]
        
        # Extract core concepts (frequently mentioned important terms)
        words = re.findall(r'\b[a-z]{4,}\b', chunk.lower())
        word_freq = {}
        for word in words:
            if word not in {'this', 'that', 'with', 'have', 'will', 'from', 'they', 'been', 'were', 'said', 'each', 'which', 'their', 'time', 'would', 'there'}:
                word_freq[word] = word_freq.get(word, 0) + 1
        
        analysis['concepts'] = [word for word, freq in sorted(word_freq.items(), key=lambda x: x[1], reverse=True)[:8]]
        
        return analysis

    def create_conceptual_question(self, analysis: Dict) -> str:
        """Create conceptual understanding questions"""
        if analysis['main_topics']:
            topic = analysis['main_topics'][0]
            return f"Explain the key concepts and principles related to {topic}. What makes it important?"
        elif analysis['key_phrases']:
            phrase = analysis['key_phrases'][0]
            return f"What are the fundamental concepts behind {phrase} and how do they work?"
        elif analysis['concepts']:
            concept = analysis['concepts'][0]
            return f"Describe the main ideas and principles related to {concept}."
        else:
            return "What are the core concepts and main ideas presented in this content?"

    def create_analytical_question(self, analysis: Dict) -> str:
        """Create analytical thinking questions"""
        if analysis['benefits'] and analysis['main_topics']:
            topic = analysis['main_topics'][0]
            return f"Analyze the advantages and potential limitations of using {topic}. What factors should be considered?"
        elif analysis['processes']:
            return f"Analyze the approach described in the content. What are its strengths and weaknesses?"
        elif analysis['main_topics']:
            topic = analysis['main_topics'][0]
            return f"Critically evaluate {topic}. What are its implications and potential impact?"
        else:
            return "Analyze the main approach or methodology described. What are its pros and cons?"

    def create_application_question(self, analysis: Dict) -> str:
        """Create practical application questions"""
        if analysis['main_topics'] and analysis['examples']:
            topic = analysis['main_topics'][0]
            return f"How would you implement {topic} in a real-world project? Provide specific steps."
        elif analysis['technical_terms']:
            term = analysis['technical_terms'][0]
            return f"Describe a practical scenario where you would use {term}. How would you apply it?"
        elif analysis['processes']:
            return f"Walk through how you would apply the methodology described. What steps would you follow?"
        else:
            return "How would you apply the concepts discussed in a practical, real-world situation?"

    def create_definition_question(self, analysis: Dict) -> str:
        """Create definition and terminology questions"""
        if analysis['technical_terms']:
            terms = analysis['technical_terms'][:2]
            if len(terms) > 1:
                return f"Define and explain {terms[0]} and {terms[1]}. How are they used?"
            else:
                return f"Define {terms[0]} and explain its purpose and functionality."
        elif analysis['main_topics']:
            topic = analysis['main_topics'][0]
            return f"Provide a comprehensive definition of {topic}. What are its key characteristics?"
        elif analysis['key_phrases']:
            phrase = analysis['key_phrases'][0]
            return f"Define {phrase} and explain its significance in this context."
        else:
            return "Define the key terms and concepts mentioned in this content."

    def create_comparison_question(self, analysis: Dict) -> str:
        """Create comparison and contrast questions"""
        if len(analysis['main_topics']) >= 2:
            topic1, topic2 = analysis['main_topics'][:2]
            return f"Compare and contrast {topic1} and {topic2}. What are the key similarities and differences?"
        elif len(analysis['technical_terms']) >= 2:
            term1, term2 = analysis['technical_terms'][:2]
            return f"How do {term1} and {term2} differ? When would you use one over the other?"
        elif analysis['main_topics'] and analysis['concepts']:
            topic = analysis['main_topics'][0]
            concept = analysis['concepts'][0]
            return f"How does {topic} compare to traditional {concept} approaches? What are the trade-offs?"
        else:
            return "Compare the different approaches or methods discussed in this content. What are their relative merits?"

    def create_general_question(self, analysis: Dict) -> str:
        """Create general questions when specific types don't apply well"""
        if analysis['main_topics']:
            topic = analysis['main_topics'][0]
            return f"Discuss the important aspects of {topic} covered in this content."
        elif analysis['processes']:
            return "Explain the key processes or methodologies described in this section."
        else:
            return "What are the most important points covered in this content and why are they significant?"

    def create_question_prompts(self, text: str, question_type: str) -> List[str]:
        """Create different prompts for question generation based on type"""
        
        # Summarize the text first for better context
        try:
            if len(text) > 500:
                summary_input = text[:500] + "..."
                summary = self.summarizer(summary_input, max_length=100, min_length=30, do_sample=False)[0]['summary_text']
            else:
                summary = text
        except:
            summary = text[:200] + "..." if len(text) > 200 else text
        
        prompts = []
        
        if question_type == "conceptual":
            prompts.extend([
                f"Based on this content about key concepts, create an interview question that tests understanding of the main ideas: {summary}",
                f"Generate an interview question asking about the core concept explained in: {summary}",
                f"Create a thoughtful question about the main principles discussed in: {summary}"
            ])
            
        elif question_type == "analytical":
            prompts.extend([
                f"Create an analytical interview question that requires critical thinking about: {summary}",
                f"Generate a question asking for analysis or evaluation of: {summary}",
                f"Design an interview question that tests analytical skills based on: {summary}"
            ])
            
        elif question_type == "application":
            prompts.extend([
                f"Create an interview question about how to apply or implement concepts from: {summary}",
                f"Generate a practical application question based on: {summary}",
                f"Design a question asking how someone would use the information in: {summary}"
            ])
            
        elif question_type == "definition":
            prompts.extend([
                f"Create an interview question asking for definition or explanation of key terms in: {summary}",
                f"Generate a question about defining important concepts from: {summary}",
                f"Design a question testing knowledge of terminology in: {summary}"
            ])
            
        elif question_type == "comparison":
            prompts.extend([
                f"Create an interview question asking to compare or contrast ideas from: {summary}",
                f"Generate a question about similarities and differences in: {summary}",
                f"Design a question asking for comparison based on: {summary}"
            ])
        
        return prompts

    def generate_single_question(self, prompt: str) -> str:
        """Generate a single question using the model"""
        try:
            inputs = self.qg_tokenizer.encode(
                prompt,
                return_tensors="pt",
                max_length=512,
                truncation=True
            )
            inputs = inputs.to(self.device)
            
            with torch.no_grad():
                outputs = self.qg_model.generate(
                    inputs,
                    max_length=100,
                    min_length=20,
                    num_beams=5,
                    do_sample=True,
                    temperature=0.8,
                    no_repeat_ngram_size=2,
                    early_stopping=True
                )
            
            question = self.qg_tokenizer.decode(outputs[0], skip_special_tokens=True)
            return self.clean_question(question)
            
        except Exception as e:
            print(f"Error in question generation: {e}")
            return ""

    def clean_question(self, question: str) -> str:
        """Clean and format the generated question"""
        question = question.strip()
        
        # Remove common prefixes that models sometimes add
        prefixes_to_remove = [
            "question:", "q:", "interview question:", "based on", "according to"
        ]
        
        question_lower = question.lower()
        for prefix in prefixes_to_remove:
            if question_lower.startswith(prefix):
                question = question[len(prefix):].strip()
        
        # Ensure question ends with question mark
        if not question.endswith('?'):
            question += '?'
        
        # Capitalize first letter
        if question:
            question = question[0].upper() + question[1:]
        
        return question

    def validate_question(self, question: str, context: str) -> bool:
        """Validate if generated question is good enough"""
        if not question or len(question) < 10:
            return False
        
        if not question.endswith('?'):
            return False
        
        # Check if question is too generic
        generic_patterns = [
            r"what is this about",
            r"what does this mean",
            r"what is the main point",
            r"what is discussed",
            r"what can you tell me"
        ]
        
        question_lower = question.lower()
        for pattern in generic_patterns:
            if re.search(pattern, question_lower):
                return False
        
        # Check if question contains content-specific terms
        context_words = set(re.findall(r'\b[A-Za-z]{4,}\b', context.lower()))
        question_words = set(re.findall(r'\b[A-Za-z]{4,}\b', question_lower))
        
        # Question should have some overlap with context content
        overlap = len(context_words.intersection(question_words))
        if overlap < 2:
            return False
        
        return True

    def select_best_question(self, candidates: List[str], context: str) -> str:
        """Select the best question from candidates"""
        if not candidates:
            return ""
        
        scored_questions = []
        
        for question in candidates:
            score = 0
            
            # Score based on length (prefer medium length)
            length = len(question.split())
            if 8 <= length <= 20:
                score += 3
            elif 6 <= length <= 25:
                score += 1
            
            # Score based on question type diversity
            if any(word in question.lower() for word in ['how', 'why', 'explain', 'describe']):
                score += 2
            if any(word in question.lower() for word in ['compare', 'analyze', 'evaluate']):
                score += 3
            if any(word in question.lower() for word in ['what', 'define', 'identify']):
                score += 1
            
            # Score based on specificity
            specific_terms = re.findall(r'\b[A-Z][a-z]+\b', question)
            score += len(specific_terms)
            
            scored_questions.append((question, score))
        
        # Return highest scoring question
        scored_questions.sort(key=lambda x: x[1], reverse=True)
        return scored_questions[0][0]

    def create_fallback_question(self, chunk: str, question_type: str) -> str:
        """Create fallback questions when generation fails"""
        
        # Extract key terms from chunk
        key_terms = self.extract_key_terms(chunk)
        
        if question_type == "conceptual":
            if key_terms:
                return f"What are the key concepts related to {key_terms[0]} mentioned in this content?"
            return "What are the main concepts explained in this section?"
            
        elif question_type == "analytical":
            if key_terms:
                return f"How would you analyze the significance of {key_terms[0]} in this context?"
            return "How would you analyze the main points presented in this content?"
            
        elif question_type == "application":
            if key_terms:
                return f"How could you apply the principles of {key_terms[0]} in a real-world scenario?"
            return "How would you apply the concepts discussed in this content?"
            
        elif question_type == "definition":
            if key_terms:
                return f"How would you define {key_terms[0]} based on this content?"
            return "How would you define the key terms mentioned in this section?"
            
        elif question_type == "comparison":
            if len(key_terms) >= 2:
                return f"How would you compare {key_terms[0]} and {key_terms[1]}?"
            return "What are the similarities and differences between the concepts discussed?"
        
        return "What are the main points you would highlight from this content?"

    def extract_key_terms(self, text: str) -> List[str]:
        """Extract clean, separated key terms from text"""
        # First, clean up the text to handle concatenated words
        cleaned_text = self.separate_concatenated_words(text)
        
        # Extract meaningful terms
        terms = []
        
        # Find properly spaced technical terms
        tech_terms = re.findall(r'\b[a-zA-Z][a-zA-Z0-9]*(?:\s+[a-zA-Z][a-zA-Z0-9]*)*\b', cleaned_text)
        
        # Filter and clean terms
        for term in tech_terms:
            term = term.strip()
            if (len(term) > 2 and 
                not term.lower() in {'the', 'and', 'but', 'for', 'with', 'this', 'that', 'are', 'you', 'can', 'will', 'have', 'has'} and
                not term.isdigit()):
                terms.append(term)
        
        # Remove duplicates while preserving order
        seen = set()
        unique_terms = []
        for term in terms:
            term_lower = term.lower()
            if term_lower not in seen:
                seen.add(term_lower)
                unique_terms.append(term)
        
        return unique_terms[:8]

    def separate_concatenated_words(self, text: str) -> str:
        """Separate concatenated words like 'CountersHeapq' into 'Counters Heapq'"""
        # Handle camelCase and PascalCase
        separated = re.sub(r'([a-z])([A-Z])', r'\1 \2', text)
        
        # Handle sequences of capitals followed by lowercase
        separated = re.sub(r'([A-Z]+)([A-Z][a-z])', r'\1 \2', separated)
        
        # Handle common programming terms that are often concatenated
        programming_terms = {
            'deque': 'deque',
            'dict': 'dictionary',
            'str': 'string',
            'int': 'integer',
            'bool': 'boolean',
            'func': 'function',
            'obj': 'object',
            'var': 'variable',
            'param': 'parameter',
            'arg': 'argument'
        }
        
        # Expand abbreviated terms
        words = separated.split()
        expanded_words = []
        for word in words:
            word_lower = word.lower()
            if word_lower in programming_terms:
                expanded_words.append(programming_terms[word_lower])
            else:
                expanded_words.append(word)
        
        return ' '.join(expanded_words)

    def create_fallback_question(self, chunk: str, question_type: str) -> str:
        """Create high-quality fallback questions when other methods fail"""
        
        # Extract clean key terms
        key_terms = self.extract_key_terms(chunk)
        
        # Get the main subject of the content
        main_subject = self.identify_main_subject(chunk)
        
        if question_type == "conceptual":
            if main_subject:
                return f"Explain the core concepts and principles of {main_subject}. What makes it significant?"
            elif key_terms:
                primary_term = key_terms[0]
                return f"What are the fundamental concepts behind {primary_term} and how does it work?"
            return "What are the main concepts and principles explained in this content?"
            
        elif question_type == "analytical":
            if main_subject:
                return f"Analyze the advantages and potential challenges of {main_subject}. What should be considered when using it?"
            elif key_terms:
                return f"Critically evaluate the approach involving {key_terms[0]}. What are its strengths and limitations?"
            return "Analyze the methodology described. What are its benefits and potential drawbacks?"
            
        elif question_type == "application":
            if main_subject:
                return f"How would you implement {main_subject} in a practical project? Describe the key steps."
            elif key_terms:
                return f"Describe a real-world scenario where you would use {key_terms[0]}. How would you apply it effectively?"
            return "How would you apply these concepts in a practical, real-world situation?"
            
        elif question_type == "definition":
            if len(key_terms) >= 2:
                return f"Define {key_terms[0]} and {key_terms[1]}. How do they relate to each other?"
            elif key_terms:
                return f"Provide a comprehensive definition of {key_terms[0]}. What are its key characteristics?"
            elif main_subject:
                return f"Define {main_subject} and explain its primary purpose and functionality."
            return "Define the key terms and concepts presented in this content."
            
        elif question_type == "comparison":
            if len(key_terms) >= 2:
                return f"Compare {key_terms[0]} and {key_terms[1]}. What are their similarities and differences?"
            elif main_subject and key_terms:
                return f"How does {main_subject} compare to other {key_terms[0]} approaches? What are the trade-offs?"
            return "Compare the different approaches or methods discussed. What are their relative advantages?"
        
        # Generic fallback
        if main_subject:
            return f"Discuss the important aspects of {main_subject} and explain why it's significant."
        return "What are the most important points covered in this content and why do they matter?"

    def identify_main_subject(self, text: str) -> str:
        """Identify the main subject/topic of the text chunk"""
        # Look for title-like patterns
        lines = text.split('\n')
        
        # Check first few lines for titles
        for line in lines[:3]:
            line = line.strip()
            if (len(line) > 5 and len(line) < 100 and 
                not line.endswith('.') and 
                any(c.isupper() for c in line)):
                # Clean up the potential title
                clean_title = re.sub(r'[^\w\s]', '', line).strip()
                if clean_title:
                    return clean_title
        
        # Look for frequently mentioned terms
        key_terms = self.extract_key_terms(text)
        if key_terms:
            return key_terms[0]
        
        # Extract from first sentence
        sentences = re.split(r'[.!?]+', text)
        if sentences:
            first_sentence = sentences[0].strip()
            # Look for subject patterns
            subject_match = re.search(r'\b([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)\b', first_sentence)
            if subject_match:
                return subject_match.group(1)
        
        return ""

    def extract_key_concepts(self, text: str) -> List[str]:
        """Extract key concepts for answer evaluation"""
        # Enhanced keyword extraction
        words = re.findall(r'\b[A-Za-z]{3,}\b', text.lower())
        
        # Advanced stop words list
        stop_words = {
            'this', 'that', 'with', 'have', 'will', 'from', 'they', 'been', 
            'were', 'said', 'each', 'which', 'their', 'time', 'would', 'there',
            'when', 'what', 'where', 'how', 'why', 'who', 'can', 'could', 'should',
            'may', 'might', 'must', 'shall', 'will', 'would', 'also', 'then',
            'than', 'only', 'just', 'even', 'still', 'more', 'most', 'very',
            'much', 'many', 'some', 'any', 'all', 'both', 'either', 'neither'
        }
        
        # Filter words
        filtered_words = [word for word in words if word not in stop_words and len(word) > 3]
        
        # Calculate word frequency and importance
        word_scores = {}
        for word in filtered_words:
            # Base frequency score
            freq_score = filtered_words.count(word)
            
            # Bonus for longer words (often more specific)
            length_bonus = min(len(word) - 3, 5)
            
            # Bonus for words that appear in multiple contexts
            context_bonus = 1 if freq_score > 1 else 0
            
            word_scores[word] = freq_score + length_bonus + context_bonus
        
        # Return top scoring words
        sorted_words = sorted(word_scores.items(), key=lambda x: x[1], reverse=True)
        return [word for word, score in sorted_words[:15]]

    def evaluate_answer(self, question_data: Dict, user_answer: str) -> float:
        """Enhanced answer evaluation with multiple scoring methods"""
        if not user_answer.strip():
            return 0.0

        try:
            # Method 1: Semantic similarity with context
            context_embedding = self.similarity_model.encode([question_data['context']])
            answer_embedding = self.similarity_model.encode([user_answer])
            
            context_similarity = np.dot(context_embedding[0], answer_embedding[0]) / (
                np.linalg.norm(context_embedding[0]) * np.linalg.norm(answer_embedding[0])
            )

            # Method 2: Key concept matching
            user_words = set(re.findall(r'\b[A-Za-z]{3,}\b', user_answer.lower()))
            key_concepts = set(question_data['key_concepts'])
            
            concept_overlap = len(user_words.intersection(key_concepts)) / max(len(key_concepts), 1)

            # Method 3: Answer quality indicators
            quality_score = self.evaluate_answer_quality(user_answer, question_data)

            # Method 4: Question-specific relevance
            question_relevance = self.evaluate_question_relevance(
                question_data['question'], user_answer, question_data['context']
            )

            # Weighted combination
            final_score = (
                context_similarity * 0.3 +
                concept_overlap * 0.25 +
                quality_score * 0.25 +
                question_relevance * 0.2
            ) * 100

            return min(100.0, max(0.0, final_score))

        except Exception as e:
            print(f"Error evaluating answer: {e}")
            # Fallback scoring based on answer length and keyword presence
            word_count = len(user_answer.split())
            base_score = min(word_count * 3, 60)  # Up to 60 points for length
            
            # Bonus for using key terms
            key_bonus = 0
            for concept in question_data.get('key_concepts', [])[:5]:
                if concept.lower() in user_answer.lower():
                    key_bonus += 8
            
            return min(100.0, base_score + key_bonus)

    def evaluate_answer_quality(self, answer: str, question_data: Dict) -> float:
        """Evaluate the quality of the answer"""
        score = 0.0
        
        # Length scoring (encourage substantial answers)
        word_count = len(answer.split())
        if 20 <= word_count <= 100:
            score += 0.3
        elif 10 <= word_count <= 150:
            score += 0.2
        elif word_count >= 5:
            score += 0.1
        
        # Structure scoring
        sentences = re.split(r'[.!?]+', answer)
        if len(sentences) >= 2:
            score += 0.1
        
        # Specificity scoring (presence of detailed information)
        if re.search(r'\b\d+\b', answer):  # Contains numbers
            score += 0.1
        if re.search(r'\b[A-Z][a-z]+\b', answer):  # Contains proper nouns
            score += 0.1
        
        # Technical depth
        if any(term in answer.lower() for term in ['because', 'therefore', 'however', 'furthermore']):
            score += 0.1
        
        return min(1.0, score)

    def evaluate_question_relevance(self, question: str, answer: str, context: str) -> float:
        """Evaluate how well the answer addresses the specific question"""
        try:
            # Check if answer directly addresses question type
            question_lower = question.lower()
            answer_lower = answer.lower()
            
            relevance_score = 0.0
            
            # Question type analysis
            if 'what' in question_lower:
                if any(word in answer_lower for word in ['is', 'are', 'means', 'refers']):
                    relevance_score += 0.3
            
            if 'how' in question_lower:
                if any(word in answer_lower for word in ['by', 'through', 'using', 'method']):
                    relevance_score += 0.3
            
            if 'why' in question_lower:
                if any(word in answer_lower for word in ['because', 'since', 'due', 'reason']):
                    relevance_score += 0.3
            
            if any(word in question_lower for word in ['compare', 'difference', 'similar']):
                if any(word in answer_lower for word in ['similar', 'different', 'both', 'while', 'whereas']):
                    relevance_score += 0.3
            
            # Semantic similarity between question and answer
            q_embedding = self.similarity_model.encode([question])
            a_embedding = self.similarity_model.encode([answer])
            
            semantic_relevance = np.dot(q_embedding[0], a_embedding[0]) / (
                np.linalg.norm(q_embedding[0]) * np.linalg.norm(a_embedding[0])
            )
            
            relevance_score += semantic_relevance * 0.4
            
            return min(1.0, relevance_score)
            
        except Exception as e:
            print(f"Error evaluating relevance: {e}")
            return 0.5  # Neutral score on error

    def process_content(self, pdf_file=None, web_url="", num_questions=10):
        """Process uploaded content and generate questions"""
        try:
            # Extract text based on input type
            if pdf_file is not None:
                text = self.extract_pdf_text(pdf_file)
                source = "PDF"
            elif web_url.strip():
                text = self.extract_web_content(web_url.strip())
                source = "Web URL"
            else:
                return "Please provide either a PDF file or a web URL.", "", ""

            if text.startswith("Error"):
                return text, "", ""

            # Process text with improved chunking
            self.content_chunks = self.chunk_text(text)

            if not self.content_chunks:
                return "No valid content found to generate questions.", "", ""

            if len(self.content_chunks) < 3:
                return f"Content too short. Found only {len(self.content_chunks)} chunks. Please provide more substantial content.", "", ""

            # Generate high-quality questions
            self.questions = self.generate_better_questions(self.content_chunks, num_questions)
            self.current_question_idx = 0
            self.user_answers = []
            self.scores = []

            if not self.questions:
                return "Failed to generate questions from the content.", "", ""

            summary = f"✅ Successfully processed {source}!\n"
            summary += f"📄 Extracted {len(self.content_chunks)} content sections\n"
            summary += f"❓ Generated {len(self.questions)} high-quality interview questions\n"
            summary += f"🎯 Questions cover: concepts, analysis, applications, and definitions\n"
            summary += "\nReady to start your interview practice!"

            first_question = f"Question 1/{len(self.questions)} [{self.questions[0]['question_type'].title()}]:\n\n{self.questions[0]['question']}"

            return summary, first_question, ""

        except Exception as e:
            return f"Error processing content: {str(e)}", "", ""

    def submit_answer(self, user_answer):
        """Submit answer and get next question with enhanced feedback"""
        if not self.questions:
            return "No quiz in progress. Please upload content first.", "", "No active quiz"

        if self.current_question_idx >= len(self.questions):
            return "Quiz completed!", "", self.get_final_results()

        # Evaluate current answer
        current_question = self.questions[self.current_question_idx]
        score = self.evaluate_answer(current_question, user_answer)

        self.user_answers.append(user_answer)
        self.scores.append(score)

        # Provide detailed feedback
        feedback = f"✅ Answer {self.current_question_idx + 1} submitted!\n"
        feedback += f"📊 Score: {score:.1f}/100\n"
        
        # Add specific feedback based on score
        if score >= 80:
            feedback += "🌟 Excellent answer! Great understanding demonstrated.\n"
        elif score >= 60:
            feedback += "👍 Good answer! You covered the main points well.\n"
        elif score >= 40:
            feedback += "📚 Fair answer. Try to include more specific details.\n"
        else:
            feedback += "💪 Room for improvement. Focus on key concepts from the content.\n"

        self.current_question_idx += 1

        if self.current_question_idx < len(self.questions):
            # Next question with enhanced formatting
            next_question = self.questions[self.current_question_idx]
            question_text = f"Question {self.current_question_idx + 1}/{len(self.questions)} [{next_question['question_type'].title()}]:\n\n"
            question_text += f"{next_question['question']}\n\n"
            question_text += "💡 Tip: Use specific details and examples from the content in your answer."
            
            return feedback, question_text, self.get_current_progress()
        else:
            # Quiz completed
            return feedback + "\n🎉 Interview Complete!", "🎉 Interview Complete!", self.get_final_results()

    def get_current_progress(self):
        """Get current progress summary with enhanced analytics"""
        if not self.scores:
            return "No answers submitted yet"

        avg_score = sum(self.scores) / len(self.scores)
        progress = f"📈 Progress: {len(self.scores)}/{len(self.questions)} questions answered\n"
        progress += f"📊 Average Score: {avg_score:.1f}/100\n"
        progress += f"🎯 Latest Score: {self.scores[-1]:.1f}/100\n"
        
        if len(self.scores) >= 2:
            trend = "📈 Improving" if self.scores[-1] > self.scores[-2] else "📉 Declining" if self.scores[-1] < self.scores[-2] else "➡️ Stable"
            progress += f"📉📈 Trend: {trend}\n"
        
        progress += "\n"

        # Add dynamic tips based on current performance
        if len(self.scores) >= 2:
            progress += self.get_live_tips()

        return progress

    def get_live_tips(self):
        """Generate live tips during the quiz based on current performance"""
        recent_scores = self.scores[-3:] if len(self.scores) >= 3 else self.scores
        avg_recent = sum(recent_scores) / len(recent_scores)

        tips = "🎯 **Live Performance Tips:**\n"

        if avg_recent >= 80:
            tips += "• Excellent work! Keep providing detailed, specific answers\n"
            tips += "• You're demonstrating strong comprehension skills\n"
        elif avg_recent >= 60:
            tips += "• Good progress! Try adding more specific examples from the content\n"
            tips += "• Connect your answers directly to the source material\n"
        elif avg_recent >= 40:
            tips += "• Focus on using key terms and concepts from the source\n"
            tips += "• Take time to structure your thoughts before answering\n"
        else:
            tips += "• Slow down and carefully read both the question and source content\n"
            tips += "• Look for main ideas and important details in each section\n"

        # Performance trend analysis
        if len(recent_scores) >= 2:
            if recent_scores[-1] > recent_scores[-2]:
                tips += "• 📈 Great improvement! You're adapting well to the format\n"
            elif recent_scores[-1] < recent_scores[-2]:
                tips += "• 🎯 Refocus needed - take a moment to review the content\n"

        return tips

    def get_final_results(self):
        """Generate comprehensive final quiz results"""
        if not self.scores:
            return "No quiz completed yet"

        total_questions = len(self.scores)
        avg_score = sum(self.scores) / total_questions
        max_score = max(self.scores)
        min_score = min(self.scores)

        # Enhanced performance categorization
        if avg_score >= 85:
            performance = "Outstanding! 🌟"
            grade = "A"
        elif avg_score >= 75:
            performance = "Excellent! 🎉"
            grade = "B+"
        elif avg_score >= 65:
            performance = "Good! 👍"
            grade = "B"
        elif avg_score >= 55:
            performance = "Fair 📚"
            grade = "C+"
        elif avg_score >= 45:
            performance = "Needs Improvement 💪"
            grade = "C"
        else:
            performance = "Requires Focus 🎯"
            grade = "D"

        results = f"""
🎯 INTERVIEW PERFORMANCE REPORT 🎯

📊 **Overall Performance**
• Total Questions: {total_questions}
• Average Score: {avg_score:.1f}/100
• Grade: {grade}
• Assessment: {performance}

📈 **Score Analysis**
• Highest Score: {max_score:.1f}/100
• Lowest Score: {min_score:.1f}/100
• Score Range: {max_score - min_score:.1f} points
• Consistency: {"High" if max_score - min_score < 30 else "Moderate" if max_score - min_score < 50 else "Variable"}

📋 **Question-by-Question Results**
"""

        for i, (score, question_data) in enumerate(zip(self.scores, self.questions[:len(self.scores)]), 1):
            q_type = question_data['question_type'].title()
            emoji = "🌟" if score >= 80 else "👍" if score >= 60 else "📚" if score >= 40 else "💪"
            results += f"Q{i} ({q_type}): {score:.1f}/100 {emoji}\n"

        # Performance by question type
        results += "\n🎭 **Performance by Question Type**\n"
        type_scores = {}
        for i, score in enumerate(self.scores):
            if i < len(self.questions):
                q_type = self.questions[i]['question_type']
                if q_type not in type_scores:
                    type_scores[q_type] = []
                type_scores[q_type].append(score)

        for q_type, scores in type_scores.items():
            avg_type_score = sum(scores) / len(scores)
            results += f"• {q_type.title()}: {avg_type_score:.1f}/100 ({len(scores)} questions)\n"

        results += "\n" + self.get_personalized_tips()

        return results

    def get_personalized_tips(self):
        """Generate enhanced personalized tips based on detailed performance analysis"""
        if not self.scores:
            return "💡 Complete a quiz to get personalized tips!"

        avg_score = sum(self.scores) / len(self.scores)
        score_variance = max(self.scores) - min(self.scores)
        
        tips = "💡 **Personalized Development Plan:**\n\n"

        # Overall performance feedback
        if avg_score >= 85:
            tips += "🌟 **Outstanding Performance!**\n"
            tips += "• You demonstrate excellent comprehension and communication skills\n"
            tips += "• Your answers show deep understanding of the content\n"
            tips += "• Consider mentoring others or taking on leadership roles\n"
            tips += "• Challenge yourself with more complex technical content\n"

        elif avg_score >= 75:
            tips += "🎉 **Excellent Work!**\n"
            tips += "• Strong foundation with room for refinement\n"
            tips += "• Focus on adding more specific examples in your answers\n"
            tips += "• Practice connecting concepts across different sections\n"

        elif avg_score >= 65:
            tips += "👍 **Good Performance with Growth Potential!**\n"
            tips += "• You understand the main concepts well\n"
            tips += "• Work on providing more detailed explanations\n"
            tips += "• Include specific terminology from the source material\n"

        elif avg_score >= 55:
            tips += "📚 **Solid Foundation - Ready for Next Level!**\n"
            tips += "• Focus on reading comprehension before answering\n"
            tips += "• Practice identifying key concepts in each section\n"
            tips += "• Structure your answers with main points first\n"

        elif avg_score >= 45:
            tips += "💪 **Building Skills - You're on the Right Path!**\n"
            tips += "• Spend more time analyzing the content before questions\n"
            tips += "• Practice summarizing key points in your own words\n"
            tips += "• Focus on understanding rather than memorizing\n"

        else:
            tips += "🎯 **Focus Areas for Rapid Improvement:**\n"
            tips += "• Take time to thoroughly read and understand the source content\n"
            tips += "• Practice breaking down complex information into main ideas\n"
            tips += "• Focus on quality over speed in your responses\n"
            tips += "• Consider reviewing fundamental concepts in the subject area\n"

        # Consistency analysis
        if score_variance > 50:
            tips += "\n📊 **Consistency Focus:**\n"
            tips += "• Your scores vary significantly across questions\n"
            tips += "• This suggests uneven understanding - review weaker areas\n"
            tips += "• Practice maintaining consistent quality across all question types\n"
        elif score_variance < 20:
            tips += "\n🎯 **Consistent Performance:**\n"
            tips += "• Great job maintaining steady quality across questions!\n"

        # Question type specific tips
        if len(self.scores) >= 3:
            tips += "\n🎭 **Question Type Strategies:**\n"
            
            type_performance = {}
            for i, score in enumerate(self.scores):
                if i < len(self.questions):
                    q_type = self.questions[i]['question_type']
                    if q_type not in type_performance:
                        type_performance[q_type] = []
                    type_performance[q_type].append(score)
            
            for q_type, scores in type_performance.items():
                avg_type = sum(scores) / len(scores)
                if avg_type < avg_score - 10:  # Significantly below average
                    tips += f"• Improve {q_type} questions: "
                    if q_type == "analytical":
                        tips += "Break down problems step by step\n"
                    elif q_type == "application":
                        tips += "Think about real-world examples and use cases\n"
                    elif q_type == "conceptual":
                        tips += "Focus on understanding core principles and ideas\n"
                    elif q_type == "definition":
                        tips += "Use precise terminology and clear explanations\n"
                    elif q_type == "comparison":
                        tips += "Identify both similarities and key differences\n"

        return tips

# Initialize the bot
bot = InterviewBot()

# Create enhanced Gradio interface
def create_interface():
    with gr.Blocks(title="Interview Bot - AI-Powered Quiz Generator", theme=gr.themes.Soft()) as demo:
        gr.Markdown("""
        # 🤖 AI Interview Bot - Intelligent Quiz Generator

        Transform any PDF document or web article into a comprehensive interview practice session!

        **🚀 Enhanced Features:**
        - **Smart Content Analysis**: Identifies key concepts and important sections
        - **Intelligent Question Generation**: Creates diverse, meaningful interview questions
        - **Multi-Type Questions**: Conceptual, analytical, application, definition, and comparison questions
        - **Advanced Answer Evaluation**: Uses semantic analysis and concept matching
        - **Real-Time Performance Analytics**: Live scoring, trends, and personalized tips
        - **Comprehensive Reporting**: Detailed performance analysis and improvement recommendations
        """)

        with gr.Row():
            with gr.Column(scale=1):
                gr.Markdown("### 📤 Content Input")

                pdf_input = gr.File(
                    label="Upload PDF File (up to 50 pages)",
                    file_types=[".pdf"]
                )

                web_input = gr.Textbox(
                    label="Or Enter Web URL",
                    placeholder="https://example.com/article",
                    lines=1
                )

                num_questions = gr.Slider(
                    minimum=5,
                    maximum=20,
                    value=10,
                    step=1,
                    label="Number of Questions"
                )

                process_btn = gr.Button("🚀 Generate Intelligent Quiz", variant="primary", size="lg")

            with gr.Column(scale=2):
                gr.Markdown("### 📝 Interview Practice")

                status_output = gr.Textbox(
                    label="📋 Status & Instructions",
                    lines=6,
                    value="Welcome! Upload a PDF or enter a web URL to generate your personalized interview quiz.\n\nThe AI will analyze the content and create intelligent questions testing different skills:\n• Conceptual understanding\n• Analytical thinking\n• Practical application\n• Definitions and terminology\n• Comparisons and relationships"
                )

                question_output = gr.Textbox(
                    label="❓ Current Question",
                    lines=4,
                    value="Your intelligently generated questions will appear here..."
                )

                answer_input = gr.Textbox(
                    label="✍️ Your Answer",
                    lines=6,
                    placeholder="Provide a detailed, thoughtful answer here...\n\nTips:\n• Use specific terms from the content\n• Provide examples when possible\n• Explain your reasoning\n• Structure your response clearly"
                )

                submit_btn = gr.Button("✅ Submit Answer & Continue", variant="secondary", size="lg")

        with gr.Row():
            with gr.Column(scale=1):
                progress_output = gr.Textbox(
                    label="📊 Progress & Analytics",
                    lines=10,
                    value="Quiz progress and performance analytics will appear here..."
                )
            
            with gr.Column(scale=1):
                tips_output = gr.Textbox(
                    label="💡 Live Tips & Feedback",
                    lines=10,
                    value="Personalized tips and improvement suggestions will appear here as you progress through the quiz..."
                )

        # Event handlers
        def process_content_wrapper(pdf_file, web_url, num_q):
            status, question, progress = bot.process_content(pdf_file, web_url, num_q)
            tips = "🎯 **Getting Started Tips:**\n• Read each question carefully\n• Think about the main concepts from the source\n• Provide specific, detailed answers\n• Use terminology from the original content\n• Take your time to give thoughtful responses"
            return status, question, progress, tips

        def submit_answer_wrapper(user_answer):
            status, question, progress = bot.submit_answer(user_answer)
            
            # Generate updated tips
            if bot.scores:
                tips = bot.get_personalized_tips()
            else:
                tips = "Complete more questions to get personalized feedback!"
            
            return status, question, progress, tips, ""  # Clear answer input

        process_btn.click(
            fn=process_content_wrapper,
            inputs=[pdf_input, web_input, num_questions],
            outputs=[status_output, question_output, progress_output, tips_output]
        )

        submit_btn.click(
            fn=submit_answer_wrapper,
            inputs=[answer_input],
            outputs=[status_output, question_output, progress_output, tips_output, answer_input]
        )

        # Additional information section
        with gr.Row():
            gr.Markdown("""
            ### 🎓 How It Works:
            
            1. **Content Analysis**: The AI analyzes your document/webpage to identify key concepts, important sections, and technical terms
            2. **Smart Question Generation**: Creates diverse question types using advanced NLP models trained specifically for question generation
            3. **Intelligent Evaluation**: Your answers are scored using semantic similarity, concept matching, and answer quality metrics
            4. **Personalized Feedback**: Get real-time tips and detailed performance analysis to improve your interview skills
            
            ### 🎯 Question Types Generated:
            - **Conceptual**: Test understanding of main ideas and principles
            - **Analytical**: Require critical thinking and analysis
            - **Application**: Focus on practical use and real-world scenarios
            - **Definition**: Test knowledge of key terms and concepts
            - **Comparison**: Examine relationships and differences between ideas
            """)

    return demo

# Launch the application
if __name__ == "__main__":
    demo = create_interface()
    demo.launch()
    print("🚀 Enhanced Interview Bot is ready!")
    print("🎯 Features: Intelligent question generation, advanced scoring, and personalized feedback!")