import gradio as gr import PyPDF2 import requests from bs4 import BeautifulSoup import re import random from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM import torch from sentence_transformers import SentenceTransformer import json from typing import List, Dict, Tuple import numpy as np class InterviewBot: def __init__(self): self.device = "cuda" if torch.cuda.is_available() else "cpu" print(f"Using device: {self.device}") # Initialize models self.init_models() # Storage for content and questions self.content_chunks = [] self.important_chunks = [] self.questions = [] self.current_question_idx = 0 self.user_answers = [] self.scores = [] def init_models(self): """Initialize models for content analysis and answer evaluation""" try: # Text similarity model for answer evaluation print("Loading similarity model...") self.similarity_model = SentenceTransformer('all-MiniLM-L6-v2') # Text summarization for content processing (optional, with fallback) print("Loading summarization pipeline...") try: self.summarizer = pipeline("summarization", model="facebook/bart-large-cnn", device=0 if self.device == "cuda" else -1) except: print("Using fallback summarization...") self.summarizer = None print("Models loaded successfully!") except Exception as e: print(f"Error loading models: {e}") self.init_fallback_models() def init_fallback_models(self): """Initialize minimal models if main models fail to load""" print("Loading fallback models...") self.similarity_model = SentenceTransformer('all-MiniLM-L6-v2') self.summarizer = None def extract_pdf_text(self, pdf_file) -> str: """Extract text from uploaded PDF file""" try: import io if isinstance(pdf_file, bytes): pdf_stream = io.BytesIO(pdf_file) elif hasattr(pdf_file, 'read'): pdf_stream = io.BytesIO(pdf_file.read()) elif isinstance(pdf_file, str): with open(pdf_file, 'rb') as f: pdf_stream = io.BytesIO(f.read()) else: pdf_stream = pdf_file pdf_reader = PyPDF2.PdfReader(pdf_stream) text = "" max_pages = min(50, len(pdf_reader.pages)) for page_num in range(max_pages): page = pdf_reader.pages[page_num] page_text = page.extract_text() if page_text.strip(): text += page_text + "\n" if not text.strip(): return "Error extracting PDF: No readable text found in the PDF" return text except Exception as e: return f"Error extracting PDF: {str(e)}" def extract_web_content(self, url: str) -> str: """Extract text content from web URL""" try: headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' } response = requests.get(url, headers=headers, timeout=10) response.raise_for_status() soup = BeautifulSoup(response.content, 'html.parser') for script in soup(["script", "style"]): script.decompose() content_tags = soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'article', 'div']) text = "" for tag in content_tags: if tag.get_text().strip(): text += tag.get_text().strip() + "\n" return text except Exception as e: return f"Error extracting web content: {str(e)}" def chunk_text(self, text: str, chunk_size: int = 800) -> List[str]: """Split text into manageable chunks with better content preservation""" text = re.sub(r'\s+', ' ', text).strip() # Split by paragraphs first, then by sentences if needed paragraphs = text.split('\n\n') chunks = [] for paragraph in paragraphs: if len(paragraph) <= chunk_size: if paragraph.strip(): chunks.append(paragraph.strip()) else: # Split long paragraphs by sentences sentences = re.split(r'[.!?]+', paragraph) current_chunk = "" for sentence in sentences: sentence = sentence.strip() if not sentence: continue if len(current_chunk) + len(sentence) < chunk_size: current_chunk += sentence + ". " else: if current_chunk: chunks.append(current_chunk.strip()) current_chunk = sentence + ". " if current_chunk: chunks.append(current_chunk.strip()) return [chunk for chunk in chunks if len(chunk) > 100] # Filter very short chunks def identify_important_chunks(self, chunks: List[str]) -> List[Dict]: """Identify and rank chunks by importance for question generation""" important_chunks = [] for i, chunk in enumerate(chunks): importance_score = 0 # Score based on content indicators # Key phrases that suggest important content key_indicators = [ 'definition', 'concept', 'principle', 'theory', 'method', 'process', 'important', 'significant', 'key', 'main', 'primary', 'essential', 'result', 'conclusion', 'finding', 'solution', 'approach', 'example', 'case study', 'research', 'study', 'analysis' ] chunk_lower = chunk.lower() for indicator in key_indicators: importance_score += chunk_lower.count(indicator) # Bonus for chunks with numbers, dates, or specific data if re.search(r'\b\d+%\b|\b\d+\.\d+\b|\b\d{4}\b', chunk): importance_score += 2 # Bonus for chunks with technical terms (capitalized words) capitalized_words = re.findall(r'\b[A-Z][a-z]+\b', chunk) importance_score += min(len(capitalized_words) * 0.5, 5) # Penalty for very short or very long chunks if len(chunk) < 200: importance_score -= 2 elif len(chunk) > 1500: importance_score -= 1 important_chunks.append({ 'text': chunk, 'score': importance_score, 'index': i }) # Sort by importance and return top chunks important_chunks.sort(key=lambda x: x['score'], reverse=True) return important_chunks def generate_better_questions(self, chunks: List[str], num_questions: int = 10) -> List[Dict]: """Generate high-quality interview questions using rule-based and template approaches""" # Identify important content chunks important_chunks = self.identify_important_chunks(chunks) questions = [] question_types = ["conceptual", "analytical", "application", "definition", "comparison"] # Use top chunks for question generation selected_chunks = important_chunks[:min(len(important_chunks), num_questions)] for i in range(min(num_questions, len(selected_chunks))): chunk_data = selected_chunks[i] chunk = chunk_data['text'] question_type = question_types[i % len(question_types)] try: # Use rule-based question generation instead of unreliable AI generation question = self.create_intelligent_question(chunk, question_type) # Extract answer key points key_concepts = self.extract_key_concepts(chunk) questions.append({ 'question': question, 'context': chunk, 'key_concepts': key_concepts, 'question_type': question_type, 'question_id': i + 1, 'importance_score': chunk_data['score'] }) except Exception as e: print(f"Error generating question {i+1}: {e}") # Create fallback question fallback_question = self.create_fallback_question(chunk, question_type) questions.append({ 'question': fallback_question, 'context': chunk, 'key_concepts': self.extract_key_concepts(chunk), 'question_type': question_type, 'question_id': i + 1, 'importance_score': chunk_data['score'] }) return questions def create_intelligent_question(self, chunk: str, question_type: str) -> str: """Create intelligent questions using rule-based analysis of content""" # Analyze the chunk content analysis = self.analyze_chunk_content(chunk) # Generate question based on analysis and type if question_type == "conceptual": return self.create_conceptual_question(analysis) elif question_type == "analytical": return self.create_analytical_question(analysis) elif question_type == "application": return self.create_application_question(analysis) elif question_type == "definition": return self.create_definition_question(analysis) elif question_type == "comparison": return self.create_comparison_question(analysis) else: return self.create_general_question(analysis) def analyze_chunk_content(self, chunk: str) -> Dict: """Analyze chunk content to extract meaningful elements for question generation""" analysis = { 'main_topics': [], 'technical_terms': [], 'processes': [], 'benefits': [], 'examples': [], 'numbers_data': [], 'key_phrases': [], 'concepts': [] } # Extract main topics (capitalized terms that appear multiple times or are emphasized) potential_topics = re.findall(r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b', chunk) topic_freq = {} for topic in potential_topics: if len(topic) > 3 and topic not in ['The', 'This', 'That', 'These', 'Those']: topic_freq[topic] = topic_freq.get(topic, 0) + 1 analysis['main_topics'] = [topic for topic, freq in sorted(topic_freq.items(), key=lambda x: x[1], reverse=True)[:3]] # Extract technical terms (words with specific patterns) technical_patterns = [ r'\b[a-z]+(?:[A-Z][a-z]*)+\b', # camelCase r'\b[A-Z]+\b', # Acronyms r'\b\w+\.\w+\b', # module.function r'\b\w+_\w+\b' # snake_case ] for pattern in technical_patterns: matches = re.findall(pattern, chunk) analysis['technical_terms'].extend(matches) analysis['technical_terms'] = list(set(analysis['technical_terms']))[:5] # Extract processes (words indicating steps or procedures) process_indicators = ['step', 'process', 'method', 'approach', 'way', 'how to', 'procedure'] for indicator in process_indicators: if indicator in chunk.lower(): # Find sentences containing process indicators sentences = re.split(r'[.!?]+', chunk) for sentence in sentences: if indicator in sentence.lower() and len(sentence.strip()) > 20: analysis['processes'].append(sentence.strip()) break # Extract benefits/advantages (positive outcome indicators) benefit_indicators = ['benefit', 'advantage', 'improvement', 'better', 'efficient', 'effective'] for indicator in benefit_indicators: if indicator in chunk.lower(): sentences = re.split(r'[.!?]+', chunk) for sentence in sentences: if indicator in sentence.lower() and len(sentence.strip()) > 15: analysis['benefits'].append(sentence.strip()) break # Extract examples example_patterns = [ r'for example[^.!?]*[.!?]', r'such as[^.!?]*[.!?]', r'like[^.!?]*[.!?]', r'including[^.!?]*[.!?]' ] for pattern in example_patterns: matches = re.findall(pattern, chunk, re.IGNORECASE) analysis['examples'].extend(matches) # Extract numbers and data number_patterns = [ r'\b\d+%\b', r'\b\d+\.\d+\b', r'\b\d{4}\b', r'\b\d+(?:,\d{3})*\b' ] for pattern in number_patterns: matches = re.findall(pattern, chunk) analysis['numbers_data'].extend(matches) # Extract key phrases (noun phrases) key_phrase_pattern = r'\b(?:[A-Z][a-z]*\s+)*[A-Z][a-z]*\b' potential_phrases = re.findall(key_phrase_pattern, chunk) analysis['key_phrases'] = [phrase.strip() for phrase in potential_phrases if len(phrase.strip()) > 5][:5] # Extract core concepts (frequently mentioned important terms) words = re.findall(r'\b[a-z]{4,}\b', chunk.lower()) word_freq = {} for word in words: if word not in {'this', 'that', 'with', 'have', 'will', 'from', 'they', 'been', 'were', 'said', 'each', 'which', 'their', 'time', 'would', 'there'}: word_freq[word] = word_freq.get(word, 0) + 1 analysis['concepts'] = [word for word, freq in sorted(word_freq.items(), key=lambda x: x[1], reverse=True)[:8]] return analysis def create_conceptual_question(self, analysis: Dict) -> str: """Create conceptual understanding questions""" if analysis['main_topics']: topic = analysis['main_topics'][0] return f"Explain the key concepts and principles related to {topic}. What makes it important?" elif analysis['key_phrases']: phrase = analysis['key_phrases'][0] return f"What are the fundamental concepts behind {phrase} and how do they work?" elif analysis['concepts']: concept = analysis['concepts'][0] return f"Describe the main ideas and principles related to {concept}." else: return "What are the core concepts and main ideas presented in this content?" def create_analytical_question(self, analysis: Dict) -> str: """Create analytical thinking questions""" if analysis['benefits'] and analysis['main_topics']: topic = analysis['main_topics'][0] return f"Analyze the advantages and potential limitations of using {topic}. What factors should be considered?" elif analysis['processes']: return f"Analyze the approach described in the content. What are its strengths and weaknesses?" elif analysis['main_topics']: topic = analysis['main_topics'][0] return f"Critically evaluate {topic}. What are its implications and potential impact?" else: return "Analyze the main approach or methodology described. What are its pros and cons?" def create_application_question(self, analysis: Dict) -> str: """Create practical application questions""" if analysis['main_topics'] and analysis['examples']: topic = analysis['main_topics'][0] return f"How would you implement {topic} in a real-world project? Provide specific steps." elif analysis['technical_terms']: term = analysis['technical_terms'][0] return f"Describe a practical scenario where you would use {term}. How would you apply it?" elif analysis['processes']: return f"Walk through how you would apply the methodology described. What steps would you follow?" else: return "How would you apply the concepts discussed in a practical, real-world situation?" def create_definition_question(self, analysis: Dict) -> str: """Create definition and terminology questions""" if analysis['technical_terms']: terms = analysis['technical_terms'][:2] if len(terms) > 1: return f"Define and explain {terms[0]} and {terms[1]}. How are they used?" else: return f"Define {terms[0]} and explain its purpose and functionality." elif analysis['main_topics']: topic = analysis['main_topics'][0] return f"Provide a comprehensive definition of {topic}. What are its key characteristics?" elif analysis['key_phrases']: phrase = analysis['key_phrases'][0] return f"Define {phrase} and explain its significance in this context." else: return "Define the key terms and concepts mentioned in this content." def create_comparison_question(self, analysis: Dict) -> str: """Create comparison and contrast questions""" if len(analysis['main_topics']) >= 2: topic1, topic2 = analysis['main_topics'][:2] return f"Compare and contrast {topic1} and {topic2}. What are the key similarities and differences?" elif len(analysis['technical_terms']) >= 2: term1, term2 = analysis['technical_terms'][:2] return f"How do {term1} and {term2} differ? When would you use one over the other?" elif analysis['main_topics'] and analysis['concepts']: topic = analysis['main_topics'][0] concept = analysis['concepts'][0] return f"How does {topic} compare to traditional {concept} approaches? What are the trade-offs?" else: return "Compare the different approaches or methods discussed in this content. What are their relative merits?" def create_general_question(self, analysis: Dict) -> str: """Create general questions when specific types don't apply well""" if analysis['main_topics']: topic = analysis['main_topics'][0] return f"Discuss the important aspects of {topic} covered in this content." elif analysis['processes']: return "Explain the key processes or methodologies described in this section." else: return "What are the most important points covered in this content and why are they significant?" def create_question_prompts(self, text: str, question_type: str) -> List[str]: """Create different prompts for question generation based on type""" # Summarize the text first for better context try: if len(text) > 500: summary_input = text[:500] + "..." summary = self.summarizer(summary_input, max_length=100, min_length=30, do_sample=False)[0]['summary_text'] else: summary = text except: summary = text[:200] + "..." if len(text) > 200 else text prompts = [] if question_type == "conceptual": prompts.extend([ f"Based on this content about key concepts, create an interview question that tests understanding of the main ideas: {summary}", f"Generate an interview question asking about the core concept explained in: {summary}", f"Create a thoughtful question about the main principles discussed in: {summary}" ]) elif question_type == "analytical": prompts.extend([ f"Create an analytical interview question that requires critical thinking about: {summary}", f"Generate a question asking for analysis or evaluation of: {summary}", f"Design an interview question that tests analytical skills based on: {summary}" ]) elif question_type == "application": prompts.extend([ f"Create an interview question about how to apply or implement concepts from: {summary}", f"Generate a practical application question based on: {summary}", f"Design a question asking how someone would use the information in: {summary}" ]) elif question_type == "definition": prompts.extend([ f"Create an interview question asking for definition or explanation of key terms in: {summary}", f"Generate a question about defining important concepts from: {summary}", f"Design a question testing knowledge of terminology in: {summary}" ]) elif question_type == "comparison": prompts.extend([ f"Create an interview question asking to compare or contrast ideas from: {summary}", f"Generate a question about similarities and differences in: {summary}", f"Design a question asking for comparison based on: {summary}" ]) return prompts def generate_single_question(self, prompt: str) -> str: """Generate a single question using the model""" try: inputs = self.qg_tokenizer.encode( prompt, return_tensors="pt", max_length=512, truncation=True ) inputs = inputs.to(self.device) with torch.no_grad(): outputs = self.qg_model.generate( inputs, max_length=100, min_length=20, num_beams=5, do_sample=True, temperature=0.8, no_repeat_ngram_size=2, early_stopping=True ) question = self.qg_tokenizer.decode(outputs[0], skip_special_tokens=True) return self.clean_question(question) except Exception as e: print(f"Error in question generation: {e}") return "" def clean_question(self, question: str) -> str: """Clean and format the generated question""" question = question.strip() # Remove common prefixes that models sometimes add prefixes_to_remove = [ "question:", "q:", "interview question:", "based on", "according to" ] question_lower = question.lower() for prefix in prefixes_to_remove: if question_lower.startswith(prefix): question = question[len(prefix):].strip() # Ensure question ends with question mark if not question.endswith('?'): question += '?' # Capitalize first letter if question: question = question[0].upper() + question[1:] return question def validate_question(self, question: str, context: str) -> bool: """Validate if generated question is good enough""" if not question or len(question) < 10: return False if not question.endswith('?'): return False # Check if question is too generic generic_patterns = [ r"what is this about", r"what does this mean", r"what is the main point", r"what is discussed", r"what can you tell me" ] question_lower = question.lower() for pattern in generic_patterns: if re.search(pattern, question_lower): return False # Check if question contains content-specific terms context_words = set(re.findall(r'\b[A-Za-z]{4,}\b', context.lower())) question_words = set(re.findall(r'\b[A-Za-z]{4,}\b', question_lower)) # Question should have some overlap with context content overlap = len(context_words.intersection(question_words)) if overlap < 2: return False return True def select_best_question(self, candidates: List[str], context: str) -> str: """Select the best question from candidates""" if not candidates: return "" scored_questions = [] for question in candidates: score = 0 # Score based on length (prefer medium length) length = len(question.split()) if 8 <= length <= 20: score += 3 elif 6 <= length <= 25: score += 1 # Score based on question type diversity if any(word in question.lower() for word in ['how', 'why', 'explain', 'describe']): score += 2 if any(word in question.lower() for word in ['compare', 'analyze', 'evaluate']): score += 3 if any(word in question.lower() for word in ['what', 'define', 'identify']): score += 1 # Score based on specificity specific_terms = re.findall(r'\b[A-Z][a-z]+\b', question) score += len(specific_terms) scored_questions.append((question, score)) # Return highest scoring question scored_questions.sort(key=lambda x: x[1], reverse=True) return scored_questions[0][0] def create_fallback_question(self, chunk: str, question_type: str) -> str: """Create fallback questions when generation fails""" # Extract key terms from chunk key_terms = self.extract_key_terms(chunk) if question_type == "conceptual": if key_terms: return f"What are the key concepts related to {key_terms[0]} mentioned in this content?" return "What are the main concepts explained in this section?" elif question_type == "analytical": if key_terms: return f"How would you analyze the significance of {key_terms[0]} in this context?" return "How would you analyze the main points presented in this content?" elif question_type == "application": if key_terms: return f"How could you apply the principles of {key_terms[0]} in a real-world scenario?" return "How would you apply the concepts discussed in this content?" elif question_type == "definition": if key_terms: return f"How would you define {key_terms[0]} based on this content?" return "How would you define the key terms mentioned in this section?" elif question_type == "comparison": if len(key_terms) >= 2: return f"How would you compare {key_terms[0]} and {key_terms[1]}?" return "What are the similarities and differences between the concepts discussed?" return "What are the main points you would highlight from this content?" def extract_key_terms(self, text: str) -> List[str]: """Extract clean, separated key terms from text""" # First, clean up the text to handle concatenated words cleaned_text = self.separate_concatenated_words(text) # Extract meaningful terms terms = [] # Find properly spaced technical terms tech_terms = re.findall(r'\b[a-zA-Z][a-zA-Z0-9]*(?:\s+[a-zA-Z][a-zA-Z0-9]*)*\b', cleaned_text) # Filter and clean terms for term in tech_terms: term = term.strip() if (len(term) > 2 and not term.lower() in {'the', 'and', 'but', 'for', 'with', 'this', 'that', 'are', 'you', 'can', 'will', 'have', 'has'} and not term.isdigit()): terms.append(term) # Remove duplicates while preserving order seen = set() unique_terms = [] for term in terms: term_lower = term.lower() if term_lower not in seen: seen.add(term_lower) unique_terms.append(term) return unique_terms[:8] def separate_concatenated_words(self, text: str) -> str: """Separate concatenated words like 'CountersHeapq' into 'Counters Heapq'""" # Handle camelCase and PascalCase separated = re.sub(r'([a-z])([A-Z])', r'\1 \2', text) # Handle sequences of capitals followed by lowercase separated = re.sub(r'([A-Z]+)([A-Z][a-z])', r'\1 \2', separated) # Handle common programming terms that are often concatenated programming_terms = { 'deque': 'deque', 'dict': 'dictionary', 'str': 'string', 'int': 'integer', 'bool': 'boolean', 'func': 'function', 'obj': 'object', 'var': 'variable', 'param': 'parameter', 'arg': 'argument' } # Expand abbreviated terms words = separated.split() expanded_words = [] for word in words: word_lower = word.lower() if word_lower in programming_terms: expanded_words.append(programming_terms[word_lower]) else: expanded_words.append(word) return ' '.join(expanded_words) def create_fallback_question(self, chunk: str, question_type: str) -> str: """Create high-quality fallback questions when other methods fail""" # Extract clean key terms key_terms = self.extract_key_terms(chunk) # Get the main subject of the content main_subject = self.identify_main_subject(chunk) if question_type == "conceptual": if main_subject: return f"Explain the core concepts and principles of {main_subject}. What makes it significant?" elif key_terms: primary_term = key_terms[0] return f"What are the fundamental concepts behind {primary_term} and how does it work?" return "What are the main concepts and principles explained in this content?" elif question_type == "analytical": if main_subject: return f"Analyze the advantages and potential challenges of {main_subject}. What should be considered when using it?" elif key_terms: return f"Critically evaluate the approach involving {key_terms[0]}. What are its strengths and limitations?" return "Analyze the methodology described. What are its benefits and potential drawbacks?" elif question_type == "application": if main_subject: return f"How would you implement {main_subject} in a practical project? Describe the key steps." elif key_terms: return f"Describe a real-world scenario where you would use {key_terms[0]}. How would you apply it effectively?" return "How would you apply these concepts in a practical, real-world situation?" elif question_type == "definition": if len(key_terms) >= 2: return f"Define {key_terms[0]} and {key_terms[1]}. How do they relate to each other?" elif key_terms: return f"Provide a comprehensive definition of {key_terms[0]}. What are its key characteristics?" elif main_subject: return f"Define {main_subject} and explain its primary purpose and functionality." return "Define the key terms and concepts presented in this content." elif question_type == "comparison": if len(key_terms) >= 2: return f"Compare {key_terms[0]} and {key_terms[1]}. What are their similarities and differences?" elif main_subject and key_terms: return f"How does {main_subject} compare to other {key_terms[0]} approaches? What are the trade-offs?" return "Compare the different approaches or methods discussed. What are their relative advantages?" # Generic fallback if main_subject: return f"Discuss the important aspects of {main_subject} and explain why it's significant." return "What are the most important points covered in this content and why do they matter?" def identify_main_subject(self, text: str) -> str: """Identify the main subject/topic of the text chunk""" # Look for title-like patterns lines = text.split('\n') # Check first few lines for titles for line in lines[:3]: line = line.strip() if (len(line) > 5 and len(line) < 100 and not line.endswith('.') and any(c.isupper() for c in line)): # Clean up the potential title clean_title = re.sub(r'[^\w\s]', '', line).strip() if clean_title: return clean_title # Look for frequently mentioned terms key_terms = self.extract_key_terms(text) if key_terms: return key_terms[0] # Extract from first sentence sentences = re.split(r'[.!?]+', text) if sentences: first_sentence = sentences[0].strip() # Look for subject patterns subject_match = re.search(r'\b([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)\b', first_sentence) if subject_match: return subject_match.group(1) return "" def extract_key_concepts(self, text: str) -> List[str]: """Extract key concepts for answer evaluation""" # Enhanced keyword extraction words = re.findall(r'\b[A-Za-z]{3,}\b', text.lower()) # Advanced stop words list stop_words = { 'this', 'that', 'with', 'have', 'will', 'from', 'they', 'been', 'were', 'said', 'each', 'which', 'their', 'time', 'would', 'there', 'when', 'what', 'where', 'how', 'why', 'who', 'can', 'could', 'should', 'may', 'might', 'must', 'shall', 'will', 'would', 'also', 'then', 'than', 'only', 'just', 'even', 'still', 'more', 'most', 'very', 'much', 'many', 'some', 'any', 'all', 'both', 'either', 'neither' } # Filter words filtered_words = [word for word in words if word not in stop_words and len(word) > 3] # Calculate word frequency and importance word_scores = {} for word in filtered_words: # Base frequency score freq_score = filtered_words.count(word) # Bonus for longer words (often more specific) length_bonus = min(len(word) - 3, 5) # Bonus for words that appear in multiple contexts context_bonus = 1 if freq_score > 1 else 0 word_scores[word] = freq_score + length_bonus + context_bonus # Return top scoring words sorted_words = sorted(word_scores.items(), key=lambda x: x[1], reverse=True) return [word for word, score in sorted_words[:15]] def evaluate_answer(self, question_data: Dict, user_answer: str) -> float: """Enhanced answer evaluation with multiple scoring methods""" if not user_answer.strip(): return 0.0 try: # Method 1: Semantic similarity with context context_embedding = self.similarity_model.encode([question_data['context']]) answer_embedding = self.similarity_model.encode([user_answer]) context_similarity = np.dot(context_embedding[0], answer_embedding[0]) / ( np.linalg.norm(context_embedding[0]) * np.linalg.norm(answer_embedding[0]) ) # Method 2: Key concept matching user_words = set(re.findall(r'\b[A-Za-z]{3,}\b', user_answer.lower())) key_concepts = set(question_data['key_concepts']) concept_overlap = len(user_words.intersection(key_concepts)) / max(len(key_concepts), 1) # Method 3: Answer quality indicators quality_score = self.evaluate_answer_quality(user_answer, question_data) # Method 4: Question-specific relevance question_relevance = self.evaluate_question_relevance( question_data['question'], user_answer, question_data['context'] ) # Weighted combination final_score = ( context_similarity * 0.3 + concept_overlap * 0.25 + quality_score * 0.25 + question_relevance * 0.2 ) * 100 return min(100.0, max(0.0, final_score)) except Exception as e: print(f"Error evaluating answer: {e}") # Fallback scoring based on answer length and keyword presence word_count = len(user_answer.split()) base_score = min(word_count * 3, 60) # Up to 60 points for length # Bonus for using key terms key_bonus = 0 for concept in question_data.get('key_concepts', [])[:5]: if concept.lower() in user_answer.lower(): key_bonus += 8 return min(100.0, base_score + key_bonus) def evaluate_answer_quality(self, answer: str, question_data: Dict) -> float: """Evaluate the quality of the answer""" score = 0.0 # Length scoring (encourage substantial answers) word_count = len(answer.split()) if 20 <= word_count <= 100: score += 0.3 elif 10 <= word_count <= 150: score += 0.2 elif word_count >= 5: score += 0.1 # Structure scoring sentences = re.split(r'[.!?]+', answer) if len(sentences) >= 2: score += 0.1 # Specificity scoring (presence of detailed information) if re.search(r'\b\d+\b', answer): # Contains numbers score += 0.1 if re.search(r'\b[A-Z][a-z]+\b', answer): # Contains proper nouns score += 0.1 # Technical depth if any(term in answer.lower() for term in ['because', 'therefore', 'however', 'furthermore']): score += 0.1 return min(1.0, score) def evaluate_question_relevance(self, question: str, answer: str, context: str) -> float: """Evaluate how well the answer addresses the specific question""" try: # Check if answer directly addresses question type question_lower = question.lower() answer_lower = answer.lower() relevance_score = 0.0 # Question type analysis if 'what' in question_lower: if any(word in answer_lower for word in ['is', 'are', 'means', 'refers']): relevance_score += 0.3 if 'how' in question_lower: if any(word in answer_lower for word in ['by', 'through', 'using', 'method']): relevance_score += 0.3 if 'why' in question_lower: if any(word in answer_lower for word in ['because', 'since', 'due', 'reason']): relevance_score += 0.3 if any(word in question_lower for word in ['compare', 'difference', 'similar']): if any(word in answer_lower for word in ['similar', 'different', 'both', 'while', 'whereas']): relevance_score += 0.3 # Semantic similarity between question and answer q_embedding = self.similarity_model.encode([question]) a_embedding = self.similarity_model.encode([answer]) semantic_relevance = np.dot(q_embedding[0], a_embedding[0]) / ( np.linalg.norm(q_embedding[0]) * np.linalg.norm(a_embedding[0]) ) relevance_score += semantic_relevance * 0.4 return min(1.0, relevance_score) except Exception as e: print(f"Error evaluating relevance: {e}") return 0.5 # Neutral score on error def process_content(self, pdf_file=None, web_url="", num_questions=10): """Process uploaded content and generate questions""" try: # Extract text based on input type if pdf_file is not None: text = self.extract_pdf_text(pdf_file) source = "PDF" elif web_url.strip(): text = self.extract_web_content(web_url.strip()) source = "Web URL" else: return "Please provide either a PDF file or a web URL.", "", "" if text.startswith("Error"): return text, "", "" # Process text with improved chunking self.content_chunks = self.chunk_text(text) if not self.content_chunks: return "No valid content found to generate questions.", "", "" if len(self.content_chunks) < 3: return f"Content too short. Found only {len(self.content_chunks)} chunks. Please provide more substantial content.", "", "" # Generate high-quality questions self.questions = self.generate_better_questions(self.content_chunks, num_questions) self.current_question_idx = 0 self.user_answers = [] self.scores = [] if not self.questions: return "Failed to generate questions from the content.", "", "" summary = f"āœ… Successfully processed {source}!\n" summary += f"šŸ“„ Extracted {len(self.content_chunks)} content sections\n" summary += f"ā“ Generated {len(self.questions)} high-quality interview questions\n" summary += f"šŸŽÆ Questions cover: concepts, analysis, applications, and definitions\n" summary += "\nReady to start your interview practice!" first_question = f"Question 1/{len(self.questions)} [{self.questions[0]['question_type'].title()}]:\n\n{self.questions[0]['question']}" return summary, first_question, "" except Exception as e: return f"Error processing content: {str(e)}", "", "" def submit_answer(self, user_answer): """Submit answer and get next question with enhanced feedback""" if not self.questions: return "No quiz in progress. Please upload content first.", "", "No active quiz" if self.current_question_idx >= len(self.questions): return "Quiz completed!", "", self.get_final_results() # Evaluate current answer current_question = self.questions[self.current_question_idx] score = self.evaluate_answer(current_question, user_answer) self.user_answers.append(user_answer) self.scores.append(score) # Provide detailed feedback feedback = f"āœ… Answer {self.current_question_idx + 1} submitted!\n" feedback += f"šŸ“Š Score: {score:.1f}/100\n" # Add specific feedback based on score if score >= 80: feedback += "🌟 Excellent answer! Great understanding demonstrated.\n" elif score >= 60: feedback += "šŸ‘ Good answer! You covered the main points well.\n" elif score >= 40: feedback += "šŸ“š Fair answer. Try to include more specific details.\n" else: feedback += "šŸ’Ŗ Room for improvement. Focus on key concepts from the content.\n" self.current_question_idx += 1 if self.current_question_idx < len(self.questions): # Next question with enhanced formatting next_question = self.questions[self.current_question_idx] question_text = f"Question {self.current_question_idx + 1}/{len(self.questions)} [{next_question['question_type'].title()}]:\n\n" question_text += f"{next_question['question']}\n\n" question_text += "šŸ’” Tip: Use specific details and examples from the content in your answer." return feedback, question_text, self.get_current_progress() else: # Quiz completed return feedback + "\nšŸŽ‰ Interview Complete!", "šŸŽ‰ Interview Complete!", self.get_final_results() def get_current_progress(self): """Get current progress summary with enhanced analytics""" if not self.scores: return "No answers submitted yet" avg_score = sum(self.scores) / len(self.scores) progress = f"šŸ“ˆ Progress: {len(self.scores)}/{len(self.questions)} questions answered\n" progress += f"šŸ“Š Average Score: {avg_score:.1f}/100\n" progress += f"šŸŽÆ Latest Score: {self.scores[-1]:.1f}/100\n" if len(self.scores) >= 2: trend = "šŸ“ˆ Improving" if self.scores[-1] > self.scores[-2] else "šŸ“‰ Declining" if self.scores[-1] < self.scores[-2] else "āž”ļø Stable" progress += f"šŸ“‰šŸ“ˆ Trend: {trend}\n" progress += "\n" # Add dynamic tips based on current performance if len(self.scores) >= 2: progress += self.get_live_tips() return progress def get_live_tips(self): """Generate live tips during the quiz based on current performance""" recent_scores = self.scores[-3:] if len(self.scores) >= 3 else self.scores avg_recent = sum(recent_scores) / len(recent_scores) tips = "šŸŽÆ **Live Performance Tips:**\n" if avg_recent >= 80: tips += "• Excellent work! Keep providing detailed, specific answers\n" tips += "• You're demonstrating strong comprehension skills\n" elif avg_recent >= 60: tips += "• Good progress! Try adding more specific examples from the content\n" tips += "• Connect your answers directly to the source material\n" elif avg_recent >= 40: tips += "• Focus on using key terms and concepts from the source\n" tips += "• Take time to structure your thoughts before answering\n" else: tips += "• Slow down and carefully read both the question and source content\n" tips += "• Look for main ideas and important details in each section\n" # Performance trend analysis if len(recent_scores) >= 2: if recent_scores[-1] > recent_scores[-2]: tips += "• šŸ“ˆ Great improvement! You're adapting well to the format\n" elif recent_scores[-1] < recent_scores[-2]: tips += "• šŸŽÆ Refocus needed - take a moment to review the content\n" return tips def get_final_results(self): """Generate comprehensive final quiz results""" if not self.scores: return "No quiz completed yet" total_questions = len(self.scores) avg_score = sum(self.scores) / total_questions max_score = max(self.scores) min_score = min(self.scores) # Enhanced performance categorization if avg_score >= 85: performance = "Outstanding! 🌟" grade = "A" elif avg_score >= 75: performance = "Excellent! šŸŽ‰" grade = "B+" elif avg_score >= 65: performance = "Good! šŸ‘" grade = "B" elif avg_score >= 55: performance = "Fair šŸ“š" grade = "C+" elif avg_score >= 45: performance = "Needs Improvement šŸ’Ŗ" grade = "C" else: performance = "Requires Focus šŸŽÆ" grade = "D" results = f""" šŸŽÆ INTERVIEW PERFORMANCE REPORT šŸŽÆ šŸ“Š **Overall Performance** • Total Questions: {total_questions} • Average Score: {avg_score:.1f}/100 • Grade: {grade} • Assessment: {performance} šŸ“ˆ **Score Analysis** • Highest Score: {max_score:.1f}/100 • Lowest Score: {min_score:.1f}/100 • Score Range: {max_score - min_score:.1f} points • Consistency: {"High" if max_score - min_score < 30 else "Moderate" if max_score - min_score < 50 else "Variable"} šŸ“‹ **Question-by-Question Results** """ for i, (score, question_data) in enumerate(zip(self.scores, self.questions[:len(self.scores)]), 1): q_type = question_data['question_type'].title() emoji = "🌟" if score >= 80 else "šŸ‘" if score >= 60 else "šŸ“š" if score >= 40 else "šŸ’Ŗ" results += f"Q{i} ({q_type}): {score:.1f}/100 {emoji}\n" # Performance by question type results += "\nšŸŽ­ **Performance by Question Type**\n" type_scores = {} for i, score in enumerate(self.scores): if i < len(self.questions): q_type = self.questions[i]['question_type'] if q_type not in type_scores: type_scores[q_type] = [] type_scores[q_type].append(score) for q_type, scores in type_scores.items(): avg_type_score = sum(scores) / len(scores) results += f"• {q_type.title()}: {avg_type_score:.1f}/100 ({len(scores)} questions)\n" results += "\n" + self.get_personalized_tips() return results def get_personalized_tips(self): """Generate enhanced personalized tips based on detailed performance analysis""" if not self.scores: return "šŸ’” Complete a quiz to get personalized tips!" avg_score = sum(self.scores) / len(self.scores) score_variance = max(self.scores) - min(self.scores) tips = "šŸ’” **Personalized Development Plan:**\n\n" # Overall performance feedback if avg_score >= 85: tips += "🌟 **Outstanding Performance!**\n" tips += "• You demonstrate excellent comprehension and communication skills\n" tips += "• Your answers show deep understanding of the content\n" tips += "• Consider mentoring others or taking on leadership roles\n" tips += "• Challenge yourself with more complex technical content\n" elif avg_score >= 75: tips += "šŸŽ‰ **Excellent Work!**\n" tips += "• Strong foundation with room for refinement\n" tips += "• Focus on adding more specific examples in your answers\n" tips += "• Practice connecting concepts across different sections\n" elif avg_score >= 65: tips += "šŸ‘ **Good Performance with Growth Potential!**\n" tips += "• You understand the main concepts well\n" tips += "• Work on providing more detailed explanations\n" tips += "• Include specific terminology from the source material\n" elif avg_score >= 55: tips += "šŸ“š **Solid Foundation - Ready for Next Level!**\n" tips += "• Focus on reading comprehension before answering\n" tips += "• Practice identifying key concepts in each section\n" tips += "• Structure your answers with main points first\n" elif avg_score >= 45: tips += "šŸ’Ŗ **Building Skills - You're on the Right Path!**\n" tips += "• Spend more time analyzing the content before questions\n" tips += "• Practice summarizing key points in your own words\n" tips += "• Focus on understanding rather than memorizing\n" else: tips += "šŸŽÆ **Focus Areas for Rapid Improvement:**\n" tips += "• Take time to thoroughly read and understand the source content\n" tips += "• Practice breaking down complex information into main ideas\n" tips += "• Focus on quality over speed in your responses\n" tips += "• Consider reviewing fundamental concepts in the subject area\n" # Consistency analysis if score_variance > 50: tips += "\nšŸ“Š **Consistency Focus:**\n" tips += "• Your scores vary significantly across questions\n" tips += "• This suggests uneven understanding - review weaker areas\n" tips += "• Practice maintaining consistent quality across all question types\n" elif score_variance < 20: tips += "\nšŸŽÆ **Consistent Performance:**\n" tips += "• Great job maintaining steady quality across questions!\n" # Question type specific tips if len(self.scores) >= 3: tips += "\nšŸŽ­ **Question Type Strategies:**\n" type_performance = {} for i, score in enumerate(self.scores): if i < len(self.questions): q_type = self.questions[i]['question_type'] if q_type not in type_performance: type_performance[q_type] = [] type_performance[q_type].append(score) for q_type, scores in type_performance.items(): avg_type = sum(scores) / len(scores) if avg_type < avg_score - 10: # Significantly below average tips += f"• Improve {q_type} questions: " if q_type == "analytical": tips += "Break down problems step by step\n" elif q_type == "application": tips += "Think about real-world examples and use cases\n" elif q_type == "conceptual": tips += "Focus on understanding core principles and ideas\n" elif q_type == "definition": tips += "Use precise terminology and clear explanations\n" elif q_type == "comparison": tips += "Identify both similarities and key differences\n" return tips # Initialize the bot bot = InterviewBot() # Create enhanced Gradio interface def create_interface(): with gr.Blocks(title="Interview Bot - AI-Powered Quiz Generator", theme=gr.themes.Soft()) as demo: gr.Markdown(""" # šŸ¤– AI Interview Bot - Intelligent Quiz Generator Transform any PDF document or web article into a comprehensive interview practice session! **šŸš€ Enhanced Features:** - **Smart Content Analysis**: Identifies key concepts and important sections - **Intelligent Question Generation**: Creates diverse, meaningful interview questions - **Multi-Type Questions**: Conceptual, analytical, application, definition, and comparison questions - **Advanced Answer Evaluation**: Uses semantic analysis and concept matching - **Real-Time Performance Analytics**: Live scoring, trends, and personalized tips - **Comprehensive Reporting**: Detailed performance analysis and improvement recommendations """) with gr.Row(): with gr.Column(scale=1): gr.Markdown("### šŸ“¤ Content Input") pdf_input = gr.File( label="Upload PDF File (up to 50 pages)", file_types=[".pdf"] ) web_input = gr.Textbox( label="Or Enter Web URL", placeholder="https://example.com/article", lines=1 ) num_questions = gr.Slider( minimum=5, maximum=20, value=10, step=1, label="Number of Questions" ) process_btn = gr.Button("šŸš€ Generate Intelligent Quiz", variant="primary", size="lg") with gr.Column(scale=2): gr.Markdown("### šŸ“ Interview Practice") status_output = gr.Textbox( label="šŸ“‹ Status & Instructions", lines=6, value="Welcome! Upload a PDF or enter a web URL to generate your personalized interview quiz.\n\nThe AI will analyze the content and create intelligent questions testing different skills:\n• Conceptual understanding\n• Analytical thinking\n• Practical application\n• Definitions and terminology\n• Comparisons and relationships" ) question_output = gr.Textbox( label="ā“ Current Question", lines=4, value="Your intelligently generated questions will appear here..." ) answer_input = gr.Textbox( label="āœļø Your Answer", lines=6, placeholder="Provide a detailed, thoughtful answer here...\n\nTips:\n• Use specific terms from the content\n• Provide examples when possible\n• Explain your reasoning\n• Structure your response clearly" ) submit_btn = gr.Button("āœ… Submit Answer & Continue", variant="secondary", size="lg") with gr.Row(): with gr.Column(scale=1): progress_output = gr.Textbox( label="šŸ“Š Progress & Analytics", lines=10, value="Quiz progress and performance analytics will appear here..." ) with gr.Column(scale=1): tips_output = gr.Textbox( label="šŸ’” Live Tips & Feedback", lines=10, value="Personalized tips and improvement suggestions will appear here as you progress through the quiz..." ) # Event handlers def process_content_wrapper(pdf_file, web_url, num_q): status, question, progress = bot.process_content(pdf_file, web_url, num_q) tips = "šŸŽÆ **Getting Started Tips:**\n• Read each question carefully\n• Think about the main concepts from the source\n• Provide specific, detailed answers\n• Use terminology from the original content\n• Take your time to give thoughtful responses" return status, question, progress, tips def submit_answer_wrapper(user_answer): status, question, progress = bot.submit_answer(user_answer) # Generate updated tips if bot.scores: tips = bot.get_personalized_tips() else: tips = "Complete more questions to get personalized feedback!" return status, question, progress, tips, "" # Clear answer input process_btn.click( fn=process_content_wrapper, inputs=[pdf_input, web_input, num_questions], outputs=[status_output, question_output, progress_output, tips_output] ) submit_btn.click( fn=submit_answer_wrapper, inputs=[answer_input], outputs=[status_output, question_output, progress_output, tips_output, answer_input] ) # Additional information section with gr.Row(): gr.Markdown(""" ### šŸŽ“ How It Works: 1. **Content Analysis**: The AI analyzes your document/webpage to identify key concepts, important sections, and technical terms 2. **Smart Question Generation**: Creates diverse question types using advanced NLP models trained specifically for question generation 3. **Intelligent Evaluation**: Your answers are scored using semantic similarity, concept matching, and answer quality metrics 4. **Personalized Feedback**: Get real-time tips and detailed performance analysis to improve your interview skills ### šŸŽÆ Question Types Generated: - **Conceptual**: Test understanding of main ideas and principles - **Analytical**: Require critical thinking and analysis - **Application**: Focus on practical use and real-world scenarios - **Definition**: Test knowledge of key terms and concepts - **Comparison**: Examine relationships and differences between ideas """) return demo # Launch the application if __name__ == "__main__": demo = create_interface() demo.launch() print("šŸš€ Enhanced Interview Bot is ready!") print("šŸŽÆ Features: Intelligent question generation, advanced scoring, and personalized feedback!")