File size: 16,352 Bytes
2a502d8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
import os
import google.generativeai as genai
from dotenv import load_dotenv
from pathlib import Path
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.document_loaders.pdf import PyPDFLoader
from langchain_community.document_loaders.directory import DirectoryLoader
from langchain_core.documents import Document
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
from tqdm import tqdm
import json
from datetime import datetime
import itertools  # Added for groupby functionality

# Load environment variables
load_dotenv()

# Initialize Gemini API
genai.configure(api_key=os.getenv('GOOGLE_API_KEY'))
model = genai.GenerativeModel('gemini-1.5-pro')  # Using stable version instead of preview

# Initialize the sentence transformer model for embeddings
embedder = SentenceTransformer('all-mpnet-base-v2')

class LearningChatbot:
    def __init__(self, docs_path="./documents"):
        """Initialize chatbot with document path"""
        self.docs_path = docs_path
        self.vector_store = None
        self.documents = []
        self.initialize_knowledge_base()

    def _load_json_file(self, file_path):
        """Load and process JSON file into document chunks"""
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                data = json.load(f)
            
            # Convert JSON to text chunks
            chunks = []
            
            def process_json(obj, parent_key=''):
                if isinstance(obj, dict):
                    for key, value in obj.items():
                        new_key = f"{parent_key}.{key}" if parent_key else key
                        if isinstance(value, (dict, list)):
                            process_json(value, new_key)
                        else:
                            chunks.append(f"{new_key}: {value}")
                elif isinstance(obj, list):
                    for i, item in enumerate(obj):
                        new_key = f"{parent_key}[{i}]"
                        if isinstance(item, (dict, list)):
                            process_json(item, new_key)
                        else:
                            chunks.append(f"{new_key}: {item}")
            
            process_json(data)
            return chunks
            
        except Exception as e:
            print(f"Error loading JSON file {file_path}: {str(e)}")
            return []

    def initialize_knowledge_base(self):
        """Load and process documents into vector store with memory management"""
        try:
            print("Loading documents...")
            self.documents = []
            
            # Process files in batches
            batch_size = 5
            all_files = list(Path(self.docs_path).glob("**/*.*"))
            
            for i in range(0, len(all_files), batch_size):
                batch_files = all_files[i:i + batch_size]
                batch_docs = []
                
                for file in batch_files:
                    try:
                        if file.suffix.lower() == '.pdf':
                            loader = PyPDFLoader(str(file))
                            batch_docs.extend(loader.load())
                        elif file.suffix.lower() == '.json':
                            chunks = self._load_json_file(str(file))
                            # Convert chunks to document format
                            batch_docs.extend([
                                Document(page_content=chunk, metadata={"source": str(file)})
                                for chunk in chunks
                            ])
                    except Exception as e:
                        print(f"Error loading {file}: {str(e)}")
                        continue
                
                self.documents.extend(batch_docs)
                
                # Clear memory after each batch
                batch_docs = None
            
            print(f"Loaded {len(self.documents)} documents")
            
            # Memory-efficient text splitting
            print("Splitting text...")
            text_splitter = RecursiveCharacterTextSplitter(
                chunk_size=256,  # Reduced chunk size
                chunk_overlap=20,  # Reduced overlap
                separators=["\n\n", "\n", ".", "!", "?", ";", ",", " "],
                length_function=len,
            )
            
            # Split documents in batches
            processed_chunks = []
            batch_size = 50  # Process 50 chunks at a time
            
            for i in range(0, len(self.documents), batch_size):
                batch = self.documents[i:i + batch_size]
                chunks = text_splitter.split_documents(batch)
                processed_chunks.extend(chunks)
                
                # Clear batch from memory
                batch = None
            
            self.documents = processed_chunks
            print(f"Created {len(self.documents)} chunks")
            
            # Generate embeddings in batches
            print("Generating embeddings...")
            embeddings = []
            batch_size = 32  # Process 32 embeddings at a time
            
            for i in range(0, len(self.documents), batch_size):
                batch = self.documents[i:i + batch_size]
                texts = [doc.page_content for doc in batch]
                
                # Generate embeddings for batch
                batch_embeddings = embedder.encode(texts)
                embeddings.extend(batch_embeddings)
                
                # Clear batch from memory
                batch = None
                texts = None
                batch_embeddings = None
            
            # Initialize FAISS index with memory-efficient approach
            print("Building search index...")
            dimension = embeddings[0].shape[0]
            self.vector_store = faiss.IndexFlatL2(dimension)
            
            # Add embeddings in batches
            batch_size = 1000  # Add 1000 vectors at a time
            embeddings_array = np.array(embeddings)
            
            for i in range(0, len(embeddings_array), batch_size):
                batch = embeddings_array[i:i + batch_size]
                self.vector_store.add(batch)
                
                # Clear batch from memory
                batch = None
            
            # Clear large objects from memory
            embeddings = None
            embeddings_array = None
            
            print("Knowledge base initialization complete")
            
        except Exception as e:
            print(f"Error initializing knowledge base: {str(e)}")
            raise e

    def verify_knowledge_base(self):
        """

        Verify if the knowledge base is properly initialized

        Returns:

            bool: True if vector store and documents are ready

        """
        try:
            return (
                self.vector_store is not None and 
                len(self.documents) > 0 and
                hasattr(self.vector_store, 'ntotal') and 
                self.vector_store.ntotal > 0
            )
        except Exception as e:
            print(f"Error verifying knowledge base: {str(e)}")
            return False

    def get_relevant_context(self, query, k=3):
        """Memory-efficient context retrieval"""
        try:
            # Generate query embedding
            query_vector = embedder.encode([query])[0]
            
            # Search in batches if there are many documents
            batch_size = 1000
            if self.vector_store.ntotal > batch_size:
                distances = []
                indices = []
                
                for i in range(0, self.vector_store.ntotal, batch_size):
                    end_idx = min(i + batch_size, self.vector_store.ntotal)
                    batch_distances, batch_indices = self.vector_store.search(
                        query_vector.reshape(1, -1),
                        min(k, end_idx - i)
                    )
                    distances.extend(batch_distances[0])
                    indices.extend(batch_indices[0])
                
                # Get top k results
                top_indices = sorted(range(len(distances)), key=lambda i: distances[i])[:k]
                relevant_docs = [self.documents[indices[i]].page_content for i in top_indices]
            else:
                # For smaller document sets, search all at once
                distances, indices = self.vector_store.search(query_vector.reshape(1, -1), k)
                relevant_docs = [self.documents[i].page_content for i in indices[0]]
            
            return "\n".join(relevant_docs)
            
        except Exception as e:
            print(f"Error retrieving context: {str(e)}")
            return ""

    def _construct_educational_prompt(self, query, context):
        """

        Construct an OpexA-focused prompt that delivers clear, concise, and actionable responses

        """
        base_prompt = f"""You are an expert assistant for OpexA, an EdTech platform focused on career growth for IT professionals, 

businesses, and public sector users. Your goal is to deliver clear, concise, and actionable answers while maintaining a friendly 

and supportive tone.



Context from OpexA materials:

{context}



User Question: {query}



Key Guidelines for Your Response:



1. User Segments - Tailor your response based on user type:

β€’ Beginners: Offer foundational guidance and basic concepts

β€’ Career Changers: Focus on transition plans and skill mapping

β€’ Experienced Professionals: Provide advanced insights and industry-specific details

β€’ Business/Public Sector: Address organizational needs and compliance



Response Structure:

1. Start with direct, relevant information

2. Use bullet points for lists and steps

3. Include practical examples or analogies

4. Add proactive tips or next steps

5. End with an engaging question



Handling Special Cases:

β€’ Unclear Questions: Ask for clarification (e.g., "Are you interested in career assessments or account settings?")

β€’ Out-of-Scope: Politely redirect to available features

β€’ Privacy Concerns: Provide reassurance about data protection



Style Guidelines:

β€’ Use natural, conversational language

β€’ Include relevant emojis sparingly (πŸš€ for growth, πŸ”’ for security)

β€’ Format lists and steps with bullet points (β€’)

β€’ Keep responses concise but informative

β€’ End with engaging questions like "What's your next goal?" or "Ready to explore more?"



Now, please provide a helpful response to: {query}"""
        
        return base_prompt

    def _format_response(self, response):
        """Format the response with consistent list formatting and proper line breaks"""
        try:
            text = response.text
            
            # Split into paragraphs
            paragraphs = text.split('\n\n')
            formatted_paragraphs = []
            
            for p in paragraphs:
                lines = p.split('\n')
                formatted_lines = []
                in_list = False
                previous_was_list = False
                
                for line in lines:
                    line = line.strip()
                    
                    # Check if this is a list item
                    is_list_item = line.startswith(('β€’', '-', '*', 'β—‹', 'Β·', 'β–Ί', 'β†’', '1.', '2.', '3.', '4.', '5.', '6.', '7.', '8.', '9.'))
                    
                    # Add extra line break before list items (except for the first one)
                    if is_list_item and previous_was_list:
                        formatted_lines.append('')  # Add empty line between list items
                    if is_list_item:
                        # Standardize bullet points
                        if line[0].isdigit():  # If it's a numbered list
                            line = 'β€’ ' + line[line.find(' ')+1:].strip()
                        else:  # If it's already a bullet point
                            line = 'β€’ ' + line[1:].strip()
                        in_list = True
                        previous_was_list = True
                    else:
                        # If this looks like it should be a list item but missing bullet
                        if in_list and line and not line.endswith(':'):
                            if previous_was_list:
                                formatted_lines.append('')  # Add empty line between list items
                            line = 'β€’ ' + line
                            previous_was_list = True
                        else:
                            in_list = False
                            previous_was_list = False
                    
                    formatted_lines.append(line)
                
                # Join lines with appropriate spacing
                formatted_text = '\n'.join(formatted_lines)
                
                # Add extra newline before lists for better readability
                if any(line.startswith('β€’ ') for line in formatted_lines):
                    formatted_text = '\n' + formatted_text
                
                formatted_paragraphs.append(formatted_text)
            
            # Join paragraphs with double newlines
            formatted_text = '\n\n'.join(formatted_paragraphs)
            
            # Clean up multiple consecutive newlines
            formatted_text = '\n'.join(line for line, _ in itertools.groupby(formatted_text.splitlines()))
            
            # If response is too long, keep main points while preserving list structure
            if len(formatted_text) > 500:
                main_paragraphs = []
                
                # Always keep the first paragraph (usually the main explanation)
                main_paragraphs.append(formatted_paragraphs[0])
                
                # Keep all bullet point lists
                for p in formatted_paragraphs[1:]:
                    if 'β€’ ' in p:
                        main_paragraphs.append(p)
                
                formatted_text = '\n\n'.join(main_paragraphs)
            
            return formatted_text.strip()
            
        except Exception as e:
            return f"I apologize, but I ran into an issue formatting the response. Let me try to help you in a simpler way: {str(e)}"

    def _handle_generation_error(self, error):
        """Handle errors with a natural, supportive tone"""
        return f"""I apologize, but I'm having trouble helping you at the moment. 

        

This might be because:

- I'm still processing some information

- There might be a technical issue

- The question might need to be more specific



Would you mind trying to rephrase your question? I want to make sure I give you the best help possible.



Technical note: {str(error)}"""

    def generate_response(self, query):
        """Generate natural, personalized responses for students"""
        try:
            if not self.verify_knowledge_base():
                return """I'm having trouble accessing our learning materials at the moment. 

                Could you make sure all the study materials are properly loaded? 

                This helps me give you the most accurate and helpful responses."""
                
            # Get relevant context
            context = self.get_relevant_context(query, k=3)
            
            # Construct educational prompt
            prompt = self._construct_educational_prompt(query, context)
            
            # Generate response with simplified parameters
            response = model.generate_content(prompt)
            
            # Return natural response
            return self._format_response(response)
            
        except Exception as e:
            return self._handle_generation_error(e)