TahaRasouli commited on
Commit
1abb1bd
·
verified ·
1 Parent(s): 73e9e73

Create unified_document_processor.py

Browse files
Files changed (1) hide show
  1. unified_document_processor.py +765 -0
unified_document_processor.py ADDED
@@ -0,0 +1,765 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Dict, Union
2
+ from groq import Groq
3
+ import chromadb
4
+ import os
5
+ import datetime
6
+ import json
7
+ import xml.etree.ElementTree as ET
8
+ import nltk
9
+ from nltk.tokenize import sent_tokenize
10
+ import PyPDF2
11
+ from sentence_transformers import SentenceTransformer
12
+
13
+ class CustomEmbeddingFunction:
14
+ def __init__(self):
15
+ self.model = SentenceTransformer('all-MiniLM-L6-v2')
16
+
17
+ def __call__(self, input: List[str]) -> List[List[float]]:
18
+ embeddings = self.model.encode(input)
19
+ return embeddings.tolist()
20
+
21
+ class UnifiedDocumentProcessor:
22
+ def __init__(self, groq_api_key, collection_name="unified_content"):
23
+ """Initialize the processor with necessary clients"""
24
+ self.groq_client = Groq(api_key=groq_api_key)
25
+
26
+ # XML-specific settings
27
+ self.max_elements_per_chunk = 50
28
+
29
+ # PDF-specific settings
30
+ self.pdf_chunk_size = 500
31
+ self.pdf_overlap = 50
32
+
33
+ # Initialize NLTK
34
+ self._initialize_nltk()
35
+
36
+ # Initialize ChromaDB with a single collection for all document types
37
+ self.chroma_client = chromadb.Client()
38
+ existing_collections = self.chroma_client.list_collections()
39
+ collection_exists = any(col.name == collection_name for col in existing_collections)
40
+
41
+ if collection_exists:
42
+ print(f"Using existing collection: {collection_name}")
43
+ self.collection = self.chroma_client.get_collection(
44
+ name=collection_name,
45
+ embedding_function=CustomEmbeddingFunction()
46
+ )
47
+ else:
48
+ print(f"Creating new collection: {collection_name}")
49
+ self.collection = self.chroma_client.create_collection(
50
+ name=collection_name,
51
+ embedding_function=CustomEmbeddingFunction()
52
+ )
53
+
54
+ def _initialize_nltk(self):
55
+ """Ensure NLTK's `punkt` tokenizer resource is available."""
56
+ try:
57
+ nltk.data.find('tokenizers/punkt')
58
+ except LookupError:
59
+ print("Downloading NLTK 'punkt' tokenizer...")
60
+ nltk.download('punkt')
61
+
62
+ def extract_text_from_pdf(self, pdf_path: str) -> str:
63
+ """Extract text from PDF file"""
64
+ try:
65
+ text = ""
66
+ with open(pdf_path, 'rb') as file:
67
+ pdf_reader = PyPDF2.PdfReader(file)
68
+ for page in pdf_reader.pages:
69
+ text += page.extract_text() + " "
70
+ return text.strip()
71
+ except Exception as e:
72
+ raise Exception(f"Error extracting text from PDF: {str(e)}")
73
+
74
+ def chunk_text(self, text: str) -> List[str]:
75
+ """Split text into chunks while preserving sentence boundaries"""
76
+ sentences = sent_tokenize(text)
77
+ chunks = []
78
+ current_chunk = []
79
+ current_size = 0
80
+
81
+ for sentence in sentences:
82
+ words = sentence.split()
83
+ sentence_size = len(words)
84
+
85
+ if current_size + sentence_size > self.pdf_chunk_size:
86
+ if current_chunk:
87
+ chunks.append(' '.join(current_chunk))
88
+ overlap_words = current_chunk[-self.pdf_overlap:] if self.pdf_overlap > 0 else []
89
+ current_chunk = overlap_words + words
90
+ current_size = len(current_chunk)
91
+ else:
92
+ current_chunk = words
93
+ current_size = sentence_size
94
+ else:
95
+ current_chunk.extend(words)
96
+ current_size += sentence_size
97
+
98
+ if current_chunk:
99
+ chunks.append(' '.join(current_chunk))
100
+
101
+ return chunks
102
+
103
+ def flatten_xml_to_text(self, element, depth=0) -> str:
104
+ """Convert XML element and its children to a flat text representation"""
105
+ text_parts = []
106
+
107
+ element_info = f"Element: {element.tag}"
108
+ if element.attrib:
109
+ element_info += f", Attributes: {json.dumps(element.attrib)}"
110
+ if element.text and element.text.strip():
111
+ element_info += f", Text: {element.text.strip()}"
112
+ text_parts.append(element_info)
113
+
114
+ for child in element:
115
+ child_text = self.flatten_xml_to_text(child, depth + 1)
116
+ text_parts.append(child_text)
117
+
118
+ return "\n".join(text_parts)
119
+
120
+ def chunk_xml_text(self, text: str, max_chunk_size: int = 2000) -> List[str]:
121
+ """Split flattened XML text into manageable chunks"""
122
+ lines = text.split('\n')
123
+ chunks = []
124
+ current_chunk = []
125
+ current_size = 0
126
+
127
+ for line in lines:
128
+ line_size = len(line)
129
+ if current_size + line_size > max_chunk_size and current_chunk:
130
+ chunks.append('\n'.join(current_chunk))
131
+ current_chunk = []
132
+ current_size = 0
133
+ current_chunk.append(line)
134
+ current_size += line_size
135
+
136
+ if current_chunk:
137
+ chunks.append('\n'.join(current_chunk))
138
+
139
+ return chunks
140
+
141
+ def generate_natural_language(self, content: Union[List[Dict], str], content_type: str) -> str:
142
+ """Generate natural language description with improved error handling and chunking"""
143
+ try:
144
+ if content_type == "xml":
145
+ prompt = f"Convert this XML structure description to a natural language summary: {content}"
146
+ else: # pdf
147
+ prompt = f"Summarize this text while preserving key information: {content}"
148
+
149
+ max_prompt_length = 4000
150
+ if len(prompt) > max_prompt_length:
151
+ prompt = prompt[:max_prompt_length] + "..."
152
+
153
+ response = self.groq_client.chat.completions.create(
154
+ messages=[{"role": "user", "content": prompt}],
155
+ model="llama3-8b-8192",
156
+ max_tokens=1000
157
+ )
158
+ return response.choices[0].message.content
159
+ except Exception as e:
160
+ print(f"Error generating natural language: {str(e)}")
161
+ if len(content) > 2000:
162
+ half_length = len(content) // 2
163
+ first_half = content[:half_length]
164
+ try:
165
+ return self.generate_natural_language(first_half, content_type)
166
+ except:
167
+ return None
168
+ return None
169
+
170
+ # Additional methods (unchanged but structured for easier review)...
171
+
172
+ def store_in_vector_db(self, natural_language: str, metadata: Dict) -> str:
173
+ """Store content in vector database"""
174
+ doc_id = f"{metadata['source_file']}_{metadata['content_type']}_{metadata['chunk_id']}_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}"
175
+
176
+ self.collection.add(
177
+ documents=[natural_language],
178
+ metadatas=[metadata],
179
+ ids=[doc_id]
180
+ )
181
+
182
+ return doc_id
183
+
184
+ def process_file(self, file_path: str) -> Dict:
185
+ """Process any supported file type"""
186
+ try:
187
+ file_extension = os.path.splitext(file_path)[1].lower()
188
+
189
+ if file_extension == '.xml':
190
+ return self.process_xml_file(file_path)
191
+ elif file_extension == '.pdf':
192
+ return self.process_pdf_file(file_path)
193
+ else:
194
+ return {
195
+ 'success': False,
196
+ 'error': f'Unsupported file type: {file_extension}'
197
+ }
198
+ except Exception as e:
199
+ return {
200
+ 'success': False,
201
+ 'error': f'Error processing file: {str(e)}'
202
+ }
203
+
204
+ def process_xml_file(self, xml_file_path: str) -> Dict:
205
+ """Process XML file with improved chunking"""
206
+ try:
207
+ tree = ET.parse(xml_file_path)
208
+ root = tree.getroot()
209
+ flattened_text = self.flatten_xml_to_text(root)
210
+ chunks = self.chunk_xml_text(flattened_text)
211
+
212
+ print(f"Split XML into {len(chunks)} chunks")
213
+ results = []
214
+
215
+ for i, chunk in enumerate(chunks):
216
+ print(f"Processing XML chunk {i+1}/{len(chunks)}")
217
+ try:
218
+ natural_language = self.generate_natural_language(chunk, "xml")
219
+
220
+ if natural_language:
221
+ metadata = {
222
+ 'source_file': os.path.basename(xml_file_path),
223
+ 'content_type': 'xml',
224
+ 'chunk_id': i,
225
+ 'total_chunks': len(chunks),
226
+ 'timestamp': str(datetime.datetime.now())
227
+ }
228
+ doc_id = self.store_in_vector_db(natural_language, metadata)
229
+ results.append({
230
+ 'chunk': i,
231
+ 'success': True,
232
+ 'doc_id': doc_id,
233
+ 'natural_language': natural_language
234
+ })
235
+ else:
236
+ results.append({
237
+ 'chunk': i,
238
+ 'success': False,
239
+ 'error': 'Failed to generate natural language'
240
+ })
241
+ except Exception as e:
242
+ print(f"Error processing chunk {i}: {str(e)}")
243
+ results.append({
244
+ 'chunk': i,
245
+ 'success': False,
246
+ 'error': str(e)
247
+ })
248
+
249
+ return {
250
+ 'success': True,
251
+ 'total_chunks': len(chunks),
252
+ 'results': results
253
+ }
254
+
255
+ except Exception as e:
256
+ return {
257
+ 'success': False,
258
+ 'error': str(e)
259
+ }
260
+
261
+ def process_pdf_file(self, pdf_file_path: str) -> Dict:
262
+ """Process PDF file"""
263
+ try:
264
+ full_text = self.extract_text_from_pdf(pdf_file_path)
265
+ chunks = self.chunk_text(full_text)
266
+
267
+ print(f"Split PDF into {len(chunks)} chunks")
268
+ results = []
269
+
270
+ for i, chunk in enumerate(chunks):
271
+ print(f"Processing PDF chunk {i+1}/{len(chunks)}")
272
+ natural_language = self.generate_natural_language(chunk, "pdf")
273
+
274
+ if natural_language:
275
+ metadata = {
276
+ 'source_file': os.path.basename(pdf_file_path),
277
+ 'content_type': 'pdf',
278
+ 'chunk_id': i,
279
+ 'total_chunks': len(chunks),
280
+ 'timestamp': str(datetime.datetime.now()),
281
+ 'chunk_size': len(chunk.split())
282
+ }
283
+ doc_id = self.store_in_vector_db(natural_language, metadata)
284
+ results.append({
285
+ 'chunk': i,
286
+ 'success': True,
287
+ 'doc_id': doc_id,
288
+ 'natural_language': natural_language,
289
+ 'original_text': chunk[:200] + "..."
290
+ })
291
+ else:
292
+ results.append({
293
+ 'chunk': i,
294
+ 'success': False,
295
+ 'error': 'Failed to generate natural language summary'
296
+ })
297
+
298
+ return {
299
+ 'success': True,
300
+ 'total_chunks': len(chunks),
301
+ 'results': results
302
+ }
303
+
304
+ except Exception as e:
305
+ return {
306
+ 'success': False,
307
+ 'error': str(e)
308
+ }
309
+
310
+ def get_available_files(self) -> Dict[str, List[str]]:
311
+ """Get list of all files in the database"""
312
+ try:
313
+ all_entries = self.collection.get(
314
+ include=['metadatas']
315
+ )
316
+
317
+ files = {
318
+ 'pdf': set(),
319
+ 'xml': set()
320
+ }
321
+
322
+ for metadata in all_entries['metadatas']:
323
+ file_type = metadata['content_type']
324
+ file_name = metadata['source_file']
325
+ files[file_type].add(file_name)
326
+
327
+ return {
328
+ 'pdf': sorted(list(files['pdf'])),
329
+ 'xml': sorted(list(files['xml']))
330
+ }
331
+ except Exception as e:
332
+ print(f"Error getting available files: {str(e)}")
333
+ return {'pdf': [], 'xml': []}
334
+
335
+ def ask_question_selective(self, question: str, selected_files: List[str], n_results: int = 5) -> str:
336
+ """Ask a question using only the selected files"""
337
+ try:
338
+ filter_dict = {
339
+ 'source_file': {'$in': selected_files}
340
+ }
341
+
342
+ results = self.collection.query(
343
+ query_texts=[question],
344
+ n_results=n_results,
345
+ where=filter_dict,
346
+ include=["documents", "metadatas"]
347
+ )
348
+
349
+ if not results['documents'][0]:
350
+ return "No relevant content found in the selected files."
351
+
352
+ context = "\n\n".join(results['documents'][0])
353
+
354
+ prompt = f"""Based on the following content from the selected files, please answer this question: {question}
355
+
356
+ Content:
357
+ {context}
358
+
359
+ Please provide a direct answer based only on the information provided above."""
360
+
361
+ response = self.groq_client.chat.completions.create(
362
+ messages=[{"role": "user", "content": prompt}],
363
+ model="llama3-8b-8192",
364
+ temperature=0.2
365
+ )
366
+
367
+ return response.choices[0].message.content
368
+
369
+ except Exception as e:
370
+ return f"Error processing your question: {str(e)}"
371
+
372
+
373
+ from typing import List, Dict, Union
374
+ from groq import Groq
375
+ import chromadb
376
+ import os
377
+ import datetime
378
+ import json
379
+ import xml.etree.ElementTree as ET
380
+ import nltk
381
+ from nltk.tokenize import sent_tokenize
382
+ import PyPDF2
383
+ from sentence_transformers import SentenceTransformer
384
+
385
+ class CustomEmbeddingFunction:
386
+ def __init__(self):
387
+ self.model = SentenceTransformer('all-MiniLM-L6-v2')
388
+
389
+ def __call__(self, input: List[str]) -> List[List[float]]:
390
+ embeddings = self.model.encode(input)
391
+ return embeddings.tolist()
392
+
393
+ class UnifiedDocumentProcessor:
394
+ def __init__(self, groq_api_key, collection_name="unified_content"):
395
+ """Initialize the processor with necessary clients"""
396
+ self.groq_client = Groq(api_key=groq_api_key)
397
+
398
+ # XML-specific settings
399
+ self.max_elements_per_chunk = 50
400
+
401
+ # PDF-specific settings
402
+ self.pdf_chunk_size = 500
403
+ self.pdf_overlap = 50
404
+
405
+ # Initialize NLTK - Updated to handle both resources
406
+ self._initialize_nltk()
407
+
408
+ # Initialize ChromaDB with a single collection for all document types
409
+ self.chroma_client = chromadb.Client()
410
+ existing_collections = self.chroma_client.list_collections()
411
+ collection_exists = any(col.name == collection_name for col in existing_collections)
412
+
413
+ if collection_exists:
414
+ print(f"Using existing collection: {collection_name}")
415
+ self.collection = self.chroma_client.get_collection(
416
+ name=collection_name,
417
+ embedding_function=CustomEmbeddingFunction()
418
+ )
419
+ else:
420
+ print(f"Creating new collection: {collection_name}")
421
+ self.collection = self.chroma_client.create_collection(
422
+ name=collection_name,
423
+ embedding_function=CustomEmbeddingFunction()
424
+ )
425
+
426
+ def _initialize_nltk(self):
427
+ """Ensure both NLTK resources are available."""
428
+ try:
429
+ nltk.download('punkt')
430
+ try:
431
+ nltk.data.find('tokenizers/punkt_tab')
432
+ except LookupError:
433
+ nltk.download('punkt_tab')
434
+ except Exception as e:
435
+ print(f"Warning: Error downloading NLTK resources: {str(e)}")
436
+ print("Falling back to basic sentence splitting...")
437
+
438
+ def _basic_sentence_split(self, text: str) -> List[str]:
439
+ """Fallback method for sentence tokenization"""
440
+ sentences = []
441
+ current = ""
442
+
443
+ for char in text:
444
+ current += char
445
+ if char in ['.', '!', '?'] and len(current.strip()) > 0:
446
+ sentences.append(current.strip())
447
+ current = ""
448
+
449
+ if current.strip():
450
+ sentences.append(current.strip())
451
+
452
+ return sentences
453
+
454
+ def process_file(self, file_path: str) -> Dict:
455
+ """Process any supported file type"""
456
+ try:
457
+ file_extension = os.path.splitext(file_path)[1].lower()
458
+
459
+ if file_extension == '.xml':
460
+ return self.process_xml_file(file_path)
461
+ elif file_extension == '.pdf':
462
+ return self.process_pdf_file(file_path)
463
+ else:
464
+ return {
465
+ 'success': False,
466
+ 'error': f'Unsupported file type: {file_extension}'
467
+ }
468
+ except Exception as e:
469
+ return {
470
+ 'success': False,
471
+ 'error': f'Error processing file: {str(e)}'
472
+ }
473
+
474
+ def extract_text_from_pdf(self, pdf_path: str) -> str:
475
+ """Extract text from PDF file"""
476
+ try:
477
+ text = ""
478
+ with open(pdf_path, 'rb') as file:
479
+ pdf_reader = PyPDF2.PdfReader(file)
480
+ for page in pdf_reader.pages:
481
+ text += page.extract_text() + " "
482
+ return text.strip()
483
+ except Exception as e:
484
+ raise Exception(f"Error extracting text from PDF: {str(e)}")
485
+
486
+ def chunk_text(self, text: str) -> List[str]:
487
+ """Split text into chunks while preserving sentence boundaries"""
488
+ try:
489
+ sentences = sent_tokenize(text)
490
+ except Exception as e:
491
+ print(f"Warning: Using fallback sentence splitting: {str(e)}")
492
+ sentences = self._basic_sentence_split(text)
493
+
494
+ chunks = []
495
+ current_chunk = []
496
+ current_size = 0
497
+
498
+ for sentence in sentences:
499
+ words = sentence.split()
500
+ sentence_size = len(words)
501
+
502
+ if current_size + sentence_size > self.pdf_chunk_size:
503
+ if current_chunk:
504
+ chunks.append(' '.join(current_chunk))
505
+ overlap_words = current_chunk[-self.pdf_overlap:] if self.pdf_overlap > 0 else []
506
+ current_chunk = overlap_words + words
507
+ current_size = len(current_chunk)
508
+ else:
509
+ current_chunk = words
510
+ current_size = sentence_size
511
+ else:
512
+ current_chunk.extend(words)
513
+ current_size += sentence_size
514
+
515
+ if current_chunk:
516
+ chunks.append(' '.join(current_chunk))
517
+
518
+ return chunks
519
+
520
+ def flatten_xml_to_text(self, element, depth=0) -> str:
521
+ """Convert XML element and its children to a flat text representation"""
522
+ text_parts = []
523
+
524
+ element_info = f"Element: {element.tag}"
525
+ if element.attrib:
526
+ element_info += f", Attributes: {json.dumps(element.attrib)}"
527
+ if element.text and element.text.strip():
528
+ element_info += f", Text: {element.text.strip()}"
529
+ text_parts.append(element_info)
530
+
531
+ for child in element:
532
+ child_text = self.flatten_xml_to_text(child, depth + 1)
533
+ text_parts.append(child_text)
534
+
535
+ return "\n".join(text_parts)
536
+
537
+ def chunk_xml_text(self, text: str, max_chunk_size: int = 2000) -> List[str]:
538
+ """Split flattened XML text into manageable chunks"""
539
+ lines = text.split('\n')
540
+ chunks = []
541
+ current_chunk = []
542
+ current_size = 0
543
+
544
+ for line in lines:
545
+ line_size = len(line)
546
+ if current_size + line_size > max_chunk_size and current_chunk:
547
+ chunks.append('\n'.join(current_chunk))
548
+ current_chunk = []
549
+ current_size = 0
550
+ current_chunk.append(line)
551
+ current_size += line_size
552
+
553
+ if current_chunk:
554
+ chunks.append('\n'.join(current_chunk))
555
+
556
+ return chunks
557
+
558
+ def generate_natural_language(self, content: Union[List[Dict], str], content_type: str) -> str:
559
+ """Generate natural language description with improved error handling and chunking"""
560
+ try:
561
+ if content_type == "xml":
562
+ prompt = f"Convert this XML structure description to a natural language summary: {content}"
563
+ else: # pdf
564
+ prompt = f"Summarize this text while preserving key information: {content}"
565
+
566
+ max_prompt_length = 4000
567
+ if len(prompt) > max_prompt_length:
568
+ prompt = prompt[:max_prompt_length] + "..."
569
+
570
+ response = self.groq_client.chat.completions.create(
571
+ messages=[{"role": "user", "content": prompt}],
572
+ model="llama3-8b-8192",
573
+ max_tokens=1000
574
+ )
575
+ return response.choices[0].message.content
576
+ except Exception as e:
577
+ print(f"Error generating natural language: {str(e)}")
578
+ if len(content) > 2000:
579
+ half_length = len(content) // 2
580
+ first_half = content[:half_length]
581
+ try:
582
+ return self.generate_natural_language(first_half, content_type)
583
+ except:
584
+ return None
585
+ return None
586
+
587
+ def store_in_vector_db(self, natural_language: str, metadata: Dict) -> str:
588
+ """Store content in vector database"""
589
+ doc_id = f"{metadata['source_file']}_{metadata['content_type']}_{metadata['chunk_id']}_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}"
590
+
591
+ self.collection.add(
592
+ documents=[natural_language],
593
+ metadatas=[metadata],
594
+ ids=[doc_id]
595
+ )
596
+
597
+ return doc_id
598
+
599
+ def process_xml_file(self, xml_file_path: str) -> Dict:
600
+ """Process XML file with improved chunking"""
601
+ try:
602
+ tree = ET.parse(xml_file_path)
603
+ root = tree.getroot()
604
+ flattened_text = self.flatten_xml_to_text(root)
605
+ chunks = self.chunk_xml_text(flattened_text)
606
+
607
+ print(f"Split XML into {len(chunks)} chunks")
608
+ results = []
609
+
610
+ for i, chunk in enumerate(chunks):
611
+ print(f"Processing XML chunk {i+1}/{len(chunks)}")
612
+ try:
613
+ natural_language = self.generate_natural_language(chunk, "xml")
614
+
615
+ if natural_language:
616
+ metadata = {
617
+ 'source_file': os.path.basename(xml_file_path),
618
+ 'content_type': 'xml',
619
+ 'chunk_id': i,
620
+ 'total_chunks': len(chunks),
621
+ 'timestamp': str(datetime.datetime.now())
622
+ }
623
+ doc_id = self.store_in_vector_db(natural_language, metadata)
624
+ results.append({
625
+ 'chunk': i,
626
+ 'success': True,
627
+ 'doc_id': doc_id,
628
+ 'natural_language': natural_language
629
+ })
630
+ else:
631
+ results.append({
632
+ 'chunk': i,
633
+ 'success': False,
634
+ 'error': 'Failed to generate natural language'
635
+ })
636
+ except Exception as e:
637
+ print(f"Error processing chunk {i}: {str(e)}")
638
+ results.append({
639
+ 'chunk': i,
640
+ 'success': False,
641
+ 'error': str(e)
642
+ })
643
+
644
+ return {
645
+ 'success': True,
646
+ 'total_chunks': len(chunks),
647
+ 'results': results
648
+ }
649
+
650
+ except Exception as e:
651
+ return {
652
+ 'success': False,
653
+ 'error': str(e)
654
+ }
655
+
656
+ def process_pdf_file(self, pdf_file_path: str) -> Dict:
657
+ """Process PDF file"""
658
+ try:
659
+ full_text = self.extract_text_from_pdf(pdf_file_path)
660
+ chunks = self.chunk_text(full_text)
661
+
662
+ print(f"Split PDF into {len(chunks)} chunks")
663
+ results = []
664
+
665
+ for i, chunk in enumerate(chunks):
666
+ print(f"Processing PDF chunk {i+1}/{len(chunks)}")
667
+ natural_language = self.generate_natural_language(chunk, "pdf")
668
+
669
+ if natural_language:
670
+ metadata = {
671
+ 'source_file': os.path.basename(pdf_file_path),
672
+ 'content_type': 'pdf',
673
+ 'chunk_id': i,
674
+ 'total_chunks': len(chunks),
675
+ 'timestamp': str(datetime.datetime.now()),
676
+ 'chunk_size': len(chunk.split())
677
+ }
678
+ doc_id = self.store_in_vector_db(natural_language, metadata)
679
+ results.append({
680
+ 'chunk': i,
681
+ 'success': True,
682
+ 'doc_id': doc_id,
683
+ 'natural_language': natural_language,
684
+ 'original_text': chunk[:200] + "..."
685
+ })
686
+ else:
687
+ results.append({
688
+ 'chunk': i,
689
+ 'success': False,
690
+ 'error': 'Failed to generate natural language summary'
691
+ })
692
+
693
+ return {
694
+ 'success': True,
695
+ 'total_chunks': len(chunks),
696
+ 'results': results
697
+ }
698
+
699
+ except Exception as e:
700
+ return {
701
+ 'success': False,
702
+ 'error': str(e)
703
+ }
704
+
705
+ def get_available_files(self) -> Dict[str, List[str]]:
706
+ """Get list of all files in the database"""
707
+ try:
708
+ all_entries = self.collection.get(
709
+ include=['metadatas']
710
+ )
711
+
712
+ files = {
713
+ 'pdf': set(),
714
+ 'xml': set()
715
+ }
716
+
717
+ for metadata in all_entries['metadatas']:
718
+ file_type = metadata['content_type']
719
+ file_name = metadata['source_file']
720
+ files[file_type].add(file_name)
721
+
722
+ return {
723
+ 'pdf': sorted(list(files['pdf'])),
724
+ 'xml': sorted(list(files['xml']))
725
+ }
726
+ except Exception as e:
727
+ print(f"Error getting available files: {str(e)}")
728
+ return {'pdf': [], 'xml': []}
729
+
730
+ def ask_question_selective(self, question: str, selected_files: List[str], n_results: int = 5) -> str:
731
+ """Ask a question using only the selected files"""
732
+ try:
733
+ filter_dict = {
734
+ 'source_file': {'$in': selected_files}
735
+ }
736
+
737
+ results = self.collection.query(
738
+ query_texts=[question],
739
+ n_results=n_results,
740
+ where=filter_dict,
741
+ include=["documents", "metadatas"]
742
+ )
743
+
744
+ if not results['documents'][0]:
745
+ return "No relevant content found in the selected files."
746
+
747
+ context = "\n\n".join(results['documents'][0])
748
+
749
+ prompt = f"""Based on the following content from the selected files, please answer this question: {question}
750
+
751
+ Content:
752
+ {context}
753
+
754
+ Please provide a direct answer based only on the information provided above."""
755
+
756
+ response = self.groq_client.chat.completions.create(
757
+ messages=[{"role": "user", "content": prompt}],
758
+ model="llama3-8b-8192",
759
+ temperature=0.2
760
+ )
761
+
762
+ return response.choices[0].message.content
763
+
764
+ except Exception as e:
765
+ return f"Error processing your question: {str(e)}"