avimittal30 commited on
Commit
031085e
·
verified ·
1 Parent(s): 20897be

Update tools.py

Browse files
Files changed (1) hide show
  1. tools.py +559 -559
tools.py CHANGED
@@ -1,559 +1,559 @@
1
- from langchain.tools import DuckDuckGoSearchResults, WikipediaQueryRun
2
- from langchain.utilities import WikipediaAPIWrapper
3
- from PIL import Image
4
- import re
5
- import time
6
- import json
7
- import pandas as pd
8
- from pathlib import Path
9
- from typing import List, Dict, Optional, Union
10
- from tabulate import tabulate
11
- import whisper
12
-
13
- import numpy as np
14
- import os
15
- from langchain_groq import ChatGroq
16
- from youtube_transcript_api import YouTubeTranscriptApi
17
- import re
18
-
19
- from langchain_openai import ChatOpenAI
20
- llm=ChatOpenAI(model='gpt-4o', temperature=0)
21
-
22
- # ----------- Enhanced Search Functionality -----------
23
- class EnhancedSearchTool:
24
- """Enhanced web search with intelligent query processing and result filtering"""
25
-
26
- def __init__(self, max_results: int = 10):
27
- self.base_tool = DuckDuckGoSearchResults(num_results=max_results)
28
- self.max_results = max_results
29
-
30
- def _extract_key_terms(self, question: str) -> List[str]:
31
- """Extract key search terms from the question using LLM"""
32
- try:
33
- extract_prompt = f"""
34
- Extract the most important search terms from this question for web search:
35
- Question: {question}
36
-
37
- Return ONLY a comma-separated list of key terms, no explanations.
38
- Focus on: proper nouns, specific concepts, technical terms, dates, numbers.
39
- Avoid: common words like 'what', 'how', 'when', 'the', 'is', 'are'.
40
-
41
- Example: "What is the population of Tokyo in 2023?" -> "Tokyo population 2023"
42
- """
43
-
44
- response = llm.invoke(extract_prompt).content.strip()
45
- return [term.strip() for term in response.split(',')]
46
- except Exception:
47
- # Fallback to simple keyword extraction
48
- return self._simple_keyword_extraction(question)
49
-
50
- def _simple_keyword_extraction(self, question: str) -> List[str]:
51
- """Fallback keyword extraction using regex"""
52
- # Remove common question words
53
- stop_words = {'what', 'how', 'when', 'where', 'why', 'who', 'which', 'the', 'is', 'are', 'was', 'were', 'do', 'does', 'did', 'can', 'could', 'should', 'would'}
54
- words = re.findall(r'\b[A-Za-z]+\b', question.lower())
55
- return [word for word in words if word not in stop_words and len(word) > 2]
56
-
57
- def _generate_search_queries(self, question: str) -> List[str]:
58
- """Generate multiple search queries for comprehensive results"""
59
- key_terms = self._extract_key_terms(question)
60
-
61
- queries = []
62
-
63
- # Original question (cleaned)
64
- cleaned_question = re.sub(r'[^\w\s]', ' ', question).strip()
65
- queries.append(cleaned_question)
66
-
67
- # Key terms combined
68
- if key_terms:
69
- queries.append(' '.join(key_terms[:5])) # Top 5 terms
70
-
71
- # Specific query patterns based on question type
72
- if any(word in question.lower() for word in ['latest', 'recent', 'current', 'new']):
73
- queries.append(f"{' '.join(key_terms[:3])} 2024 2025")
74
-
75
- if any(word in question.lower() for word in ['statistics', 'data', 'number', 'count']):
76
- queries.append(f"{' '.join(key_terms[:3])} statistics data")
77
-
78
- if any(word in question.lower() for word in ['definition', 'what is', 'meaning']):
79
- queries.append(f"{' '.join(key_terms[:2])} definition meaning")
80
-
81
- return list(dict.fromkeys(queries)) # Remove duplicates while preserving order
82
-
83
- def _filter_and_rank_results(self, results: List[Dict], question: str) -> List[Dict]:
84
- """Filter and rank search results based on relevance"""
85
- if not results:
86
- return results
87
-
88
- key_terms = self._extract_key_terms(question)
89
- key_terms_lower = [term.lower() for term in key_terms]
90
-
91
- scored_results = []
92
- for result in results:
93
- score = 0
94
- text_content = (result.get('snippet', '') + ' ' + result.get('title', '')).lower()
95
-
96
- # Score based on key term matches
97
- for term in key_terms_lower:
98
- if term in text_content:
99
- score += text_content.count(term)
100
-
101
- # Bonus for recent dates
102
- if any(year in text_content for year in ['2024', '2025', '2023']):
103
- score += 2
104
-
105
- # Penalty for very short snippets
106
- if len(result.get('snippet', '')) < 50:
107
- score -= 1
108
-
109
- scored_results.append((score, result))
110
-
111
- # Sort by score and return top results
112
- scored_results.sort(key=lambda x: x[0], reverse=True)
113
- return [result for score, result in scored_results[:self.max_results]]
114
-
115
- def run(self, question: str) -> str:
116
- """Enhanced search execution with multiple queries and result filtering"""
117
- try:
118
- search_queries = self._generate_search_queries(question)
119
- all_results = []
120
-
121
- for query in search_queries[:3]: # Limit to 3 queries to avoid rate limits
122
- try:
123
- results = self.base_tool.run(query)
124
- if isinstance(results, str):
125
- # Parse string results if needed
126
- try:
127
- results = json.loads(results) if results.startswith('[') else [{'snippet': results, 'title': 'Search Result'}]
128
- except:
129
- results = [{'snippet': results, 'title': 'Search Result'}]
130
-
131
- if isinstance(results, list):
132
- all_results.extend(results)
133
-
134
- time.sleep(0.5) # Rate limiting
135
- except Exception as e:
136
- print(f"Search query failed: {query} - {e}")
137
- continue
138
-
139
- if not all_results:
140
- return "No search results found."
141
-
142
- # Filter and rank results
143
- filtered_results = self._filter_and_rank_results(all_results, question)
144
-
145
- # Format results
146
- formatted_results = []
147
- for i, result in enumerate(filtered_results[:5], 1):
148
- title = result.get('title', 'No title')
149
- snippet = result.get('snippet', 'No description')
150
- link = result.get('link', '')
151
-
152
- formatted_results.append(f"{i}. {title}\n {snippet}\n Source: {link}\n")
153
-
154
- return "ENHANCED SEARCH RESULTS:\n" + "\n".join(formatted_results)
155
-
156
- except Exception as e:
157
- return f"Enhanced search error: {str(e)}"
158
-
159
- # ----------- Enhanced Wikipedia Tool -----------
160
-
161
- class EnhancedWikipediaTool:
162
- """Enhanced Wikipedia search with intelligent query processing and content extraction"""
163
-
164
- def __init__(self):
165
- self.base_wrapper = WikipediaAPIWrapper(
166
- top_k_results=3,
167
- doc_content_chars_max=3000,
168
- load_all_available_meta=True
169
- )
170
- self.base_tool = WikipediaQueryRun(api_wrapper=self.base_wrapper)
171
-
172
- def _extract_entities(self, question: str) -> List[str]:
173
- """Extract named entities for Wikipedia search"""
174
- try:
175
- entity_prompt = f"""
176
- Extract named entities (people, places, organizations, concepts) from this question for Wikipedia search:
177
- Question: {question}
178
-
179
- Return ONLY a comma-separated list of the most important entities.
180
- Focus on: proper nouns, specific names, places, organizations, historical events, scientific concepts.
181
-
182
- Example: "Tell me about Einstein's theory of relativity" -> "Albert Einstein, theory of relativity, relativity"
183
- """
184
- response = llm.invoke(entity_prompt).content.strip()
185
- print(f'inside extract_entities:{response}')
186
- entities = [entity.strip() for entity in response.split(',')]
187
- return [e for e in entities if len(e) > 2]
188
- except Exception:
189
- # Fallback: extract capitalized words and phrases
190
- return self._extract_capitalized_terms(question)
191
-
192
- def _extract_capitalized_terms(self, question: str) -> List[str]:
193
- """Fallback: extract capitalized terms as potential entities"""
194
- # Find capitalized words and phrases
195
- capitalized_words = re.findall(r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b', question)
196
- # Also look for quoted terms
197
- quoted_terms = re.findall(r'"([^"]+)"', question)
198
- quoted_terms.extend(re.findall(r"'([^']+)'", question))
199
-
200
- return capitalized_words + quoted_terms
201
-
202
- def _search_multiple_terms(self, entities: List[str]) -> Dict[str, str]:
203
- """Search Wikipedia for multiple entities and return best results"""
204
- results = {}
205
-
206
- for entity in entities[:3]: # Limit to avoid too many API calls
207
- try:
208
- result = self.base_tool.run(entity)
209
- print(f'Inside _search_multiple_terms: {result}')
210
- if result and "Page:" in result and len(result) > 100:
211
- results[entity] = result
212
- time.sleep(0.5) # Rate limiting
213
- except Exception as e:
214
- print(f"Wikipedia search failed for '{entity}': {e}")
215
- continue
216
-
217
- return results
218
-
219
- def _extract_relevant_sections(self, content: str, question: str) -> str:
220
- """Extract the most relevant sections from Wikipedia content"""
221
- if not content or len(content) < 200:
222
- return content
223
-
224
- # Split content into sections (usually separated by double newlines)
225
- sections = re.split(r'\n\s*\n', content)
226
- print(f'Inside _extract relevant sections:{sections}')
227
-
228
- # Score sections based on relevance to question
229
- key_terms = self._extract_entities(question)
230
- key_terms_lower = [term.lower() for term in key_terms]
231
-
232
- scored_sections = []
233
- for section in sections:
234
- if len(section.strip()) < 500:
235
- continue
236
-
237
- score = 0
238
- section_lower = section.lower()
239
-
240
- # Score based on key term matches
241
- for term in key_terms_lower:
242
- score += section_lower.count(term)
243
-
244
- # Bonus for sections with dates, numbers, or specific facts
245
- if re.search(r'\b(19|20)\d{2}\b', section): # Years
246
- score += 1
247
- if re.search(r'\b\d+([.,]\d+)?\s*(million|billion|thousand|percent|%)\b', section):
248
- score += 1
249
-
250
- scored_sections.append((score, section))
251
-
252
- # Sort by relevance and take top sections
253
- scored_sections.sort(key=lambda x: x[0], reverse=True)
254
- top_sections = [section for score, section in scored_sections[:7] if score > 0]
255
- print(f'Inside extract relevant sections, top sections:{top_sections}')
256
-
257
- if not top_sections:
258
- # If no highly relevant sections, take first few sections
259
- top_sections = sections[:2]
260
-
261
- return '\n\n'.join(top_sections)
262
-
263
- def run(self, question: str) -> str:
264
- """Enhanced Wikipedia search with entity extraction and content filtering"""
265
- try:
266
- entities = self._extract_entities(question)
267
-
268
- if not entities:
269
- # Fallback to direct search with cleaned question
270
- cleaned_question = re.sub(r'[^\w\s]', ' ', question).strip()
271
- try:
272
- result = self.base_tool.run(cleaned_question)
273
- print(f'******************Inside run*************:{result} ')
274
- return self._extract_relevant_sections(result, question) if result else "No Wikipedia results found."
275
- except Exception as e:
276
- return f"Wikipedia search error: {str(e)}"
277
-
278
- # Search for multiple entities
279
- search_results = self._search_multiple_terms(entities)
280
-
281
- if not search_results:
282
- return "No relevant Wikipedia articles found."
283
-
284
- # Combine and format results
285
- formatted_results = []
286
- for entity, content in search_results.items():
287
- relevant_content = self._extract_relevant_sections(content, question)
288
- if relevant_content:
289
- formatted_results.append(f"=== {entity} ===\n{relevant_content}")
290
-
291
- if not formatted_results:
292
- return "No relevant information found in Wikipedia articles."
293
-
294
- return "ENHANCED WIKIPEDIA RESULTS:\n\n" + "\n\n".join(formatted_results)
295
-
296
- except Exception as e:
297
- return f"Enhanced Wikipedia error: {str(e)}"
298
-
299
- # ----------- Enhanced File Processing Tools -----------
300
- def excel_to_markdown(excel_path: str, sheet_name: Optional[str] = None) -> str:
301
- """Enhanced Excel tool with better error handling and data analysis"""
302
- try:
303
- file_path = Path(excel_path).expanduser().resolve()
304
- if not file_path.is_file():
305
- return f"Error: Excel file not found at {file_path}"
306
-
307
- sheet: Union[str, int] = (
308
- int(sheet_name) if sheet_name and sheet_name.isdigit() else sheet_name or 0
309
- )
310
- df = pd.read_excel(file_path, sheet_name=sheet)
311
- df = df.iloc[:, :-1]
312
-
313
- # Enhanced metadata
314
- metadata = f"EXCEL FILE ANALYSIS:\n"
315
- metadata += f"File: {file_path.name}\n"
316
- metadata += f"Dimensions: {len(df)} rows × {len(df.columns)} columns\n"
317
- metadata += f"Columns: {', '.join(df.columns.tolist())}\n"
318
- metadata += f"Data types: {dict(df.dtypes)}\n"
319
-
320
- # Basic statistics for numeric columns
321
- numeric_cols = df.select_dtypes(include=['number']).columns
322
- if len(numeric_cols) > 0:
323
- metadata += f"Numeric columns: {list(numeric_cols)}\n"
324
- for col in numeric_cols:
325
- metadata += f" {col}: mean={df[col].mean():.2f}, min={df[col].min()}, max={df[col].max()}, sum={df[col].sum()}\n"
326
-
327
- metadata += "\nSAMPLE DATA (first 10 rows):\n"
328
-
329
- # if hasattr(df, "to_markdown"):
330
- # sample_data = df.head(10).to_markdown(index=False)
331
- # else:
332
- # sample_data = tabulate(df.head(10), headers="keys", tablefmt="github", showindex=False)
333
-
334
- # return metadata + sample_data + f"\n\n(Showing first 10 rows of {len(df)} total rows)"
335
- return metadata
336
-
337
- except Exception as e:
338
- return f"Error reading Excel file: {str(e)}"
339
-
340
-
341
- import os
342
- import mimetypes
343
- from pathlib import Path
344
-
345
- def image_file_info(image_path: str, question: str) -> str:
346
- """Enhanced image file analysis using Gemini API"""
347
- try:
348
- # Check if file exists
349
- if not os.path.exists(image_path):
350
- return f"Error: Image file not found at {image_path}"
351
-
352
- # Try the older google.generativeai library first (more stable)
353
- try:
354
- import google.generativeai as genai
355
- from PIL import Image
356
-
357
- # Configure the API key
358
- genai.configure(api_key=os.getenv("GEMINI_API_KEY"))
359
-
360
- # Create the model - using a more stable model
361
- model = genai.GenerativeModel('gemini-1.5-flash')
362
-
363
- # Open and validate the image
364
- try:
365
- image = Image.open(image_path)
366
- # Convert to RGB if necessary (handles PNG with transparency)
367
- if image.mode in ('RGBA', 'LA'):
368
- background = Image.new('RGB', image.size, (255, 255, 255))
369
- if image.mode == 'RGBA':
370
- background.paste(image, mask=image.split()[-1])
371
- else:
372
- background.paste(image, mask=image.split()[-1])
373
- image = background
374
- elif image.mode != 'RGB':
375
- image = image.convert('RGB')
376
-
377
- except Exception as img_error:
378
- return f"Error opening image: {img_error}"
379
-
380
- # Generate content using the older SDK
381
- response = model.generate_content([question, image])
382
-
383
- return response.text
384
-
385
- except ImportError:
386
- # Fall back to the newer google.genai library
387
- try:
388
- from google import genai
389
- from google.genai import types
390
-
391
- # Initialize the client
392
- client = genai.Client(api_key=os.getenv("GEMINI_API_KEY"))
393
-
394
- # Read content from a local file
395
- with open(image_path, "rb") as f:
396
- img_bytes = f.read()
397
-
398
- # Determine the correct MIME type based on file extension
399
- mime_type, _ = mimetypes.guess_type(image_path)
400
- if mime_type is None or not mime_type.startswith('image/'):
401
- # For PNG files specifically
402
- if image_path.lower().endswith('.png'):
403
- mime_type = "image/png"
404
- else:
405
- mime_type = "image/jpeg"
406
-
407
- # Generate content using the newer SDK
408
- response = client.models.generate_content(
409
- model="gemini-1.5-flash", # Using more stable model
410
- contents=[
411
- question,
412
- types.Part.from_bytes(data=img_bytes, mime_type=mime_type)
413
- ],
414
- )
415
-
416
- return response.text
417
-
418
- except Exception as new_sdk_error:
419
- return f"Error with both SDKs. New SDK error: {new_sdk_error}"
420
-
421
- except Exception as e:
422
- return f"Error during image analysis: {e}"
423
-
424
- def audio_file_info(audio_path: str) -> str:
425
- """Returns only the transcription of an audio file."""
426
- try:
427
- model = whisper.load_model("tiny") # Fast + accurate balance
428
- result = model.transcribe(audio_path, fp16=False)
429
- return result['text']
430
- except Exception as e:
431
- return f"Error transcribing audio: {str(e)}"
432
-
433
- def code_file_read(code_path: str) -> str:
434
- """Enhanced code file analysis"""
435
- try:
436
- with open(code_path, "r", encoding="utf-8") as f:
437
- content = f.read()
438
-
439
- file_path = Path(code_path)
440
-
441
- info = f"CODE FILE ANALYSIS:\n"
442
- info += f"File: {file_path.name}\n"
443
- info += f"Extension: {file_path.suffix}\n"
444
- info += f"Size: {len(content)} characters, {len(content.splitlines())} lines\n"
445
-
446
- # Language-specific analysis
447
- if file_path.suffix == '.py':
448
- # Python-specific analysis
449
- import_lines = [line for line in content.splitlines() if line.strip().startswith(('import ', 'from '))]
450
- if import_lines:
451
- info += f"Imports ({len(import_lines)}): {', '.join(import_lines[:5])}\n"
452
-
453
- # Count functions and classes
454
- func_count = len(re.findall(r'^def\s+\w+', content, re.MULTILINE))
455
- class_count = len(re.findall(r'^class\s+\w+', content, re.MULTILINE))
456
- info += f"Functions: {func_count}, Classes: {class_count}\n"
457
-
458
- info += f"\nCODE CONTENT:\n{content}"
459
- return info
460
-
461
- except Exception as e:
462
- return f"Error reading code file: {e}"
463
-
464
-
465
- import yt_dlp
466
- from pathlib import Path
467
-
468
- def extract_youtube_info(question: str) -> str:
469
- """
470
- Download a YouTube video or audio using yt-dlp without merging.
471
-
472
- Parameters:
473
- - url: str — YouTube URL
474
- - audio_only: bool — if True, downloads audio only; else best single video+audio stream
475
-
476
- Returns:
477
- - str: path to downloaded file or error message
478
- """
479
- pattern = r"(https?://(?:www\.)?(?:youtube\.com/watch\?v=[\w\-]+|youtu\.be/[\w\-]+))"
480
- match = re.search(pattern, question)
481
- youtube_url = match.group(1) if match else None
482
-
483
- print(f"Extracting YouTube URL: {youtube_url}")
484
- try:
485
- # Extract video ID from URL
486
- video_id = re.search(r'(?:youtube\.com/watch\?v=|youtu\.be/|youtube\.com/embed/)([a-zA-Z0-9_-]{11})', youtube_url).group(1)
487
-
488
- # Get transcript
489
- transcript_list = YouTubeTranscriptApi.get_transcript(video_id)
490
-
491
- # Combine all text segments
492
- full_transcript = ' '.join([entry['text'] for entry in transcript_list])
493
-
494
- # Clean up the text
495
- full_transcript = re.sub(r'\s+', ' ', full_transcript).strip()
496
-
497
- return full_transcript
498
-
499
- except Exception as e:
500
- print(f"Error getting transcript: {e}")
501
- return None
502
-
503
-
504
- # def get_youtube_transcript(video_url):
505
- # """
506
- # Get transcription from a YouTube video.
507
-
508
- # Args:
509
- # video_url (str): YouTube video URL
510
-
511
- # Returns:
512
- # str: Full transcription text or None if not available
513
- # """
514
- # try:
515
- # # Extract video ID from URL
516
- # video_id = re.search(r'(?:youtube\.com/watch\?v=|youtu\.be/|youtube\.com/embed/)([a-zA-Z0-9_-]{11})', video_url).group(1)
517
-
518
- # # Get transcript
519
- # transcript_list = YouTubeTranscriptApi.get_transcript(video_id)
520
-
521
- # # Combine all text segments
522
- # full_transcript = ' '.join([entry['text'] for entry in transcript_list])
523
-
524
- # # Clean up the text
525
- # full_transcript = re.sub(r'\s+', ' ', full_transcript).strip()
526
-
527
- # return full_transcript
528
-
529
- # except Exception as e:
530
- # print(f"Error getting transcript: {e}")
531
- # return None
532
-
533
- # extract_youtube_info
534
-
535
- # question="How many studio albums were published by Mercedes Sosa between 2000 and 2009 (included)? You can use the latest 2022 version of english wikipedia."
536
- # wiki=EnhancedWikipediaTool()
537
- # wiki.run(question)
538
-
539
-
540
- # entity_prompt = f"""
541
- # Extract named entities (people, places, organizations, concepts) from this question for Wikipedia search:
542
- # Question: {question}
543
-
544
- # Return ONLY a comma-separated list of the most important entities.
545
- # Focus on: proper nouns, specific names, places, organizations, historical events, scientific concepts.
546
-
547
- # Example: "Tell me about Einstein's theory of relativity" -> "Albert Einstein, theory of relativity, relativity"
548
- # """
549
-
550
- # response = llm.invoke(entity_prompt).content.strip()
551
-
552
-
553
-
554
-
555
- # result=extract_youtube_info("Examine the video at https://www.youtube.com/watch?v=1htKBjuUWec.\n\nWhat does Teal'c say in response to the question \"Isn't that hot")
556
-
557
-
558
-
559
-
 
1
+ from langchain.tools import DuckDuckGoSearchResults, WikipediaQueryRun
2
+ from langchain.utilities import WikipediaAPIWrapper
3
+ from PIL import Image
4
+ import re
5
+ import time
6
+ import json
7
+ import pandas as pd
8
+ from pathlib import Path
9
+ from typing import List, Dict, Optional, Union
10
+ from tabulate import tabulate
11
+ import whisper
12
+
13
+ import numpy as np
14
+ import os
15
+
16
+ from youtube_transcript_api import YouTubeTranscriptApi
17
+ import re
18
+
19
+ from langchain_openai import ChatOpenAI
20
+ llm=ChatOpenAI(model='gpt-4o', temperature=0)
21
+
22
+ # ----------- Enhanced Search Functionality -----------
23
+ class EnhancedSearchTool:
24
+ """Enhanced web search with intelligent query processing and result filtering"""
25
+
26
+ def __init__(self, max_results: int = 10):
27
+ self.base_tool = DuckDuckGoSearchResults(num_results=max_results)
28
+ self.max_results = max_results
29
+
30
+ def _extract_key_terms(self, question: str) -> List[str]:
31
+ """Extract key search terms from the question using LLM"""
32
+ try:
33
+ extract_prompt = f"""
34
+ Extract the most important search terms from this question for web search:
35
+ Question: {question}
36
+
37
+ Return ONLY a comma-separated list of key terms, no explanations.
38
+ Focus on: proper nouns, specific concepts, technical terms, dates, numbers.
39
+ Avoid: common words like 'what', 'how', 'when', 'the', 'is', 'are'.
40
+
41
+ Example: "What is the population of Tokyo in 2023?" -> "Tokyo population 2023"
42
+ """
43
+
44
+ response = llm.invoke(extract_prompt).content.strip()
45
+ return [term.strip() for term in response.split(',')]
46
+ except Exception:
47
+ # Fallback to simple keyword extraction
48
+ return self._simple_keyword_extraction(question)
49
+
50
+ def _simple_keyword_extraction(self, question: str) -> List[str]:
51
+ """Fallback keyword extraction using regex"""
52
+ # Remove common question words
53
+ stop_words = {'what', 'how', 'when', 'where', 'why', 'who', 'which', 'the', 'is', 'are', 'was', 'were', 'do', 'does', 'did', 'can', 'could', 'should', 'would'}
54
+ words = re.findall(r'\b[A-Za-z]+\b', question.lower())
55
+ return [word for word in words if word not in stop_words and len(word) > 2]
56
+
57
+ def _generate_search_queries(self, question: str) -> List[str]:
58
+ """Generate multiple search queries for comprehensive results"""
59
+ key_terms = self._extract_key_terms(question)
60
+
61
+ queries = []
62
+
63
+ # Original question (cleaned)
64
+ cleaned_question = re.sub(r'[^\w\s]', ' ', question).strip()
65
+ queries.append(cleaned_question)
66
+
67
+ # Key terms combined
68
+ if key_terms:
69
+ queries.append(' '.join(key_terms[:5])) # Top 5 terms
70
+
71
+ # Specific query patterns based on question type
72
+ if any(word in question.lower() for word in ['latest', 'recent', 'current', 'new']):
73
+ queries.append(f"{' '.join(key_terms[:3])} 2024 2025")
74
+
75
+ if any(word in question.lower() for word in ['statistics', 'data', 'number', 'count']):
76
+ queries.append(f"{' '.join(key_terms[:3])} statistics data")
77
+
78
+ if any(word in question.lower() for word in ['definition', 'what is', 'meaning']):
79
+ queries.append(f"{' '.join(key_terms[:2])} definition meaning")
80
+
81
+ return list(dict.fromkeys(queries)) # Remove duplicates while preserving order
82
+
83
+ def _filter_and_rank_results(self, results: List[Dict], question: str) -> List[Dict]:
84
+ """Filter and rank search results based on relevance"""
85
+ if not results:
86
+ return results
87
+
88
+ key_terms = self._extract_key_terms(question)
89
+ key_terms_lower = [term.lower() for term in key_terms]
90
+
91
+ scored_results = []
92
+ for result in results:
93
+ score = 0
94
+ text_content = (result.get('snippet', '') + ' ' + result.get('title', '')).lower()
95
+
96
+ # Score based on key term matches
97
+ for term in key_terms_lower:
98
+ if term in text_content:
99
+ score += text_content.count(term)
100
+
101
+ # Bonus for recent dates
102
+ if any(year in text_content for year in ['2024', '2025', '2023']):
103
+ score += 2
104
+
105
+ # Penalty for very short snippets
106
+ if len(result.get('snippet', '')) < 50:
107
+ score -= 1
108
+
109
+ scored_results.append((score, result))
110
+
111
+ # Sort by score and return top results
112
+ scored_results.sort(key=lambda x: x[0], reverse=True)
113
+ return [result for score, result in scored_results[:self.max_results]]
114
+
115
+ def run(self, question: str) -> str:
116
+ """Enhanced search execution with multiple queries and result filtering"""
117
+ try:
118
+ search_queries = self._generate_search_queries(question)
119
+ all_results = []
120
+
121
+ for query in search_queries[:3]: # Limit to 3 queries to avoid rate limits
122
+ try:
123
+ results = self.base_tool.run(query)
124
+ if isinstance(results, str):
125
+ # Parse string results if needed
126
+ try:
127
+ results = json.loads(results) if results.startswith('[') else [{'snippet': results, 'title': 'Search Result'}]
128
+ except:
129
+ results = [{'snippet': results, 'title': 'Search Result'}]
130
+
131
+ if isinstance(results, list):
132
+ all_results.extend(results)
133
+
134
+ time.sleep(0.5) # Rate limiting
135
+ except Exception as e:
136
+ print(f"Search query failed: {query} - {e}")
137
+ continue
138
+
139
+ if not all_results:
140
+ return "No search results found."
141
+
142
+ # Filter and rank results
143
+ filtered_results = self._filter_and_rank_results(all_results, question)
144
+
145
+ # Format results
146
+ formatted_results = []
147
+ for i, result in enumerate(filtered_results[:5], 1):
148
+ title = result.get('title', 'No title')
149
+ snippet = result.get('snippet', 'No description')
150
+ link = result.get('link', '')
151
+
152
+ formatted_results.append(f"{i}. {title}\n {snippet}\n Source: {link}\n")
153
+
154
+ return "ENHANCED SEARCH RESULTS:\n" + "\n".join(formatted_results)
155
+
156
+ except Exception as e:
157
+ return f"Enhanced search error: {str(e)}"
158
+
159
+ # ----------- Enhanced Wikipedia Tool -----------
160
+
161
+ class EnhancedWikipediaTool:
162
+ """Enhanced Wikipedia search with intelligent query processing and content extraction"""
163
+
164
+ def __init__(self):
165
+ self.base_wrapper = WikipediaAPIWrapper(
166
+ top_k_results=3,
167
+ doc_content_chars_max=3000,
168
+ load_all_available_meta=True
169
+ )
170
+ self.base_tool = WikipediaQueryRun(api_wrapper=self.base_wrapper)
171
+
172
+ def _extract_entities(self, question: str) -> List[str]:
173
+ """Extract named entities for Wikipedia search"""
174
+ try:
175
+ entity_prompt = f"""
176
+ Extract named entities (people, places, organizations, concepts) from this question for Wikipedia search:
177
+ Question: {question}
178
+
179
+ Return ONLY a comma-separated list of the most important entities.
180
+ Focus on: proper nouns, specific names, places, organizations, historical events, scientific concepts.
181
+
182
+ Example: "Tell me about Einstein's theory of relativity" -> "Albert Einstein, theory of relativity, relativity"
183
+ """
184
+ response = llm.invoke(entity_prompt).content.strip()
185
+ print(f'inside extract_entities:{response}')
186
+ entities = [entity.strip() for entity in response.split(',')]
187
+ return [e for e in entities if len(e) > 2]
188
+ except Exception:
189
+ # Fallback: extract capitalized words and phrases
190
+ return self._extract_capitalized_terms(question)
191
+
192
+ def _extract_capitalized_terms(self, question: str) -> List[str]:
193
+ """Fallback: extract capitalized terms as potential entities"""
194
+ # Find capitalized words and phrases
195
+ capitalized_words = re.findall(r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b', question)
196
+ # Also look for quoted terms
197
+ quoted_terms = re.findall(r'"([^"]+)"', question)
198
+ quoted_terms.extend(re.findall(r"'([^']+)'", question))
199
+
200
+ return capitalized_words + quoted_terms
201
+
202
+ def _search_multiple_terms(self, entities: List[str]) -> Dict[str, str]:
203
+ """Search Wikipedia for multiple entities and return best results"""
204
+ results = {}
205
+
206
+ for entity in entities[:3]: # Limit to avoid too many API calls
207
+ try:
208
+ result = self.base_tool.run(entity)
209
+ print(f'Inside _search_multiple_terms: {result}')
210
+ if result and "Page:" in result and len(result) > 100:
211
+ results[entity] = result
212
+ time.sleep(0.5) # Rate limiting
213
+ except Exception as e:
214
+ print(f"Wikipedia search failed for '{entity}': {e}")
215
+ continue
216
+
217
+ return results
218
+
219
+ def _extract_relevant_sections(self, content: str, question: str) -> str:
220
+ """Extract the most relevant sections from Wikipedia content"""
221
+ if not content or len(content) < 200:
222
+ return content
223
+
224
+ # Split content into sections (usually separated by double newlines)
225
+ sections = re.split(r'\n\s*\n', content)
226
+ print(f'Inside _extract relevant sections:{sections}')
227
+
228
+ # Score sections based on relevance to question
229
+ key_terms = self._extract_entities(question)
230
+ key_terms_lower = [term.lower() for term in key_terms]
231
+
232
+ scored_sections = []
233
+ for section in sections:
234
+ if len(section.strip()) < 500:
235
+ continue
236
+
237
+ score = 0
238
+ section_lower = section.lower()
239
+
240
+ # Score based on key term matches
241
+ for term in key_terms_lower:
242
+ score += section_lower.count(term)
243
+
244
+ # Bonus for sections with dates, numbers, or specific facts
245
+ if re.search(r'\b(19|20)\d{2}\b', section): # Years
246
+ score += 1
247
+ if re.search(r'\b\d+([.,]\d+)?\s*(million|billion|thousand|percent|%)\b', section):
248
+ score += 1
249
+
250
+ scored_sections.append((score, section))
251
+
252
+ # Sort by relevance and take top sections
253
+ scored_sections.sort(key=lambda x: x[0], reverse=True)
254
+ top_sections = [section for score, section in scored_sections[:7] if score > 0]
255
+ print(f'Inside extract relevant sections, top sections:{top_sections}')
256
+
257
+ if not top_sections:
258
+ # If no highly relevant sections, take first few sections
259
+ top_sections = sections[:2]
260
+
261
+ return '\n\n'.join(top_sections)
262
+
263
+ def run(self, question: str) -> str:
264
+ """Enhanced Wikipedia search with entity extraction and content filtering"""
265
+ try:
266
+ entities = self._extract_entities(question)
267
+
268
+ if not entities:
269
+ # Fallback to direct search with cleaned question
270
+ cleaned_question = re.sub(r'[^\w\s]', ' ', question).strip()
271
+ try:
272
+ result = self.base_tool.run(cleaned_question)
273
+ print(f'******************Inside run*************:{result} ')
274
+ return self._extract_relevant_sections(result, question) if result else "No Wikipedia results found."
275
+ except Exception as e:
276
+ return f"Wikipedia search error: {str(e)}"
277
+
278
+ # Search for multiple entities
279
+ search_results = self._search_multiple_terms(entities)
280
+
281
+ if not search_results:
282
+ return "No relevant Wikipedia articles found."
283
+
284
+ # Combine and format results
285
+ formatted_results = []
286
+ for entity, content in search_results.items():
287
+ relevant_content = self._extract_relevant_sections(content, question)
288
+ if relevant_content:
289
+ formatted_results.append(f"=== {entity} ===\n{relevant_content}")
290
+
291
+ if not formatted_results:
292
+ return "No relevant information found in Wikipedia articles."
293
+
294
+ return "ENHANCED WIKIPEDIA RESULTS:\n\n" + "\n\n".join(formatted_results)
295
+
296
+ except Exception as e:
297
+ return f"Enhanced Wikipedia error: {str(e)}"
298
+
299
+ # ----------- Enhanced File Processing Tools -----------
300
+ def excel_to_markdown(excel_path: str, sheet_name: Optional[str] = None) -> str:
301
+ """Enhanced Excel tool with better error handling and data analysis"""
302
+ try:
303
+ file_path = Path(excel_path).expanduser().resolve()
304
+ if not file_path.is_file():
305
+ return f"Error: Excel file not found at {file_path}"
306
+
307
+ sheet: Union[str, int] = (
308
+ int(sheet_name) if sheet_name and sheet_name.isdigit() else sheet_name or 0
309
+ )
310
+ df = pd.read_excel(file_path, sheet_name=sheet)
311
+ df = df.iloc[:, :-1]
312
+
313
+ # Enhanced metadata
314
+ metadata = f"EXCEL FILE ANALYSIS:\n"
315
+ metadata += f"File: {file_path.name}\n"
316
+ metadata += f"Dimensions: {len(df)} rows × {len(df.columns)} columns\n"
317
+ metadata += f"Columns: {', '.join(df.columns.tolist())}\n"
318
+ metadata += f"Data types: {dict(df.dtypes)}\n"
319
+
320
+ # Basic statistics for numeric columns
321
+ numeric_cols = df.select_dtypes(include=['number']).columns
322
+ if len(numeric_cols) > 0:
323
+ metadata += f"Numeric columns: {list(numeric_cols)}\n"
324
+ for col in numeric_cols:
325
+ metadata += f" {col}: mean={df[col].mean():.2f}, min={df[col].min()}, max={df[col].max()}, sum={df[col].sum()}\n"
326
+
327
+ metadata += "\nSAMPLE DATA (first 10 rows):\n"
328
+
329
+ # if hasattr(df, "to_markdown"):
330
+ # sample_data = df.head(10).to_markdown(index=False)
331
+ # else:
332
+ # sample_data = tabulate(df.head(10), headers="keys", tablefmt="github", showindex=False)
333
+
334
+ # return metadata + sample_data + f"\n\n(Showing first 10 rows of {len(df)} total rows)"
335
+ return metadata
336
+
337
+ except Exception as e:
338
+ return f"Error reading Excel file: {str(e)}"
339
+
340
+
341
+ import os
342
+ import mimetypes
343
+ from pathlib import Path
344
+
345
+ def image_file_info(image_path: str, question: str) -> str:
346
+ """Enhanced image file analysis using Gemini API"""
347
+ try:
348
+ # Check if file exists
349
+ if not os.path.exists(image_path):
350
+ return f"Error: Image file not found at {image_path}"
351
+
352
+ # Try the older google.generativeai library first (more stable)
353
+ try:
354
+ import google.generativeai as genai
355
+ from PIL import Image
356
+
357
+ # Configure the API key
358
+ genai.configure(api_key=os.getenv("GEMINI_API_KEY"))
359
+
360
+ # Create the model - using a more stable model
361
+ model = genai.GenerativeModel('gemini-1.5-flash')
362
+
363
+ # Open and validate the image
364
+ try:
365
+ image = Image.open(image_path)
366
+ # Convert to RGB if necessary (handles PNG with transparency)
367
+ if image.mode in ('RGBA', 'LA'):
368
+ background = Image.new('RGB', image.size, (255, 255, 255))
369
+ if image.mode == 'RGBA':
370
+ background.paste(image, mask=image.split()[-1])
371
+ else:
372
+ background.paste(image, mask=image.split()[-1])
373
+ image = background
374
+ elif image.mode != 'RGB':
375
+ image = image.convert('RGB')
376
+
377
+ except Exception as img_error:
378
+ return f"Error opening image: {img_error}"
379
+
380
+ # Generate content using the older SDK
381
+ response = model.generate_content([question, image])
382
+
383
+ return response.text
384
+
385
+ except ImportError:
386
+ # Fall back to the newer google.genai library
387
+ try:
388
+ from google import genai
389
+ from google.genai import types
390
+
391
+ # Initialize the client
392
+ client = genai.Client(api_key=os.getenv("GEMINI_API_KEY"))
393
+
394
+ # Read content from a local file
395
+ with open(image_path, "rb") as f:
396
+ img_bytes = f.read()
397
+
398
+ # Determine the correct MIME type based on file extension
399
+ mime_type, _ = mimetypes.guess_type(image_path)
400
+ if mime_type is None or not mime_type.startswith('image/'):
401
+ # For PNG files specifically
402
+ if image_path.lower().endswith('.png'):
403
+ mime_type = "image/png"
404
+ else:
405
+ mime_type = "image/jpeg"
406
+
407
+ # Generate content using the newer SDK
408
+ response = client.models.generate_content(
409
+ model="gemini-1.5-flash", # Using more stable model
410
+ contents=[
411
+ question,
412
+ types.Part.from_bytes(data=img_bytes, mime_type=mime_type)
413
+ ],
414
+ )
415
+
416
+ return response.text
417
+
418
+ except Exception as new_sdk_error:
419
+ return f"Error with both SDKs. New SDK error: {new_sdk_error}"
420
+
421
+ except Exception as e:
422
+ return f"Error during image analysis: {e}"
423
+
424
+ def audio_file_info(audio_path: str) -> str:
425
+ """Returns only the transcription of an audio file."""
426
+ try:
427
+ model = whisper.load_model("tiny") # Fast + accurate balance
428
+ result = model.transcribe(audio_path, fp16=False)
429
+ return result['text']
430
+ except Exception as e:
431
+ return f"Error transcribing audio: {str(e)}"
432
+
433
+ def code_file_read(code_path: str) -> str:
434
+ """Enhanced code file analysis"""
435
+ try:
436
+ with open(code_path, "r", encoding="utf-8") as f:
437
+ content = f.read()
438
+
439
+ file_path = Path(code_path)
440
+
441
+ info = f"CODE FILE ANALYSIS:\n"
442
+ info += f"File: {file_path.name}\n"
443
+ info += f"Extension: {file_path.suffix}\n"
444
+ info += f"Size: {len(content)} characters, {len(content.splitlines())} lines\n"
445
+
446
+ # Language-specific analysis
447
+ if file_path.suffix == '.py':
448
+ # Python-specific analysis
449
+ import_lines = [line for line in content.splitlines() if line.strip().startswith(('import ', 'from '))]
450
+ if import_lines:
451
+ info += f"Imports ({len(import_lines)}): {', '.join(import_lines[:5])}\n"
452
+
453
+ # Count functions and classes
454
+ func_count = len(re.findall(r'^def\s+\w+', content, re.MULTILINE))
455
+ class_count = len(re.findall(r'^class\s+\w+', content, re.MULTILINE))
456
+ info += f"Functions: {func_count}, Classes: {class_count}\n"
457
+
458
+ info += f"\nCODE CONTENT:\n{content}"
459
+ return info
460
+
461
+ except Exception as e:
462
+ return f"Error reading code file: {e}"
463
+
464
+
465
+ import yt_dlp
466
+ from pathlib import Path
467
+
468
+ def extract_youtube_info(question: str) -> str:
469
+ """
470
+ Download a YouTube video or audio using yt-dlp without merging.
471
+
472
+ Parameters:
473
+ - url: str — YouTube URL
474
+ - audio_only: bool — if True, downloads audio only; else best single video+audio stream
475
+
476
+ Returns:
477
+ - str: path to downloaded file or error message
478
+ """
479
+ pattern = r"(https?://(?:www\.)?(?:youtube\.com/watch\?v=[\w\-]+|youtu\.be/[\w\-]+))"
480
+ match = re.search(pattern, question)
481
+ youtube_url = match.group(1) if match else None
482
+
483
+ print(f"Extracting YouTube URL: {youtube_url}")
484
+ try:
485
+ # Extract video ID from URL
486
+ video_id = re.search(r'(?:youtube\.com/watch\?v=|youtu\.be/|youtube\.com/embed/)([a-zA-Z0-9_-]{11})', youtube_url).group(1)
487
+
488
+ # Get transcript
489
+ transcript_list = YouTubeTranscriptApi.get_transcript(video_id)
490
+
491
+ # Combine all text segments
492
+ full_transcript = ' '.join([entry['text'] for entry in transcript_list])
493
+
494
+ # Clean up the text
495
+ full_transcript = re.sub(r'\s+', ' ', full_transcript).strip()
496
+
497
+ return full_transcript
498
+
499
+ except Exception as e:
500
+ print(f"Error getting transcript: {e}")
501
+ return None
502
+
503
+
504
+ # def get_youtube_transcript(video_url):
505
+ # """
506
+ # Get transcription from a YouTube video.
507
+
508
+ # Args:
509
+ # video_url (str): YouTube video URL
510
+
511
+ # Returns:
512
+ # str: Full transcription text or None if not available
513
+ # """
514
+ # try:
515
+ # # Extract video ID from URL
516
+ # video_id = re.search(r'(?:youtube\.com/watch\?v=|youtu\.be/|youtube\.com/embed/)([a-zA-Z0-9_-]{11})', video_url).group(1)
517
+
518
+ # # Get transcript
519
+ # transcript_list = YouTubeTranscriptApi.get_transcript(video_id)
520
+
521
+ # # Combine all text segments
522
+ # full_transcript = ' '.join([entry['text'] for entry in transcript_list])
523
+
524
+ # # Clean up the text
525
+ # full_transcript = re.sub(r'\s+', ' ', full_transcript).strip()
526
+
527
+ # return full_transcript
528
+
529
+ # except Exception as e:
530
+ # print(f"Error getting transcript: {e}")
531
+ # return None
532
+
533
+ # extract_youtube_info
534
+
535
+ # question="How many studio albums were published by Mercedes Sosa between 2000 and 2009 (included)? You can use the latest 2022 version of english wikipedia."
536
+ # wiki=EnhancedWikipediaTool()
537
+ # wiki.run(question)
538
+
539
+
540
+ # entity_prompt = f"""
541
+ # Extract named entities (people, places, organizations, concepts) from this question for Wikipedia search:
542
+ # Question: {question}
543
+
544
+ # Return ONLY a comma-separated list of the most important entities.
545
+ # Focus on: proper nouns, specific names, places, organizations, historical events, scientific concepts.
546
+
547
+ # Example: "Tell me about Einstein's theory of relativity" -> "Albert Einstein, theory of relativity, relativity"
548
+ # """
549
+
550
+ # response = llm.invoke(entity_prompt).content.strip()
551
+
552
+
553
+
554
+
555
+ # result=extract_youtube_info("Examine the video at https://www.youtube.com/watch?v=1htKBjuUWec.\n\nWhat does Teal'c say in response to the question \"Isn't that hot")
556
+
557
+
558
+
559
+