from duckduckgo_search import DDGS import requests from bs4 import BeautifulSoup import logging from typing import List, Dict, Any from config.settings import Settings logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) class WebSearchTool: def __init__(self): self.ddgs = DDGS() self.session = requests.Session() self.session.headers.update({ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' }) def search(self, query: str, max_results: int = Settings.MAX_SEARCH_RESULTS) -> List[Dict[str, Any]]: """ Search the web using DuckDuckGo """ try: results = [] search_results = self.ddgs.text(query, max_results=max_results) for result in search_results: results.append({ 'title': result.get('title', ''), 'url': result.get('href', ''), 'snippet': result.get('body', ''), 'source': 'DuckDuckGo' }) logger.info(f"Found {len(results)} search results for: {query}") return results except Exception as e: logger.error(f"Error searching web: {e}") return [] def get_page_content(self, url: str, max_chars: int = 5000) -> str: """ Extract text content from a web page """ try: response = self.session.get(url, timeout=10) response.raise_for_status() soup = BeautifulSoup(response.content, 'html.parser') # Remove script and style elements for script in soup(["script", "style"]): script.decompose() # Get text content text = soup.get_text() # Clean up whitespace lines = (line.strip() for line in text.splitlines()) chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) text = ' '.join(chunk for chunk in chunks if chunk) # Limit length if len(text) > max_chars: text = text[:max_chars] + "..." return text except Exception as e: logger.error(f"Error extracting content from {url}: {e}") return f"Error: Could not extract content from {url}" def search_and_summarize(self, query: str, include_content: bool = False) -> str: """ Search and format results for LLM consumption """ results = self.search(query) if not results: return "No search results found." summary_parts = [f"Search results for: {query}\n"] for i, result in enumerate(results, 1): summary_parts.append(f"{i}. **{result['title']}**") summary_parts.append(f" URL: {result['url']}") summary_parts.append(f" Summary: {result['snippet']}") if include_content and i <= 2: # Only get content for top 2 results content = self.get_page_content(result['url']) if content and not content.startswith("Error:"): summary_parts.append(f" Content Preview: {content[:500]}...") summary_parts.append("") return "\n".join(summary_parts) def search_news(self, query: str, max_results: int = 5) -> List[Dict[str, Any]]: """ Search for news articles """ try: results = [] news_results = self.ddgs.news(query, max_results=max_results) for result in news_results: results.append({ 'title': result.get('title', ''), 'url': result.get('url', ''), 'snippet': result.get('body', ''), 'source': result.get('source', ''), 'date': result.get('date', ''), 'type': 'news' }) logger.info(f"Found {len(results)} news results for: {query}") return results except Exception as e: logger.error(f"Error searching news: {e}") return [] def search_images(self, query: str, max_results: int = 5) -> List[Dict[str, Any]]: """ Search for images """ try: results = [] image_results = self.ddgs.images(query, max_results=max_results) for result in image_results: results.append({ 'title': result.get('title', ''), 'url': result.get('image', ''), 'thumbnail': result.get('thumbnail', ''), 'source': result.get('source', ''), 'type': 'image' }) logger.info(f"Found {len(results)} image results for: {query}") return results except Exception as e: logger.error(f"Error searching images: {e}") return [] def quick_fact_search(self, query: str) -> str: """ Quick search for factual information """ try: # Try to get instant answer first instant_answer = self.ddgs.answers(query) if instant_answer: return f"Quick Fact: {instant_answer[0].get('text', '')}" # Fall back to regular search results = self.search(query, max_results=2) if results: return f"From search: {results[0]['snippet']}" return "No quick facts found." except Exception as e: logger.error(f"Error in quick fact search: {e}") return "Error retrieving quick facts." def research_topic(self, topic: str) -> Dict[str, Any]: """ Comprehensive research on a topic """ research_data = { 'topic': topic, 'general_info': [], 'news': [], 'related_queries': [] } try: # General search general_results = self.search(topic, max_results=5) research_data['general_info'] = general_results # News search news_results = self.search_news(topic, max_results=3) research_data['news'] = news_results # Generate related queries related_queries = [ f"{topic} definition", f"{topic} examples", f"{topic} applications", f"latest {topic} developments" ] research_data['related_queries'] = related_queries return research_data except Exception as e: logger.error(f"Error researching topic {topic}: {e}") return research_data def format_research_for_llm(self, research_data: Dict[str, Any]) -> str: """ Format research data for LLM consumption """ formatted_parts = [f"Research Results for: {research_data['topic']}\n"] if research_data['general_info']: formatted_parts.append("## General Information:") for i, result in enumerate(research_data['general_info'], 1): formatted_parts.append(f"{i}. {result['title']}") formatted_parts.append(f" {result['snippet']}\n") if research_data['news']: formatted_parts.append("## Recent News:") for i, result in enumerate(research_data['news'], 1): formatted_parts.append(f"{i}. {result['title']}") formatted_parts.append(f" {result['snippet']}") if result.get('date'): formatted_parts.append(f" Date: {result['date']}\n") return "\n".join(formatted_parts)