Spaces:

Agents-MCP-Hackathon
/

course-creator-ai

Running

App Files Files Community

course-creator-ai / coursecrafter /tools /web_research.py

sizzlebop

Upload 34 files

8be2f43 verified 3 months ago

raw

history blame contribute delete

16.2 kB

	"""
	🔍 Web Research Tools
	Advanced web research using DuckDuckGo search and Crawl4AI content extraction
	"""

	import os
	import requests
	from typing import List, Dict, Any, Optional
	from duckduckgo_search import DDGS
	from bs4 import BeautifulSoup
	import logging

	# Try to import Crawl4AI, but have a fallback if it fails
	try:
	from crawl4ai import (
	AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LLMConfig,
	LLMContentFilter, DefaultMarkdownGenerator
	)
	CRAWL4AI_AVAILABLE = True
	except ImportError:
	CRAWL4AI_AVAILABLE = False
	print("⚠️ Crawl4AI not available, using fallback web scraping")

	logger = logging.getLogger(__name__)


	class WebResearcher:
	"""Advanced web research using DuckDuckGo and Crawl4AI"""

	def __init__(self, max_results: int = 10, max_crawl_pages: int = 7, llm_provider: str = None):
	self.max_results = max_results
	self.max_crawl_pages = max_crawl_pages
	self.llm_provider = llm_provider or "openai" # Default fallback

	if CRAWL4AI_AVAILABLE:
	self.browser_config = BrowserConfig(
	headless=True,
	viewport_width=1280,
	viewport_height=720
	)
	else:
	self.browser_config = None
	print("🔄 Using fallback web scraping (requests + BeautifulSoup)")

	async def search_topic(self, topic: str, region: str = "us-en") -> List[Dict[str, Any]]:
	"""Search for a topic using DuckDuckGo"""
	try:
	print(f"🔍 Searching DuckDuckGo for: {topic}")

	with DDGS() as ddgs:
	results = []
	search_results = ddgs.text(
	keywords=topic,
	region=region,
	safesearch="moderate",
	max_results=self.max_results
	)

	for result in search_results:
	results.append({
	"title": result.get("title", ""),
	"url": result.get("href", ""),
	"snippet": result.get("body", ""),
	"source": "duckduckgo"
	})

	print(f"✅ Found {len(results)} search results")
	return results

	except Exception as e:
	logger.error(f"Search failed: {e}")
	print(f"❌ Search failed: {e}")
	return []

	async def _fallback_extract_content(self, urls: List[str]) -> List[Dict[str, Any]]:
	"""Fallback content extraction using requests and BeautifulSoup"""
	extracted_content = []

	headers = {
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
	}

	for i, url in enumerate(urls[:self.max_crawl_pages]):
	try:
	print(f"📖 Scraping {i+1}/{min(len(urls), self.max_crawl_pages)}: {url}")

	response = requests.get(url, headers=headers, timeout=10)
	response.raise_for_status()

	soup = BeautifulSoup(response.content, 'html.parser')

	# Remove script and style elements
	for script in soup(["script", "style", "nav", "footer", "header"]):
	script.decompose()

	# Extract title
	title = ""
	if soup.title:
	title = soup.title.string.strip()

	# Extract main content
	content_selectors = [
	'main', 'article', '.content', '#content',
	'.post-content', '.entry-content', '.article-content'
	]

	content = ""
	for selector in content_selectors:
	content_elem = soup.select_one(selector)
	if content_elem:
	content = content_elem.get_text(separator='\n', strip=True)
	break

	# If no specific content area found, use body
	if not content:
	content = soup.get_text(separator='\n', strip=True)

	# Clean up content
	lines = [line.strip() for line in content.split('\n') if line.strip()]
	content = '\n'.join(lines)

	word_count = len(content.split())

	extracted_content.append({
	"url": url,
	"title": title,
	"content": content,
	"word_count": word_count,
	"extraction_success": True
	})

	print(f"✅ Extracted {word_count} words from {url}")

	except Exception as e:
	logger.error(f"Error scraping {url}: {e}")
	print(f"❌ Error scraping {url}: {e}")
	extracted_content.append({
	"url": url,
	"title": "",
	"content": "",
	"word_count": 0,
	"extraction_success": False,
	"error": str(e)
	})

	successful_extractions = [c for c in extracted_content if c["extraction_success"]]
	print(f"✅ Successfully extracted content from {len(successful_extractions)}/{len(urls)} URLs")

	return extracted_content

	async def extract_content(self, urls: List[str], topic: str) -> List[Dict[str, Any]]:
	"""Extract content from URLs using Crawl4AI with LLM filtering"""

	# If Crawl4AI is not available, use fallback immediately
	if not CRAWL4AI_AVAILABLE:
	print("🔄 Using fallback content extraction (Crawl4AI not available)")
	return await self._fallback_extract_content(urls)

	# Check if Playwright browsers are installed
	try:
	from playwright.async_api import async_playwright
	async with async_playwright() as p:
	# Try to get browser path - this will fail if browsers aren't installed
	browser_path = p.chromium.executable_path
	if not browser_path or not os.path.exists(browser_path):
	print("🔄 Playwright browsers not installed, using fallback content extraction")
	return await self._fallback_extract_content(urls)
	except Exception as e:
	print(f"🔄 Playwright check failed ({e}), using fallback content extraction")
	return await self._fallback_extract_content(urls)

	try:
	print(f"📄 Extracting content from {len(urls)} URLs...")

	# Try to configure LLM content filter for educational content
	try:
	# Use the provider passed to the class, or fall back to environment/default
	crawl4ai_provider_simple = self.llm_provider

	# Map simple provider names to full provider/model format
	provider_mapping = {
	"openai": "openai/gpt-4o-mini",
	"google": "gemini/gemini-2.0-flash-exp",
	"gemini": "gemini/gemini-2.0-flash-exp",
	"anthropic": "gemini/gemini-2.0-flash-exp" # Fallback since Crawl4AI doesn't support Anthropic directly
	}

	crawl4ai_provider = provider_mapping.get(crawl4ai_provider_simple, "openai/gpt-4o-mini")

	if crawl4ai_provider.startswith("gemini"):
	# Check if Google API key is available
	if not os.getenv("GOOGLE_API_KEY"):
	print("⚠️ GOOGLE_API_KEY not found, falling back to OpenAI")
	llm_config = LLMConfig(
	provider="openai/gpt-4o-mini",
	api_token="env:OPENAI_API_KEY"
	)
	print("🧠 Using OpenAI for content filtering: gpt-4o-mini (fallback)")
	else:
	llm_config = LLMConfig(
	provider=crawl4ai_provider,
	api_token="env:GOOGLE_API_KEY"
	)
	print(f"🧠 Using Gemini for content filtering: {crawl4ai_provider}")
	else:
	# Default to OpenAI
	llm_config = LLMConfig(
	provider="openai/gpt-4o-mini",
	api_token="env:OPENAI_API_KEY"
	)
	print("🧠 Using OpenAI for content filtering: gpt-4o-mini")

	content_filter = LLMContentFilter(
	llm_config=llm_config,
	instruction=f"""
	Extract educational content related to "{topic}".
	Focus on:
	- Key concepts and explanations
	- Practical examples and tutorials
	- Technical details and specifications
	- Best practices and guidelines
	- Code examples and implementations

	Exclude:
	- Navigation menus and sidebars
	- Advertisements and promotional content
	- Footer content and legal text
	- Unrelated content

	Format as clean markdown with proper headers and code blocks.
	""",
	chunk_token_threshold=1000,
	verbose=False
	)

	markdown_generator = DefaultMarkdownGenerator(
	content_filter=content_filter,
	options={"ignore_links": False}
	)
	except Exception as e:
	print(f"⚠️ Could not configure LLM content filter: {e}")
	# Fallback to basic markdown generator
	markdown_generator = DefaultMarkdownGenerator(
	options={"ignore_links": False}
	)

	run_config = CrawlerRunConfig(
	markdown_generator=markdown_generator,
	cache_mode=CacheMode.BYPASS,
	wait_for_images=False,
	process_iframes=False,
	remove_overlay_elements=True
	)

	extracted_content = []

	async with AsyncWebCrawler(config=self.browser_config) as crawler:
	for i, url in enumerate(urls[:self.max_crawl_pages]):
	try:
	print(f"📖 Crawling {i+1}/{min(len(urls), self.max_crawl_pages)}: {url}")

	result = await crawler.arun(url=url, config=run_config)

	if result.success and result.markdown:
	extracted_content.append({
	"url": url,
	"title": result.metadata.get("title", ""),
	"content": result.markdown,
	"word_count": len(result.markdown.split()),
	"extraction_success": True
	})
	print(f"✅ Extracted {len(result.markdown.split())} words from {url}")
	else:
	print(f"⚠️ Failed to extract content from {url}: {result.error_message}")
	extracted_content.append({
	"url": url,
	"title": "",
	"content": "",
	"word_count": 0,
	"extraction_success": False,
	"error": result.error_message
	})

	except Exception as e:
	logger.error(f"Error crawling {url}: {e}")
	print(f"❌ Error crawling {url}: {e}")
	extracted_content.append({
	"url": url,
	"title": "",
	"content": "",
	"word_count": 0,
	"extraction_success": False,
	"error": str(e)
	})

	successful_extractions = [c for c in extracted_content if c["extraction_success"]]
	print(f"✅ Successfully extracted content from {len(successful_extractions)}/{len(urls)} URLs")

	return extracted_content

	except Exception as e:
	logger.error(f"Content extraction failed: {e}")
	print(f"❌ Content extraction failed: {e}")

	# If Crawl4AI fails (likely due to Playwright), try fallback
	error_str = str(e)
	playwright_errors = [
	"Executable doesn't exist",
	"BrowserType.launch",
	"playwright install",
	"Playwright was just installed",
	"download new browsers",
	"chromium-",
	"chrome-linux/chrome"
	]

	if any(error in error_str for error in playwright_errors):
	print("🔄 Playwright browser binaries not available, falling back to simple web scraping")
	return await self._fallback_extract_content(urls)

	return []

	async def research_topic(self, topic: str) -> Dict[str, Any]:
	"""Complete research workflow: search + extract + summarize"""
	try:
	print(f"🚀 Starting comprehensive research for: {topic}")

	# Step 1: Search for relevant URLs
	search_results = await self.search_topic(topic)

	if not search_results:
	return {
	"topic": topic,
	"search_results": [],
	"extracted_content": [],
	"summary": f"No search results found for {topic}",
	"success": False
	}

	# Step 2: Extract content from top URLs
	urls = [result["url"] for result in search_results]
	extracted_content = await self.extract_content(urls, topic)

	# Step 3: Compile research summary
	successful_content = [c for c in extracted_content if c["extraction_success"]]
	total_words = sum(c["word_count"] for c in successful_content)

	summary = f"""
	Research completed for "{topic}":
	- Found {len(search_results)} search results
	- Successfully extracted content from {len(successful_content)} sources
	- Total content: {total_words} words
	- Sources include educational articles, documentation, and tutorials
	"""

	print(f"🎉 Research completed: {len(successful_content)} sources, {total_words} words")

	return {
	"topic": topic,
	"search_results": search_results,
	"extracted_content": extracted_content,
	"summary": summary.strip(),
	"total_words": total_words,
	"successful_sources": len(successful_content),
	"success": True
	}

	except Exception as e:
	logger.error(f"Research failed: {e}")
	print(f"❌ Research failed: {e}")
	return {
	"topic": topic,
	"search_results": [],
	"extracted_content": [],
	"summary": f"Research failed for {topic}: {str(e)}",
	"success": False
	}


	async def research_topic(topic: str, llm_provider: str = "openai") -> Dict[str, Any]:
	"""Convenience function for topic research with LLM provider"""
	web_researcher = WebResearcher(llm_provider=llm_provider)
	return await web_researcher.research_topic(topic)