LiamKhoaLe's picture
Enh search strats
4bc06b1
import requests
from bs4 import BeautifulSoup
import logging
from typing import List, Dict
import time
logger = logging.getLogger(__name__)
class CookingSearchEngine:
"""Specialized cooking search engine with curated sources"""
def __init__(self, timeout: int = 15):
self.session = requests.Session()
self.session.headers.update({
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
})
self.timeout = timeout
# Comprehensive cooking sources with enhanced search strategies
self.cooking_sources = {
'allrecipes': {
'base_url': 'https://www.allrecipes.com',
'search_url': 'https://www.allrecipes.com/search',
'domains': ['allrecipes.com'],
'search_params': ['q', 'query', 'search'],
'priority': 1
},
'food_network': {
'base_url': 'https://www.foodnetwork.com',
'search_url': 'https://www.foodnetwork.com/search',
'domains': ['foodnetwork.com'],
'search_params': ['q', 'query', 'search'],
'priority': 1
},
'epicurious': {
'base_url': 'https://www.epicurious.com',
'search_url': 'https://www.epicurious.com/search',
'domains': ['epicurious.com'],
'search_params': ['q', 'query', 'search'],
'priority': 1
},
'serious_eats': {
'base_url': 'https://www.seriouseats.com',
'search_url': 'https://www.seriouseats.com/search',
'domains': ['seriouseats.com'],
'search_params': ['q', 'query', 'search'],
'priority': 1
},
'bon_appetit': {
'base_url': 'https://www.bonappetit.com',
'search_url': 'https://www.bonappetit.com/search',
'domains': ['bonappetit.com'],
'search_params': ['q', 'query', 'search'],
'priority': 1
},
'taste_of_home': {
'base_url': 'https://www.tasteofhome.com',
'search_url': 'https://www.tasteofhome.com/search',
'domains': ['tasteofhome.com'],
'search_params': ['q', 'query', 'search'],
'priority': 2
},
'food_com': {
'base_url': 'https://www.food.com',
'search_url': 'https://www.food.com/search',
'domains': ['food.com'],
'search_params': ['q', 'query', 'search'],
'priority': 2
},
'bbc_good_food': {
'base_url': 'https://www.bbcgoodfood.com',
'search_url': 'https://www.bbcgoodfood.com/search',
'domains': ['bbcgoodfood.com'],
'search_params': ['q', 'query', 'search'],
'priority': 2
},
'martha_stewart': {
'base_url': 'https://www.marthastewart.com',
'search_url': 'https://www.marthastewart.com/search',
'domains': ['marthastewart.com'],
'search_params': ['q', 'query', 'search'],
'priority': 2
},
'king_arthur_baking': {
'base_url': 'https://www.kingarthurbaking.com',
'search_url': 'https://www.kingarthurbaking.com/search',
'domains': ['kingarthurbaking.com'],
'search_params': ['q', 'query', 'search'],
'priority': 2
}
}
def search(self, query: str, num_results: int = 10) -> List[Dict]:
"""Search cooking sources for relevant information with enhanced strategies"""
results = []
# Enhanced query processing
enhanced_queries = self._create_enhanced_queries(query)
logger.info(f"Enhanced queries for cooking search: {enhanced_queries}")
# Strategy 1: Priority-based source searches
priority_sources = self._get_priority_sources()
for priority_level in [1, 2]: # Search priority 1 sources first, then priority 2
if len(results) >= num_results:
break
for source_name in priority_sources.get(priority_level, []):
if len(results) >= num_results:
break
source_config = self.cooking_sources[source_name]
# Try multiple query variations for each source
for query_variant in enhanced_queries:
if len(results) >= num_results:
break
source_results = self._search_cooking_source(query_variant, source_name, source_config)
if source_results:
results.extend(source_results)
logger.info(f"{source_name} found {len(source_results)} results for query: {query_variant}")
break # Move to next source if we found results
# Add delay between requests
time.sleep(0.3)
# Strategy 2: Recipe-specific searches if we need more results
if len(results) < num_results:
recipe_results = self._search_recipe_specific(query, num_results - len(results))
results.extend(recipe_results)
# Strategy 3: Technique-specific searches
if len(results) < num_results:
technique_results = self._search_technique_specific(query, num_results - len(results))
results.extend(technique_results)
# Strategy 4: Cooking fallback sources
if len(results) < num_results:
fallback_results = self._get_fallback_sources(query, num_results - len(results))
results.extend(fallback_results)
# Remove duplicates and return top results
unique_results = self._remove_duplicates(results)
return unique_results[:num_results]
def _search_cooking_source(self, query: str, source_name: str, source_config: Dict) -> List[Dict]:
"""Search a specific cooking source"""
try:
search_url = source_config.get('search_url')
if not search_url:
return []
params = {
'q': query,
'query': query,
'search': query
}
response = self.session.get(search_url, params=params, timeout=self.timeout)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
results = []
# Source-specific selectors
selectors = self._get_source_selectors(source_name)
for selector in selectors:
links = soup.select(selector)
if links:
logger.info(f"{source_name} found {len(links)} results with selector: {selector}")
break
for link in links[:3]: # Limit per source
try:
href = link.get('href')
if not href:
continue
# Make absolute URL
if href.startswith('/'):
href = source_config['base_url'] + href
title = link.get_text(strip=True)
if title and href.startswith('http'):
results.append({
'url': href,
'title': title,
'source': source_name,
'domain': source_config['domains'][0]
})
except Exception as e:
logger.debug(f"Error parsing {source_name} link: {e}")
continue
return results
except Exception as e:
logger.warning(f"Cooking source {source_name} search failed: {e}")
return []
def _get_source_selectors(self, source_name: str) -> List[str]:
"""Get CSS selectors for specific cooking sources"""
selectors_map = {
'allrecipes': [
'a[href*="/recipe/"]',
'a[href*="/recipes/"]',
'.search-result a',
'.result-title a'
],
'food_network': [
'a[href*="/recipes/"]',
'.search-result a',
'.result-title a',
'a[href*="/recipe/"]'
],
'epicurious': [
'a[href*="/recipes/"]',
'.search-result a',
'.result-title a',
'a[href*="/recipe/"]'
],
'serious_eats': [
'a[href*="/recipes/"]',
'.search-result a',
'.result-title a',
'a[href*="/recipe/"]'
],
'bon_appetit': [
'a[href*="/recipes/"]',
'.search-result a',
'.result-title a',
'a[href*="/recipe/"]'
]
}
return selectors_map.get(source_name, ['a[href*="http"]'])
def _get_fallback_sources(self, query: str, num_results: int) -> List[Dict]:
"""Get fallback cooking sources when direct search fails"""
fallback_sources = [
{
'url': 'https://www.allrecipes.com/recipes',
'title': f'AllRecipes: {query}',
'source': 'allrecipes_fallback',
'domain': 'allrecipes.com'
},
{
'url': 'https://www.foodnetwork.com/recipes',
'title': f'Food Network: {query}',
'source': 'foodnetwork_fallback',
'domain': 'foodnetwork.com'
},
{
'url': 'https://www.epicurious.com/recipes-menus',
'title': f'Epicurious: {query}',
'source': 'epicurious_fallback',
'domain': 'epicurious.com'
},
{
'url': 'https://www.seriouseats.com/recipes',
'title': f'Serious Eats: {query}',
'source': 'seriouseats_fallback',
'domain': 'seriouseats.com'
},
{
'url': 'https://www.bonappetit.com/recipes',
'title': f'Bon Appétit: {query}',
'source': 'bonappetit_fallback',
'domain': 'bonappetit.com'
}
]
return fallback_sources[:num_results]
def _create_enhanced_queries(self, query: str) -> List[str]:
"""Create enhanced query variations for better cooking search results"""
import re
# Clean the base query
base_query = re.sub(r'[^\w\s\-\.]', ' ', query).strip()
base_query = re.sub(r'\s+', ' ', base_query)
enhanced_queries = [base_query]
# Add cooking-specific enhancements
cooking_enhancements = [
f"{base_query} recipe",
f"{base_query} cooking method",
f"{base_query} how to cook",
f"{base_query} ingredients",
f"{base_query} technique",
f"{base_query} tutorial"
]
# Add technique-specific queries
cooking_techniques = ['bake', 'roast', 'grill', 'fry', 'boil', 'steam', 'sauté', 'braise', 'poach']
for technique in cooking_techniques:
if technique in base_query.lower():
enhanced_queries.append(f"{base_query} {technique} method")
enhanced_queries.append(f"how to {technique} {base_query}")
# Add cuisine-specific enhancements
cuisines = ['italian', 'chinese', 'mexican', 'french', 'indian', 'thai', 'japanese', 'mediterranean']
for cuisine in cuisines:
if cuisine in base_query.lower():
enhanced_queries.append(f"{cuisine} {base_query} recipe")
enhanced_queries.append(f"authentic {cuisine} {base_query}")
# Remove duplicates and limit
unique_queries = list(dict.fromkeys(enhanced_queries))
return unique_queries[:5] # Limit to 5 query variations
def _get_priority_sources(self) -> Dict[int, List[str]]:
"""Get sources organized by priority"""
priority_sources = {1: [], 2: []}
for source_name, config in self.cooking_sources.items():
priority = config.get('priority', 2)
priority_sources[priority].append(source_name)
return priority_sources
def _search_recipe_specific(self, query: str, num_results: int) -> List[Dict]:
"""Search for recipe-specific content"""
recipe_queries = [
f"{query} recipe ingredients",
f"{query} recipe instructions",
f"{query} recipe steps",
f"how to make {query}",
f"{query} cooking recipe"
]
results = []
for recipe_query in recipe_queries:
if len(results) >= num_results:
break
# Search top priority sources for recipe content
priority_sources = self._get_priority_sources()
for source_name in priority_sources.get(1, []):
if len(results) >= num_results:
break
source_config = self.cooking_sources[source_name]
source_results = self._search_cooking_source(recipe_query, source_name, source_config)
results.extend(source_results)
time.sleep(0.2)
return results[:num_results]
def _search_technique_specific(self, query: str, num_results: int) -> List[Dict]:
"""Search for cooking technique-specific content"""
technique_queries = [
f"{query} cooking technique",
f"{query} cooking method",
f"how to cook {query}",
f"{query} preparation method",
f"{query} cooking tips"
]
results = []
for technique_query in technique_queries:
if len(results) >= num_results:
break
# Search priority sources for technique content
priority_sources = self._get_priority_sources()
for source_name in priority_sources.get(1, []):
if len(results) >= num_results:
break
source_config = self.cooking_sources[source_name]
source_results = self._search_cooking_source(technique_query, source_name, source_config)
results.extend(source_results)
time.sleep(0.2)
return results[:num_results]
def _remove_duplicates(self, results: List[Dict]) -> List[Dict]:
"""Remove duplicate results based on URL"""
seen_urls = set()
unique_results = []
for result in results:
url = result.get('url', '')
if url and url not in seen_urls:
seen_urls.add(url)
unique_results.append(result)
return unique_results