|
from fastapi import FastAPI, HTTPException, Path, Query, Header |
|
from pydantic import BaseModel, ConfigDict |
|
import requests |
|
import openai |
|
from duckduckgo_search import DDGS |
|
from googleapiclient.discovery import build |
|
import concurrent.futures |
|
from docling.document_converter import DocumentConverter |
|
import dotenv |
|
import os |
|
import time |
|
import docling |
|
import fastapi |
|
import sys |
|
import pytz |
|
from datetime import datetime |
|
import uvicorn |
|
import logging |
|
|
|
|
|
dotenv.load_dotenv() |
|
SEARCH_ENGINE = os.getenv("SEARCH_ENGINE", "duckduckgo") |
|
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY") |
|
GOOGLE_CSE_ID = os.getenv("GOOGLE_CSE_ID") |
|
|
|
|
|
openai_api_key = os.getenv("OPENAI_API_KEY") |
|
api_key = os.getenv("API_KEY") |
|
client = openai.Client() |
|
|
|
app = FastAPI( |
|
title="Search & Summary API", |
|
description=""" |
|
This API provides an enhanced search service that combines DuckDuckGo search results with AI-powered summaries. |
|
|
|
## Features |
|
|
|
- **Web Search**: Utilizes DuckDuckGo's search engine to find relevant web pages |
|
- **Content Processing**: Extracts and converts web content to markdown format |
|
- **AI Summaries**: Generates intelligent summaries of search results using OpenAI's GPT model |
|
- **Concurrent Processing**: Handles multiple results simultaneously for better performance |
|
|
|
## Main Endpoints |
|
|
|
### /search |
|
The main search endpoint accepts the following parameters: |
|
- `query` (required): The search term or phrase |
|
- `timelimit` (optional): Time range for results ('d' for day, 'w' for week, 'm' for month, 'y' for year) |
|
- `region` (optional): Geographic region for results (e.g., 'us-en', 'es-es') |
|
- `max_results` (optional): Maximum number of results to return (default: 3) |
|
|
|
### /fetch |
|
The fetch endpoint retrieves and summarizes content from a specific URL: |
|
- `url` (required): The URL to fetch and summarize |
|
|
|
## Response Format |
|
|
|
The API returns: |
|
- Search results with URLs and snippets |
|
- AI-generated summaries for each result |
|
- Processing time information |
|
""", |
|
version="1.0.0", |
|
contact={ |
|
"name": "API Support", |
|
"email": "your-email@example.com", |
|
}, |
|
license_info={ |
|
"name": "MIT", |
|
} |
|
) |
|
|
|
|
|
class SearchResult(BaseModel): |
|
url: str |
|
|
|
url_snippet: str |
|
url_summary: str |
|
input_tokens: int |
|
completion_tokens: int |
|
|
|
class SearchResponse(BaseModel): |
|
query: str |
|
results: list[SearchResult] |
|
processing_time: float |
|
timestamp: str |
|
total_input_tokens: int |
|
total_completion_tokens: int |
|
total_url_summaries_ok: int |
|
total_url_summaries_nok: int |
|
|
|
class VersionInfo(BaseModel): |
|
python: str = "Python Runtime" |
|
openai: str = "OpenAI Language Model API" |
|
fastapi: str = "API Server Framework" |
|
|
|
model_config = ConfigDict( |
|
title="Version Information", |
|
description="Current versions of all major dependencies", |
|
json_schema_extra={ |
|
"example": { |
|
"python": "3.11.4 [CPython] - Python Runtime", |
|
"openai": "1.12.0 - AI Language Model API", |
|
"fastapi": "0.109.0 - API Server Framework" |
|
} |
|
} |
|
) |
|
|
|
class FetchResult(BaseModel): |
|
url: str |
|
url_md: str |
|
content_summary: str |
|
processing_time: float |
|
timestamp: str |
|
input_tokens: int |
|
completion_tokens: int |
|
total_url_summaries_ok: int |
|
total_url_summaries_nok: int |
|
|
|
class SearchEngineFactory: |
|
@staticmethod |
|
def create_search_engine(): |
|
if SEARCH_ENGINE.lower() == "google": |
|
return GoogleSearchEngine() |
|
return DuckDuckGoSearchEngine() |
|
|
|
class SearchEngineBase: |
|
async def search(self, query: str, max_results: int, **kwargs) -> list: |
|
pass |
|
|
|
class DuckDuckGoSearchEngine(SearchEngineBase): |
|
async def search(self, query: str, max_results: int, timelimit: str = "m", region: str = "us-en") -> list: |
|
try: |
|
with DDGS() as ddgs: |
|
results = list(ddgs.text(query, max_results=max_results, timelimit=timelimit, region=region)) |
|
return [ |
|
{"href": r["link"], "body": r["body"]} for r in results |
|
] if results else [] |
|
except Exception as e: |
|
raise HTTPException(status_code=500, detail=f"DuckDuckGo search error: {str(e)}") |
|
|
|
class GoogleSearchEngine(SearchEngineBase): |
|
def __init__(self): |
|
if not GOOGLE_API_KEY or not GOOGLE_CSE_ID: |
|
raise ValueError("Google API key and Custom Search Engine ID are required") |
|
self.service = build("customsearch", "v1", developerKey=GOOGLE_API_KEY) |
|
|
|
def _convert_timelimit(self, timelimit: str) -> str: |
|
"""Convert DuckDuckGo timelimit format to Google dateRestrict format""" |
|
if not timelimit: |
|
return None |
|
|
|
|
|
time_map = { |
|
'd': 'd', |
|
'w': 'w', |
|
'm': 'm', |
|
'y': 'y' |
|
} |
|
|
|
|
|
unit = timelimit[0].lower() |
|
number = timelimit[1:] if len(timelimit) > 1 else '1' |
|
|
|
print(f"unit: {unit}, number: {number}") |
|
|
|
if unit not in time_map: |
|
return None |
|
|
|
try: |
|
int(number) |
|
return f"{time_map[unit]}{number}" |
|
except ValueError: |
|
return None |
|
|
|
def _convert_region_to_lang(self, region: str) -> str: |
|
"""Convert DuckDuckGo region format to Google Search API language restriction""" |
|
if not region: |
|
return None |
|
|
|
|
|
try: |
|
lang_code = region.split('-')[1].lower() |
|
except IndexError: |
|
return None |
|
|
|
|
|
lang_map = { |
|
'ar': 'lang_ar', |
|
'bg': 'lang_bg', |
|
'ca': 'lang_ca', |
|
'cs': 'lang_cs', |
|
'da': 'lang_da', |
|
'de': 'lang_de', |
|
'el': 'lang_el', |
|
'en': 'lang_en', |
|
'es': 'lang_es', |
|
'et': 'lang_et', |
|
'fi': 'lang_fi', |
|
'fr': 'lang_fr', |
|
'hr': 'lang_hr', |
|
'hu': 'lang_hu', |
|
'id': 'lang_id', |
|
'is': 'lang_is', |
|
'it': 'lang_it', |
|
'iw': 'lang_iw', |
|
'ja': 'lang_ja', |
|
'ko': 'lang_ko', |
|
'lt': 'lang_lt', |
|
'lv': 'lang_lv', |
|
'nl': 'lang_nl', |
|
'no': 'lang_no', |
|
'pl': 'lang_pl', |
|
'pt': 'lang_pt', |
|
'ro': 'lang_ro', |
|
'ru': 'lang_ru', |
|
'sk': 'lang_sk', |
|
'sl': 'lang_sl', |
|
'sr': 'lang_sr', |
|
'sv': 'lang_sv', |
|
'tr': 'lang_tr', |
|
'zh': 'lang_zh-CN' |
|
} |
|
|
|
print(f"Converting region {region} to language code {lang_code}") |
|
return lang_map.get(lang_code) |
|
|
|
async def search(self, query: str, max_results: int, timelimit: str = "m", region: str = "us-en", **kwargs) -> list: |
|
try: |
|
results = [] |
|
date_restrict = self._convert_timelimit(timelimit) |
|
language = self._convert_region_to_lang(region) |
|
|
|
for i in range(0, max_results, 10): |
|
search_params = { |
|
'q': query, |
|
'cx': GOOGLE_CSE_ID, |
|
'start': i + 1, |
|
'num': min(10, max_results - i) |
|
} |
|
|
|
if date_restrict: |
|
print(f"Adding dateRestrict: {date_restrict}") |
|
search_params['dateRestrict'] = date_restrict |
|
|
|
if language: |
|
print(f"Adding language restriction: {language}") |
|
search_params['lr'] = language |
|
|
|
try: |
|
response = self.service.cse().list(**search_params).execute() |
|
|
|
if "items" not in response: |
|
print(f"No results found for query: {query}") |
|
continue |
|
|
|
results.extend([ |
|
{ |
|
"href": item["link"], |
|
"body": item.get("snippet", "") |
|
} for item in response["items"] |
|
]) |
|
except Exception as search_error: |
|
print(f"Error during Google search: {str(search_error)}") |
|
raise HTTPException(status_code=500, detail=f"Google search error: {str(search_error)}") |
|
|
|
if not results: |
|
|
|
raise HTTPException( |
|
status_code=404, |
|
detail=f"No results found for query: {query}" |
|
) |
|
|
|
return results[:max_results] |
|
|
|
except HTTPException as he: |
|
raise he |
|
except Exception as e: |
|
raise HTTPException(status_code=500, detail=f"Google search error: {str(e)}") |
|
|
|
@app.get("/", |
|
response_model=VersionInfo, |
|
summary="API Version Information", |
|
description="Returns version information for all major dependencies including Python runtime", |
|
operation_id="get_version_info" |
|
) |
|
async def get_version_info(): |
|
python_version = f"{sys.version_info.major}.{sys.version_info.minor}.{sys.version_info.micro} [CPython]" |
|
return VersionInfo( |
|
python=f"{python_version} - Python Runtime", |
|
openai=f"{getattr(openai, '__version__', 'unknown')} - OpenAI Language Model API", |
|
fastapi=f"{fastapi.__version__} - API Server Framework" |
|
) |
|
|
|
@app.get("/search", |
|
response_model=SearchResponse, |
|
summary="Search and summarize web content", |
|
description=""" |
|
Performs a web search and generates AI-powered summaries of the results. |
|
|
|
The search process involves: |
|
1. Querying DuckDuckGo for relevant web pages |
|
2. Converting found web content to markdown format |
|
3. Generating intelligent summaries using OpenAI's GPT model |
|
4. Processing results concurrently for improved performance |
|
|
|
Returns a detailed response including URLs, snippets, and AI-generated summaries. |
|
""", |
|
response_description="A list of search results with AI-generated summaries", |
|
operation_id="search_and_summarize" |
|
) |
|
async def search( |
|
query: str = Query(..., description="The search query term or phrase"), |
|
timelimit: str = Query("m", description="Time range for results (DuckDuckGo only): 'd' (day), 'w' (week), 'm' (month), 'y' (year)"), |
|
region: str = Query("us-en", description="Geographic region for results (DuckDuckGo only, e.g., 'us-en', 'es-es')"), |
|
max_results: int = Query(3, description="Maximum number of results to return", ge=1, le=10), |
|
authorization: str = Header(..., description="API key for authorization", alias="Auth") |
|
): |
|
if authorization != api_key: |
|
raise HTTPException(status_code=401, detail="Unauthorized") |
|
|
|
start_time = time.time() |
|
madrid_tz = pytz.timezone('Europe/Madrid') |
|
current_time = datetime.now(madrid_tz).strftime('%Y-%m-%d %H:%M:%S %Z') |
|
|
|
|
|
total_input_tokens = 0 |
|
total_completion_tokens = 0 |
|
total_url_summaries_ok = 0 |
|
total_url_summaries_nok = 0 |
|
|
|
|
|
print(f"Using search engine: {SEARCH_ENGINE}") |
|
search_engine = SearchEngineFactory.create_search_engine() |
|
|
|
try: |
|
|
|
search_results = await search_engine.search( |
|
query=query, |
|
max_results=max_results, |
|
timelimit=timelimit, |
|
region=region |
|
) |
|
|
|
if not search_results: |
|
return SearchResponse( |
|
query=query, |
|
results=[], |
|
processing_time=round(time.time() - start_time, 2), |
|
timestamp=current_time, |
|
total_input_tokens=0, |
|
total_completion_tokens=0, |
|
total_url_summaries_ok=0, |
|
total_url_summaries_nok=0 |
|
) |
|
|
|
except HTTPException as he: |
|
|
|
raise he |
|
except Exception as e: |
|
|
|
if "No results found" in str(e): |
|
return SearchResponse( |
|
query=query, |
|
results=[], |
|
processing_time=round(time.time() - start_time, 2), |
|
timestamp=current_time, |
|
total_input_tokens=0, |
|
total_completion_tokens=0, |
|
total_url_summaries_ok=0, |
|
total_url_summaries_nok=0 |
|
) |
|
raise HTTPException(status_code=500, detail=f"Search error: {str(e)}") |
|
|
|
|
|
converter = DocumentConverter() |
|
|
|
def process_result(result): |
|
nonlocal total_url_summaries_ok, total_url_summaries_nok |
|
url = result.get('href', '') |
|
try: |
|
content = converter.convert(url) |
|
url_md = content.document.export_to_markdown() |
|
except Exception as exc: |
|
print(f"Error converting {url}: {exc}") |
|
url_md = f"content error: {str(exc)}" |
|
|
|
|
|
prompt = ( |
|
f"OBJECTIVE: " |
|
f"Create a full detailed summary of the provided markdown content, focusing on the topic <{query}>." |
|
f"Your task is to distill this information into a focused summary that emphasizes the aspects related to <{query}>." |
|
"\\n\\INSTRUCTIONS:" |
|
f"Analyze the markdown content and extract the key points related to <{query}>. Your summary should capture the essential details and insights in a clear and verbose manner." |
|
"\\n\\nFormat: Provide the summary as a well-organized three paragraphs in markdown format. " |
|
f"CONTEXt: The markdown content provided below contains detailed information: " |
|
"\\n\\n<content>" |
|
f"{url_md}" |
|
"\\n\\n</content>" |
|
"\\n\\nSummary:" |
|
) |
|
|
|
if url_md.startswith("content error:"): |
|
summary = f"Summary error: Failed to convert URL content - {url_md}" |
|
input_tokens = 0 |
|
completion_tokens = 0 |
|
total_url_summaries_nok += 1 |
|
else: |
|
try: |
|
response = client.chat.completions.create( |
|
model="gpt-4o-mini", |
|
messages=[ |
|
{"role": "developer", "content": "You are a technology expert."}, |
|
{"role": "user", "content": prompt} |
|
], |
|
temperature=0.3, |
|
max_completion_tokens=1000 |
|
) |
|
|
|
summary = response.choices[0].message.content.strip() |
|
summary = summary.replace("```markdown", "").replace("```", "").strip() |
|
input_tokens = response.usage.prompt_tokens |
|
completion_tokens = response.usage.completion_tokens |
|
total_url_summaries_ok += 1 |
|
except Exception as exc: |
|
print(f"Error generating summary for {url}: {exc}") |
|
summary = f"Summary error: Failed to generate summary - {str(exc)}" |
|
input_tokens = 0 |
|
completion_tokens = 0 |
|
total_url_summaries_nok += 1 |
|
|
|
return SearchResult( |
|
url=url, |
|
|
|
url_snippet=result.get('body', ''), |
|
url_summary=summary, |
|
input_tokens=input_tokens, |
|
completion_tokens=completion_tokens |
|
) |
|
|
|
|
|
with concurrent.futures.ThreadPoolExecutor() as executor: |
|
results = list(executor.map(process_result, search_results)) |
|
|
|
processing_time = time.time() - start_time |
|
|
|
|
|
total_input_tokens = sum(r.input_tokens for r in results) |
|
total_completion_tokens = sum(r.completion_tokens for r in results) |
|
|
|
return SearchResponse( |
|
query=query, |
|
results=results, |
|
processing_time=round(processing_time, 2), |
|
timestamp=current_time, |
|
total_input_tokens=total_input_tokens, |
|
total_completion_tokens=total_completion_tokens, |
|
total_url_summaries_ok=total_url_summaries_ok, |
|
total_url_summaries_nok=total_url_summaries_nok |
|
) |
|
|
|
@app.get("/fetch", |
|
response_model=FetchResult, |
|
summary="Fetch and summarize specific URL content", |
|
description=""" |
|
Fetch a specific URL, converts its content to markdown format using docling, |
|
and generates an AI-powered summary using OpenAI's GPT model. |
|
|
|
The process involves: |
|
1. Fetching and converting the web content to markdown |
|
2. Generating an intelligent summary using OpenAI's GPT model |
|
|
|
Returns the URL and its AI-generated summary. |
|
""", |
|
response_description="URL content with AI-generated summary", |
|
operation_id="fetch_and_summarize" |
|
) |
|
async def fetch( |
|
url: str = Query(..., description="The URL to crawl and summarize"), |
|
authorization: str = Header(..., description="API key for authorization", alias="Auth") |
|
): |
|
if authorization != api_key: |
|
raise HTTPException(status_code=401, detail="Unauthorized") |
|
|
|
start_time = time.time() |
|
madrid_tz = pytz.timezone('Europe/Madrid') |
|
current_time = datetime.now(madrid_tz).strftime('%Y-%m-%d %H:%M:%S %Z') |
|
total_url_summaries_ok = 0 |
|
total_url_summaries_nok = 0 |
|
|
|
|
|
converter = DocumentConverter() |
|
|
|
try: |
|
content = converter.convert(url) |
|
url_md = content.document.export_to_markdown() |
|
except Exception as exc: |
|
total_url_summaries_nok = 1 |
|
error_msg = f"Error converting URL content: {str(exc)}" |
|
return FetchResult( |
|
url=url, |
|
url_md="", |
|
content_summary=error_msg, |
|
processing_time=round(time.time() - start_time, 2), |
|
timestamp=current_time, |
|
input_tokens=0, |
|
completion_tokens=0, |
|
total_url_summaries_ok=0, |
|
total_url_summaries_nok=1 |
|
) |
|
|
|
|
|
prompt = ( |
|
f"OBJECTIVE: Create a comprehensive summary of the provided webpage content." |
|
f"Your task is to distill the information into a focused and detailed summary." |
|
"\n\nINSTRUCTIONS:" |
|
"Analyze the markdown content and extract the key points. Your summary should capture " |
|
"the essential details and insights in a clear and verbose manner." |
|
"\n\nFormat: Provide the summary as a well-organized three paragraphs in markdown format." |
|
f"\n\nCONTEXT: The markdown content provided below contains detailed information: " |
|
"\n\n<content>" |
|
f"{url_md}" |
|
"\n\n</content>" |
|
"\n\nSummary:" |
|
) |
|
|
|
try: |
|
response = client.chat.completions.create( |
|
model="gpt-4o-mini", |
|
messages=[ |
|
{"role": "developer", "content": "You are a technology expert."}, |
|
{"role": "user", "content": prompt} |
|
], |
|
temperature=0.3, |
|
max_completion_tokens=1000 |
|
) |
|
summary = response.choices[0].message.content.strip() |
|
summary = summary.replace("```markdown", "").replace("```", "").strip() |
|
input_tokens = response.usage.prompt_tokens |
|
completion_tokens = response.usage.completion_tokens |
|
total_url_summaries_ok = 1 |
|
except Exception as exc: |
|
error_msg = f"Error generating summary: {str(exc)}" |
|
return FetchResult( |
|
url=url, |
|
url_md=url_md, |
|
content_summary=error_msg, |
|
processing_time=round(time.time() - start_time, 2), |
|
timestamp=current_time, |
|
input_tokens=0, |
|
completion_tokens=0, |
|
total_url_summaries_ok=0, |
|
total_url_summaries_nok=1 |
|
) |
|
|
|
processing_time = time.time() - start_time |
|
|
|
return FetchResult( |
|
url=url, |
|
url_md=url_md, |
|
content_summary=summary, |
|
processing_time=round(processing_time, 2), |
|
timestamp=current_time, |
|
input_tokens=input_tokens, |
|
completion_tokens=completion_tokens, |
|
total_url_summaries_ok=total_url_summaries_ok, |
|
total_url_summaries_nok=total_url_summaries_nok |
|
) |
|
|
|
if __name__ == "__main__": |
|
|
|
logging.basicConfig( |
|
level=logging.INFO, |
|
format='%(asctime)s - %(levelname)s - %(message)s', |
|
datefmt='%Y-%m-%d %H:%M:%S' |
|
) |
|
|
|
|
|
uvicorn.run( |
|
"index:app", |
|
host="0.0.0.0", |
|
port=8000, |
|
reload=True, |
|
reload_dirs=["./"], |
|
log_level="info" |
|
) |