adding offset to Fetch_Webpage and Search_DuckDuckGo so you can pickup where you left off
Browse files
app.py
CHANGED
|
@@ -288,13 +288,24 @@ def _fullpage_markdown_from_soup(full_soup: BeautifulSoup, base_url: str, strip_
|
|
| 288 |
return markdown_text or "No content could be extracted."
|
| 289 |
|
| 290 |
|
| 291 |
-
def _truncate_markdown(markdown: str, max_chars: int) -> str:
|
| 292 |
"""
|
| 293 |
Truncate markdown content to a maximum character count while preserving structure.
|
| 294 |
Tries to break at paragraph boundaries when possible.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 295 |
"""
|
| 296 |
-
|
| 297 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 298 |
|
| 299 |
# Find a good break point near the limit
|
| 300 |
truncated = markdown[:max_chars]
|
|
@@ -303,14 +314,37 @@ def _truncate_markdown(markdown: str, max_chars: int) -> str:
|
|
| 303 |
last_paragraph = truncated.rfind('\n\n')
|
| 304 |
if last_paragraph > max_chars * 0.7: # If we find a paragraph break in the last 30%
|
| 305 |
truncated = truncated[:last_paragraph]
|
| 306 |
-
|
| 307 |
# Try to break at the end of a sentence
|
| 308 |
elif '.' in truncated[-100:]: # Look for a period in the last 100 chars
|
| 309 |
last_period = truncated.rfind('.')
|
| 310 |
if last_period > max_chars * 0.8: # If we find a period in the last 20%
|
| 311 |
truncated = truncated[:last_period + 1]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 312 |
|
| 313 |
-
return truncated
|
| 314 |
|
| 315 |
|
| 316 |
def Fetch_Webpage( # <-- MCP tool #1 (Fetch)
|
|
@@ -318,6 +352,7 @@ def Fetch_Webpage( # <-- MCP tool #1 (Fetch)
|
|
| 318 |
max_chars: Annotated[int, "Maximum characters to return (0 = no limit, full page content)."] = 3000,
|
| 319 |
strip_selectors: Annotated[str, "CSS selectors to remove (comma-separated, e.g., '.header, .footer, nav')."] = "",
|
| 320 |
url_scraper: Annotated[bool, "Extract only links from the page instead of content."] = False,
|
|
|
|
| 321 |
) -> str:
|
| 322 |
"""
|
| 323 |
Fetch a web page and return it converted to Markdown format with configurable options.
|
|
@@ -331,12 +366,14 @@ def Fetch_Webpage( # <-- MCP tool #1 (Fetch)
|
|
| 331 |
max_chars (int): Maximum characters to return. Use 0 for no limit (full page).
|
| 332 |
strip_selectors (str): CSS selectors to remove before processing (comma-separated).
|
| 333 |
url_scraper (bool): If True, extract only links instead of content.
|
|
|
|
| 334 |
|
| 335 |
Returns:
|
| 336 |
str: Either the webpage content converted to Markdown or a list of all links,
|
| 337 |
-
depending on the url_scraper setting. Content is length-limited by max_chars
|
|
|
|
| 338 |
"""
|
| 339 |
-
_log_call_start("Fetch_Webpage", url=url, max_chars=max_chars, strip_selectors=strip_selectors, url_scraper=url_scraper)
|
| 340 |
if not url or not url.strip():
|
| 341 |
result = "Please enter a valid URL."
|
| 342 |
_log_call_end("Fetch_Webpage", _truncate_for_log(result))
|
|
@@ -367,15 +404,34 @@ def Fetch_Webpage( # <-- MCP tool #1 (Fetch)
|
|
| 367 |
if url_scraper:
|
| 368 |
# Extract links mode
|
| 369 |
result = _extract_links_from_soup(full_soup, final_url)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 370 |
else:
|
| 371 |
# Convert to markdown mode
|
| 372 |
-
|
| 373 |
-
|
| 374 |
-
|
| 375 |
-
|
| 376 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 377 |
|
| 378 |
-
_log_call_end("Fetch_Webpage", f"chars={len(result)}, url_scraper={url_scraper}")
|
| 379 |
return result
|
| 380 |
|
| 381 |
|
|
@@ -578,9 +634,13 @@ def Search_DuckDuckGo( # <-- MCP tool #2 (DDG Search)
|
|
| 578 |
max_results: Annotated[int, "Number of results to return (1–20)."] = 5,
|
| 579 |
page: Annotated[int, "Page number for pagination (1-based, each page contains max_results items)."] = 1,
|
| 580 |
search_type: Annotated[str, "Type of search: 'text' (web pages), 'news', 'images', 'videos', or 'books'."] = "text",
|
|
|
|
| 581 |
) -> str:
|
| 582 |
"""
|
| 583 |
Run a DuckDuckGo search and return formatted results with support for multiple content types.
|
|
|
|
|
|
|
|
|
|
| 584 |
|
| 585 |
Args:
|
| 586 |
query (str): The search query string. Supports operators like site:, quotes for exact matching,
|
|
@@ -591,18 +651,22 @@ def Search_DuckDuckGo( # <-- MCP tool #2 (DDG Search)
|
|
| 591 |
- Exact phrase: "artificial intelligence"
|
| 592 |
- Exclude terms: "cats -dogs"
|
| 593 |
max_results (int): Number of results to return per page (1–20). Default: 5.
|
| 594 |
-
page (int): Page number for pagination (1-based). Default: 1.
|
| 595 |
search_type (str): Type of search to perform:
|
| 596 |
- "text": Web pages (default)
|
| 597 |
-
- "news": News articles with dates and sources
|
| 598 |
- "images": Image results with dimensions and sources
|
| 599 |
- "videos": Video results with duration and upload info
|
| 600 |
- "books": Book search results
|
|
|
|
|
|
|
| 601 |
|
| 602 |
Returns:
|
| 603 |
str: Search results formatted appropriately for the search type, with pagination info.
|
|
|
|
|
|
|
| 604 |
"""
|
| 605 |
-
_log_call_start("Search_DuckDuckGo", query=query, max_results=max_results, page=page, search_type=search_type)
|
| 606 |
if not query or not query.strip():
|
| 607 |
result = "No search query provided. Please enter a search term."
|
| 608 |
_log_call_end("Search_DuckDuckGo", _truncate_for_log(result))
|
|
@@ -611,77 +675,143 @@ def Search_DuckDuckGo( # <-- MCP tool #2 (DDG Search)
|
|
| 611 |
# Validate parameters
|
| 612 |
max_results = max(1, min(20, max_results))
|
| 613 |
page = max(1, page)
|
|
|
|
| 614 |
valid_types = ["text", "news", "images", "videos", "books"]
|
| 615 |
if search_type not in valid_types:
|
| 616 |
search_type = "text"
|
| 617 |
|
| 618 |
-
# Calculate offset
|
| 619 |
-
offset
|
| 620 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 621 |
|
| 622 |
-
|
| 623 |
-
|
| 624 |
-
|
| 625 |
-
|
| 626 |
-
|
| 627 |
-
|
| 628 |
-
|
| 629 |
-
|
| 630 |
-
|
| 631 |
-
|
| 632 |
-
|
| 633 |
-
raw_gen = ddgs.images(query, max_results=total_needed + 10)
|
| 634 |
-
elif search_type == "videos":
|
| 635 |
-
raw_gen = ddgs.videos(query, max_results=total_needed + 10)
|
| 636 |
-
elif search_type == "books":
|
| 637 |
-
raw_gen = ddgs.books(query, max_results=total_needed + 10)
|
| 638 |
-
|
| 639 |
-
raw = list(raw_gen)
|
| 640 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 641 |
except Exception as e:
|
| 642 |
-
|
| 643 |
-
if "blocked" in str(e).lower() or "rate" in str(e).lower():
|
| 644 |
-
error_msg = "Search temporarily blocked due to rate limiting. Please try again in a few minutes."
|
| 645 |
-
elif "timeout" in str(e).lower():
|
| 646 |
-
error_msg = "Search timed out. Please try again with a simpler query."
|
| 647 |
-
elif "network" in str(e).lower() or "connection" in str(e).lower():
|
| 648 |
-
error_msg = "Network connection error. Please check your internet connection and try again."
|
| 649 |
-
result = f"Error: {error_msg}"
|
| 650 |
_log_call_end("Search_DuckDuckGo", _truncate_for_log(result))
|
| 651 |
return result
|
| 652 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 653 |
if not raw:
|
| 654 |
-
|
|
|
|
| 655 |
_log_call_end("Search_DuckDuckGo", _truncate_for_log(result))
|
| 656 |
return result
|
| 657 |
|
| 658 |
# Apply pagination by slicing the results
|
| 659 |
-
paginated_results = raw[
|
| 660 |
|
| 661 |
if not paginated_results:
|
| 662 |
-
|
|
|
|
|
|
|
|
|
|
| 663 |
_log_call_end("Search_DuckDuckGo", _truncate_for_log(result))
|
| 664 |
return result
|
| 665 |
|
| 666 |
# Format results based on search type
|
| 667 |
total_available = len(raw)
|
| 668 |
-
start_num =
|
| 669 |
-
end_num =
|
|
|
|
| 670 |
|
| 671 |
-
|
| 672 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 673 |
|
| 674 |
for i, result in enumerate(paginated_results, start_num):
|
| 675 |
result_lines = _format_search_result(result, search_type, i)
|
| 676 |
lines.extend(result_lines)
|
| 677 |
lines.append("") # Empty line between results
|
| 678 |
|
| 679 |
-
# Add pagination
|
| 680 |
if total_available > end_num:
|
| 681 |
-
lines.append(f"💡 More results available
|
|
|
|
|
|
|
|
|
|
| 682 |
|
| 683 |
result = "\n".join(lines)
|
| 684 |
-
|
|
|
|
|
|
|
|
|
|
| 685 |
return result
|
| 686 |
|
| 687 |
|
|
@@ -1300,19 +1430,29 @@ fetch_interface = gr.Interface(
|
|
| 1300 |
value=False,
|
| 1301 |
info="Extract only links instead of content"
|
| 1302 |
),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1303 |
],
|
| 1304 |
outputs=gr.Markdown(label="Extracted Content"),
|
| 1305 |
title="Fetch Webpage",
|
| 1306 |
description=(
|
| 1307 |
-
"<div style=\"text-align:center\">Convert any webpage to clean Markdown format with precision controls, or extract all links. Supports custom element removal and
|
| 1308 |
),
|
| 1309 |
api_description=(
|
| 1310 |
"Fetch a web page and return it converted to Markdown format or extract links with configurable options. "
|
|
|
|
| 1311 |
"Parameters: url (str - absolute URL), max_chars (int - 0=no limit, default 3000), "
|
| 1312 |
"strip_selectors (str - CSS selectors to remove, comma-separated), "
|
| 1313 |
-
"url_scraper (bool - extract only links instead of content, default False)
|
| 1314 |
-
"
|
| 1315 |
-
"When
|
|
|
|
| 1316 |
),
|
| 1317 |
flagging_mode="never",
|
| 1318 |
)
|
|
@@ -1323,27 +1463,38 @@ concise_interface = gr.Interface(
|
|
| 1323 |
inputs=[
|
| 1324 |
gr.Textbox(label="Query", placeholder="topic OR site:example.com"),
|
| 1325 |
gr.Slider(minimum=1, maximum=20, value=5, step=1, label="Max results"),
|
| 1326 |
-
gr.Slider(minimum=1, maximum=10, value=1, step=1, label="Page", info="Page number for pagination"),
|
| 1327 |
gr.Radio(
|
| 1328 |
label="Search Type",
|
| 1329 |
choices=["text", "news", "images", "videos", "books"],
|
| 1330 |
value="text",
|
| 1331 |
info="Type of content to search for"
|
| 1332 |
),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1333 |
],
|
| 1334 |
outputs=gr.Textbox(label="Search Results", interactive=False),
|
| 1335 |
title="DuckDuckGo Search",
|
| 1336 |
description=(
|
| 1337 |
-
"<div style=\"text-align:center\">Multi-type web search with readable output format, date detection, and pagination. Supports text, news, images, videos, and books.</div>"
|
| 1338 |
),
|
| 1339 |
api_description=(
|
| 1340 |
"Run a DuckDuckGo search with support for multiple content types and return formatted results. "
|
|
|
|
|
|
|
| 1341 |
"Supports advanced search operators: site: for specific domains, quotes for exact phrases, "
|
| 1342 |
"OR for alternatives, and - to exclude terms. Examples: 'Python programming', 'site:example.com', "
|
| 1343 |
"'\"artificial intelligence\"', 'cats -dogs', 'Python OR JavaScript'. "
|
| 1344 |
"Parameters: query (str), max_results (int, 1-20), page (int, 1-based pagination), "
|
| 1345 |
-
"search_type (str: text/news/images/videos/books). "
|
| 1346 |
-
"Returns appropriately formatted results with metadata
|
|
|
|
| 1347 |
),
|
| 1348 |
flagging_mode="never",
|
| 1349 |
submit_btn="Search",
|
|
|
|
| 288 |
return markdown_text or "No content could be extracted."
|
| 289 |
|
| 290 |
|
| 291 |
+
def _truncate_markdown(markdown: str, max_chars: int) -> Tuple[str, Dict[str, any]]:
|
| 292 |
"""
|
| 293 |
Truncate markdown content to a maximum character count while preserving structure.
|
| 294 |
Tries to break at paragraph boundaries when possible.
|
| 295 |
+
|
| 296 |
+
Returns:
|
| 297 |
+
Tuple[str, Dict]: (truncated_content, metadata_dict)
|
| 298 |
+
metadata_dict contains: truncated, returned_chars, total_chars_estimate, next_cursor
|
| 299 |
"""
|
| 300 |
+
total_chars = len(markdown)
|
| 301 |
+
|
| 302 |
+
if total_chars <= max_chars:
|
| 303 |
+
return markdown, {
|
| 304 |
+
"truncated": False,
|
| 305 |
+
"returned_chars": total_chars,
|
| 306 |
+
"total_chars_estimate": total_chars,
|
| 307 |
+
"next_cursor": None
|
| 308 |
+
}
|
| 309 |
|
| 310 |
# Find a good break point near the limit
|
| 311 |
truncated = markdown[:max_chars]
|
|
|
|
| 314 |
last_paragraph = truncated.rfind('\n\n')
|
| 315 |
if last_paragraph > max_chars * 0.7: # If we find a paragraph break in the last 30%
|
| 316 |
truncated = truncated[:last_paragraph]
|
| 317 |
+
cursor_pos = last_paragraph
|
| 318 |
# Try to break at the end of a sentence
|
| 319 |
elif '.' in truncated[-100:]: # Look for a period in the last 100 chars
|
| 320 |
last_period = truncated.rfind('.')
|
| 321 |
if last_period > max_chars * 0.8: # If we find a period in the last 20%
|
| 322 |
truncated = truncated[:last_period + 1]
|
| 323 |
+
cursor_pos = last_period + 1
|
| 324 |
+
else:
|
| 325 |
+
cursor_pos = len(truncated)
|
| 326 |
+
else:
|
| 327 |
+
cursor_pos = len(truncated)
|
| 328 |
+
|
| 329 |
+
metadata = {
|
| 330 |
+
"truncated": True,
|
| 331 |
+
"returned_chars": len(truncated),
|
| 332 |
+
"total_chars_estimate": total_chars,
|
| 333 |
+
"next_cursor": cursor_pos
|
| 334 |
+
}
|
| 335 |
+
|
| 336 |
+
truncated = truncated.rstrip()
|
| 337 |
+
|
| 338 |
+
# Add informative truncation notice
|
| 339 |
+
truncation_notice = (
|
| 340 |
+
f"\n\n---\n"
|
| 341 |
+
f"**Content Truncated:** Showing {metadata['returned_chars']:,} of {metadata['total_chars_estimate']:,} characters "
|
| 342 |
+
f"({(metadata['returned_chars']/metadata['total_chars_estimate']*100):.1f}%)\n"
|
| 343 |
+
f"**Next cursor:** {metadata['next_cursor']} (use this value with offset parameter for continuation)\n"
|
| 344 |
+
f"---"
|
| 345 |
+
)
|
| 346 |
|
| 347 |
+
return truncated + truncation_notice, metadata
|
| 348 |
|
| 349 |
|
| 350 |
def Fetch_Webpage( # <-- MCP tool #1 (Fetch)
|
|
|
|
| 352 |
max_chars: Annotated[int, "Maximum characters to return (0 = no limit, full page content)."] = 3000,
|
| 353 |
strip_selectors: Annotated[str, "CSS selectors to remove (comma-separated, e.g., '.header, .footer, nav')."] = "",
|
| 354 |
url_scraper: Annotated[bool, "Extract only links from the page instead of content."] = False,
|
| 355 |
+
offset: Annotated[int, "Character offset to start from (for pagination, use next_cursor from previous call)."] = 0,
|
| 356 |
) -> str:
|
| 357 |
"""
|
| 358 |
Fetch a web page and return it converted to Markdown format with configurable options.
|
|
|
|
| 366 |
max_chars (int): Maximum characters to return. Use 0 for no limit (full page).
|
| 367 |
strip_selectors (str): CSS selectors to remove before processing (comma-separated).
|
| 368 |
url_scraper (bool): If True, extract only links instead of content.
|
| 369 |
+
offset (int): Character offset to start from (for pagination, use next_cursor from previous call).
|
| 370 |
|
| 371 |
Returns:
|
| 372 |
str: Either the webpage content converted to Markdown or a list of all links,
|
| 373 |
+
depending on the url_scraper setting. Content is length-limited by max_chars
|
| 374 |
+
and includes detailed truncation metadata when content is truncated.
|
| 375 |
"""
|
| 376 |
+
_log_call_start("Fetch_Webpage", url=url, max_chars=max_chars, strip_selectors=strip_selectors, url_scraper=url_scraper, offset=offset)
|
| 377 |
if not url or not url.strip():
|
| 378 |
result = "Please enter a valid URL."
|
| 379 |
_log_call_end("Fetch_Webpage", _truncate_for_log(result))
|
|
|
|
| 404 |
if url_scraper:
|
| 405 |
# Extract links mode
|
| 406 |
result = _extract_links_from_soup(full_soup, final_url)
|
| 407 |
+
# Apply offset and truncation for link extraction too
|
| 408 |
+
if offset > 0:
|
| 409 |
+
result = result[offset:]
|
| 410 |
+
if max_chars > 0 and len(result) > max_chars:
|
| 411 |
+
result, metadata = _truncate_markdown(result, max_chars)
|
| 412 |
else:
|
| 413 |
# Convert to markdown mode
|
| 414 |
+
full_result = _fullpage_markdown_from_soup(full_soup, final_url, strip_selectors)
|
| 415 |
+
|
| 416 |
+
# Apply offset if specified
|
| 417 |
+
if offset > 0:
|
| 418 |
+
if offset >= len(full_result):
|
| 419 |
+
result = f"Offset {offset} exceeds content length ({len(full_result)} characters). Content ends at position {len(full_result)}."
|
| 420 |
+
_log_call_end("Fetch_Webpage", _truncate_for_log(result))
|
| 421 |
+
return result
|
| 422 |
+
result = full_result[offset:]
|
| 423 |
+
else:
|
| 424 |
+
result = full_result
|
| 425 |
+
|
| 426 |
+
# Apply max_chars truncation if specified
|
| 427 |
+
if max_chars > 0 and len(result) > max_chars:
|
| 428 |
+
result, metadata = _truncate_markdown(result, max_chars)
|
| 429 |
+
# Adjust metadata to account for offset
|
| 430 |
+
if offset > 0:
|
| 431 |
+
metadata["total_chars_estimate"] = len(full_result)
|
| 432 |
+
metadata["next_cursor"] = offset + metadata["next_cursor"] if metadata["next_cursor"] else None
|
| 433 |
|
| 434 |
+
_log_call_end("Fetch_Webpage", f"chars={len(result)}, url_scraper={url_scraper}, offset={offset}")
|
| 435 |
return result
|
| 436 |
|
| 437 |
|
|
|
|
| 634 |
max_results: Annotated[int, "Number of results to return (1–20)."] = 5,
|
| 635 |
page: Annotated[int, "Page number for pagination (1-based, each page contains max_results items)."] = 1,
|
| 636 |
search_type: Annotated[str, "Type of search: 'text' (web pages), 'news', 'images', 'videos', or 'books'."] = "text",
|
| 637 |
+
offset: Annotated[int, "Result offset to start from (overrides page if > 0, for precise continuation)."] = 0,
|
| 638 |
) -> str:
|
| 639 |
"""
|
| 640 |
Run a DuckDuckGo search and return formatted results with support for multiple content types.
|
| 641 |
+
|
| 642 |
+
Features smart fallback: if 'news' search returns no results, automatically retries with 'text'
|
| 643 |
+
search to catch sources like Hacker News that might not appear in news-specific results.
|
| 644 |
|
| 645 |
Args:
|
| 646 |
query (str): The search query string. Supports operators like site:, quotes for exact matching,
|
|
|
|
| 651 |
- Exact phrase: "artificial intelligence"
|
| 652 |
- Exclude terms: "cats -dogs"
|
| 653 |
max_results (int): Number of results to return per page (1–20). Default: 5.
|
| 654 |
+
page (int): Page number for pagination (1-based). Default: 1. Ignored if offset > 0.
|
| 655 |
search_type (str): Type of search to perform:
|
| 656 |
- "text": Web pages (default)
|
| 657 |
+
- "news": News articles with dates and sources (with smart fallback to 'text')
|
| 658 |
- "images": Image results with dimensions and sources
|
| 659 |
- "videos": Video results with duration and upload info
|
| 660 |
- "books": Book search results
|
| 661 |
+
offset (int): Result offset to start from (0-based). If > 0, overrides page parameter
|
| 662 |
+
for precise continuation. Use this to pick up exactly where you left off.
|
| 663 |
|
| 664 |
Returns:
|
| 665 |
str: Search results formatted appropriately for the search type, with pagination info.
|
| 666 |
+
If 'news' search fails, results include a note about automatic fallback to 'text' search.
|
| 667 |
+
Includes next_offset information for easy continuation.
|
| 668 |
"""
|
| 669 |
+
_log_call_start("Search_DuckDuckGo", query=query, max_results=max_results, page=page, search_type=search_type, offset=offset)
|
| 670 |
if not query or not query.strip():
|
| 671 |
result = "No search query provided. Please enter a search term."
|
| 672 |
_log_call_end("Search_DuckDuckGo", _truncate_for_log(result))
|
|
|
|
| 675 |
# Validate parameters
|
| 676 |
max_results = max(1, min(20, max_results))
|
| 677 |
page = max(1, page)
|
| 678 |
+
offset = max(0, offset)
|
| 679 |
valid_types = ["text", "news", "images", "videos", "books"]
|
| 680 |
if search_type not in valid_types:
|
| 681 |
search_type = "text"
|
| 682 |
|
| 683 |
+
# Calculate actual offset: use provided offset if > 0, otherwise calculate from page
|
| 684 |
+
if offset > 0:
|
| 685 |
+
actual_offset = offset
|
| 686 |
+
calculated_page = (offset // max_results) + 1
|
| 687 |
+
else:
|
| 688 |
+
actual_offset = (page - 1) * max_results
|
| 689 |
+
calculated_page = page
|
| 690 |
|
| 691 |
+
total_needed = actual_offset + max_results
|
| 692 |
+
|
| 693 |
+
# Track if we used fallback
|
| 694 |
+
used_fallback = False
|
| 695 |
+
original_search_type = search_type
|
| 696 |
+
|
| 697 |
+
def _perform_search(stype: str):
|
| 698 |
+
"""Perform the actual search with the given search type."""
|
| 699 |
+
try:
|
| 700 |
+
# Apply rate limiting to avoid being blocked
|
| 701 |
+
_search_rate_limiter.acquire()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 702 |
|
| 703 |
+
# Perform search with timeout handling based on search type
|
| 704 |
+
with DDGS() as ddgs:
|
| 705 |
+
if stype == "text":
|
| 706 |
+
raw_gen = ddgs.text(query, max_results=total_needed + 10)
|
| 707 |
+
elif stype == "news":
|
| 708 |
+
raw_gen = ddgs.news(query, max_results=total_needed + 10)
|
| 709 |
+
elif stype == "images":
|
| 710 |
+
raw_gen = ddgs.images(query, max_results=total_needed + 10)
|
| 711 |
+
elif stype == "videos":
|
| 712 |
+
raw_gen = ddgs.videos(query, max_results=total_needed + 10)
|
| 713 |
+
elif stype == "books":
|
| 714 |
+
raw_gen = ddgs.books(query, max_results=total_needed + 10)
|
| 715 |
+
|
| 716 |
+
# Convert generator to list, handle case where no results are found
|
| 717 |
+
try:
|
| 718 |
+
return list(raw_gen)
|
| 719 |
+
except Exception as inner_e:
|
| 720 |
+
# If the generator fails (e.g., no results), return empty list
|
| 721 |
+
if "no results" in str(inner_e).lower() or "not found" in str(inner_e).lower():
|
| 722 |
+
return []
|
| 723 |
+
else:
|
| 724 |
+
raise inner_e
|
| 725 |
+
|
| 726 |
+
except Exception as e:
|
| 727 |
+
error_msg = f"Search failed: {str(e)[:200]}"
|
| 728 |
+
if "blocked" in str(e).lower() or "rate" in str(e).lower():
|
| 729 |
+
error_msg = "Search temporarily blocked due to rate limiting. Please try again in a few minutes."
|
| 730 |
+
elif "timeout" in str(e).lower():
|
| 731 |
+
error_msg = "Search timed out. Please try again with a simpler query."
|
| 732 |
+
elif "network" in str(e).lower() or "connection" in str(e).lower():
|
| 733 |
+
error_msg = "Network connection error. Please check your internet connection and try again."
|
| 734 |
+
elif "no results" in str(e).lower() or "not found" in str(e).lower():
|
| 735 |
+
# This is expected for some searches, return empty list
|
| 736 |
+
return []
|
| 737 |
+
raise Exception(error_msg)
|
| 738 |
+
|
| 739 |
+
# Try the primary search
|
| 740 |
+
try:
|
| 741 |
+
raw = _perform_search(search_type)
|
| 742 |
except Exception as e:
|
| 743 |
+
result = f"Error: {str(e)}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 744 |
_log_call_end("Search_DuckDuckGo", _truncate_for_log(result))
|
| 745 |
return result
|
| 746 |
|
| 747 |
+
# Smart fallback: if news search returns empty and we haven't tried text yet, try text search
|
| 748 |
+
if not raw and search_type == "news":
|
| 749 |
+
try:
|
| 750 |
+
raw = _perform_search("text")
|
| 751 |
+
if raw: # Only mark as fallback if we actually got results
|
| 752 |
+
used_fallback = True
|
| 753 |
+
search_type = "text" # Update for result formatting
|
| 754 |
+
except Exception:
|
| 755 |
+
# If fallback also fails, continue with empty results from original search
|
| 756 |
+
pass
|
| 757 |
+
|
| 758 |
if not raw:
|
| 759 |
+
fallback_note = " (also tried 'text' search as fallback)" if original_search_type == "news" and used_fallback else ""
|
| 760 |
+
result = f"No {original_search_type} results found for query: {query}{fallback_note}"
|
| 761 |
_log_call_end("Search_DuckDuckGo", _truncate_for_log(result))
|
| 762 |
return result
|
| 763 |
|
| 764 |
# Apply pagination by slicing the results
|
| 765 |
+
paginated_results = raw[actual_offset:actual_offset + max_results]
|
| 766 |
|
| 767 |
if not paginated_results:
|
| 768 |
+
if actual_offset >= len(raw):
|
| 769 |
+
result = f"Offset {actual_offset} exceeds available results ({len(raw)} total). Try offset=0 to start from beginning."
|
| 770 |
+
else:
|
| 771 |
+
result = f"No {original_search_type} results found on page {calculated_page} for query: {query}. Try page 1 or reduce page number."
|
| 772 |
_log_call_end("Search_DuckDuckGo", _truncate_for_log(result))
|
| 773 |
return result
|
| 774 |
|
| 775 |
# Format results based on search type
|
| 776 |
total_available = len(raw)
|
| 777 |
+
start_num = actual_offset + 1
|
| 778 |
+
end_num = actual_offset + len(paginated_results)
|
| 779 |
+
next_offset = actual_offset + len(paginated_results)
|
| 780 |
|
| 781 |
+
# Create header with fallback notification if applicable
|
| 782 |
+
search_label = original_search_type.title()
|
| 783 |
+
if used_fallback:
|
| 784 |
+
search_label += " → Text (Smart Fallback)"
|
| 785 |
+
|
| 786 |
+
# Show both page and offset information for clarity
|
| 787 |
+
pagination_info = f"Page {calculated_page}"
|
| 788 |
+
if offset > 0:
|
| 789 |
+
pagination_info = f"Offset {actual_offset} (≈ {pagination_info})"
|
| 790 |
+
|
| 791 |
+
lines = [f"{search_label} search results for: {query}"]
|
| 792 |
+
|
| 793 |
+
if used_fallback:
|
| 794 |
+
lines.append("📍 Note: News search returned no results, automatically searched general web content instead")
|
| 795 |
+
|
| 796 |
+
lines.append(f"{pagination_info} (results {start_num}-{end_num} of ~{total_available}+ available)\n")
|
| 797 |
|
| 798 |
for i, result in enumerate(paginated_results, start_num):
|
| 799 |
result_lines = _format_search_result(result, search_type, i)
|
| 800 |
lines.extend(result_lines)
|
| 801 |
lines.append("") # Empty line between results
|
| 802 |
|
| 803 |
+
# Add pagination/continuation hints
|
| 804 |
if total_available > end_num:
|
| 805 |
+
lines.append(f"💡 More results available:")
|
| 806 |
+
lines.append(f" • Next page: page={calculated_page + 1}")
|
| 807 |
+
lines.append(f" • Next offset: offset={next_offset}")
|
| 808 |
+
lines.append(f" • Use offset={next_offset} to continue exactly from result {next_offset + 1}")
|
| 809 |
|
| 810 |
result = "\n".join(lines)
|
| 811 |
+
search_info = f"type={original_search_type}"
|
| 812 |
+
if used_fallback:
|
| 813 |
+
search_info += "→text"
|
| 814 |
+
_log_call_end("Search_DuckDuckGo", f"{search_info} page={calculated_page} offset={actual_offset} results={len(paginated_results)} chars={len(result)}")
|
| 815 |
return result
|
| 816 |
|
| 817 |
|
|
|
|
| 1430 |
value=False,
|
| 1431 |
info="Extract only links instead of content"
|
| 1432 |
),
|
| 1433 |
+
gr.Slider(
|
| 1434 |
+
minimum=0,
|
| 1435 |
+
maximum=100000,
|
| 1436 |
+
value=0,
|
| 1437 |
+
step=100,
|
| 1438 |
+
label="Offset",
|
| 1439 |
+
info="Character offset to start from (use next_cursor from previous call for pagination)"
|
| 1440 |
+
),
|
| 1441 |
],
|
| 1442 |
outputs=gr.Markdown(label="Extracted Content"),
|
| 1443 |
title="Fetch Webpage",
|
| 1444 |
description=(
|
| 1445 |
+
"<div style=\"text-align:center\">Convert any webpage to clean Markdown format with precision controls, or extract all links. Supports custom element removal, length limits, and pagination with offset.</div>"
|
| 1446 |
),
|
| 1447 |
api_description=(
|
| 1448 |
"Fetch a web page and return it converted to Markdown format or extract links with configurable options. "
|
| 1449 |
+
"Includes enhanced truncation with detailed metadata and pagination support via offset parameter. "
|
| 1450 |
"Parameters: url (str - absolute URL), max_chars (int - 0=no limit, default 3000), "
|
| 1451 |
"strip_selectors (str - CSS selectors to remove, comma-separated), "
|
| 1452 |
+
"url_scraper (bool - extract only links instead of content, default False), "
|
| 1453 |
+
"offset (int - character offset for pagination, use next_cursor from previous call). "
|
| 1454 |
+
"When content is truncated, returns detailed metadata including truncated status, character counts, "
|
| 1455 |
+
"and next_cursor for continuation. When url_scraper=True, returns formatted list of all links found on the page."
|
| 1456 |
),
|
| 1457 |
flagging_mode="never",
|
| 1458 |
)
|
|
|
|
| 1463 |
inputs=[
|
| 1464 |
gr.Textbox(label="Query", placeholder="topic OR site:example.com"),
|
| 1465 |
gr.Slider(minimum=1, maximum=20, value=5, step=1, label="Max results"),
|
| 1466 |
+
gr.Slider(minimum=1, maximum=10, value=1, step=1, label="Page", info="Page number for pagination (ignored if offset > 0)"),
|
| 1467 |
gr.Radio(
|
| 1468 |
label="Search Type",
|
| 1469 |
choices=["text", "news", "images", "videos", "books"],
|
| 1470 |
value="text",
|
| 1471 |
info="Type of content to search for"
|
| 1472 |
),
|
| 1473 |
+
gr.Slider(
|
| 1474 |
+
minimum=0,
|
| 1475 |
+
maximum=1000,
|
| 1476 |
+
value=0,
|
| 1477 |
+
step=1,
|
| 1478 |
+
label="Offset",
|
| 1479 |
+
info="Result offset to start from (overrides page if > 0, use next_offset from previous search)"
|
| 1480 |
+
),
|
| 1481 |
],
|
| 1482 |
outputs=gr.Textbox(label="Search Results", interactive=False),
|
| 1483 |
title="DuckDuckGo Search",
|
| 1484 |
description=(
|
| 1485 |
+
"<div style=\"text-align:center\">Multi-type web search with readable output format, date detection, and flexible pagination. Supports text, news, images, videos, and books. Features smart fallback for news searches and precise offset control.</div>"
|
| 1486 |
),
|
| 1487 |
api_description=(
|
| 1488 |
"Run a DuckDuckGo search with support for multiple content types and return formatted results. "
|
| 1489 |
+
"Features smart fallback: if 'news' search returns no results, automatically retries with 'text' search "
|
| 1490 |
+
"to catch sources like Hacker News that might not appear in news-specific results. "
|
| 1491 |
"Supports advanced search operators: site: for specific domains, quotes for exact phrases, "
|
| 1492 |
"OR for alternatives, and - to exclude terms. Examples: 'Python programming', 'site:example.com', "
|
| 1493 |
"'\"artificial intelligence\"', 'cats -dogs', 'Python OR JavaScript'. "
|
| 1494 |
"Parameters: query (str), max_results (int, 1-20), page (int, 1-based pagination), "
|
| 1495 |
+
"search_type (str: text/news/images/videos/books), offset (int, result offset for precise continuation). "
|
| 1496 |
+
"If offset > 0, it overrides the page parameter. Returns appropriately formatted results with metadata, "
|
| 1497 |
+
"pagination hints, and next_offset information for each content type."
|
| 1498 |
),
|
| 1499 |
flagging_mode="never",
|
| 1500 |
submit_btn="Search",
|