Big improvements to Fetch and Web Search
Browse files
app.py
CHANGED
|
@@ -209,9 +209,54 @@ def _extract_main_text(html: str) -> Tuple[str, BeautifulSoup]:
|
|
| 209 |
return clean_text, s
|
| 210 |
|
| 211 |
|
| 212 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 213 |
|
| 214 |
-
# Remove
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 215 |
for element in full_soup.select("script, style, nav, footer, header, aside"):
|
| 216 |
element.decompose()
|
| 217 |
|
|
@@ -270,32 +315,28 @@ def _truncate_markdown(markdown: str, max_chars: int) -> str:
|
|
| 270 |
|
| 271 |
def Fetch_Webpage( # <-- MCP tool #1 (Fetch)
|
| 272 |
url: Annotated[str, "The absolute URL to fetch (must return HTML)."],
|
| 273 |
-
|
|
|
|
|
|
|
| 274 |
) -> str:
|
| 275 |
"""
|
| 276 |
-
Fetch a web page and return it converted to Markdown format with configurable
|
| 277 |
|
| 278 |
-
This function retrieves a webpage and converts its main content to clean Markdown
|
| 279 |
-
|
| 280 |
-
|
| 281 |
-
content area.
|
| 282 |
|
| 283 |
Args:
|
| 284 |
url (str): The absolute URL to fetch (must return HTML).
|
| 285 |
-
|
| 286 |
-
|
| 287 |
-
|
| 288 |
-
- "Full": Return complete page content with no length limit
|
| 289 |
|
| 290 |
Returns:
|
| 291 |
-
str:
|
| 292 |
-
|
| 293 |
-
- Main content converted to clean Markdown
|
| 294 |
-
- Preserved heading hierarchy
|
| 295 |
-
- Clean formatting without navigation/sidebar elements
|
| 296 |
-
- Length controlled by verbosity setting
|
| 297 |
"""
|
| 298 |
-
_log_call_start("Fetch_Webpage", url=url,
|
| 299 |
if not url or not url.strip():
|
| 300 |
result = "Please enter a valid URL."
|
| 301 |
_log_call_end("Fetch_Webpage", _truncate_for_log(result))
|
|
@@ -320,18 +361,21 @@ def Fetch_Webpage( # <-- MCP tool #1 (Fetch)
|
|
| 320 |
resp.encoding = resp.encoding or resp.apparent_encoding
|
| 321 |
html = resp.text
|
| 322 |
|
| 323 |
-
# Parse HTML
|
| 324 |
full_soup = BeautifulSoup(html, "lxml")
|
| 325 |
-
markdown_content = _fullpage_markdown_from_soup(full_soup, final_url)
|
| 326 |
|
| 327 |
-
|
| 328 |
-
|
| 329 |
-
result =
|
| 330 |
-
|
| 331 |
-
|
| 332 |
-
|
| 333 |
-
|
| 334 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 335 |
return result
|
| 336 |
|
| 337 |
|
|
@@ -412,12 +456,45 @@ def _log_call_end(func_name: str, output_desc: str) -> None:
|
|
| 412 |
except Exception as e: # pragma: no cover
|
| 413 |
print(f"[TOOL RESULT] {func_name} (failed to log output: {e})", flush=True)
|
| 414 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 415 |
def Search_DuckDuckGo( # <-- MCP tool #2 (DDG Search)
|
| 416 |
query: Annotated[str, "The search query (supports operators like site:, quotes, OR)."],
|
| 417 |
max_results: Annotated[int, "Number of results to return (1–20)."] = 5,
|
|
|
|
| 418 |
) -> str:
|
| 419 |
"""
|
| 420 |
-
Run a DuckDuckGo search and return numbered results with URLs, titles, and
|
| 421 |
|
| 422 |
Args:
|
| 423 |
query (str): The search query string. Supports operators like site:, quotes for exact matching,
|
|
@@ -427,27 +504,36 @@ def Search_DuckDuckGo( # <-- MCP tool #2 (DDG Search)
|
|
| 427 |
- Site search: "site:example.com"
|
| 428 |
- Exact phrase: "artificial intelligence"
|
| 429 |
- Exclude terms: "cats -dogs"
|
| 430 |
-
max_results (int): Number of results to return (1–20). Default: 5.
|
|
|
|
| 431 |
|
| 432 |
Returns:
|
| 433 |
-
str: Search results in readable format with titles, URLs,
|
|
|
|
| 434 |
"""
|
| 435 |
-
_log_call_start("Search_DuckDuckGo", query=query, max_results=max_results)
|
| 436 |
if not query or not query.strip():
|
| 437 |
result = "No search query provided. Please enter a search term."
|
| 438 |
_log_call_end("Search_DuckDuckGo", _truncate_for_log(result))
|
| 439 |
return result
|
| 440 |
|
| 441 |
-
# Validate
|
| 442 |
max_results = max(1, min(20, max_results))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 443 |
|
| 444 |
try:
|
| 445 |
# Apply rate limiting to avoid being blocked
|
| 446 |
_search_rate_limiter.acquire()
|
| 447 |
|
| 448 |
# Perform search with timeout handling
|
|
|
|
| 449 |
with DDGS() as ddgs:
|
| 450 |
-
|
|
|
|
| 451 |
|
| 452 |
except Exception as e:
|
| 453 |
error_msg = f"Search failed: {str(e)[:200]}"
|
|
@@ -466,9 +552,16 @@ def Search_DuckDuckGo( # <-- MCP tool #2 (DDG Search)
|
|
| 466 |
_log_call_end("Search_DuckDuckGo", _truncate_for_log(result))
|
| 467 |
return result
|
| 468 |
|
| 469 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 470 |
|
| 471 |
-
|
|
|
|
| 472 |
title = (r.get("title") or "").strip()
|
| 473 |
url = (r.get("href") or r.get("link") or "").strip()
|
| 474 |
body = (r.get("body") or r.get("snippet") or "").strip()
|
|
@@ -476,29 +569,46 @@ def Search_DuckDuckGo( # <-- MCP tool #2 (DDG Search)
|
|
| 476 |
if not url:
|
| 477 |
continue
|
| 478 |
|
|
|
|
|
|
|
|
|
|
| 479 |
result_obj = {
|
| 480 |
"title": title or _domain_of(url),
|
| 481 |
"url": url,
|
| 482 |
-
"snippet": body
|
|
|
|
| 483 |
}
|
| 484 |
|
| 485 |
results.append(result_obj)
|
| 486 |
|
| 487 |
if not results:
|
| 488 |
-
result = f"No valid results found for query: {query}"
|
| 489 |
_log_call_end("Search_DuckDuckGo", _truncate_for_log(result))
|
| 490 |
return result
|
| 491 |
|
| 492 |
-
# Format output in readable format
|
| 493 |
-
|
| 494 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 495 |
lines.append(f"{i}. {result['title']}")
|
| 496 |
lines.append(f" URL: {result['url']}")
|
| 497 |
if result['snippet']:
|
| 498 |
lines.append(f" Summary: {result['snippet']}")
|
|
|
|
|
|
|
| 499 |
lines.append("") # Empty line between results
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 500 |
result = "\n".join(lines)
|
| 501 |
-
_log_call_end("Search_DuckDuckGo", f"results={len(results)} chars={len(result)}")
|
| 502 |
return result
|
| 503 |
|
| 504 |
|
|
@@ -985,21 +1095,38 @@ fetch_interface = gr.Interface(
|
|
| 985 |
fn=Fetch_Webpage,
|
| 986 |
inputs=[
|
| 987 |
gr.Textbox(label="URL", placeholder="https://example.com/article"),
|
| 988 |
-
gr.
|
| 989 |
-
|
| 990 |
-
|
| 991 |
-
value=
|
| 992 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 993 |
),
|
| 994 |
],
|
| 995 |
-
outputs=gr.Markdown(label="Extracted
|
| 996 |
title="Fetch Webpage",
|
| 997 |
description=(
|
| 998 |
-
"<div style=\"text-align:center\">Convert any webpage to clean Markdown format with
|
| 999 |
),
|
| 1000 |
api_description=(
|
| 1001 |
-
"Fetch a web page and return it converted to Markdown format with configurable
|
| 1002 |
-
"Parameters: url (str - absolute URL),
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1003 |
),
|
| 1004 |
flagging_mode="never",
|
| 1005 |
)
|
|
@@ -1010,17 +1137,20 @@ concise_interface = gr.Interface(
|
|
| 1010 |
inputs=[
|
| 1011 |
gr.Textbox(label="Query", placeholder="topic OR site:example.com"),
|
| 1012 |
gr.Slider(minimum=1, maximum=20, value=5, step=1, label="Max results"),
|
|
|
|
| 1013 |
],
|
| 1014 |
outputs=gr.Textbox(label="Search Results", interactive=False),
|
| 1015 |
title="DuckDuckGo Search",
|
| 1016 |
description=(
|
| 1017 |
-
"<div style=\"text-align:center\">Web search with readable output format. Supports advanced search operators.</div>"
|
| 1018 |
),
|
| 1019 |
api_description=(
|
| 1020 |
-
"Run a DuckDuckGo search and return numbered results with URLs, titles, and
|
| 1021 |
"Supports advanced search operators: site: for specific domains, quotes for exact phrases, "
|
| 1022 |
"OR for alternatives, and - to exclude terms. Examples: 'Python programming', 'site:example.com', "
|
| 1023 |
-
"'\"artificial intelligence\"', 'cats -dogs', 'Python OR JavaScript'."
|
|
|
|
|
|
|
| 1024 |
),
|
| 1025 |
flagging_mode="never",
|
| 1026 |
submit_btn="Search",
|
|
|
|
| 209 |
return clean_text, s
|
| 210 |
|
| 211 |
|
| 212 |
+
def _extract_links_from_soup(soup: BeautifulSoup, base_url: str) -> str:
|
| 213 |
+
"""
|
| 214 |
+
Extract all links from the page and return as formatted text.
|
| 215 |
+
"""
|
| 216 |
+
links = []
|
| 217 |
+
for link in soup.find_all("a", href=True):
|
| 218 |
+
href = link.get("href")
|
| 219 |
+
text = link.get_text(strip=True)
|
| 220 |
+
|
| 221 |
+
# Make relative URLs absolute
|
| 222 |
+
if href.startswith("http"):
|
| 223 |
+
full_url = href
|
| 224 |
+
elif href.startswith("//"):
|
| 225 |
+
full_url = "https:" + href
|
| 226 |
+
elif href.startswith("/"):
|
| 227 |
+
from urllib.parse import urljoin
|
| 228 |
+
full_url = urljoin(base_url, href)
|
| 229 |
+
else:
|
| 230 |
+
from urllib.parse import urljoin
|
| 231 |
+
full_url = urljoin(base_url, href)
|
| 232 |
+
|
| 233 |
+
if text and href not in ["#", "javascript:void(0)"]:
|
| 234 |
+
links.append(f"- [{text}]({full_url})")
|
| 235 |
+
|
| 236 |
+
if not links:
|
| 237 |
+
return "No links found on this page."
|
| 238 |
+
|
| 239 |
+
# Add title if present
|
| 240 |
+
title = soup.find("title")
|
| 241 |
+
title_text = title.get_text(strip=True) if title else "Links from webpage"
|
| 242 |
+
|
| 243 |
+
return f"# {title_text}\n\n" + "\n".join(links)
|
| 244 |
+
|
| 245 |
+
|
| 246 |
+
def _fullpage_markdown_from_soup(full_soup: BeautifulSoup, base_url: str, strip_selectors: str = "") -> str:
|
| 247 |
|
| 248 |
+
# Remove custom selectors first if provided
|
| 249 |
+
if strip_selectors:
|
| 250 |
+
selectors = [s.strip() for s in strip_selectors.split(",") if s.strip()]
|
| 251 |
+
for selector in selectors:
|
| 252 |
+
try:
|
| 253 |
+
for element in full_soup.select(selector):
|
| 254 |
+
element.decompose()
|
| 255 |
+
except Exception:
|
| 256 |
+
# Invalid CSS selector, skip it
|
| 257 |
+
continue
|
| 258 |
+
|
| 259 |
+
# Remove unwanted elements globally
|
| 260 |
for element in full_soup.select("script, style, nav, footer, header, aside"):
|
| 261 |
element.decompose()
|
| 262 |
|
|
|
|
| 315 |
|
| 316 |
def Fetch_Webpage( # <-- MCP tool #1 (Fetch)
|
| 317 |
url: Annotated[str, "The absolute URL to fetch (must return HTML)."],
|
| 318 |
+
max_chars: Annotated[int, "Maximum characters to return (0 = no limit, full page content)."] = 3000,
|
| 319 |
+
strip_selectors: Annotated[str, "CSS selectors to remove (comma-separated, e.g., '.header, .footer, nav')."] = "",
|
| 320 |
+
url_scraper: Annotated[bool, "Extract only links from the page instead of content."] = False,
|
| 321 |
) -> str:
|
| 322 |
"""
|
| 323 |
+
Fetch a web page and return it converted to Markdown format with configurable options.
|
| 324 |
|
| 325 |
+
This function retrieves a webpage and either converts its main content to clean Markdown
|
| 326 |
+
or extracts all links from the page. It automatically removes navigation, footers,
|
| 327 |
+
scripts, and other non-content elements, plus any custom selectors you specify.
|
|
|
|
| 328 |
|
| 329 |
Args:
|
| 330 |
url (str): The absolute URL to fetch (must return HTML).
|
| 331 |
+
max_chars (int): Maximum characters to return. Use 0 for no limit (full page).
|
| 332 |
+
strip_selectors (str): CSS selectors to remove before processing (comma-separated).
|
| 333 |
+
url_scraper (bool): If True, extract only links instead of content.
|
|
|
|
| 334 |
|
| 335 |
Returns:
|
| 336 |
+
str: Either the webpage content converted to Markdown or a list of all links,
|
| 337 |
+
depending on the url_scraper setting. Content is length-limited by max_chars.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 338 |
"""
|
| 339 |
+
_log_call_start("Fetch_Webpage", url=url, max_chars=max_chars, strip_selectors=strip_selectors, url_scraper=url_scraper)
|
| 340 |
if not url or not url.strip():
|
| 341 |
result = "Please enter a valid URL."
|
| 342 |
_log_call_end("Fetch_Webpage", _truncate_for_log(result))
|
|
|
|
| 361 |
resp.encoding = resp.encoding or resp.apparent_encoding
|
| 362 |
html = resp.text
|
| 363 |
|
| 364 |
+
# Parse HTML
|
| 365 |
full_soup = BeautifulSoup(html, "lxml")
|
|
|
|
| 366 |
|
| 367 |
+
if url_scraper:
|
| 368 |
+
# Extract links mode
|
| 369 |
+
result = _extract_links_from_soup(full_soup, final_url)
|
| 370 |
+
else:
|
| 371 |
+
# Convert to markdown mode
|
| 372 |
+
result = _fullpage_markdown_from_soup(full_soup, final_url, strip_selectors)
|
| 373 |
+
|
| 374 |
+
# Apply max_chars truncation if specified
|
| 375 |
+
if max_chars > 0 and len(result) > max_chars:
|
| 376 |
+
result = _truncate_markdown(result, max_chars)
|
| 377 |
+
|
| 378 |
+
_log_call_end("Fetch_Webpage", f"chars={len(result)}, url_scraper={url_scraper}")
|
| 379 |
return result
|
| 380 |
|
| 381 |
|
|
|
|
| 456 |
except Exception as e: # pragma: no cover
|
| 457 |
print(f"[TOOL RESULT] {func_name} (failed to log output: {e})", flush=True)
|
| 458 |
|
| 459 |
+
def _extract_date_from_snippet(snippet: str) -> str:
|
| 460 |
+
"""
|
| 461 |
+
Extract publication date from search result snippet using common patterns.
|
| 462 |
+
"""
|
| 463 |
+
import re
|
| 464 |
+
from datetime import datetime
|
| 465 |
+
|
| 466 |
+
if not snippet:
|
| 467 |
+
return ""
|
| 468 |
+
|
| 469 |
+
# Common date patterns
|
| 470 |
+
date_patterns = [
|
| 471 |
+
# ISO format: 2023-12-25, 2023/12/25
|
| 472 |
+
r'\b(\d{4}[-/]\d{1,2}[-/]\d{1,2})\b',
|
| 473 |
+
# US format: Dec 25, 2023 | December 25, 2023
|
| 474 |
+
r'\b([A-Za-z]{3,9}\s+\d{1,2},?\s+\d{4})\b',
|
| 475 |
+
# EU format: 25 Dec 2023 | 25 December 2023
|
| 476 |
+
r'\b(\d{1,2}\s+[A-Za-z]{3,9}\s+\d{4})\b',
|
| 477 |
+
# Relative: "2 days ago", "1 week ago", "3 months ago"
|
| 478 |
+
r'\b(\d+\s+(?:day|week|month|year)s?\s+ago)\b',
|
| 479 |
+
# Common prefixes: "Published: ", "Updated: ", "Posted: "
|
| 480 |
+
r'(?:Published|Updated|Posted):\s*([^,\n]+?)(?:[,\n]|$)',
|
| 481 |
+
]
|
| 482 |
+
|
| 483 |
+
for pattern in date_patterns:
|
| 484 |
+
matches = re.findall(pattern, snippet, re.IGNORECASE)
|
| 485 |
+
if matches:
|
| 486 |
+
return matches[0].strip()
|
| 487 |
+
|
| 488 |
+
return ""
|
| 489 |
+
|
| 490 |
+
|
| 491 |
def Search_DuckDuckGo( # <-- MCP tool #2 (DDG Search)
|
| 492 |
query: Annotated[str, "The search query (supports operators like site:, quotes, OR)."],
|
| 493 |
max_results: Annotated[int, "Number of results to return (1–20)."] = 5,
|
| 494 |
+
page: Annotated[int, "Page number for pagination (1-based, each page contains max_results items)."] = 1,
|
| 495 |
) -> str:
|
| 496 |
"""
|
| 497 |
+
Run a DuckDuckGo search and return numbered results with URLs, titles, snippets, and dates.
|
| 498 |
|
| 499 |
Args:
|
| 500 |
query (str): The search query string. Supports operators like site:, quotes for exact matching,
|
|
|
|
| 504 |
- Site search: "site:example.com"
|
| 505 |
- Exact phrase: "artificial intelligence"
|
| 506 |
- Exclude terms: "cats -dogs"
|
| 507 |
+
max_results (int): Number of results to return per page (1–20). Default: 5.
|
| 508 |
+
page (int): Page number for pagination (1-based). Default: 1.
|
| 509 |
|
| 510 |
Returns:
|
| 511 |
+
str: Search results in readable format with titles, URLs, snippets, and publication dates
|
| 512 |
+
when available, formatted as a numbered list with pagination info.
|
| 513 |
"""
|
| 514 |
+
_log_call_start("Search_DuckDuckGo", query=query, max_results=max_results, page=page)
|
| 515 |
if not query or not query.strip():
|
| 516 |
result = "No search query provided. Please enter a search term."
|
| 517 |
_log_call_end("Search_DuckDuckGo", _truncate_for_log(result))
|
| 518 |
return result
|
| 519 |
|
| 520 |
+
# Validate parameters
|
| 521 |
max_results = max(1, min(20, max_results))
|
| 522 |
+
page = max(1, page)
|
| 523 |
+
|
| 524 |
+
# Calculate offset for pagination
|
| 525 |
+
offset = (page - 1) * max_results
|
| 526 |
+
total_needed = offset + max_results
|
| 527 |
|
| 528 |
try:
|
| 529 |
# Apply rate limiting to avoid being blocked
|
| 530 |
_search_rate_limiter.acquire()
|
| 531 |
|
| 532 |
# Perform search with timeout handling
|
| 533 |
+
# We need to get more results than needed for pagination
|
| 534 |
with DDGS() as ddgs:
|
| 535 |
+
raw_gen = ddgs.text(query, max_results=total_needed + 10) # Get extra for safety
|
| 536 |
+
raw = list(raw_gen)
|
| 537 |
|
| 538 |
except Exception as e:
|
| 539 |
error_msg = f"Search failed: {str(e)[:200]}"
|
|
|
|
| 552 |
_log_call_end("Search_DuckDuckGo", _truncate_for_log(result))
|
| 553 |
return result
|
| 554 |
|
| 555 |
+
# Apply pagination by slicing the results
|
| 556 |
+
paginated_results = raw[offset:offset + max_results]
|
| 557 |
+
|
| 558 |
+
if not paginated_results:
|
| 559 |
+
result = f"No results found on page {page} for query: {query}. Try page 1 or reduce page number."
|
| 560 |
+
_log_call_end("Search_DuckDuckGo", _truncate_for_log(result))
|
| 561 |
+
return result
|
| 562 |
|
| 563 |
+
results = []
|
| 564 |
+
for r in paginated_results:
|
| 565 |
title = (r.get("title") or "").strip()
|
| 566 |
url = (r.get("href") or r.get("link") or "").strip()
|
| 567 |
body = (r.get("body") or r.get("snippet") or "").strip()
|
|
|
|
| 569 |
if not url:
|
| 570 |
continue
|
| 571 |
|
| 572 |
+
# Extract date from snippet
|
| 573 |
+
date_found = _extract_date_from_snippet(body)
|
| 574 |
+
|
| 575 |
result_obj = {
|
| 576 |
"title": title or _domain_of(url),
|
| 577 |
"url": url,
|
| 578 |
+
"snippet": body,
|
| 579 |
+
"date": date_found
|
| 580 |
}
|
| 581 |
|
| 582 |
results.append(result_obj)
|
| 583 |
|
| 584 |
if not results:
|
| 585 |
+
result = f"No valid results found on page {page} for query: {query}"
|
| 586 |
_log_call_end("Search_DuckDuckGo", _truncate_for_log(result))
|
| 587 |
return result
|
| 588 |
|
| 589 |
+
# Format output in readable format with pagination info
|
| 590 |
+
total_available = len(raw)
|
| 591 |
+
start_num = offset + 1
|
| 592 |
+
end_num = offset + len(results)
|
| 593 |
+
|
| 594 |
+
lines = [f"Search results for: {query}"]
|
| 595 |
+
lines.append(f"Page {page} (results {start_num}-{end_num} of ~{total_available}+ available)\n")
|
| 596 |
+
|
| 597 |
+
for i, result in enumerate(results, start_num):
|
| 598 |
lines.append(f"{i}. {result['title']}")
|
| 599 |
lines.append(f" URL: {result['url']}")
|
| 600 |
if result['snippet']:
|
| 601 |
lines.append(f" Summary: {result['snippet']}")
|
| 602 |
+
if result['date']:
|
| 603 |
+
lines.append(f" Date: {result['date']}")
|
| 604 |
lines.append("") # Empty line between results
|
| 605 |
+
|
| 606 |
+
# Add pagination hint
|
| 607 |
+
if total_available > end_num:
|
| 608 |
+
lines.append(f"💡 More results available - use page={page + 1} to see next {max_results} results")
|
| 609 |
+
|
| 610 |
result = "\n".join(lines)
|
| 611 |
+
_log_call_end("Search_DuckDuckGo", f"page={page} results={len(results)} chars={len(result)}")
|
| 612 |
return result
|
| 613 |
|
| 614 |
|
|
|
|
| 1095 |
fn=Fetch_Webpage,
|
| 1096 |
inputs=[
|
| 1097 |
gr.Textbox(label="URL", placeholder="https://example.com/article"),
|
| 1098 |
+
gr.Slider(
|
| 1099 |
+
minimum=0,
|
| 1100 |
+
maximum=20000,
|
| 1101 |
+
value=3000,
|
| 1102 |
+
step=100,
|
| 1103 |
+
label="Max Characters",
|
| 1104 |
+
info="0 = no limit (full page), default 3000"
|
| 1105 |
+
),
|
| 1106 |
+
gr.Textbox(
|
| 1107 |
+
label="Strip Selectors",
|
| 1108 |
+
placeholder=".header, .footer, nav, .sidebar",
|
| 1109 |
+
value="",
|
| 1110 |
+
info="CSS selectors to remove (comma-separated)"
|
| 1111 |
+
),
|
| 1112 |
+
gr.Checkbox(
|
| 1113 |
+
label="URL Scraper",
|
| 1114 |
+
value=False,
|
| 1115 |
+
info="Extract only links instead of content"
|
| 1116 |
),
|
| 1117 |
],
|
| 1118 |
+
outputs=gr.Markdown(label="Extracted Content"),
|
| 1119 |
title="Fetch Webpage",
|
| 1120 |
description=(
|
| 1121 |
+
"<div style=\"text-align:center\">Convert any webpage to clean Markdown format with precision controls, or extract all links. Supports custom element removal and length limits.</div>"
|
| 1122 |
),
|
| 1123 |
api_description=(
|
| 1124 |
+
"Fetch a web page and return it converted to Markdown format or extract links with configurable options. "
|
| 1125 |
+
"Parameters: url (str - absolute URL), max_chars (int - 0=no limit, default 3000), "
|
| 1126 |
+
"strip_selectors (str - CSS selectors to remove, comma-separated), "
|
| 1127 |
+
"url_scraper (bool - extract only links instead of content, default False). "
|
| 1128 |
+
"When url_scraper=True, returns formatted list of all links found on the page. "
|
| 1129 |
+
"When False, returns clean Markdown content with custom element removal and length control."
|
| 1130 |
),
|
| 1131 |
flagging_mode="never",
|
| 1132 |
)
|
|
|
|
| 1137 |
inputs=[
|
| 1138 |
gr.Textbox(label="Query", placeholder="topic OR site:example.com"),
|
| 1139 |
gr.Slider(minimum=1, maximum=20, value=5, step=1, label="Max results"),
|
| 1140 |
+
gr.Slider(minimum=1, maximum=10, value=1, step=1, label="Page", info="Page number for pagination"),
|
| 1141 |
],
|
| 1142 |
outputs=gr.Textbox(label="Search Results", interactive=False),
|
| 1143 |
title="DuckDuckGo Search",
|
| 1144 |
description=(
|
| 1145 |
+
"<div style=\"text-align:center\">Web search with readable output format, date detection, and pagination support. Supports advanced search operators.</div>"
|
| 1146 |
),
|
| 1147 |
api_description=(
|
| 1148 |
+
"Run a DuckDuckGo search and return numbered results with URLs, titles, summaries, and publication dates when detectable. "
|
| 1149 |
"Supports advanced search operators: site: for specific domains, quotes for exact phrases, "
|
| 1150 |
"OR for alternatives, and - to exclude terms. Examples: 'Python programming', 'site:example.com', "
|
| 1151 |
+
"'\"artificial intelligence\"', 'cats -dogs', 'Python OR JavaScript'. "
|
| 1152 |
+
"Parameters: query (str), max_results (int, 1-20), page (int, 1-based pagination). "
|
| 1153 |
+
"Returns formatted results with date metadata and pagination hints for accessing more results."
|
| 1154 |
),
|
| 1155 |
flagging_mode="never",
|
| 1156 |
submit_btn="Search",
|