Search MCP changes... READ ME
Browse filesI simplified the `Fetch_Webpage` and `Search_DuckDuckGo` tools a lot, it's much cleaner and reliable now. That said, the MCP instructions given to the LLM is a bit verbose right now.
Currently, having all six tools enabled will add 2000 tokens to context. I'd like it to be a bit less, but the current prompts are very effective for small, shitty models. They also work very well for big models but I test locally.
Fetch and Search now use a much better user agent and spam protection to avoid bot detection. Both have better error handling.
Search now makes full use of DDG's Operators (queries with site search, other advanced search parameters).
app.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
# Purpose: One Space that offers six tools/tabs (all exposed as MCP tools):
|
2 |
-
# 1) Fetch —
|
3 |
# 2) DuckDuckGo Search — compact JSONL search output (short keys to minimize tokens)
|
4 |
# 3) Python Code Executor — run Python code and capture stdout/errors
|
5 |
# 4) Kokoro TTS — synthesize speech from text using Kokoro-82M with 54 voice options
|
@@ -85,96 +85,6 @@ def _http_get_enhanced(url: str) -> requests.Response:
|
|
85 |
else:
|
86 |
raise requests.exceptions.RequestException(f"HTTP error {response.status_code}: {str(e)}")
|
87 |
|
88 |
-
def _extract_main_text_enhanced(html: str) -> Tuple[str, BeautifulSoup]:
|
89 |
-
"""
|
90 |
-
Enhanced main text extraction with better fallback mechanisms.
|
91 |
-
"""
|
92 |
-
try:
|
93 |
-
# Try Readability first
|
94 |
-
doc = Document(html)
|
95 |
-
readable_html = doc.summary(html_partial=True)
|
96 |
-
|
97 |
-
if readable_html and readable_html.strip():
|
98 |
-
soup = BeautifulSoup(readable_html, "lxml")
|
99 |
-
|
100 |
-
# Remove noisy tags more comprehensively
|
101 |
-
for sel in ["script", "style", "noscript", "iframe", "svg", "nav", "header", "footer", "aside", "[role='banner']", "[role='navigation']", "[role='complementary']"]:
|
102 |
-
for tag in soup.select(sel):
|
103 |
-
tag.decompose()
|
104 |
-
|
105 |
-
# Extract text with better structure preservation
|
106 |
-
text_parts = []
|
107 |
-
for element in soup.find_all(["h1", "h2", "h3", "h4", "h5", "h6", "p", "li", "blockquote", "div"]):
|
108 |
-
chunk = element.get_text(" ", strip=True)
|
109 |
-
if chunk and len(chunk) > 15: # Only include substantial content
|
110 |
-
# Avoid repetitive disclaimers
|
111 |
-
if not ("responses are generated using ai" in chunk.lower() and len(chunk) < 100):
|
112 |
-
text_parts.append(chunk)
|
113 |
-
|
114 |
-
if text_parts:
|
115 |
-
clean_text = _normalize_whitespace("\n\n".join(text_parts))
|
116 |
-
# Check if we got substantial content
|
117 |
-
if len(clean_text) > 100:
|
118 |
-
return clean_text, soup
|
119 |
-
|
120 |
-
except Exception:
|
121 |
-
pass # Fall through to backup extraction
|
122 |
-
|
123 |
-
# Fallback: Parse original HTML more carefully
|
124 |
-
try:
|
125 |
-
full_soup = BeautifulSoup(html, "lxml")
|
126 |
-
|
127 |
-
# Remove unwanted elements
|
128 |
-
for element in full_soup.select("script, style, nav, footer, header, aside, [role='banner'], [role='navigation'], [role='complementary']"):
|
129 |
-
element.decompose()
|
130 |
-
|
131 |
-
# Try to find main content areas
|
132 |
-
main_content = (
|
133 |
-
full_soup.find("main")
|
134 |
-
or full_soup.find("article")
|
135 |
-
or full_soup.find("div", class_=re.compile(r"content|main|post|article|body", re.I))
|
136 |
-
or full_soup.find("div", id=re.compile(r"content|main|post|article|body", re.I))
|
137 |
-
or full_soup.find("section", class_=re.compile(r"content|main|post|article|body", re.I))
|
138 |
-
or full_soup.find("body")
|
139 |
-
or full_soup
|
140 |
-
)
|
141 |
-
|
142 |
-
if main_content:
|
143 |
-
# More aggressive removal of common noise patterns
|
144 |
-
for element in main_content.select(".disclaimer, .warning, .alert, .notice, [class*='cookie'], [class*='banner'], [id*='cookie'], [id*='banner']"):
|
145 |
-
element.decompose()
|
146 |
-
|
147 |
-
text = main_content.get_text(" ", strip=True)
|
148 |
-
text = _normalize_whitespace(text)
|
149 |
-
|
150 |
-
# Filter out repetitive text
|
151 |
-
lines = text.split('\n')
|
152 |
-
filtered_lines = []
|
153 |
-
seen_lines = set()
|
154 |
-
|
155 |
-
for line in lines:
|
156 |
-
line_clean = line.strip()
|
157 |
-
if len(line_clean) > 10 and line_clean not in seen_lines:
|
158 |
-
# Skip common disclaimers and repetitive content
|
159 |
-
if not ("responses are generated using ai" in line_clean.lower() and len(line_clean) < 100):
|
160 |
-
filtered_lines.append(line)
|
161 |
-
seen_lines.add(line_clean)
|
162 |
-
|
163 |
-
clean_text = '\n'.join(filtered_lines)
|
164 |
-
|
165 |
-
# Create a minimal soup for link extraction
|
166 |
-
minimal_soup = BeautifulSoup(str(main_content), "lxml")
|
167 |
-
return clean_text, minimal_soup
|
168 |
-
|
169 |
-
except Exception:
|
170 |
-
pass
|
171 |
-
|
172 |
-
# Last resort: Just get all text
|
173 |
-
fallback_soup = BeautifulSoup(html, "lxml")
|
174 |
-
text = fallback_soup.get_text(" ", strip=True)
|
175 |
-
return _normalize_whitespace(text), fallback_soup
|
176 |
-
|
177 |
-
|
178 |
def _normalize_whitespace(text: str) -> str:
|
179 |
"""
|
180 |
Squeeze extra spaces and blank lines to keep things compact.
|
@@ -338,136 +248,57 @@ def _fullpage_markdown_from_soup(full_soup: BeautifulSoup, base_url: str) -> str
|
|
338 |
return markdown_text or "No content could be extracted."
|
339 |
|
340 |
|
341 |
-
def
|
342 |
-
"""
|
343 |
-
Collect clean, unique, absolute links from the readable section only.
|
344 |
-
(Layman's terms: pull a tidy list of links from the article body.)
|
345 |
-
"""
|
346 |
-
seen = set()
|
347 |
-
links: List[Tuple[str, str]] = []
|
348 |
-
|
349 |
-
for a in readable_soup.find_all("a", href=True):
|
350 |
-
href = a.get("href").strip()
|
351 |
-
# Skip junk links we can't use
|
352 |
-
if not href or href.startswith("#") or href.startswith("mailto:") or href.startswith("javascript:"):
|
353 |
-
continue
|
354 |
-
|
355 |
-
# Resolve relative URLs, strip fragments (#…)
|
356 |
-
absolute = urljoin(base_url, href)
|
357 |
-
absolute, _ = urldefrag(absolute)
|
358 |
-
|
359 |
-
if absolute in seen:
|
360 |
-
continue
|
361 |
-
seen.add(absolute)
|
362 |
-
|
363 |
-
text = a.get_text(" ", strip=True)
|
364 |
-
if len(text) > 120:
|
365 |
-
text = text[:117] + "…"
|
366 |
-
|
367 |
-
links.append((text or absolute, absolute))
|
368 |
-
|
369 |
-
if len(links) >= max_links > 0:
|
370 |
-
break
|
371 |
-
|
372 |
-
return links
|
373 |
-
|
374 |
-
|
375 |
-
def _format_markdown(
|
376 |
-
meta: Dict[str, str],
|
377 |
-
body: str,
|
378 |
-
body_truncated: bool,
|
379 |
-
links: List[Tuple[str, str]],
|
380 |
-
include_text: bool,
|
381 |
-
include_metadata: bool,
|
382 |
-
include_links: bool,
|
383 |
-
verbosity: str,
|
384 |
-
) -> str:
|
385 |
"""
|
386 |
-
|
387 |
-
|
388 |
"""
|
389 |
-
|
390 |
-
|
391 |
-
|
392 |
-
|
393 |
-
|
394 |
-
|
395 |
-
#
|
396 |
-
|
397 |
-
|
398 |
-
|
399 |
-
|
400 |
-
|
401 |
-
|
402 |
-
|
403 |
-
|
404 |
-
|
405 |
-
|
406 |
-
|
407 |
-
md.append(f"- **Fetched From:** {meta['fetched_url']}")
|
408 |
-
if md:
|
409 |
-
lines.append("## Metadata")
|
410 |
-
lines.extend(md)
|
411 |
-
|
412 |
-
# Body text
|
413 |
-
if include_text and body:
|
414 |
-
if verbosity == "Brief":
|
415 |
-
brief, was_more = _truncate(body, 800)
|
416 |
-
lines.append("## Text")
|
417 |
-
lines.append(brief)
|
418 |
-
if was_more or body_truncated:
|
419 |
-
lines.append("\n> (Trimmed for brevity)")
|
420 |
-
else:
|
421 |
-
lines.append("## Text")
|
422 |
-
lines.append(body)
|
423 |
-
if body_truncated:
|
424 |
-
lines.append("\n> (Trimmed for brevity)")
|
425 |
-
|
426 |
-
# Links section
|
427 |
-
if include_links and links:
|
428 |
-
lines.append(f"## Links ({len(links)})")
|
429 |
-
for text, url in links:
|
430 |
-
lines.append(f"- [{text}]({url})")
|
431 |
-
|
432 |
-
return "\n\n".join(lines).strip()
|
433 |
|
434 |
|
435 |
def Fetch_Webpage( # <-- MCP tool #1 (Fetch)
|
436 |
-
url: Annotated[str, "The absolute URL to fetch (must return HTML)."]
|
437 |
-
verbosity: Annotated[str, "Controls
|
438 |
-
include_metadata: Annotated[bool, "Include a Metadata section (description, site name, canonical, lang, fetched URL)."] = True,
|
439 |
-
include_text: Annotated[bool, "Include the readable main text extracted with Readability."] = True,
|
440 |
-
include_links: Annotated[bool, "Include outbound links discovered in the readable section."] = True,
|
441 |
-
max_chars: Annotated[int, "Hard cap for body characters after the verbosity preset. Use 0 to disable the cap."] = 3000,
|
442 |
-
max_links: Annotated[int, "Maximum number of links to include from the readable content. Set 0 to omit links."] = 20,
|
443 |
-
full_page_markdown: Annotated[bool, "If true, return the page as full Markdown (Content Scraper mode) instead of a compact summary."] = False,
|
444 |
) -> str:
|
445 |
"""
|
446 |
-
Fetch a web page and return
|
447 |
-
|
|
|
|
|
|
|
|
|
448 |
|
449 |
Args:
|
450 |
-
url: The absolute URL to fetch (must return HTML).
|
451 |
-
verbosity: Controls
|
452 |
-
|
453 |
-
|
454 |
-
|
455 |
-
max_chars: Hard cap for body characters after the verbosity preset. Use 0 to disable the cap.
|
456 |
-
max_links: Maximum number of links to include from the readable content. Set 0 to omit links.
|
457 |
-
full_page_markdown: If True, return the page converted to full Markdown (Content Scraper mode)
|
458 |
-
instead of the compact summary. This ignores verbosity/include_* and max_* limits and
|
459 |
-
attempts to convert the main content area to Markdown with headings preserved.
|
460 |
|
461 |
Returns:
|
462 |
-
str:
|
463 |
-
-
|
464 |
-
-
|
465 |
-
-
|
466 |
-
-
|
467 |
-
|
468 |
-
Special mode:
|
469 |
-
If full_page_markdown=True, the function returns the page converted to Markdown,
|
470 |
-
similar to the "Content Scraper" tool, ignoring verbosity/include_* limits.
|
471 |
"""
|
472 |
if not url or not url.strip():
|
473 |
return "Please enter a valid URL."
|
@@ -487,42 +318,17 @@ def Fetch_Webpage( # <-- MCP tool #1 (Fetch)
|
|
487 |
resp.encoding = resp.encoding or resp.apparent_encoding
|
488 |
html = resp.text
|
489 |
|
490 |
-
#
|
491 |
full_soup = BeautifulSoup(html, "lxml")
|
492 |
-
|
493 |
-
|
494 |
-
#
|
495 |
-
if
|
496 |
-
return
|
497 |
-
|
498 |
-
|
499 |
-
|
500 |
-
|
501 |
-
# Fallback to "whole-page text" if Readability found nothing
|
502 |
-
fallback_text = full_soup.get_text(" ", strip=True)
|
503 |
-
body_text = _normalize_whitespace(fallback_text)
|
504 |
-
|
505 |
-
# Verbosity presets (we keep the smaller of preset vs. user cap)
|
506 |
-
preset_caps = {"Brief": 1200, "Standard": 3000, "Full": 999_999}
|
507 |
-
target_cap = preset_caps.get(verbosity, 3000)
|
508 |
-
cap = min(max_chars if max_chars > 0 else target_cap, target_cap)
|
509 |
-
body_text, truncated = _truncate(body_text, cap) if include_text else ("", False)
|
510 |
-
|
511 |
-
# Extract links from the simplified content only
|
512 |
-
links = _extract_links(readable_soup, final_url, max_links=max_links if include_links else 0)
|
513 |
-
|
514 |
-
# Final compact Markdown
|
515 |
-
md = _format_markdown(
|
516 |
-
meta=meta,
|
517 |
-
body=body_text,
|
518 |
-
body_truncated=truncated,
|
519 |
-
links=links,
|
520 |
-
include_text=include_text,
|
521 |
-
include_metadata=include_metadata,
|
522 |
-
include_links=include_links,
|
523 |
-
verbosity=verbosity,
|
524 |
-
)
|
525 |
-
return md or "No content could be extracted."
|
526 |
|
527 |
|
528 |
# ============================================
|
@@ -558,37 +364,28 @@ _search_rate_limiter = RateLimiter(requests_per_minute=20)
|
|
558 |
_fetch_rate_limiter = RateLimiter(requests_per_minute=25)
|
559 |
|
560 |
def Search_DuckDuckGo( # <-- MCP tool #2 (DDG Search)
|
561 |
-
query: Annotated[str, "The search query (supports operators like site:, quotes, OR)."]
|
562 |
max_results: Annotated[int, "Number of results to return (1–20)."] = 5,
|
563 |
-
include_snippets: Annotated[bool, "Include a short snippet for each result (adds tokens)."] = False,
|
564 |
-
max_snippet_chars: Annotated[int, "Character cap applied to each snippet when included."] = 80,
|
565 |
-
dedupe_domains: Annotated[bool, "If true, only the first result from each domain is kept."] = True,
|
566 |
-
title_chars: Annotated[int, "Character cap applied to titles."] = 80,
|
567 |
-
output_format: Annotated[str, "Output format: 'jsonl' for compact JSON or 'readable' for LLM-friendly text."] = "jsonl",
|
568 |
) -> str:
|
569 |
"""
|
570 |
-
Run a DuckDuckGo search with enhanced error handling and
|
571 |
-
|
572 |
-
format optimized for LLM consumption with better error messages.
|
573 |
|
574 |
Args:
|
575 |
-
query: The search query
|
576 |
-
|
577 |
-
|
578 |
-
|
579 |
-
|
580 |
-
|
581 |
-
|
|
|
582 |
|
583 |
Returns:
|
584 |
-
str:
|
585 |
-
or readable text format for better LLM consumption.
|
586 |
"""
|
587 |
if not query or not query.strip():
|
588 |
-
|
589 |
-
if output_format == "readable":
|
590 |
-
return error_msg
|
591 |
-
return json.dumps({"error": error_msg}, ensure_ascii=False, separators=(",", ":"))
|
592 |
|
593 |
# Validate max_results
|
594 |
max_results = max(1, min(20, max_results))
|
@@ -610,68 +407,41 @@ def Search_DuckDuckGo( # <-- MCP tool #2 (DDG Search)
|
|
610 |
elif "network" in str(e).lower() or "connection" in str(e).lower():
|
611 |
error_msg = "Network connection error. Please check your internet connection and try again."
|
612 |
|
613 |
-
|
614 |
-
return f"Error: {error_msg}"
|
615 |
-
return json.dumps({"error": error_msg}, ensure_ascii=False, separators=(",", ":"))
|
616 |
|
617 |
if not raw:
|
618 |
-
|
619 |
-
if output_format == "readable":
|
620 |
-
return no_results_msg
|
621 |
-
return json.dumps({"info": no_results_msg}, ensure_ascii=False, separators=(",", ":"))
|
622 |
|
623 |
-
seen_domains = set()
|
624 |
results = []
|
625 |
|
626 |
for r in raw or []:
|
627 |
-
title =
|
628 |
url = (r.get("href") or r.get("link") or "").strip()
|
629 |
body = (r.get("body") or r.get("snippet") or "").strip()
|
630 |
|
631 |
if not url:
|
632 |
continue
|
633 |
|
634 |
-
if dedupe_domains:
|
635 |
-
dom = _domain_of(url)
|
636 |
-
if dom in seen_domains:
|
637 |
-
continue
|
638 |
-
seen_domains.add(dom)
|
639 |
-
|
640 |
result_obj = {
|
641 |
"title": title or _domain_of(url),
|
642 |
-
"url": url
|
|
|
643 |
}
|
644 |
-
|
645 |
-
if include_snippets and body:
|
646 |
-
result_obj["snippet"] = _shorten(body, max_snippet_chars)
|
647 |
|
648 |
results.append(result_obj)
|
649 |
|
650 |
if not results:
|
651 |
-
|
652 |
-
|
653 |
-
|
654 |
-
|
655 |
-
|
656 |
-
|
657 |
-
|
658 |
-
|
659 |
-
|
660 |
-
|
661 |
-
|
662 |
-
if "snippet" in result:
|
663 |
-
lines.append(f" Summary: {result['snippet']}")
|
664 |
-
lines.append("") # Empty line between results
|
665 |
-
return "\n".join(lines)
|
666 |
-
else:
|
667 |
-
# JSONL format with compact keys
|
668 |
-
lines = []
|
669 |
-
for result in results:
|
670 |
-
obj = {"t": result["title"], "u": result["url"]}
|
671 |
-
if "snippet" in result:
|
672 |
-
obj["s"] = result["snippet"]
|
673 |
-
lines.append(json.dumps(obj, ensure_ascii=False, separators=(",", ":")))
|
674 |
-
return "\n".join(lines)
|
675 |
|
676 |
|
677 |
# ======================================
|
@@ -683,7 +453,7 @@ def Execute_Python(code: Annotated[str, "Python source code to run; stdout is ca
|
|
683 |
Execute arbitrary Python code and return captured stdout or an error message.
|
684 |
|
685 |
Args:
|
686 |
-
code: Python source code to run; stdout is captured and returned.
|
687 |
|
688 |
Returns:
|
689 |
str: Combined stdout produced by the code, or the exception text if
|
@@ -850,9 +620,9 @@ def Generate_Speech( # <-- MCP tool #4 (Generate Speech)
|
|
850 |
- Voice defaults to "af_heart" (American Female, Heart voice)
|
851 |
|
852 |
Args:
|
853 |
-
text: The text to synthesize. Works best with English but supports multiple languages.
|
854 |
-
speed: Speech speed multiplier in 0.5–2.0; 1.0 = normal speed. Default: 1.25 (slightly brisk).
|
855 |
-
voice: Voice identifier from 54 available options. Use List_Kokoro_Voices() to see all choices. Default: 'af_heart'.
|
856 |
|
857 |
Returns:
|
858 |
A tuple of (sample_rate_hz, audio_waveform) where:
|
@@ -928,52 +698,50 @@ def Generate_Speech( # <-- MCP tool #4 (Generate Speech)
|
|
928 |
|
929 |
# --- Fetch tab (compact controllable extraction) ---
|
930 |
fetch_interface = gr.Interface(
|
931 |
-
fn=Fetch_Webpage,
|
932 |
inputs=[
|
933 |
gr.Textbox(label="URL", placeholder="https://example.com/article"),
|
934 |
-
gr.Dropdown(
|
935 |
-
|
936 |
-
|
937 |
-
|
938 |
-
|
939 |
-
|
940 |
-
gr.Checkbox(value=False, label="Full-page Markdown (Content Scraper mode)"),
|
941 |
],
|
942 |
-
outputs=gr.Markdown(label="Extracted
|
943 |
title="Fetch Webpage",
|
944 |
description=(
|
945 |
-
|
946 |
),
|
947 |
api_description=(
|
948 |
-
|
949 |
-
|
950 |
-
|
951 |
-
|
|
|
|
|
952 |
),
|
953 |
allow_flagging="never",
|
954 |
)
|
955 |
|
956 |
-
# ---
|
957 |
concise_interface = gr.Interface(
|
958 |
fn=Search_DuckDuckGo,
|
959 |
inputs=[
|
960 |
gr.Textbox(label="Query", placeholder="topic OR site:example.com"),
|
961 |
gr.Slider(minimum=1, maximum=20, value=5, step=1, label="Max results"),
|
962 |
-
gr.Checkbox(value=False, label="Include snippets (adds tokens)"),
|
963 |
-
gr.Slider(minimum=20, maximum=200, value=80, step=5, label="Max snippet chars"),
|
964 |
-
gr.Checkbox(value=True, label="Dedupe by domain"),
|
965 |
-
gr.Slider(minimum=20, maximum=120, value=80, step=5, label="Max title chars"),
|
966 |
-
gr.Radio(label="Output format", choices=["jsonl", "readable"], value="jsonl", info="JSONL for compact JSON, readable for LLM-friendly text"),
|
967 |
],
|
968 |
outputs=gr.Textbox(label="Search Results", interactive=False),
|
969 |
title="DuckDuckGo Search",
|
970 |
description=(
|
971 |
-
"<div style=\"text-align:center\">Enhanced web search with
|
972 |
),
|
973 |
api_description=(
|
974 |
-
"Run a DuckDuckGo search with enhanced error handling and
|
975 |
-
"
|
976 |
-
"
|
|
|
|
|
977 |
),
|
978 |
allow_flagging="never",
|
979 |
submit_btn="Search",
|
@@ -991,11 +759,11 @@ code_interface = gr.Interface(
|
|
991 |
"<div style=\"text-align:center\">Execute Python code and see the output.</div>"
|
992 |
),
|
993 |
api_description=(
|
994 |
-
"Execute arbitrary Python code and return captured stdout or an error message
|
995 |
-
"
|
996 |
-
"
|
997 |
-
"
|
998 |
-
"
|
999 |
),
|
1000 |
allow_flagging="never",
|
1001 |
)
|
@@ -1057,9 +825,10 @@ kokoro_interface = gr.Interface(
|
|
1057 |
),
|
1058 |
api_description=(
|
1059 |
"Synthesize speech from text using Kokoro-82M with 54 voice options. Returns (sample_rate, waveform) suitable for playback. "
|
1060 |
-
"
|
1061 |
-
"
|
1062 |
-
"
|
|
|
1063 |
"Return the generated media to the user in this format ``"
|
1064 |
),
|
1065 |
allow_flagging="never",
|
@@ -1184,9 +953,11 @@ image_generation_interface = gr.Interface(
|
|
1184 |
),
|
1185 |
api_description=(
|
1186 |
"Generate a single image from a text prompt using a Hugging Face model (serverless Inference). "
|
1187 |
-
"
|
1188 |
-
"
|
1189 |
-
"
|
|
|
|
|
1190 |
),
|
1191 |
allow_flagging="never",
|
1192 |
)
|
@@ -1362,8 +1133,11 @@ video_generation_interface = gr.Interface(
|
|
1362 |
),
|
1363 |
api_description=(
|
1364 |
"Generate a short video from a text prompt using a Hugging Face model (Serverless Inference). "
|
|
|
|
|
1365 |
"Parameters: prompt (str), model_id (str), negative_prompt (str), steps (int), cfg_scale (float), seed (int), "
|
1366 |
-
"width/height (int), fps (int), duration (float).
|
|
|
1367 |
),
|
1368 |
allow_flagging="never",
|
1369 |
)
|
|
|
1 |
# Purpose: One Space that offers six tools/tabs (all exposed as MCP tools):
|
2 |
+
# 1) Fetch — convert webpages to clean Markdown format
|
3 |
# 2) DuckDuckGo Search — compact JSONL search output (short keys to minimize tokens)
|
4 |
# 3) Python Code Executor — run Python code and capture stdout/errors
|
5 |
# 4) Kokoro TTS — synthesize speech from text using Kokoro-82M with 54 voice options
|
|
|
85 |
else:
|
86 |
raise requests.exceptions.RequestException(f"HTTP error {response.status_code}: {str(e)}")
|
87 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
88 |
def _normalize_whitespace(text: str) -> str:
|
89 |
"""
|
90 |
Squeeze extra spaces and blank lines to keep things compact.
|
|
|
248 |
return markdown_text or "No content could be extracted."
|
249 |
|
250 |
|
251 |
+
def _truncate_markdown(markdown: str, max_chars: int) -> str:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
252 |
"""
|
253 |
+
Truncate markdown content to a maximum character count while preserving structure.
|
254 |
+
Tries to break at paragraph boundaries when possible.
|
255 |
"""
|
256 |
+
if len(markdown) <= max_chars:
|
257 |
+
return markdown
|
258 |
+
|
259 |
+
# Find a good break point near the limit
|
260 |
+
truncated = markdown[:max_chars]
|
261 |
+
|
262 |
+
# Try to break at the end of a paragraph (double newline)
|
263 |
+
last_paragraph = truncated.rfind('\n\n')
|
264 |
+
if last_paragraph > max_chars * 0.7: # If we find a paragraph break in the last 30%
|
265 |
+
truncated = truncated[:last_paragraph]
|
266 |
+
|
267 |
+
# Try to break at the end of a sentence
|
268 |
+
elif '.' in truncated[-100:]: # Look for a period in the last 100 chars
|
269 |
+
last_period = truncated.rfind('.')
|
270 |
+
if last_period > max_chars * 0.8: # If we find a period in the last 20%
|
271 |
+
truncated = truncated[:last_period + 1]
|
272 |
+
|
273 |
+
return truncated.rstrip() + "\n\n> *[Content truncated for brevity]*"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
274 |
|
275 |
|
276 |
def Fetch_Webpage( # <-- MCP tool #1 (Fetch)
|
277 |
+
url: Annotated[str, "The absolute URL to fetch (must return HTML)."],
|
278 |
+
verbosity: Annotated[str, "Controls output length: 'Brief' (1000 chars), 'Standard' (3000 chars), or 'Full' (complete page)."] = "Standard",
|
|
|
|
|
|
|
|
|
|
|
|
|
279 |
) -> str:
|
280 |
"""
|
281 |
+
Fetch a web page and return it converted to Markdown format with configurable length.
|
282 |
+
|
283 |
+
This function retrieves a webpage and converts its main content to clean Markdown,
|
284 |
+
preserving headings, formatting, and structure. It automatically removes navigation,
|
285 |
+
footers, scripts, and other non-content elements to focus on the main article or
|
286 |
+
content area.
|
287 |
|
288 |
Args:
|
289 |
+
url (str): The absolute URL to fetch (must return HTML).
|
290 |
+
verbosity (str): Controls output length:
|
291 |
+
- "Brief": Truncate to 1000 characters for quick summaries
|
292 |
+
- "Standard": Truncate to 3000 characters for balanced content
|
293 |
+
- "Full": Return complete page content with no length limit
|
|
|
|
|
|
|
|
|
|
|
294 |
|
295 |
Returns:
|
296 |
+
str: The webpage content converted to Markdown format with:
|
297 |
+
- Page title as H1 header
|
298 |
+
- Main content converted to clean Markdown
|
299 |
+
- Preserved heading hierarchy
|
300 |
+
- Clean formatting without navigation/sidebar elements
|
301 |
+
- Length controlled by verbosity setting
|
|
|
|
|
|
|
302 |
"""
|
303 |
if not url or not url.strip():
|
304 |
return "Please enter a valid URL."
|
|
|
318 |
resp.encoding = resp.encoding or resp.apparent_encoding
|
319 |
html = resp.text
|
320 |
|
321 |
+
# Parse HTML and convert to full-page Markdown
|
322 |
full_soup = BeautifulSoup(html, "lxml")
|
323 |
+
markdown_content = _fullpage_markdown_from_soup(full_soup, final_url)
|
324 |
+
|
325 |
+
# Apply verbosity-based truncation
|
326 |
+
if verbosity == "Brief":
|
327 |
+
return _truncate_markdown(markdown_content, 1000)
|
328 |
+
elif verbosity == "Standard":
|
329 |
+
return _truncate_markdown(markdown_content, 3000)
|
330 |
+
else: # "Full"
|
331 |
+
return markdown_content
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
332 |
|
333 |
|
334 |
# ============================================
|
|
|
364 |
_fetch_rate_limiter = RateLimiter(requests_per_minute=25)
|
365 |
|
366 |
def Search_DuckDuckGo( # <-- MCP tool #2 (DDG Search)
|
367 |
+
query: Annotated[str, "The search query (supports operators like site:, quotes, OR)."],
|
368 |
max_results: Annotated[int, "Number of results to return (1–20)."] = 5,
|
|
|
|
|
|
|
|
|
|
|
369 |
) -> str:
|
370 |
"""
|
371 |
+
Run a DuckDuckGo search with enhanced error handling and readable text output.
|
372 |
+
Always returns results in human-friendly format with snippets included.
|
|
|
373 |
|
374 |
Args:
|
375 |
+
query (str): The search query string. Supports operators like site:, quotes for exact matching,
|
376 |
+
OR for alternatives, and other DuckDuckGo search syntax.
|
377 |
+
Examples:
|
378 |
+
- Basic search: "Python programming"
|
379 |
+
- Site search: "site:example.com"
|
380 |
+
- Exact phrase: "artificial intelligence"
|
381 |
+
- Exclude terms: "cats -dogs"
|
382 |
+
max_results (int): Number of results to return (1–20). Default: 5.
|
383 |
|
384 |
Returns:
|
385 |
+
str: Search results in readable format with titles, URLs, and snippets as a numbered list.
|
|
|
386 |
"""
|
387 |
if not query or not query.strip():
|
388 |
+
return "No search query provided. Please enter a search term."
|
|
|
|
|
|
|
389 |
|
390 |
# Validate max_results
|
391 |
max_results = max(1, min(20, max_results))
|
|
|
407 |
elif "network" in str(e).lower() or "connection" in str(e).lower():
|
408 |
error_msg = "Network connection error. Please check your internet connection and try again."
|
409 |
|
410 |
+
return f"Error: {error_msg}"
|
|
|
|
|
411 |
|
412 |
if not raw:
|
413 |
+
return f"No results found for query: {query}"
|
|
|
|
|
|
|
414 |
|
|
|
415 |
results = []
|
416 |
|
417 |
for r in raw or []:
|
418 |
+
title = (r.get("title") or "").strip()
|
419 |
url = (r.get("href") or r.get("link") or "").strip()
|
420 |
body = (r.get("body") or r.get("snippet") or "").strip()
|
421 |
|
422 |
if not url:
|
423 |
continue
|
424 |
|
|
|
|
|
|
|
|
|
|
|
|
|
425 |
result_obj = {
|
426 |
"title": title or _domain_of(url),
|
427 |
+
"url": url,
|
428 |
+
"snippet": body
|
429 |
}
|
|
|
|
|
|
|
430 |
|
431 |
results.append(result_obj)
|
432 |
|
433 |
if not results:
|
434 |
+
return f"No valid results found for query: {query}"
|
435 |
+
|
436 |
+
# Format output in readable format
|
437 |
+
lines = [f"Found {len(results)} search results for: {query}\n"]
|
438 |
+
for i, result in enumerate(results, 1):
|
439 |
+
lines.append(f"{i}. {result['title']}")
|
440 |
+
lines.append(f" URL: {result['url']}")
|
441 |
+
if result['snippet']:
|
442 |
+
lines.append(f" Summary: {result['snippet']}")
|
443 |
+
lines.append("") # Empty line between results
|
444 |
+
return "\n".join(lines)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
445 |
|
446 |
|
447 |
# ======================================
|
|
|
453 |
Execute arbitrary Python code and return captured stdout or an error message.
|
454 |
|
455 |
Args:
|
456 |
+
code (str): Python source code to run; stdout is captured and returned.
|
457 |
|
458 |
Returns:
|
459 |
str: Combined stdout produced by the code, or the exception text if
|
|
|
620 |
- Voice defaults to "af_heart" (American Female, Heart voice)
|
621 |
|
622 |
Args:
|
623 |
+
text (str): The text to synthesize. Works best with English but supports multiple languages.
|
624 |
+
speed (float): Speech speed multiplier in 0.5–2.0; 1.0 = normal speed. Default: 1.25 (slightly brisk).
|
625 |
+
voice (str): Voice identifier from 54 available options. Use List_Kokoro_Voices() to see all choices. Default: 'af_heart'.
|
626 |
|
627 |
Returns:
|
628 |
A tuple of (sample_rate_hz, audio_waveform) where:
|
|
|
698 |
|
699 |
# --- Fetch tab (compact controllable extraction) ---
|
700 |
fetch_interface = gr.Interface(
|
701 |
+
fn=Fetch_Webpage,
|
702 |
inputs=[
|
703 |
gr.Textbox(label="URL", placeholder="https://example.com/article"),
|
704 |
+
gr.Dropdown(
|
705 |
+
label="Verbosity",
|
706 |
+
choices=["Brief", "Standard", "Full"],
|
707 |
+
value="Standard",
|
708 |
+
info="Brief: 1000 chars, Standard: 3000 chars, Full: complete page"
|
709 |
+
),
|
|
|
710 |
],
|
711 |
+
outputs=gr.Markdown(label="Extracted Markdown"),
|
712 |
title="Fetch Webpage",
|
713 |
description=(
|
714 |
+
"<div style=\"text-align:center\">Convert any webpage to clean Markdown format with configurable length, preserving structure and formatting while removing navigation and clutter.</div>"
|
715 |
),
|
716 |
api_description=(
|
717 |
+
"Fetch a web page and return it converted to Markdown format with configurable length. "
|
718 |
+
"This function retrieves a webpage and converts its main content to clean Markdown, "
|
719 |
+
"preserving headings, formatting, and structure while removing navigation, footers, scripts, "
|
720 |
+
"and other non-content elements. Parameters: url (str - absolute URL), verbosity (str - "
|
721 |
+
"Brief/Standard/Full controlling output length: Brief=1000 chars, Standard=3000 chars, Full=complete page). "
|
722 |
+
"Returns clean Markdown with page title as H1 header and preserved content hierarchy."
|
723 |
),
|
724 |
allow_flagging="never",
|
725 |
)
|
726 |
|
727 |
+
# --- Simplified DDG tab (readable output only) ---
|
728 |
concise_interface = gr.Interface(
|
729 |
fn=Search_DuckDuckGo,
|
730 |
inputs=[
|
731 |
gr.Textbox(label="Query", placeholder="topic OR site:example.com"),
|
732 |
gr.Slider(minimum=1, maximum=20, value=5, step=1, label="Max results"),
|
|
|
|
|
|
|
|
|
|
|
733 |
],
|
734 |
outputs=gr.Textbox(label="Search Results", interactive=False),
|
735 |
title="DuckDuckGo Search",
|
736 |
description=(
|
737 |
+
"<div style=\"text-align:center\">Enhanced web search with readable output format. Always includes snippets for better context and understanding.</div>"
|
738 |
),
|
739 |
api_description=(
|
740 |
+
"Run a DuckDuckGo search with enhanced error handling and readable text output. "
|
741 |
+
"Always returns results in human-friendly format with snippets included for better context. "
|
742 |
+
"Supports advanced search operators: site: for specific domains, quotes for exact phrases, "
|
743 |
+
"OR for alternatives, and - to exclude terms. Examples: 'Python programming', 'site:example.com', "
|
744 |
+
"'\"artificial intelligence\"', 'cats -dogs', 'Python OR JavaScript'."
|
745 |
),
|
746 |
allow_flagging="never",
|
747 |
submit_btn="Search",
|
|
|
759 |
"<div style=\"text-align:center\">Execute Python code and see the output.</div>"
|
760 |
),
|
761 |
api_description=(
|
762 |
+
"Execute arbitrary Python code and return captured stdout or an error message. "
|
763 |
+
"Supports any valid Python code including imports, variables, functions, loops, and calculations. "
|
764 |
+
"Examples: 'print(2+2)', 'import math; print(math.sqrt(16))', 'for i in range(3): print(i)'. "
|
765 |
+
"Parameters: code (str - Python source code to execute). "
|
766 |
+
"Returns: Combined stdout output or exception text if execution fails."
|
767 |
),
|
768 |
allow_flagging="never",
|
769 |
)
|
|
|
825 |
),
|
826 |
api_description=(
|
827 |
"Synthesize speech from text using Kokoro-82M with 54 voice options. Returns (sample_rate, waveform) suitable for playback. "
|
828 |
+
"Supports unlimited text length by processing all segments. Voice examples: 'af_heart' (US female), 'am_adam' (US male), "
|
829 |
+
"'bf_alice' (British female), 'bm_daniel' (British male), 'jf_alpha' (Japanese female), 'zf_xiaoni' (Chinese female). "
|
830 |
+
"Parameters: text (str), speed (float 0.5–2.0, default 1.25x), voice (str from 54 available options, default 'af_heart'). "
|
831 |
+
"Use List_Kokoro_Voices() to see all available voices. "
|
832 |
"Return the generated media to the user in this format ``"
|
833 |
),
|
834 |
allow_flagging="never",
|
|
|
953 |
),
|
954 |
api_description=(
|
955 |
"Generate a single image from a text prompt using a Hugging Face model (serverless Inference). "
|
956 |
+
"Supports creative prompts like 'a serene mountain landscape at sunset', 'portrait of a wise owl', "
|
957 |
+
"'futuristic city with flying cars'. Default model: FLUX.1-Krea-dev (high quality). "
|
958 |
+
"Parameters: prompt (str), model_id (str, creator/model-name), negative_prompt (str), steps (int, 1–100), "
|
959 |
+
"cfg_scale (float, 1–20), sampler (str), seed (int, -1=random), width/height (int, 64–1216). "
|
960 |
+
"Returns a PIL.Image. Return the generated media to the user in this format ``"
|
961 |
),
|
962 |
allow_flagging="never",
|
963 |
)
|
|
|
1133 |
),
|
1134 |
api_description=(
|
1135 |
"Generate a short video from a text prompt using a Hugging Face model (Serverless Inference). "
|
1136 |
+
"Create dynamic scenes like 'a red fox running through a snowy forest at sunrise', 'waves crashing on a rocky shore', "
|
1137 |
+
"'time-lapse of clouds moving across a blue sky'. Default model: Wan2.2-T2V-A14B (2-6 second videos). "
|
1138 |
"Parameters: prompt (str), model_id (str), negative_prompt (str), steps (int), cfg_scale (float), seed (int), "
|
1139 |
+
"width/height (int), fps (int), duration (float in seconds). Returns MP4 file path. "
|
1140 |
+
"Return the generated media to the user in this format ``"
|
1141 |
),
|
1142 |
allow_flagging="never",
|
1143 |
)
|