Nymbo commited on
Commit
b3ae1ba
·
verified ·
1 Parent(s): ed7ddca

Search MCP changes... READ ME

Browse files

I simplified the `Fetch_Webpage` and `Search_DuckDuckGo` tools a lot, it's much cleaner and reliable now. That said, the MCP instructions given to the LLM is a bit verbose right now.

Currently, having all six tools enabled will add 2000 tokens to context. I'd like it to be a bit less, but the current prompts are very effective for small, shitty models. They also work very well for big models but I test locally.

Fetch and Search now use a much better user agent and spam protection to avoid bot detection. Both have better error handling.
Search now makes full use of DDG's Operators (queries with site search, other advanced search parameters).

Files changed (1) hide show
  1. app.py +124 -350
app.py CHANGED
@@ -1,5 +1,5 @@
1
  # Purpose: One Space that offers six tools/tabs (all exposed as MCP tools):
2
- # 1) Fetch — extract relevant page content (title, metadata, clean text, hyperlinks)
3
  # 2) DuckDuckGo Search — compact JSONL search output (short keys to minimize tokens)
4
  # 3) Python Code Executor — run Python code and capture stdout/errors
5
  # 4) Kokoro TTS — synthesize speech from text using Kokoro-82M with 54 voice options
@@ -85,96 +85,6 @@ def _http_get_enhanced(url: str) -> requests.Response:
85
  else:
86
  raise requests.exceptions.RequestException(f"HTTP error {response.status_code}: {str(e)}")
87
 
88
- def _extract_main_text_enhanced(html: str) -> Tuple[str, BeautifulSoup]:
89
- """
90
- Enhanced main text extraction with better fallback mechanisms.
91
- """
92
- try:
93
- # Try Readability first
94
- doc = Document(html)
95
- readable_html = doc.summary(html_partial=True)
96
-
97
- if readable_html and readable_html.strip():
98
- soup = BeautifulSoup(readable_html, "lxml")
99
-
100
- # Remove noisy tags more comprehensively
101
- for sel in ["script", "style", "noscript", "iframe", "svg", "nav", "header", "footer", "aside", "[role='banner']", "[role='navigation']", "[role='complementary']"]:
102
- for tag in soup.select(sel):
103
- tag.decompose()
104
-
105
- # Extract text with better structure preservation
106
- text_parts = []
107
- for element in soup.find_all(["h1", "h2", "h3", "h4", "h5", "h6", "p", "li", "blockquote", "div"]):
108
- chunk = element.get_text(" ", strip=True)
109
- if chunk and len(chunk) > 15: # Only include substantial content
110
- # Avoid repetitive disclaimers
111
- if not ("responses are generated using ai" in chunk.lower() and len(chunk) < 100):
112
- text_parts.append(chunk)
113
-
114
- if text_parts:
115
- clean_text = _normalize_whitespace("\n\n".join(text_parts))
116
- # Check if we got substantial content
117
- if len(clean_text) > 100:
118
- return clean_text, soup
119
-
120
- except Exception:
121
- pass # Fall through to backup extraction
122
-
123
- # Fallback: Parse original HTML more carefully
124
- try:
125
- full_soup = BeautifulSoup(html, "lxml")
126
-
127
- # Remove unwanted elements
128
- for element in full_soup.select("script, style, nav, footer, header, aside, [role='banner'], [role='navigation'], [role='complementary']"):
129
- element.decompose()
130
-
131
- # Try to find main content areas
132
- main_content = (
133
- full_soup.find("main")
134
- or full_soup.find("article")
135
- or full_soup.find("div", class_=re.compile(r"content|main|post|article|body", re.I))
136
- or full_soup.find("div", id=re.compile(r"content|main|post|article|body", re.I))
137
- or full_soup.find("section", class_=re.compile(r"content|main|post|article|body", re.I))
138
- or full_soup.find("body")
139
- or full_soup
140
- )
141
-
142
- if main_content:
143
- # More aggressive removal of common noise patterns
144
- for element in main_content.select(".disclaimer, .warning, .alert, .notice, [class*='cookie'], [class*='banner'], [id*='cookie'], [id*='banner']"):
145
- element.decompose()
146
-
147
- text = main_content.get_text(" ", strip=True)
148
- text = _normalize_whitespace(text)
149
-
150
- # Filter out repetitive text
151
- lines = text.split('\n')
152
- filtered_lines = []
153
- seen_lines = set()
154
-
155
- for line in lines:
156
- line_clean = line.strip()
157
- if len(line_clean) > 10 and line_clean not in seen_lines:
158
- # Skip common disclaimers and repetitive content
159
- if not ("responses are generated using ai" in line_clean.lower() and len(line_clean) < 100):
160
- filtered_lines.append(line)
161
- seen_lines.add(line_clean)
162
-
163
- clean_text = '\n'.join(filtered_lines)
164
-
165
- # Create a minimal soup for link extraction
166
- minimal_soup = BeautifulSoup(str(main_content), "lxml")
167
- return clean_text, minimal_soup
168
-
169
- except Exception:
170
- pass
171
-
172
- # Last resort: Just get all text
173
- fallback_soup = BeautifulSoup(html, "lxml")
174
- text = fallback_soup.get_text(" ", strip=True)
175
- return _normalize_whitespace(text), fallback_soup
176
-
177
-
178
  def _normalize_whitespace(text: str) -> str:
179
  """
180
  Squeeze extra spaces and blank lines to keep things compact.
@@ -338,136 +248,57 @@ def _fullpage_markdown_from_soup(full_soup: BeautifulSoup, base_url: str) -> str
338
  return markdown_text or "No content could be extracted."
339
 
340
 
341
- def _extract_links(readable_soup: BeautifulSoup, base_url: str, max_links: int) -> List[Tuple[str, str]]:
342
- """
343
- Collect clean, unique, absolute links from the readable section only.
344
- (Layman's terms: pull a tidy list of links from the article body.)
345
- """
346
- seen = set()
347
- links: List[Tuple[str, str]] = []
348
-
349
- for a in readable_soup.find_all("a", href=True):
350
- href = a.get("href").strip()
351
- # Skip junk links we can't use
352
- if not href or href.startswith("#") or href.startswith("mailto:") or href.startswith("javascript:"):
353
- continue
354
-
355
- # Resolve relative URLs, strip fragments (#…)
356
- absolute = urljoin(base_url, href)
357
- absolute, _ = urldefrag(absolute)
358
-
359
- if absolute in seen:
360
- continue
361
- seen.add(absolute)
362
-
363
- text = a.get_text(" ", strip=True)
364
- if len(text) > 120:
365
- text = text[:117] + "…"
366
-
367
- links.append((text or absolute, absolute))
368
-
369
- if len(links) >= max_links > 0:
370
- break
371
-
372
- return links
373
-
374
-
375
- def _format_markdown(
376
- meta: Dict[str, str],
377
- body: str,
378
- body_truncated: bool,
379
- links: List[Tuple[str, str]],
380
- include_text: bool,
381
- include_metadata: bool,
382
- include_links: bool,
383
- verbosity: str,
384
- ) -> str:
385
  """
386
- Assemble a compact Markdown summary with optional sections.
387
- (Layman's terms: build the final markdown output with options.)
388
  """
389
- lines: List[str] = []
390
-
391
- # Title header
392
- title = meta.get("title") or meta.get("domain") or "Untitled"
393
- lines.append(f"# {title}")
394
-
395
- # Metadata section (only show what exists)
396
- if include_metadata:
397
- md: List[str] = []
398
- if meta.get("description"):
399
- md.append(f"- **Description:** {meta['description']}")
400
- if meta.get("site_name"):
401
- md.append(f"- **Site:** {meta['site_name']}")
402
- if meta.get("canonical"):
403
- md.append(f"- **Canonical:** {meta['canonical']}")
404
- if meta.get("lang"):
405
- md.append(f"- **Language:** {meta['lang']}")
406
- if meta.get("fetched_url"):
407
- md.append(f"- **Fetched From:** {meta['fetched_url']}")
408
- if md:
409
- lines.append("## Metadata")
410
- lines.extend(md)
411
-
412
- # Body text
413
- if include_text and body:
414
- if verbosity == "Brief":
415
- brief, was_more = _truncate(body, 800)
416
- lines.append("## Text")
417
- lines.append(brief)
418
- if was_more or body_truncated:
419
- lines.append("\n> (Trimmed for brevity)")
420
- else:
421
- lines.append("## Text")
422
- lines.append(body)
423
- if body_truncated:
424
- lines.append("\n> (Trimmed for brevity)")
425
-
426
- # Links section
427
- if include_links and links:
428
- lines.append(f"## Links ({len(links)})")
429
- for text, url in links:
430
- lines.append(f"- [{text}]({url})")
431
-
432
- return "\n\n".join(lines).strip()
433
 
434
 
435
  def Fetch_Webpage( # <-- MCP tool #1 (Fetch)
436
- url: Annotated[str, "The absolute URL to fetch (must return HTML)."] ,
437
- verbosity: Annotated[str, "Controls body length: one of 'Brief', 'Standard', or 'Full'."] = "Standard",
438
- include_metadata: Annotated[bool, "Include a Metadata section (description, site name, canonical, lang, fetched URL)."] = True,
439
- include_text: Annotated[bool, "Include the readable main text extracted with Readability."] = True,
440
- include_links: Annotated[bool, "Include outbound links discovered in the readable section."] = True,
441
- max_chars: Annotated[int, "Hard cap for body characters after the verbosity preset. Use 0 to disable the cap."] = 3000,
442
- max_links: Annotated[int, "Maximum number of links to include from the readable content. Set 0 to omit links."] = 20,
443
- full_page_markdown: Annotated[bool, "If true, return the page as full Markdown (Content Scraper mode) instead of a compact summary."] = False,
444
  ) -> str:
445
  """
446
- Fetch a web page and return a compact Markdown summary containing title, key
447
- metadata, readable main text, and outbound links.
 
 
 
 
448
 
449
  Args:
450
- url: The absolute URL to fetch (must return HTML).
451
- verbosity: Controls body length: one of 'Brief', 'Standard', or 'Full'.
452
- include_metadata: Include a Metadata section (description, site name, canonical, lang, fetched URL).
453
- include_text: Include the readable main text extracted with Readability.
454
- include_links: Include outbound links discovered in the readable section.
455
- max_chars: Hard cap for body characters after the verbosity preset. Use 0 to disable the cap.
456
- max_links: Maximum number of links to include from the readable content. Set 0 to omit links.
457
- full_page_markdown: If True, return the page converted to full Markdown (Content Scraper mode)
458
- instead of the compact summary. This ignores verbosity/include_* and max_* limits and
459
- attempts to convert the main content area to Markdown with headings preserved.
460
 
461
  Returns:
462
- str: Markdown that may contain the following sections:
463
- - Title (H1)
464
- - Metadata (optional)
465
- - Text (optional, may be trimmed)
466
- - Links (optional, deduped and absolute)
467
-
468
- Special mode:
469
- If full_page_markdown=True, the function returns the page converted to Markdown,
470
- similar to the "Content Scraper" tool, ignoring verbosity/include_* limits.
471
  """
472
  if not url or not url.strip():
473
  return "Please enter a valid URL."
@@ -487,42 +318,17 @@ def Fetch_Webpage( # <-- MCP tool #1 (Fetch)
487
  resp.encoding = resp.encoding or resp.apparent_encoding
488
  html = resp.text
489
 
490
- # Full-page soup for metadata (and potential Markdown conversion)
491
  full_soup = BeautifulSoup(html, "lxml")
492
- meta = _extract_metadata(full_soup, final_url)
493
-
494
- # Content Scraper mode: return full-page Markdown early
495
- if full_page_markdown:
496
- return _fullpage_markdown_from_soup(full_soup, final_url)
497
-
498
- # Readable content with enhanced extraction
499
- body_text, readable_soup = _extract_main_text_enhanced(html)
500
- if not body_text:
501
- # Fallback to "whole-page text" if Readability found nothing
502
- fallback_text = full_soup.get_text(" ", strip=True)
503
- body_text = _normalize_whitespace(fallback_text)
504
-
505
- # Verbosity presets (we keep the smaller of preset vs. user cap)
506
- preset_caps = {"Brief": 1200, "Standard": 3000, "Full": 999_999}
507
- target_cap = preset_caps.get(verbosity, 3000)
508
- cap = min(max_chars if max_chars > 0 else target_cap, target_cap)
509
- body_text, truncated = _truncate(body_text, cap) if include_text else ("", False)
510
-
511
- # Extract links from the simplified content only
512
- links = _extract_links(readable_soup, final_url, max_links=max_links if include_links else 0)
513
-
514
- # Final compact Markdown
515
- md = _format_markdown(
516
- meta=meta,
517
- body=body_text,
518
- body_truncated=truncated,
519
- links=links,
520
- include_text=include_text,
521
- include_metadata=include_metadata,
522
- include_links=include_links,
523
- verbosity=verbosity,
524
- )
525
- return md or "No content could be extracted."
526
 
527
 
528
  # ============================================
@@ -558,37 +364,28 @@ _search_rate_limiter = RateLimiter(requests_per_minute=20)
558
  _fetch_rate_limiter = RateLimiter(requests_per_minute=25)
559
 
560
  def Search_DuckDuckGo( # <-- MCP tool #2 (DDG Search)
561
- query: Annotated[str, "The search query (supports operators like site:, quotes, OR)."] ,
562
  max_results: Annotated[int, "Number of results to return (1–20)."] = 5,
563
- include_snippets: Annotated[bool, "Include a short snippet for each result (adds tokens)."] = False,
564
- max_snippet_chars: Annotated[int, "Character cap applied to each snippet when included."] = 80,
565
- dedupe_domains: Annotated[bool, "If true, only the first result from each domain is kept."] = True,
566
- title_chars: Annotated[int, "Character cap applied to titles."] = 80,
567
- output_format: Annotated[str, "Output format: 'jsonl' for compact JSON or 'readable' for LLM-friendly text."] = "jsonl",
568
  ) -> str:
569
  """
570
- Run a DuckDuckGo search with enhanced error handling and multiple output formats.
571
- Returns either compact JSONL (t=title, u=url, optional s=snippet) or readable text
572
- format optimized for LLM consumption with better error messages.
573
 
574
  Args:
575
- query: The search query (supports operators like site:, quotes, OR).
576
- max_results: Number of results to return (1–20).
577
- include_snippets: Include a short snippet for each result (adds tokens).
578
- max_snippet_chars: Character cap applied to each snippet when included.
579
- dedupe_domains: If true, only the first result from each domain is kept.
580
- title_chars: Character cap applied to titles.
581
- output_format: Output format: 'jsonl' for compact JSON or 'readable' for LLM-friendly text.
 
582
 
583
  Returns:
584
- str: Either JSONL format with {"t": "title", "u": "url"[, "s": "snippet"]}
585
- or readable text format for better LLM consumption.
586
  """
587
  if not query or not query.strip():
588
- error_msg = "No search query provided. Please enter a search term."
589
- if output_format == "readable":
590
- return error_msg
591
- return json.dumps({"error": error_msg}, ensure_ascii=False, separators=(",", ":"))
592
 
593
  # Validate max_results
594
  max_results = max(1, min(20, max_results))
@@ -610,68 +407,41 @@ def Search_DuckDuckGo( # <-- MCP tool #2 (DDG Search)
610
  elif "network" in str(e).lower() or "connection" in str(e).lower():
611
  error_msg = "Network connection error. Please check your internet connection and try again."
612
 
613
- if output_format == "readable":
614
- return f"Error: {error_msg}"
615
- return json.dumps({"error": error_msg}, ensure_ascii=False, separators=(",", ":"))
616
 
617
  if not raw:
618
- no_results_msg = f"No results found for query: {query}"
619
- if output_format == "readable":
620
- return no_results_msg
621
- return json.dumps({"info": no_results_msg}, ensure_ascii=False, separators=(",", ":"))
622
 
623
- seen_domains = set()
624
  results = []
625
 
626
  for r in raw or []:
627
- title = _shorten((r.get("title") or "").strip(), title_chars)
628
  url = (r.get("href") or r.get("link") or "").strip()
629
  body = (r.get("body") or r.get("snippet") or "").strip()
630
 
631
  if not url:
632
  continue
633
 
634
- if dedupe_domains:
635
- dom = _domain_of(url)
636
- if dom in seen_domains:
637
- continue
638
- seen_domains.add(dom)
639
-
640
  result_obj = {
641
  "title": title or _domain_of(url),
642
- "url": url
 
643
  }
644
-
645
- if include_snippets and body:
646
- result_obj["snippet"] = _shorten(body, max_snippet_chars)
647
 
648
  results.append(result_obj)
649
 
650
  if not results:
651
- no_results_msg = f"No valid results found for query: {query}"
652
- if output_format == "readable":
653
- return no_results_msg
654
- return json.dumps({"info": no_results_msg}, ensure_ascii=False, separators=(",", ":"))
655
-
656
- # Format output based on requested format
657
- if output_format == "readable":
658
- lines = [f"Found {len(results)} search results for: {query}\n"]
659
- for i, result in enumerate(results, 1):
660
- lines.append(f"{i}. {result['title']}")
661
- lines.append(f" URL: {result['url']}")
662
- if "snippet" in result:
663
- lines.append(f" Summary: {result['snippet']}")
664
- lines.append("") # Empty line between results
665
- return "\n".join(lines)
666
- else:
667
- # JSONL format with compact keys
668
- lines = []
669
- for result in results:
670
- obj = {"t": result["title"], "u": result["url"]}
671
- if "snippet" in result:
672
- obj["s"] = result["snippet"]
673
- lines.append(json.dumps(obj, ensure_ascii=False, separators=(",", ":")))
674
- return "\n".join(lines)
675
 
676
 
677
  # ======================================
@@ -683,7 +453,7 @@ def Execute_Python(code: Annotated[str, "Python source code to run; stdout is ca
683
  Execute arbitrary Python code and return captured stdout or an error message.
684
 
685
  Args:
686
- code: Python source code to run; stdout is captured and returned.
687
 
688
  Returns:
689
  str: Combined stdout produced by the code, or the exception text if
@@ -850,9 +620,9 @@ def Generate_Speech( # <-- MCP tool #4 (Generate Speech)
850
  - Voice defaults to "af_heart" (American Female, Heart voice)
851
 
852
  Args:
853
- text: The text to synthesize. Works best with English but supports multiple languages.
854
- speed: Speech speed multiplier in 0.5–2.0; 1.0 = normal speed. Default: 1.25 (slightly brisk).
855
- voice: Voice identifier from 54 available options. Use List_Kokoro_Voices() to see all choices. Default: 'af_heart'.
856
 
857
  Returns:
858
  A tuple of (sample_rate_hz, audio_waveform) where:
@@ -928,52 +698,50 @@ def Generate_Speech( # <-- MCP tool #4 (Generate Speech)
928
 
929
  # --- Fetch tab (compact controllable extraction) ---
930
  fetch_interface = gr.Interface(
931
- fn=Fetch_Webpage, # connect the function to the UI
932
  inputs=[
933
  gr.Textbox(label="URL", placeholder="https://example.com/article"),
934
- gr.Dropdown(label="Verbosity", choices=["Brief", "Standard", "Full"], value="Standard"),
935
- gr.Checkbox(value=True, label="Include Metadata"),
936
- gr.Checkbox(value=True, label="Include Main Text"),
937
- gr.Checkbox(value=True, label="Include Links"),
938
- gr.Slider(400, 12000, value=3000, step=100, label="Max Characters (body text)"),
939
- gr.Slider(0, 100, value=20, step=1, label="Max Links"),
940
- gr.Checkbox(value=False, label="Full-page Markdown (Content Scraper mode)"),
941
  ],
942
- outputs=gr.Markdown(label="Extracted Summary"),
943
  title="Fetch Webpage",
944
  description=(
945
- "<div style=\"text-align:center\">Extract title, key metadata, readable text, and links from webpages or toggle full-page Markdown.</div>"
946
  ),
947
  api_description=(
948
- "Fetch a web page and return a compact Markdown summary with title, key "
949
- "metadata, readable body text, and outbound links. Or, enable the "
950
- "'Full-page Markdown (Content Scraper mode)' option to return the page "
951
- "converted to Markdown."
 
 
952
  ),
953
  allow_flagging="never",
954
  )
955
 
956
- # --- Concise DDG tab (JSONL with short keys, minimal tokens) ---
957
  concise_interface = gr.Interface(
958
  fn=Search_DuckDuckGo,
959
  inputs=[
960
  gr.Textbox(label="Query", placeholder="topic OR site:example.com"),
961
  gr.Slider(minimum=1, maximum=20, value=5, step=1, label="Max results"),
962
- gr.Checkbox(value=False, label="Include snippets (adds tokens)"),
963
- gr.Slider(minimum=20, maximum=200, value=80, step=5, label="Max snippet chars"),
964
- gr.Checkbox(value=True, label="Dedupe by domain"),
965
- gr.Slider(minimum=20, maximum=120, value=80, step=5, label="Max title chars"),
966
- gr.Radio(label="Output format", choices=["jsonl", "readable"], value="jsonl", info="JSONL for compact JSON, readable for LLM-friendly text"),
967
  ],
968
  outputs=gr.Textbox(label="Search Results", interactive=False),
969
  title="DuckDuckGo Search",
970
  description=(
971
- "<div style=\"text-align:center\">Enhanced web search with better error handling and multiple output formats. JSONL format emits compact keys (t,u[,s]), readable format provides LLM-friendly text.</div>"
972
  ),
973
  api_description=(
974
- "Run a DuckDuckGo search with enhanced error handling and multiple output formats. "
975
- "Returns either compact JSONL (t=title, u=url, optional s=snippet) or readable text "
976
- "format optimized for LLM consumption with better error messages."
 
 
977
  ),
978
  allow_flagging="never",
979
  submit_btn="Search",
@@ -991,11 +759,11 @@ code_interface = gr.Interface(
991
  "<div style=\"text-align:center\">Execute Python code and see the output.</div>"
992
  ),
993
  api_description=(
994
- "Execute arbitrary Python code and return captured stdout or an error message.\n\n"
995
- "Parameters:\n"
996
- "- code (string): The Python source code to run.\n\n"
997
- "Returns:\n"
998
- "- string: Combined stdout produced by the code, or the exception text if execution failed."
999
  ),
1000
  allow_flagging="never",
1001
  )
@@ -1057,9 +825,10 @@ kokoro_interface = gr.Interface(
1057
  ),
1058
  api_description=(
1059
  "Synthesize speech from text using Kokoro-82M with 54 voice options. Returns (sample_rate, waveform) suitable for playback. "
1060
- "Parameters: text (str), speed (float 0.5–2.0, default 1.25x), voice (str from 54 available options). "
1061
- "Default voice: `af_heart`. "
1062
- "Can generate audio of unlimited length by processing all text segments. "
 
1063
  "Return the generated media to the user in this format `![Alt text](URL)`"
1064
  ),
1065
  allow_flagging="never",
@@ -1184,9 +953,11 @@ image_generation_interface = gr.Interface(
1184
  ),
1185
  api_description=(
1186
  "Generate a single image from a text prompt using a Hugging Face model (serverless Inference). "
1187
- "Parameters: prompt (str), model_id (str, creator/model-name), negative_prompt (str), steps (int, 1–100), cfg_scale (float, 1–20), "
1188
- "sampler (str, label only), seed (int, -1=random), width/height (int, 64–1216). Returns a PIL.Image. "
1189
- "Return the generated media to the user in this format `![Alt text](URL)`"
 
 
1190
  ),
1191
  allow_flagging="never",
1192
  )
@@ -1362,8 +1133,11 @@ video_generation_interface = gr.Interface(
1362
  ),
1363
  api_description=(
1364
  "Generate a short video from a text prompt using a Hugging Face model (Serverless Inference). "
 
 
1365
  "Parameters: prompt (str), model_id (str), negative_prompt (str), steps (int), cfg_scale (float), seed (int), "
1366
- "width/height (int), fps (int), duration (float). Return the generated media to the user in this format `![Alt text](URL)`"
 
1367
  ),
1368
  allow_flagging="never",
1369
  )
 
1
  # Purpose: One Space that offers six tools/tabs (all exposed as MCP tools):
2
+ # 1) Fetch — convert webpages to clean Markdown format
3
  # 2) DuckDuckGo Search — compact JSONL search output (short keys to minimize tokens)
4
  # 3) Python Code Executor — run Python code and capture stdout/errors
5
  # 4) Kokoro TTS — synthesize speech from text using Kokoro-82M with 54 voice options
 
85
  else:
86
  raise requests.exceptions.RequestException(f"HTTP error {response.status_code}: {str(e)}")
87
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88
  def _normalize_whitespace(text: str) -> str:
89
  """
90
  Squeeze extra spaces and blank lines to keep things compact.
 
248
  return markdown_text or "No content could be extracted."
249
 
250
 
251
+ def _truncate_markdown(markdown: str, max_chars: int) -> str:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
252
  """
253
+ Truncate markdown content to a maximum character count while preserving structure.
254
+ Tries to break at paragraph boundaries when possible.
255
  """
256
+ if len(markdown) <= max_chars:
257
+ return markdown
258
+
259
+ # Find a good break point near the limit
260
+ truncated = markdown[:max_chars]
261
+
262
+ # Try to break at the end of a paragraph (double newline)
263
+ last_paragraph = truncated.rfind('\n\n')
264
+ if last_paragraph > max_chars * 0.7: # If we find a paragraph break in the last 30%
265
+ truncated = truncated[:last_paragraph]
266
+
267
+ # Try to break at the end of a sentence
268
+ elif '.' in truncated[-100:]: # Look for a period in the last 100 chars
269
+ last_period = truncated.rfind('.')
270
+ if last_period > max_chars * 0.8: # If we find a period in the last 20%
271
+ truncated = truncated[:last_period + 1]
272
+
273
+ return truncated.rstrip() + "\n\n> *[Content truncated for brevity]*"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
274
 
275
 
276
  def Fetch_Webpage( # <-- MCP tool #1 (Fetch)
277
+ url: Annotated[str, "The absolute URL to fetch (must return HTML)."],
278
+ verbosity: Annotated[str, "Controls output length: 'Brief' (1000 chars), 'Standard' (3000 chars), or 'Full' (complete page)."] = "Standard",
 
 
 
 
 
 
279
  ) -> str:
280
  """
281
+ Fetch a web page and return it converted to Markdown format with configurable length.
282
+
283
+ This function retrieves a webpage and converts its main content to clean Markdown,
284
+ preserving headings, formatting, and structure. It automatically removes navigation,
285
+ footers, scripts, and other non-content elements to focus on the main article or
286
+ content area.
287
 
288
  Args:
289
+ url (str): The absolute URL to fetch (must return HTML).
290
+ verbosity (str): Controls output length:
291
+ - "Brief": Truncate to 1000 characters for quick summaries
292
+ - "Standard": Truncate to 3000 characters for balanced content
293
+ - "Full": Return complete page content with no length limit
 
 
 
 
 
294
 
295
  Returns:
296
+ str: The webpage content converted to Markdown format with:
297
+ - Page title as H1 header
298
+ - Main content converted to clean Markdown
299
+ - Preserved heading hierarchy
300
+ - Clean formatting without navigation/sidebar elements
301
+ - Length controlled by verbosity setting
 
 
 
302
  """
303
  if not url or not url.strip():
304
  return "Please enter a valid URL."
 
318
  resp.encoding = resp.encoding or resp.apparent_encoding
319
  html = resp.text
320
 
321
+ # Parse HTML and convert to full-page Markdown
322
  full_soup = BeautifulSoup(html, "lxml")
323
+ markdown_content = _fullpage_markdown_from_soup(full_soup, final_url)
324
+
325
+ # Apply verbosity-based truncation
326
+ if verbosity == "Brief":
327
+ return _truncate_markdown(markdown_content, 1000)
328
+ elif verbosity == "Standard":
329
+ return _truncate_markdown(markdown_content, 3000)
330
+ else: # "Full"
331
+ return markdown_content
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
332
 
333
 
334
  # ============================================
 
364
  _fetch_rate_limiter = RateLimiter(requests_per_minute=25)
365
 
366
  def Search_DuckDuckGo( # <-- MCP tool #2 (DDG Search)
367
+ query: Annotated[str, "The search query (supports operators like site:, quotes, OR)."],
368
  max_results: Annotated[int, "Number of results to return (1–20)."] = 5,
 
 
 
 
 
369
  ) -> str:
370
  """
371
+ Run a DuckDuckGo search with enhanced error handling and readable text output.
372
+ Always returns results in human-friendly format with snippets included.
 
373
 
374
  Args:
375
+ query (str): The search query string. Supports operators like site:, quotes for exact matching,
376
+ OR for alternatives, and other DuckDuckGo search syntax.
377
+ Examples:
378
+ - Basic search: "Python programming"
379
+ - Site search: "site:example.com"
380
+ - Exact phrase: "artificial intelligence"
381
+ - Exclude terms: "cats -dogs"
382
+ max_results (int): Number of results to return (1–20). Default: 5.
383
 
384
  Returns:
385
+ str: Search results in readable format with titles, URLs, and snippets as a numbered list.
 
386
  """
387
  if not query or not query.strip():
388
+ return "No search query provided. Please enter a search term."
 
 
 
389
 
390
  # Validate max_results
391
  max_results = max(1, min(20, max_results))
 
407
  elif "network" in str(e).lower() or "connection" in str(e).lower():
408
  error_msg = "Network connection error. Please check your internet connection and try again."
409
 
410
+ return f"Error: {error_msg}"
 
 
411
 
412
  if not raw:
413
+ return f"No results found for query: {query}"
 
 
 
414
 
 
415
  results = []
416
 
417
  for r in raw or []:
418
+ title = (r.get("title") or "").strip()
419
  url = (r.get("href") or r.get("link") or "").strip()
420
  body = (r.get("body") or r.get("snippet") or "").strip()
421
 
422
  if not url:
423
  continue
424
 
 
 
 
 
 
 
425
  result_obj = {
426
  "title": title or _domain_of(url),
427
+ "url": url,
428
+ "snippet": body
429
  }
 
 
 
430
 
431
  results.append(result_obj)
432
 
433
  if not results:
434
+ return f"No valid results found for query: {query}"
435
+
436
+ # Format output in readable format
437
+ lines = [f"Found {len(results)} search results for: {query}\n"]
438
+ for i, result in enumerate(results, 1):
439
+ lines.append(f"{i}. {result['title']}")
440
+ lines.append(f" URL: {result['url']}")
441
+ if result['snippet']:
442
+ lines.append(f" Summary: {result['snippet']}")
443
+ lines.append("") # Empty line between results
444
+ return "\n".join(lines)
 
 
 
 
 
 
 
 
 
 
 
 
 
445
 
446
 
447
  # ======================================
 
453
  Execute arbitrary Python code and return captured stdout or an error message.
454
 
455
  Args:
456
+ code (str): Python source code to run; stdout is captured and returned.
457
 
458
  Returns:
459
  str: Combined stdout produced by the code, or the exception text if
 
620
  - Voice defaults to "af_heart" (American Female, Heart voice)
621
 
622
  Args:
623
+ text (str): The text to synthesize. Works best with English but supports multiple languages.
624
+ speed (float): Speech speed multiplier in 0.5–2.0; 1.0 = normal speed. Default: 1.25 (slightly brisk).
625
+ voice (str): Voice identifier from 54 available options. Use List_Kokoro_Voices() to see all choices. Default: 'af_heart'.
626
 
627
  Returns:
628
  A tuple of (sample_rate_hz, audio_waveform) where:
 
698
 
699
  # --- Fetch tab (compact controllable extraction) ---
700
  fetch_interface = gr.Interface(
701
+ fn=Fetch_Webpage,
702
  inputs=[
703
  gr.Textbox(label="URL", placeholder="https://example.com/article"),
704
+ gr.Dropdown(
705
+ label="Verbosity",
706
+ choices=["Brief", "Standard", "Full"],
707
+ value="Standard",
708
+ info="Brief: 1000 chars, Standard: 3000 chars, Full: complete page"
709
+ ),
 
710
  ],
711
+ outputs=gr.Markdown(label="Extracted Markdown"),
712
  title="Fetch Webpage",
713
  description=(
714
+ "<div style=\"text-align:center\">Convert any webpage to clean Markdown format with configurable length, preserving structure and formatting while removing navigation and clutter.</div>"
715
  ),
716
  api_description=(
717
+ "Fetch a web page and return it converted to Markdown format with configurable length. "
718
+ "This function retrieves a webpage and converts its main content to clean Markdown, "
719
+ "preserving headings, formatting, and structure while removing navigation, footers, scripts, "
720
+ "and other non-content elements. Parameters: url (str - absolute URL), verbosity (str - "
721
+ "Brief/Standard/Full controlling output length: Brief=1000 chars, Standard=3000 chars, Full=complete page). "
722
+ "Returns clean Markdown with page title as H1 header and preserved content hierarchy."
723
  ),
724
  allow_flagging="never",
725
  )
726
 
727
+ # --- Simplified DDG tab (readable output only) ---
728
  concise_interface = gr.Interface(
729
  fn=Search_DuckDuckGo,
730
  inputs=[
731
  gr.Textbox(label="Query", placeholder="topic OR site:example.com"),
732
  gr.Slider(minimum=1, maximum=20, value=5, step=1, label="Max results"),
 
 
 
 
 
733
  ],
734
  outputs=gr.Textbox(label="Search Results", interactive=False),
735
  title="DuckDuckGo Search",
736
  description=(
737
+ "<div style=\"text-align:center\">Enhanced web search with readable output format. Always includes snippets for better context and understanding.</div>"
738
  ),
739
  api_description=(
740
+ "Run a DuckDuckGo search with enhanced error handling and readable text output. "
741
+ "Always returns results in human-friendly format with snippets included for better context. "
742
+ "Supports advanced search operators: site: for specific domains, quotes for exact phrases, "
743
+ "OR for alternatives, and - to exclude terms. Examples: 'Python programming', 'site:example.com', "
744
+ "'\"artificial intelligence\"', 'cats -dogs', 'Python OR JavaScript'."
745
  ),
746
  allow_flagging="never",
747
  submit_btn="Search",
 
759
  "<div style=\"text-align:center\">Execute Python code and see the output.</div>"
760
  ),
761
  api_description=(
762
+ "Execute arbitrary Python code and return captured stdout or an error message. "
763
+ "Supports any valid Python code including imports, variables, functions, loops, and calculations. "
764
+ "Examples: 'print(2+2)', 'import math; print(math.sqrt(16))', 'for i in range(3): print(i)'. "
765
+ "Parameters: code (str - Python source code to execute). "
766
+ "Returns: Combined stdout output or exception text if execution fails."
767
  ),
768
  allow_flagging="never",
769
  )
 
825
  ),
826
  api_description=(
827
  "Synthesize speech from text using Kokoro-82M with 54 voice options. Returns (sample_rate, waveform) suitable for playback. "
828
+ "Supports unlimited text length by processing all segments. Voice examples: 'af_heart' (US female), 'am_adam' (US male), "
829
+ "'bf_alice' (British female), 'bm_daniel' (British male), 'jf_alpha' (Japanese female), 'zf_xiaoni' (Chinese female). "
830
+ "Parameters: text (str), speed (float 0.5–2.0, default 1.25x), voice (str from 54 available options, default 'af_heart'). "
831
+ "Use List_Kokoro_Voices() to see all available voices. "
832
  "Return the generated media to the user in this format `![Alt text](URL)`"
833
  ),
834
  allow_flagging="never",
 
953
  ),
954
  api_description=(
955
  "Generate a single image from a text prompt using a Hugging Face model (serverless Inference). "
956
+ "Supports creative prompts like 'a serene mountain landscape at sunset', 'portrait of a wise owl', "
957
+ "'futuristic city with flying cars'. Default model: FLUX.1-Krea-dev (high quality). "
958
+ "Parameters: prompt (str), model_id (str, creator/model-name), negative_prompt (str), steps (int, 1–100), "
959
+ "cfg_scale (float, 1–20), sampler (str), seed (int, -1=random), width/height (int, 64–1216). "
960
+ "Returns a PIL.Image. Return the generated media to the user in this format `![Alt text](URL)`"
961
  ),
962
  allow_flagging="never",
963
  )
 
1133
  ),
1134
  api_description=(
1135
  "Generate a short video from a text prompt using a Hugging Face model (Serverless Inference). "
1136
+ "Create dynamic scenes like 'a red fox running through a snowy forest at sunrise', 'waves crashing on a rocky shore', "
1137
+ "'time-lapse of clouds moving across a blue sky'. Default model: Wan2.2-T2V-A14B (2-6 second videos). "
1138
  "Parameters: prompt (str), model_id (str), negative_prompt (str), steps (int), cfg_scale (float), seed (int), "
1139
+ "width/height (int), fps (int), duration (float in seconds). Returns MP4 file path. "
1140
+ "Return the generated media to the user in this format `![Alt text](URL)`"
1141
  ),
1142
  allow_flagging="never",
1143
  )