Nymbo commited on
Commit
4b97eaa
·
verified ·
1 Parent(s): e48cd48

Update Modules/Web_Fetch.py

Browse files
Files changed (1) hide show
  1. Modules/Web_Fetch.py +287 -280
Modules/Web_Fetch.py CHANGED
@@ -1,280 +1,287 @@
1
- from __future__ import annotations
2
-
3
- import re
4
- from typing import Annotated, Dict, List, Tuple
5
- from urllib.parse import urlparse, urljoin
6
-
7
- import gradio as gr
8
- import requests
9
- from bs4 import BeautifulSoup
10
- from markdownify import markdownify as md
11
- from readability import Document
12
-
13
- from app import _fetch_rate_limiter, _log_call_end, _log_call_start, _truncate_for_log
14
-
15
-
16
- def _http_get_enhanced(url: str, timeout: int | float = 30, *, skip_rate_limit: bool = False) -> requests.Response:
17
- headers = {
18
- "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
19
- "Accept-Language": "en-US,en;q=0.9",
20
- "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
21
- "Accept-Encoding": "gzip, deflate, br",
22
- "DNT": "1",
23
- "Connection": "keep-alive",
24
- "Upgrade-Insecure-Requests": "1",
25
- }
26
- if not skip_rate_limit:
27
- _fetch_rate_limiter.acquire()
28
- try:
29
- response = requests.get(
30
- url,
31
- headers=headers,
32
- timeout=timeout,
33
- allow_redirects=True,
34
- stream=False,
35
- )
36
- response.raise_for_status()
37
- return response
38
- except requests.exceptions.Timeout as exc:
39
- raise requests.exceptions.RequestException("Request timed out. The webpage took too long to respond.") from exc
40
- except requests.exceptions.ConnectionError as exc:
41
- raise requests.exceptions.RequestException("Connection error. Please check the URL and your internet connection.") from exc
42
- except requests.exceptions.HTTPError as exc:
43
- if response.status_code == 403:
44
- raise requests.exceptions.RequestException("Access forbidden. The website may be blocking automated requests.") from exc
45
- if response.status_code == 404:
46
- raise requests.exceptions.RequestException("Page not found. Please check the URL.") from exc
47
- if response.status_code == 429:
48
- raise requests.exceptions.RequestException("Rate limited. Please try again in a few minutes.") from exc
49
- raise requests.exceptions.RequestException(f"HTTP error {response.status_code}: {exc}") from exc
50
-
51
-
52
- def _normalize_whitespace(text: str) -> str:
53
- text = re.sub(r"[ \t\u00A0]+", " ", text)
54
- text = re.sub(r"\n\s*\n\s*\n+", "\n\n", text.strip())
55
- return text.strip()
56
-
57
-
58
- def _truncate(text: str, max_chars: int) -> Tuple[str, bool]:
59
- if max_chars is None or max_chars <= 0 or len(text) <= max_chars:
60
- return text, False
61
- return text[:max_chars].rstrip() + " …", True
62
-
63
-
64
- def _shorten(text: str, limit: int) -> str:
65
- if limit <= 0 or len(text) <= limit:
66
- return text
67
- return text[: max(0, limit - 1)].rstrip() + "…"
68
-
69
-
70
- def _domain_of(url: str) -> str:
71
- try:
72
- return urlparse(url).netloc or ""
73
- except Exception:
74
- return ""
75
-
76
-
77
- def _extract_links_from_soup(soup: BeautifulSoup, base_url: str) -> str:
78
- links = []
79
- for link in soup.find_all("a", href=True):
80
- href = link.get("href")
81
- text = link.get_text(strip=True)
82
- if href.startswith("http"):
83
- full_url = href
84
- elif href.startswith("//"):
85
- full_url = "https:" + href
86
- elif href.startswith("/"):
87
- full_url = urljoin(base_url, href)
88
- else:
89
- full_url = urljoin(base_url, href)
90
- if text and href not in ["#", "javascript:void(0)"]:
91
- links.append(f"- [{text}]({full_url})")
92
- if not links:
93
- return "No links found on this page."
94
- title = soup.find("title")
95
- title_text = title.get_text(strip=True) if title else "Links from webpage"
96
- return f"# {title_text}\n\n" + "\n".join(links)
97
-
98
-
99
- def _fullpage_markdown_from_soup(full_soup: BeautifulSoup, base_url: str, strip_selectors: str = "") -> str:
100
- if strip_selectors:
101
- selectors = [s.strip() for s in strip_selectors.split(",") if s.strip()]
102
- for selector in selectors:
103
- try:
104
- for element in full_soup.select(selector):
105
- element.decompose()
106
- except Exception:
107
- continue
108
- for element in full_soup.select("script, style, nav, footer, header, aside"):
109
- element.decompose()
110
- main = (
111
- full_soup.find("main")
112
- or full_soup.find("article")
113
- or full_soup.find("div", class_=re.compile(r"content|main|post|article", re.I))
114
- or full_soup.find("body")
115
- )
116
- if not main:
117
- return "No main content found on the webpage."
118
- markdown_text = md(str(main), heading_style="ATX")
119
- markdown_text = re.sub(r"\n{3,}", "\n\n", markdown_text)
120
- markdown_text = re.sub(r"\[\s*\]\([^)]*\)", "", markdown_text)
121
- markdown_text = re.sub(r"[ \t]+", " ", markdown_text)
122
- markdown_text = markdown_text.strip()
123
- title = full_soup.find("title")
124
- if title and title.get_text(strip=True):
125
- markdown_text = f"# {title.get_text(strip=True)}\n\n{markdown_text}"
126
- return markdown_text or "No content could be extracted."
127
-
128
-
129
- def _truncate_markdown(markdown: str, max_chars: int) -> Tuple[str, Dict[str, object]]:
130
- total_chars = len(markdown)
131
- if total_chars <= max_chars:
132
- return markdown, {
133
- "truncated": False,
134
- "returned_chars": total_chars,
135
- "total_chars_estimate": total_chars,
136
- "next_cursor": None,
137
- }
138
- truncated = markdown[:max_chars]
139
- last_paragraph = truncated.rfind("\n\n")
140
- if last_paragraph > max_chars * 0.7:
141
- truncated = truncated[:last_paragraph]
142
- cursor_pos = last_paragraph
143
- elif "." in truncated[-100:]:
144
- last_period = truncated.rfind(".")
145
- if last_period > max_chars * 0.8:
146
- truncated = truncated[: last_period + 1]
147
- cursor_pos = last_period + 1
148
- else:
149
- cursor_pos = len(truncated)
150
- else:
151
- cursor_pos = len(truncated)
152
- metadata = {
153
- "truncated": True,
154
- "returned_chars": len(truncated),
155
- "total_chars_estimate": total_chars,
156
- "next_cursor": cursor_pos,
157
- }
158
- truncated = truncated.rstrip()
159
- truncation_notice = (
160
- "\n\n---\n"
161
- f"**Content Truncated:** Showing {metadata['returned_chars']:,} of {metadata['total_chars_estimate']:,} characters "
162
- f"({(metadata['returned_chars']/metadata['total_chars_estimate']*100):.1f}%)\n"
163
- f"**Next cursor:** {metadata['next_cursor']} (use this value with offset parameter for continuation)\n"
164
- "---"
165
- )
166
- return truncated + truncation_notice, metadata
167
-
168
-
169
- def Web_Fetch(
170
- url: Annotated[str, "The absolute URL to fetch (must return HTML)."],
171
- max_chars: Annotated[int, "Maximum characters to return (0 = no limit, full page content)."] = 3000,
172
- strip_selectors: Annotated[str, "CSS selectors to remove (comma-separated, e.g., '.header, .footer, nav')."] = "",
173
- url_scraper: Annotated[bool, "Extract only links from the page instead of content."] = False,
174
- offset: Annotated[int, "Character offset to start from (for pagination, use next_cursor from previous call)."] = 0,
175
- ) -> str:
176
- _log_call_start(
177
- "Web_Fetch",
178
- url=url,
179
- max_chars=max_chars,
180
- strip_selectors=strip_selectors,
181
- url_scraper=url_scraper,
182
- offset=offset,
183
- )
184
- if not url or not url.strip():
185
- result = "Please enter a valid URL."
186
- _log_call_end("Web_Fetch", _truncate_for_log(result))
187
- return result
188
- try:
189
- resp = _http_get_enhanced(url)
190
- resp.raise_for_status()
191
- except requests.exceptions.RequestException as exc:
192
- result = f"An error occurred: {exc}"
193
- _log_call_end("Web_Fetch", _truncate_for_log(result))
194
- return result
195
- final_url = str(resp.url)
196
- ctype = resp.headers.get("Content-Type", "")
197
- if "html" not in ctype.lower():
198
- result = f"Unsupported content type for extraction: {ctype or 'unknown'}"
199
- _log_call_end("Web_Fetch", _truncate_for_log(result))
200
- return result
201
- resp.encoding = resp.encoding or resp.apparent_encoding
202
- html = resp.text
203
- full_soup = BeautifulSoup(html, "lxml")
204
- if url_scraper:
205
- result = _extract_links_from_soup(full_soup, final_url)
206
- if offset > 0:
207
- result = result[offset:]
208
- if max_chars > 0 and len(result) > max_chars:
209
- result, _ = _truncate_markdown(result, max_chars)
210
- else:
211
- full_result = _fullpage_markdown_from_soup(full_soup, final_url, strip_selectors)
212
- if offset > 0:
213
- if offset >= len(full_result):
214
- result = (
215
- f"Offset {offset} exceeds content length ({len(full_result)} characters). "
216
- f"Content ends at position {len(full_result)}."
217
- )
218
- _log_call_end("Web_Fetch", _truncate_for_log(result))
219
- return result
220
- result = full_result[offset:]
221
- else:
222
- result = full_result
223
- if max_chars > 0 and len(result) > max_chars:
224
- result, metadata = _truncate_markdown(result, max_chars)
225
- if offset > 0:
226
- metadata["total_chars_estimate"] = len(full_result)
227
- metadata["next_cursor"] = offset + metadata["next_cursor"] if metadata["next_cursor"] else None
228
- _log_call_end("Web_Fetch", f"chars={len(result)}, url_scraper={url_scraper}, offset={offset}")
229
- return result
230
-
231
-
232
- def build_interface() -> gr.Interface:
233
- return gr.Interface(
234
- fn=Web_Fetch,
235
- inputs=[
236
- gr.Textbox(label="URL", placeholder="https://example.com/article", max_lines=1),
237
- gr.Slider(minimum=0, maximum=20000, value=3000, step=100, label="Max Characters", info="0 = no limit (full page), default 3000"),
238
- gr.Textbox(
239
- label="Strip Selectors",
240
- placeholder=".header, .footer, nav, .sidebar",
241
- value="",
242
- max_lines=1,
243
- info="CSS selectors to remove (comma-separated)",
244
- ),
245
- gr.Checkbox(label="URL Scraper", value=False, info="Extract only links instead of content"),
246
- gr.Slider(
247
- minimum=0,
248
- maximum=100000,
249
- value=0,
250
- step=100,
251
- label="Offset",
252
- info="Character offset to start from (use next_cursor from previous call for pagination)",
253
- ),
254
- ],
255
- outputs=gr.Markdown(label="Extracted Content"),
256
- title="Web Fetch",
257
- description=(
258
- "<div style=\"text-align:center\">Convert any webpage to clean Markdown format with precision controls, "
259
- "or extract all links. Supports custom element removal, length limits, and pagination with offset.</div>"
260
- ),
261
- api_description=(
262
- "Fetch a web page and return it converted to Markdown format or extract links with configurable options. "
263
- "Includes enhanced truncation with detailed metadata and pagination support via offset parameter. "
264
- "Parameters: url (str - absolute URL), max_chars (int - 0=no limit, default 3000), "
265
- "strip_selectors (str - CSS selectors to remove, comma-separated), "
266
- "url_scraper (bool - extract only links instead of content, default False), "
267
- "offset (int - character offset for pagination, use next_cursor from previous call). "
268
- "When content is truncated, returns detailed metadata including truncated status, character counts, "
269
- "and next_cursor for continuation. When url_scraper=True, returns formatted list of all links found on the page."
270
- ),
271
- flagging_mode="never",
272
- )
273
-
274
-
275
- __all__ = [
276
- "Web_Fetch",
277
- "build_interface",
278
- "_http_get_enhanced",
279
- "_fullpage_markdown_from_soup",
280
- ]
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import re
4
+ from typing import Annotated, Dict, List, Tuple
5
+ from urllib.parse import urlparse, urljoin
6
+
7
+ import gradio as gr
8
+ import requests
9
+ from bs4 import BeautifulSoup
10
+ from markdownify import markdownify as md
11
+ from readability import Document
12
+
13
+ from app import _fetch_rate_limiter, _log_call_end, _log_call_start, _truncate_for_log
14
+ from ._docstrings import autodoc
15
+
16
+
17
+ def _http_get_enhanced(url: str, timeout: int | float = 30, *, skip_rate_limit: bool = False) -> requests.Response:
18
+ headers = {
19
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
20
+ "Accept-Language": "en-US,en;q=0.9",
21
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
22
+ "Accept-Encoding": "gzip, deflate, br",
23
+ "DNT": "1",
24
+ "Connection": "keep-alive",
25
+ "Upgrade-Insecure-Requests": "1",
26
+ }
27
+ if not skip_rate_limit:
28
+ _fetch_rate_limiter.acquire()
29
+ try:
30
+ response = requests.get(
31
+ url,
32
+ headers=headers,
33
+ timeout=timeout,
34
+ allow_redirects=True,
35
+ stream=False,
36
+ )
37
+ response.raise_for_status()
38
+ return response
39
+ except requests.exceptions.Timeout as exc:
40
+ raise requests.exceptions.RequestException("Request timed out. The webpage took too long to respond.") from exc
41
+ except requests.exceptions.ConnectionError as exc:
42
+ raise requests.exceptions.RequestException("Connection error. Please check the URL and your internet connection.") from exc
43
+ except requests.exceptions.HTTPError as exc:
44
+ if response.status_code == 403:
45
+ raise requests.exceptions.RequestException("Access forbidden. The website may be blocking automated requests.") from exc
46
+ if response.status_code == 404:
47
+ raise requests.exceptions.RequestException("Page not found. Please check the URL.") from exc
48
+ if response.status_code == 429:
49
+ raise requests.exceptions.RequestException("Rate limited. Please try again in a few minutes.") from exc
50
+ raise requests.exceptions.RequestException(f"HTTP error {response.status_code}: {exc}") from exc
51
+
52
+
53
+ def _normalize_whitespace(text: str) -> str:
54
+ text = re.sub(r"[ \t\u00A0]+", " ", text)
55
+ text = re.sub(r"\n\s*\n\s*\n+", "\n\n", text.strip())
56
+ return text.strip()
57
+
58
+
59
+ def _truncate(text: str, max_chars: int) -> Tuple[str, bool]:
60
+ if max_chars is None or max_chars <= 0 or len(text) <= max_chars:
61
+ return text, False
62
+ return text[:max_chars].rstrip() + " …", True
63
+
64
+
65
+ def _shorten(text: str, limit: int) -> str:
66
+ if limit <= 0 or len(text) <= limit:
67
+ return text
68
+ return text[: max(0, limit - 1)].rstrip() + "…"
69
+
70
+
71
+ def _domain_of(url: str) -> str:
72
+ try:
73
+ return urlparse(url).netloc or ""
74
+ except Exception:
75
+ return ""
76
+
77
+
78
+ def _extract_links_from_soup(soup: BeautifulSoup, base_url: str) -> str:
79
+ links = []
80
+ for link in soup.find_all("a", href=True):
81
+ href = link.get("href")
82
+ text = link.get_text(strip=True)
83
+ if href.startswith("http"):
84
+ full_url = href
85
+ elif href.startswith("//"):
86
+ full_url = "https:" + href
87
+ elif href.startswith("/"):
88
+ full_url = urljoin(base_url, href)
89
+ else:
90
+ full_url = urljoin(base_url, href)
91
+ if text and href not in ["#", "javascript:void(0)"]:
92
+ links.append(f"- [{text}]({full_url})")
93
+ if not links:
94
+ return "No links found on this page."
95
+ title = soup.find("title")
96
+ title_text = title.get_text(strip=True) if title else "Links from webpage"
97
+ return f"# {title_text}\n\n" + "\n".join(links)
98
+
99
+
100
+ def _fullpage_markdown_from_soup(full_soup: BeautifulSoup, base_url: str, strip_selectors: str = "") -> str:
101
+ if strip_selectors:
102
+ selectors = [s.strip() for s in strip_selectors.split(",") if s.strip()]
103
+ for selector in selectors:
104
+ try:
105
+ for element in full_soup.select(selector):
106
+ element.decompose()
107
+ except Exception:
108
+ continue
109
+ for element in full_soup.select("script, style, nav, footer, header, aside"):
110
+ element.decompose()
111
+ main = (
112
+ full_soup.find("main")
113
+ or full_soup.find("article")
114
+ or full_soup.find("div", class_=re.compile(r"content|main|post|article", re.I))
115
+ or full_soup.find("body")
116
+ )
117
+ if not main:
118
+ return "No main content found on the webpage."
119
+ markdown_text = md(str(main), heading_style="ATX")
120
+ markdown_text = re.sub(r"\n{3,}", "\n\n", markdown_text)
121
+ markdown_text = re.sub(r"\[\s*\]\([^)]*\)", "", markdown_text)
122
+ markdown_text = re.sub(r"[ \t]+", " ", markdown_text)
123
+ markdown_text = markdown_text.strip()
124
+ title = full_soup.find("title")
125
+ if title and title.get_text(strip=True):
126
+ markdown_text = f"# {title.get_text(strip=True)}\n\n{markdown_text}"
127
+ return markdown_text or "No content could be extracted."
128
+
129
+
130
+ def _truncate_markdown(markdown: str, max_chars: int) -> Tuple[str, Dict[str, object]]:
131
+ total_chars = len(markdown)
132
+ if total_chars <= max_chars:
133
+ return markdown, {
134
+ "truncated": False,
135
+ "returned_chars": total_chars,
136
+ "total_chars_estimate": total_chars,
137
+ "next_cursor": None,
138
+ }
139
+ truncated = markdown[:max_chars]
140
+ last_paragraph = truncated.rfind("\n\n")
141
+ if last_paragraph > max_chars * 0.7:
142
+ truncated = truncated[:last_paragraph]
143
+ cursor_pos = last_paragraph
144
+ elif "." in truncated[-100:]:
145
+ last_period = truncated.rfind(".")
146
+ if last_period > max_chars * 0.8:
147
+ truncated = truncated[: last_period + 1]
148
+ cursor_pos = last_period + 1
149
+ else:
150
+ cursor_pos = len(truncated)
151
+ else:
152
+ cursor_pos = len(truncated)
153
+ metadata = {
154
+ "truncated": True,
155
+ "returned_chars": len(truncated),
156
+ "total_chars_estimate": total_chars,
157
+ "next_cursor": cursor_pos,
158
+ }
159
+ truncated = truncated.rstrip()
160
+ truncation_notice = (
161
+ "\n\n---\n"
162
+ f"**Content Truncated:** Showing {metadata['returned_chars']:,} of {metadata['total_chars_estimate']:,} characters "
163
+ f"({(metadata['returned_chars']/metadata['total_chars_estimate']*100):.1f}%)\n"
164
+ f"**Next cursor:** {metadata['next_cursor']} (use this value with offset parameter for continuation)\n"
165
+ "---"
166
+ )
167
+ return truncated + truncation_notice, metadata
168
+
169
+
170
+ @autodoc(
171
+ summary=(
172
+ "Fetch a webpage and return clean Markdown or a list of links, with max length and pagination via offset."
173
+ ),
174
+ returns="Markdown content (or links) possibly with a truncation notice when max_chars is exceeded.",
175
+ )
176
+ def Web_Fetch(
177
+ url: Annotated[str, "The absolute URL to fetch (must return HTML)."],
178
+ max_chars: Annotated[int, "Maximum characters to return (0 = no limit, full page content)."] = 3000,
179
+ strip_selectors: Annotated[str, "CSS selectors to remove (comma-separated, e.g., '.header, .footer, nav')."] = "",
180
+ url_scraper: Annotated[bool, "Extract only links from the page instead of content."] = False,
181
+ offset: Annotated[int, "Character offset to start from (for pagination, use next_cursor from previous call)."] = 0,
182
+ ) -> str:
183
+ _log_call_start(
184
+ "Web_Fetch",
185
+ url=url,
186
+ max_chars=max_chars,
187
+ strip_selectors=strip_selectors,
188
+ url_scraper=url_scraper,
189
+ offset=offset,
190
+ )
191
+ if not url or not url.strip():
192
+ result = "Please enter a valid URL."
193
+ _log_call_end("Web_Fetch", _truncate_for_log(result))
194
+ return result
195
+ try:
196
+ resp = _http_get_enhanced(url)
197
+ resp.raise_for_status()
198
+ except requests.exceptions.RequestException as exc:
199
+ result = f"An error occurred: {exc}"
200
+ _log_call_end("Web_Fetch", _truncate_for_log(result))
201
+ return result
202
+ final_url = str(resp.url)
203
+ ctype = resp.headers.get("Content-Type", "")
204
+ if "html" not in ctype.lower():
205
+ result = f"Unsupported content type for extraction: {ctype or 'unknown'}"
206
+ _log_call_end("Web_Fetch", _truncate_for_log(result))
207
+ return result
208
+ resp.encoding = resp.encoding or resp.apparent_encoding
209
+ html = resp.text
210
+ full_soup = BeautifulSoup(html, "lxml")
211
+ if url_scraper:
212
+ result = _extract_links_from_soup(full_soup, final_url)
213
+ if offset > 0:
214
+ result = result[offset:]
215
+ if max_chars > 0 and len(result) > max_chars:
216
+ result, _ = _truncate_markdown(result, max_chars)
217
+ else:
218
+ full_result = _fullpage_markdown_from_soup(full_soup, final_url, strip_selectors)
219
+ if offset > 0:
220
+ if offset >= len(full_result):
221
+ result = (
222
+ f"Offset {offset} exceeds content length ({len(full_result)} characters). "
223
+ f"Content ends at position {len(full_result)}."
224
+ )
225
+ _log_call_end("Web_Fetch", _truncate_for_log(result))
226
+ return result
227
+ result = full_result[offset:]
228
+ else:
229
+ result = full_result
230
+ if max_chars > 0 and len(result) > max_chars:
231
+ result, metadata = _truncate_markdown(result, max_chars)
232
+ if offset > 0:
233
+ metadata["total_chars_estimate"] = len(full_result)
234
+ metadata["next_cursor"] = offset + metadata["next_cursor"] if metadata["next_cursor"] else None
235
+ _log_call_end("Web_Fetch", f"chars={len(result)}, url_scraper={url_scraper}, offset={offset}")
236
+ return result
237
+
238
+
239
+ def build_interface() -> gr.Interface:
240
+ return gr.Interface(
241
+ fn=Web_Fetch,
242
+ inputs=[
243
+ gr.Textbox(label="URL", placeholder="https://example.com/article", max_lines=1),
244
+ gr.Slider(minimum=0, maximum=20000, value=3000, step=100, label="Max Characters", info="0 = no limit (full page), default 3000"),
245
+ gr.Textbox(
246
+ label="Strip Selectors",
247
+ placeholder=".header, .footer, nav, .sidebar",
248
+ value="",
249
+ max_lines=1,
250
+ info="CSS selectors to remove (comma-separated)",
251
+ ),
252
+ gr.Checkbox(label="URL Scraper", value=False, info="Extract only links instead of content"),
253
+ gr.Slider(
254
+ minimum=0,
255
+ maximum=100000,
256
+ value=0,
257
+ step=100,
258
+ label="Offset",
259
+ info="Character offset to start from (use next_cursor from previous call for pagination)",
260
+ ),
261
+ ],
262
+ outputs=gr.Markdown(label="Extracted Content"),
263
+ title="Web Fetch",
264
+ description=(
265
+ "<div style=\"text-align:center\">Convert any webpage to clean Markdown format with precision controls, "
266
+ "or extract all links. Supports custom element removal, length limits, and pagination with offset.</div>"
267
+ ),
268
+ api_description=(
269
+ "Fetch a web page and return it converted to Markdown format or extract links with configurable options. "
270
+ "Includes enhanced truncation with detailed metadata and pagination support via offset parameter. "
271
+ "Parameters: url (str - absolute URL), max_chars (int - 0=no limit, default 3000), "
272
+ "strip_selectors (str - CSS selectors to remove, comma-separated), "
273
+ "url_scraper (bool - extract only links instead of content, default False), "
274
+ "offset (int - character offset for pagination, use next_cursor from previous call). "
275
+ "When content is truncated, returns detailed metadata including truncated status, character counts, "
276
+ "and next_cursor for continuation. When url_scraper=True, returns formatted list of all links found on the page."
277
+ ),
278
+ flagging_mode="never",
279
+ )
280
+
281
+
282
+ __all__ = [
283
+ "Web_Fetch",
284
+ "build_interface",
285
+ "_http_get_enhanced",
286
+ "_fullpage_markdown_from_soup",
287
+ ]