| from pathlib import Path |
| from subprocess import check_output |
| from sys import executable as python_executable |
|
|
| from scrapling.core.utils import log |
| from scrapling.engines.toolbelt.custom import Response |
| from scrapling.core.utils._shell import _CookieParser, _ParseHeaders |
| from scrapling.core._types import List, Optional, Dict, Tuple, Any, Callable |
|
|
| from orjson import loads as json_loads, JSONDecodeError |
|
|
| try: |
| from click import command, option, Choice, group, argument |
| except (ImportError, ModuleNotFoundError) as e: |
| raise ModuleNotFoundError( |
| "You need to install scrapling with any of the extras to enable Shell commands. See: https://scrapling.readthedocs.io/en/latest/#installation" |
| ) from e |
|
|
| __OUTPUT_FILE_HELP__ = "The output file path can be an HTML file, a Markdown file of the HTML content, or the text content itself. Use file extensions (`.html`/`.md`/`.txt`) respectively." |
| __PACKAGE_DIR__ = Path(__file__).parent |
|
|
|
|
| def __Execute(cmd: List[str], help_line: str) -> None: |
| print(f"Installing {help_line}...") |
| _ = check_output(cmd, shell=False) |
| |
|
|
|
|
| def __ParseJSONData(json_string: Optional[str] = None) -> Optional[Dict[str, Any]]: |
| """Parse JSON string into a Python object""" |
| if not json_string: |
| return None |
|
|
| try: |
| return json_loads(json_string) |
| except JSONDecodeError as err: |
| raise ValueError(f"Invalid JSON data '{json_string}': {err}") |
|
|
|
|
| def __Request_and_Save( |
| fetcher_func: Callable[..., Response], |
| url: str, |
| output_file: str, |
| css_selector: Optional[str] = None, |
| **kwargs, |
| ) -> None: |
| """Make a request using the specified fetcher function and save the result""" |
| from scrapling.core.shell import Convertor |
|
|
| |
| output_path = Path(output_file) |
| if not output_path.is_absolute(): |
| output_path = Path.cwd() / output_file |
|
|
| response = fetcher_func(url, **kwargs) |
| Convertor.write_content_to_file(response, str(output_path), css_selector) |
| log.info(f"Content successfully saved to '{output_path}'") |
|
|
|
|
| def __ParseExtractArguments( |
| headers: List[str], cookies: str, params: str, json: Optional[str] = None |
| ) -> Tuple[Dict[str, str], Dict[str, str], Dict[str, str], Optional[Dict[str, str]]]: |
| """Parse arguments for extract command""" |
| parsed_headers, parsed_cookies = _ParseHeaders(headers) |
| if cookies: |
| for key, value in _CookieParser(cookies): |
| try: |
| parsed_cookies[key] = value |
| except Exception as err: |
| raise ValueError(f"Could not parse cookies '{cookies}': {err}") |
|
|
| parsed_json = __ParseJSONData(json) |
| parsed_params = {} |
| for param in params: |
| if "=" in param: |
| key, value = param.split("=", 1) |
| parsed_params[key] = value |
|
|
| return parsed_headers, parsed_cookies, parsed_params, parsed_json |
|
|
|
|
| def __BuildRequest(headers: List[str], cookies: str, params: str, json: Optional[str] = None, **kwargs) -> Dict: |
| """Build a request object using the specified arguments""" |
| |
| parsed_headers, parsed_cookies, parsed_params, parsed_json = __ParseExtractArguments(headers, cookies, params, json) |
| |
| request_kwargs: Dict[str, Any] = { |
| "headers": parsed_headers if parsed_headers else None, |
| "cookies": parsed_cookies if parsed_cookies else None, |
| } |
| if parsed_json: |
| request_kwargs["json"] = parsed_json |
| if parsed_params: |
| request_kwargs["params"] = parsed_params |
| if "proxy" in kwargs: |
| request_kwargs["proxy"] = kwargs.pop("proxy") |
|
|
| |
| if "impersonate" in kwargs and "," in (kwargs.get("impersonate") or ""): |
| kwargs["impersonate"] = [browser.strip() for browser in kwargs["impersonate"].split(",")] |
|
|
| return {**request_kwargs, **kwargs} |
|
|
|
|
| @command(help="Install all Scrapling's Fetchers dependencies") |
| @option( |
| "-f", |
| "--force", |
| "force", |
| is_flag=True, |
| default=False, |
| type=bool, |
| help="Force Scrapling to reinstall all Fetchers dependencies", |
| ) |
| def install(force): |
| if force or not __PACKAGE_DIR__.joinpath(".scrapling_dependencies_installed").exists(): |
| __Execute( |
| [python_executable, "-m", "playwright", "install", "chromium"], |
| "Playwright browsers", |
| ) |
| __Execute( |
| [ |
| python_executable, |
| "-m", |
| "playwright", |
| "install-deps", |
| "chromium", |
| ], |
| "Playwright dependencies", |
| ) |
| from tld.utils import update_tld_names |
|
|
| update_tld_names(fail_silently=True) |
| |
| __PACKAGE_DIR__.joinpath(".scrapling_dependencies_installed").touch() |
| else: |
| print("The dependencies are already installed") |
|
|
|
|
| @command(help="Run Scrapling's MCP server (Check the docs for more info).") |
| @option( |
| "--http", |
| is_flag=True, |
| default=False, |
| help="Whether to run the MCP server in streamable-http transport or leave it as stdio (Default: False)", |
| ) |
| @option( |
| "--host", |
| type=str, |
| default="0.0.0.0", |
| help="The host to use if streamable-http transport is enabled (Default: '0.0.0.0')", |
| ) |
| @option( |
| "--port", type=int, default=8000, help="The port to use if streamable-http transport is enabled (Default: 8000)" |
| ) |
| def mcp(http, host, port): |
| from scrapling.core.ai import ScraplingMCPServer |
|
|
| server = ScraplingMCPServer() |
| server.serve(http, host, port) |
|
|
|
|
| @command(help="Interactive scraping console") |
| @option( |
| "-c", |
| "--code", |
| "code", |
| is_flag=False, |
| default="", |
| type=str, |
| help="Evaluate the code in the shell, print the result and exit", |
| ) |
| @option( |
| "-L", |
| "--loglevel", |
| "level", |
| is_flag=False, |
| default="debug", |
| type=Choice(["debug", "info", "warning", "error", "critical", "fatal"], case_sensitive=False), |
| help="Log level (default: DEBUG)", |
| ) |
| def shell(code, level): |
| from scrapling.core.shell import CustomShell |
|
|
| console = CustomShell(code=code, log_level=level) |
| console.start() |
|
|
|
|
| @group( |
| help="Fetch web pages using various fetchers and extract full/selected HTML content as HTML, Markdown, or extract text content." |
| ) |
| def extract(): |
| """Extract content from web pages and save to files""" |
| pass |
|
|
|
|
| @extract.command(help=f"Perform a GET request and save the content to a file.\n\n{__OUTPUT_FILE_HELP__}") |
| @argument("url", required=True) |
| @argument("output_file", required=True) |
| @option( |
| "--headers", |
| "-H", |
| multiple=True, |
| help='HTTP headers in format "Key: Value" (can be used multiple times)', |
| ) |
| @option("--cookies", help='Cookies string in format "name1=value1; name2=value2"') |
| @option("--timeout", type=int, default=30, help="Request timeout in seconds (default: 30)") |
| @option("--proxy", help='Proxy URL in format "http://username:password@host:port"') |
| @option( |
| "--css-selector", |
| "-s", |
| help="CSS selector to extract specific content from the page. It returns all matches.", |
| ) |
| @option( |
| "--params", |
| "-p", |
| multiple=True, |
| help='Query parameters in format "key=value" (can be used multiple times)', |
| ) |
| @option( |
| "--follow-redirects/--no-follow-redirects", |
| default=True, |
| help="Whether to follow redirects (default: True)", |
| ) |
| @option( |
| "--verify/--no-verify", |
| default=True, |
| help="Whether to verify SSL certificates (default: True)", |
| ) |
| @option( |
| "--impersonate", |
| help="Browser to impersonate. Can be a single browser (e.g., chrome) or comma-separated list for random selection (e.g., chrome,firefox,safari).", |
| ) |
| @option( |
| "--stealthy-headers/--no-stealthy-headers", |
| default=True, |
| help="Use stealthy browser headers (default: True)", |
| ) |
| def get( |
| url, |
| output_file, |
| headers, |
| cookies, |
| timeout, |
| proxy, |
| css_selector, |
| params, |
| follow_redirects, |
| verify, |
| impersonate, |
| stealthy_headers, |
| ): |
| """ |
| Perform a GET request and save the content to a file. |
| |
| :param url: Target URL for the request. |
| :param output_file: Output file path (.md for Markdown, .html for HTML). |
| :param headers: HTTP headers to include in the request. |
| :param cookies: Cookies to use in the request. |
| :param timeout: Number of seconds to wait before timing out. |
| :param proxy: Proxy URL to use. (Format: "http://username:password@localhost:8030") |
| :param css_selector: CSS selector to extract specific content. |
| :param params: Query string parameters for the request. |
| :param follow_redirects: Whether to follow redirects. |
| :param verify: Whether to verify HTTPS certificates. |
| :param impersonate: Browser version to impersonate. |
| :param stealthy_headers: If enabled, creates and adds real browser headers. |
| """ |
|
|
| kwargs = __BuildRequest( |
| headers, |
| cookies, |
| params, |
| None, |
| timeout=timeout, |
| follow_redirects=follow_redirects, |
| verify=verify, |
| stealthy_headers=stealthy_headers, |
| impersonate=impersonate, |
| proxy=proxy, |
| ) |
| from scrapling.fetchers import Fetcher |
|
|
| __Request_and_Save(Fetcher.get, url, output_file, css_selector, **kwargs) |
|
|
|
|
| @extract.command(help=f"Perform a POST request and save the content to a file.\n\n{__OUTPUT_FILE_HELP__}") |
| @argument("url", required=True) |
| @argument("output_file", required=True) |
| @option( |
| "--data", |
| "-d", |
| help='Form data to include in the request body (as string, ex: "param1=value1¶m2=value2")', |
| ) |
| @option("--json", "-j", help="JSON data to include in the request body (as string)") |
| @option( |
| "--headers", |
| "-H", |
| multiple=True, |
| help='HTTP headers in format "Key: Value" (can be used multiple times)', |
| ) |
| @option("--cookies", help='Cookies string in format "name1=value1; name2=value2"') |
| @option("--timeout", type=int, default=30, help="Request timeout in seconds (default: 30)") |
| @option("--proxy", help='Proxy URL in format "http://username:password@host:port"') |
| @option( |
| "--css-selector", |
| "-s", |
| help="CSS selector to extract specific content from the page. It returns all matches.", |
| ) |
| @option( |
| "--params", |
| "-p", |
| multiple=True, |
| help='Query parameters in format "key=value" (can be used multiple times)', |
| ) |
| @option( |
| "--follow-redirects/--no-follow-redirects", |
| default=True, |
| help="Whether to follow redirects (default: True)", |
| ) |
| @option( |
| "--verify/--no-verify", |
| default=True, |
| help="Whether to verify SSL certificates (default: True)", |
| ) |
| @option( |
| "--impersonate", |
| help="Browser to impersonate. Can be a single browser (e.g., chrome) or comma-separated list for random selection (e.g., chrome,firefox,safari).", |
| ) |
| @option( |
| "--stealthy-headers/--no-stealthy-headers", |
| default=True, |
| help="Use stealthy browser headers (default: True)", |
| ) |
| def post( |
| url, |
| output_file, |
| data, |
| json, |
| headers, |
| cookies, |
| timeout, |
| proxy, |
| css_selector, |
| params, |
| follow_redirects, |
| verify, |
| impersonate, |
| stealthy_headers, |
| ): |
| """ |
| Perform a POST request and save the content to a file. |
| |
| :param url: Target URL for the request. |
| :param output_file: Output file path (.md for Markdown, .html for HTML). |
| :param data: Form data to include in the request body. (as string, ex: "param1=value1¶m2=value2") |
| :param json: A JSON serializable object to include in the body of the request. |
| :param headers: Headers to include in the request. |
| :param cookies: Cookies to use in the request. |
| :param timeout: Number of seconds to wait before timing out. |
| :param proxy: Proxy URL to use. |
| :param css_selector: CSS selector to extract specific content. |
| :param params: Query string parameters for the request. |
| :param follow_redirects: Whether to follow redirects. |
| :param verify: Whether to verify HTTPS certificates. |
| :param impersonate: Browser version to impersonate. |
| :param stealthy_headers: If enabled, creates and adds real browser headers. |
| """ |
|
|
| kwargs = __BuildRequest( |
| headers, |
| cookies, |
| params, |
| json, |
| timeout=timeout, |
| follow_redirects=follow_redirects, |
| verify=verify, |
| stealthy_headers=stealthy_headers, |
| impersonate=impersonate, |
| proxy=proxy, |
| data=data, |
| ) |
| from scrapling.fetchers import Fetcher |
|
|
| __Request_and_Save(Fetcher.post, url, output_file, css_selector, **kwargs) |
|
|
|
|
| @extract.command(help=f"Perform a PUT request and save the content to a file.\n\n{__OUTPUT_FILE_HELP__}") |
| @argument("url", required=True) |
| @argument("output_file", required=True) |
| @option("--data", "-d", help="Form data to include in the request body") |
| @option("--json", "-j", help="JSON data to include in the request body (as string)") |
| @option( |
| "--headers", |
| "-H", |
| multiple=True, |
| help='HTTP headers in format "Key: Value" (can be used multiple times)', |
| ) |
| @option("--cookies", help='Cookies string in format "name1=value1; name2=value2"') |
| @option("--timeout", type=int, default=30, help="Request timeout in seconds (default: 30)") |
| @option("--proxy", help='Proxy URL in format "http://username:password@host:port"') |
| @option( |
| "--css-selector", |
| "-s", |
| help="CSS selector to extract specific content from the page. It returns all matches.", |
| ) |
| @option( |
| "--params", |
| "-p", |
| multiple=True, |
| help='Query parameters in format "key=value" (can be used multiple times)', |
| ) |
| @option( |
| "--follow-redirects/--no-follow-redirects", |
| default=True, |
| help="Whether to follow redirects (default: True)", |
| ) |
| @option( |
| "--verify/--no-verify", |
| default=True, |
| help="Whether to verify SSL certificates (default: True)", |
| ) |
| @option( |
| "--impersonate", |
| help="Browser to impersonate. Can be a single browser (e.g., chrome) or comma-separated list for random selection (e.g., chrome,firefox,safari).", |
| ) |
| @option( |
| "--stealthy-headers/--no-stealthy-headers", |
| default=True, |
| help="Use stealthy browser headers (default: True)", |
| ) |
| def put( |
| url, |
| output_file, |
| data, |
| json, |
| headers, |
| cookies, |
| timeout, |
| proxy, |
| css_selector, |
| params, |
| follow_redirects, |
| verify, |
| impersonate, |
| stealthy_headers, |
| ): |
| """ |
| Perform a PUT request and save the content to a file. |
| |
| :param url: Target URL for the request. |
| :param output_file: Output file path (.md for Markdown, .html for HTML). |
| :param data: Form data to include in the request body. |
| :param json: A JSON serializable object to include in the body of the request. |
| :param headers: Headers to include in the request. |
| :param cookies: Cookies to use in the request. |
| :param timeout: Number of seconds to wait before timing out. |
| :param proxy: Proxy URL to use. |
| :param css_selector: CSS selector to extract specific content. |
| :param params: Query string parameters for the request. |
| :param follow_redirects: Whether to follow redirects. |
| :param verify: Whether to verify HTTPS certificates. |
| :param impersonate: Browser version to impersonate. |
| :param stealthy_headers: If enabled, creates and adds real browser headers. |
| """ |
|
|
| kwargs = __BuildRequest( |
| headers, |
| cookies, |
| params, |
| json, |
| timeout=timeout, |
| follow_redirects=follow_redirects, |
| verify=verify, |
| stealthy_headers=stealthy_headers, |
| impersonate=impersonate, |
| proxy=proxy, |
| data=data, |
| ) |
| from scrapling.fetchers import Fetcher |
|
|
| __Request_and_Save(Fetcher.put, url, output_file, css_selector, **kwargs) |
|
|
|
|
| @extract.command(help=f"Perform a DELETE request and save the content to a file.\n\n{__OUTPUT_FILE_HELP__}") |
| @argument("url", required=True) |
| @argument("output_file", required=True) |
| @option( |
| "--headers", |
| "-H", |
| multiple=True, |
| help='HTTP headers in format "Key: Value" (can be used multiple times)', |
| ) |
| @option("--cookies", help='Cookies string in format "name1=value1; name2=value2"') |
| @option("--timeout", type=int, default=30, help="Request timeout in seconds (default: 30)") |
| @option("--proxy", help='Proxy URL in format "http://username:password@host:port"') |
| @option( |
| "--css-selector", |
| "-s", |
| help="CSS selector to extract specific content from the page. It returns all matches.", |
| ) |
| @option( |
| "--params", |
| "-p", |
| multiple=True, |
| help='Query parameters in format "key=value" (can be used multiple times)', |
| ) |
| @option( |
| "--follow-redirects/--no-follow-redirects", |
| default=True, |
| help="Whether to follow redirects (default: True)", |
| ) |
| @option( |
| "--verify/--no-verify", |
| default=True, |
| help="Whether to verify SSL certificates (default: True)", |
| ) |
| @option( |
| "--impersonate", |
| help="Browser to impersonate. Can be a single browser (e.g., chrome) or comma-separated list for random selection (e.g., chrome,firefox,safari).", |
| ) |
| @option( |
| "--stealthy-headers/--no-stealthy-headers", |
| default=True, |
| help="Use stealthy browser headers (default: True)", |
| ) |
| def delete( |
| url, |
| output_file, |
| headers, |
| cookies, |
| timeout, |
| proxy, |
| css_selector, |
| params, |
| follow_redirects, |
| verify, |
| impersonate, |
| stealthy_headers, |
| ): |
| """ |
| Perform a DELETE request and save the content to a file. |
| |
| :param url: Target URL for the request. |
| :param output_file: Output file path (.md for Markdown, .html for HTML). |
| :param headers: Headers to include in the request. |
| :param cookies: Cookies to use in the request. |
| :param timeout: Number of seconds to wait before timing out. |
| :param proxy: Proxy URL to use. |
| :param css_selector: CSS selector to extract specific content. |
| :param params: Query string parameters for the request. |
| :param follow_redirects: Whether to follow redirects. |
| :param verify: Whether to verify HTTPS certificates. |
| :param impersonate: Browser version to impersonate. |
| :param stealthy_headers: If enabled, creates and adds real browser headers. |
| """ |
|
|
| kwargs = __BuildRequest( |
| headers, |
| cookies, |
| params, |
| None, |
| timeout=timeout, |
| follow_redirects=follow_redirects, |
| verify=verify, |
| stealthy_headers=stealthy_headers, |
| impersonate=impersonate, |
| proxy=proxy, |
| ) |
| from scrapling.fetchers import Fetcher |
|
|
| __Request_and_Save(Fetcher.delete, url, output_file, css_selector, **kwargs) |
|
|
|
|
| @extract.command(help=f"Use DynamicFetcher to fetch content with browser automation.\n\n{__OUTPUT_FILE_HELP__}") |
| @argument("url", required=True) |
| @argument("output_file", required=True) |
| @option( |
| "--headless/--no-headless", |
| default=True, |
| help="Run browser in headless mode (default: True)", |
| ) |
| @option( |
| "--disable-resources/--enable-resources", |
| default=False, |
| help="Drop unnecessary resources for speed boost (default: False)", |
| ) |
| @option( |
| "--network-idle/--no-network-idle", |
| default=False, |
| help="Wait for network idle (default: False)", |
| ) |
| @option( |
| "--timeout", |
| type=int, |
| default=30000, |
| help="Timeout in milliseconds (default: 30000)", |
| ) |
| @option( |
| "--wait", |
| type=int, |
| default=0, |
| help="Additional wait time in milliseconds after page load (default: 0)", |
| ) |
| @option( |
| "--css-selector", |
| "-s", |
| help="CSS selector to extract specific content from the page. It returns all matches.", |
| ) |
| @option("--wait-selector", help="CSS selector to wait for before proceeding") |
| @option("--locale", default=None, help="Specify user locale. Defaults to the system default locale.") |
| @option( |
| "--real-chrome/--no-real-chrome", |
| default=False, |
| help="If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it. (default: False)", |
| ) |
| @option("--proxy", help='Proxy URL in format "http://username:password@host:port"') |
| @option( |
| "--extra-headers", |
| "-H", |
| multiple=True, |
| help='Extra headers in format "Key: Value" (can be used multiple times)', |
| ) |
| def fetch( |
| url, |
| output_file, |
| headless, |
| disable_resources, |
| network_idle, |
| timeout, |
| wait, |
| css_selector, |
| wait_selector, |
| locale, |
| real_chrome, |
| proxy, |
| extra_headers, |
| ): |
| """ |
| Opens up a browser and fetch content using DynamicFetcher. |
| |
| :param url: Target url. |
| :param output_file: Output file path (.md for Markdown, .html for HTML). |
| :param headless: Run the browser in headless/hidden or headful/visible mode. |
| :param disable_resources: Drop requests of unnecessary resources for a speed boost. |
| :param network_idle: Wait for the page until there are no network connections for at least 500 ms. |
| :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. |
| :param wait: The time (milliseconds) the fetcher will wait after everything finishes before returning. |
| :param css_selector: CSS selector to extract specific content. |
| :param wait_selector: Wait for a specific CSS selector to be in a specific state. |
| :param locale: Set the locale for the browser. |
| :param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it. |
| :param proxy: The proxy to be used with requests. |
| :param extra_headers: Extra headers to add to the request. |
| """ |
|
|
| |
| parsed_headers, _ = _ParseHeaders(extra_headers, False) |
|
|
| |
| kwargs = { |
| "headless": headless, |
| "disable_resources": disable_resources, |
| "network_idle": network_idle, |
| "timeout": timeout, |
| "locale": locale, |
| "real_chrome": real_chrome, |
| } |
|
|
| if wait > 0: |
| kwargs["wait"] = wait |
| if wait_selector: |
| kwargs["wait_selector"] = wait_selector |
| if proxy: |
| kwargs["proxy"] = proxy |
| if parsed_headers: |
| kwargs["extra_headers"] = parsed_headers |
|
|
| from scrapling.fetchers import DynamicFetcher |
|
|
| __Request_and_Save(DynamicFetcher.fetch, url, output_file, css_selector, **kwargs) |
|
|
|
|
| @extract.command(help=f"Use StealthyFetcher to fetch content with advanced stealth features.\n\n{__OUTPUT_FILE_HELP__}") |
| @argument("url", required=True) |
| @argument("output_file", required=True) |
| @option( |
| "--headless/--no-headless", |
| default=True, |
| help="Run browser in headless mode (default: True)", |
| ) |
| @option( |
| "--disable-resources/--enable-resources", |
| default=False, |
| help="Drop unnecessary resources for speed boost (default: False)", |
| ) |
| @option( |
| "--block-webrtc/--allow-webrtc", |
| default=False, |
| help="Block WebRTC entirely (default: False)", |
| ) |
| @option( |
| "--solve-cloudflare/--no-solve-cloudflare", |
| default=False, |
| help="Solve Cloudflare challenges (default: False)", |
| ) |
| @option("--allow-webgl/--block-webgl", default=True, help="Allow WebGL (default: True)") |
| @option( |
| "--network-idle/--no-network-idle", |
| default=False, |
| help="Wait for network idle (default: False)", |
| ) |
| @option( |
| "--real-chrome/--no-real-chrome", |
| default=False, |
| help="If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it. (default: False)", |
| ) |
| @option( |
| "--hide-canvas/--show-canvas", |
| default=False, |
| help="Add noise to canvas operations (default: False)", |
| ) |
| @option( |
| "--timeout", |
| type=int, |
| default=30000, |
| help="Timeout in milliseconds (default: 30000)", |
| ) |
| @option( |
| "--wait", |
| type=int, |
| default=0, |
| help="Additional wait time in milliseconds after page load (default: 0)", |
| ) |
| @option( |
| "--css-selector", |
| "-s", |
| help="CSS selector to extract specific content from the page. It returns all matches.", |
| ) |
| @option("--wait-selector", help="CSS selector to wait for before proceeding") |
| @option("--proxy", help='Proxy URL in format "http://username:password@host:port"') |
| @option( |
| "--extra-headers", |
| "-H", |
| multiple=True, |
| help='Extra headers in format "Key: Value" (can be used multiple times)', |
| ) |
| def stealthy_fetch( |
| url, |
| output_file, |
| headless, |
| disable_resources, |
| block_webrtc, |
| solve_cloudflare, |
| allow_webgl, |
| network_idle, |
| real_chrome, |
| hide_canvas, |
| timeout, |
| wait, |
| css_selector, |
| wait_selector, |
| proxy, |
| extra_headers, |
| ): |
| """ |
| Opens up a browser with advanced stealth features and fetch content using StealthyFetcher. |
| |
| :param url: Target url. |
| :param output_file: Output file path (.md for Markdown, .html for HTML). |
| :param headless: Run the browser in headless/hidden, or headful/visible mode. |
| :param disable_resources: Drop requests of unnecessary resources for a speed boost. |
| :param block_webrtc: Blocks WebRTC entirely. |
| :param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges. |
| :param allow_webgl: Allow WebGL (recommended to keep enabled). |
| :param network_idle: Wait for the page until there are no network connections for at least 500 ms. |
| :param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it. |
| :param hide_canvas: Add random noise to canvas operations to prevent fingerprinting. |
| :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. |
| :param wait: The time (milliseconds) the fetcher will wait after everything finishes before returning. |
| :param css_selector: CSS selector to extract specific content. |
| :param wait_selector: Wait for a specific CSS selector to be in a specific state. |
| :param proxy: The proxy to be used with requests. |
| :param extra_headers: Extra headers to add to the request. |
| """ |
|
|
| |
| parsed_headers, _ = _ParseHeaders(extra_headers, False) |
|
|
| |
| kwargs = { |
| "headless": headless, |
| "disable_resources": disable_resources, |
| "block_webrtc": block_webrtc, |
| "solve_cloudflare": solve_cloudflare, |
| "allow_webgl": allow_webgl, |
| "network_idle": network_idle, |
| "real_chrome": real_chrome, |
| "hide_canvas": hide_canvas, |
| "timeout": timeout, |
| } |
|
|
| if wait > 0: |
| kwargs["wait"] = wait |
| if wait_selector: |
| kwargs["wait_selector"] = wait_selector |
| if proxy: |
| kwargs["proxy"] = proxy |
| if parsed_headers: |
| kwargs["extra_headers"] = parsed_headers |
|
|
| from scrapling.fetchers import StealthyFetcher |
|
|
| __Request_and_Save(StealthyFetcher.fetch, url, output_file, css_selector, **kwargs) |
|
|
|
|
| @group() |
| def main(): |
| pass |
|
|
|
|
| |
| main.add_command(install) |
| main.add_command(shell) |
| main.add_command(extract) |
| main.add_command(mcp) |
|
|