Spaces:
Running
Running
| #!/usr/bin/env python3 | |
| from __future__ import annotations | |
| import asyncio | |
| import contextlib | |
| import inspect | |
| import threading | |
| import traceback | |
| from collections import deque | |
| from dataclasses import dataclass | |
| from datetime import datetime, timezone | |
| from html import escape | |
| from pathlib import Path | |
| from typing import Any | |
| from urllib.parse import urlsplit | |
| import huggingface_hub as hf_hub | |
| if not hasattr(hf_hub, "HfFolder"): | |
| class _CompatHfFolder: | |
| def get_token() -> str | None: | |
| return None | |
| def save_token(token: str) -> None: | |
| del token | |
| return None | |
| def delete_token() -> None: | |
| return None | |
| hf_hub.HfFolder = _CompatHfFolder # type: ignore[attr-defined] | |
| import gradio as gr | |
| from crawler import ( | |
| MAX_SHARD_ROWS, | |
| MAX_SHARDS, | |
| NORMAL_TOTAL_WORKERS, | |
| SUPER_TOTAL_WORKERS, | |
| AsyncCrawler, | |
| CrawlerConfig, | |
| ) | |
| APP_CSS = """ | |
| :root { | |
| --bg-main: #0a0d12; | |
| --bg-surface: #151a22; | |
| --bg-panel: #1b2230; | |
| --text-main: #f0f4fb; | |
| --text-muted: #9aa4b6; | |
| --accent: #3bd9ff; | |
| --accent-2: #4cffb1; | |
| --border: #2f3a50; | |
| --shadow: 0 18px 36px rgba(0, 0, 0, 0.45); | |
| } | |
| :root[data-crawler-theme="red"] { | |
| --bg-main: #17080c; | |
| --bg-surface: #250d15; | |
| --bg-panel: #341322; | |
| --text-main: #f8e8ee; | |
| --text-muted: #d5b0c0; | |
| --accent: #7a0018; | |
| --accent-2: #8e3ff5; | |
| --border: #5a2035; | |
| } | |
| :root[data-crawler-theme="blue"] { | |
| --bg-main: #021116; | |
| --bg-surface: #08222c; | |
| --bg-panel: #0e2f3b; | |
| --text-main: #eaffff; | |
| --text-muted: #8fbcc7; | |
| --accent: #2fff9d; | |
| --accent-2: #13e5ff; | |
| --border: #1e5662; | |
| } | |
| :root[data-crawler-theme="light"] { | |
| --bg-main: #f6f7f9; | |
| --bg-surface: #ffffff; | |
| --bg-panel: #eceff2; | |
| --text-main: #111317; | |
| --text-muted: #60666f; | |
| --accent: #2a2f37; | |
| --accent-2: #868b95; | |
| --border: #d0d4db; | |
| --shadow: 0 10px 25px rgba(35, 42, 52, 0.16); | |
| } | |
| :root[data-crawler-theme="dark"] { | |
| --bg-main: #090909; | |
| --bg-surface: #141414; | |
| --bg-panel: #1d1d1d; | |
| --text-main: #f0f0f0; | |
| --text-muted: #a8a8a8; | |
| --accent: #444444; | |
| --accent-2: #686868; | |
| --border: #2b2b2b; | |
| } | |
| :root[data-crawler-theme="green"] { | |
| --bg-main: #08110b; | |
| --bg-surface: #0f1d14; | |
| --bg-panel: #17301e; | |
| --text-main: #e8f8ed; | |
| --text-muted: #97bc9f; | |
| --accent: #2ea84b; | |
| --accent-2: #185f2a; | |
| --border: #2a5d36; | |
| } | |
| :root[data-crawler-theme="sunset"] { | |
| --bg-main: #1c0f0b; | |
| --bg-surface: #2f1810; | |
| --bg-panel: #422015; | |
| --text-main: #ffeede; | |
| --text-muted: #d9b59d; | |
| --accent: #ff7f3f; | |
| --accent-2: #ff4f81; | |
| --border: #6e3525; | |
| } | |
| :root[data-crawler-theme="ocean"] { | |
| --bg-main: #04131d; | |
| --bg-surface: #092230; | |
| --bg-panel: #0e3144; | |
| --text-main: #e8fbff; | |
| --text-muted: #9fc3cf; | |
| --accent: #2cd9ff; | |
| --accent-2: #38ffcb; | |
| --border: #1b5062; | |
| } | |
| :root[data-crawler-theme="amber"] { | |
| --bg-main: #171104; | |
| --bg-surface: #2a1d07; | |
| --bg-panel: #3a2a0a; | |
| --text-main: #fff6dc; | |
| --text-muted: #d2bd84; | |
| --accent: #ffb300; | |
| --accent-2: #ffd54f; | |
| --border: #6b4d12; | |
| } | |
| :root[data-crawler-theme="graphite"] { | |
| --bg-main: #0f1012; | |
| --bg-surface: #1a1d21; | |
| --bg-panel: #242a30; | |
| --text-main: #f1f3f6; | |
| --text-muted: #adb5bf; | |
| --accent: #8fa0b7; | |
| --accent-2: #d7dce2; | |
| --border: #3a424c; | |
| } | |
| :root[data-crawler-theme="mint"] { | |
| --bg-main: #06140f; | |
| --bg-surface: #0b2319; | |
| --bg-panel: #123125; | |
| --text-main: #e8fff4; | |
| --text-muted: #a4d8bf; | |
| --accent: #3dffb2; | |
| --accent-2: #7df9d1; | |
| --border: #256347; | |
| } | |
| .gradio-container { | |
| background: | |
| radial-gradient(1200px 550px at 8% 0%, color-mix(in srgb, var(--accent) 18%, transparent), transparent), | |
| radial-gradient(900px 600px at 100% 0%, color-mix(in srgb, var(--accent-2) 14%, transparent), transparent), | |
| var(--bg-main); | |
| color: var(--text-main); | |
| } | |
| .gradio-container .block, | |
| .gradio-container .form, | |
| .gradio-container .gr-box, | |
| .gradio-container .panel-wrap { | |
| background: color-mix(in srgb, var(--bg-surface) 92%, transparent) !important; | |
| border: 1px solid var(--border) !important; | |
| box-shadow: var(--shadow); | |
| } | |
| .gradio-container h1, | |
| .gradio-container h2, | |
| .gradio-container h3, | |
| .gradio-container p, | |
| .gradio-container label, | |
| .gradio-container .prose, | |
| .gradio-container .prose * { | |
| color: var(--text-main) !important; | |
| } | |
| .gradio-container input, | |
| .gradio-container textarea, | |
| .gradio-container select { | |
| background: var(--bg-panel) !important; | |
| color: var(--text-main) !important; | |
| border: 1px solid var(--border) !important; | |
| } | |
| .gradio-container button { | |
| border: 1px solid var(--border) !important; | |
| } | |
| .gradio-container button.primary { | |
| background: linear-gradient(135deg, var(--accent), var(--accent-2)) !important; | |
| color: #0b0e13 !important; | |
| font-weight: 700; | |
| } | |
| .seed-widget, | |
| .token-widget { | |
| display: flex; | |
| flex-direction: column; | |
| gap: 0.75rem; | |
| border: 1px solid var(--border); | |
| border-radius: 0.9rem; | |
| padding: 0.85rem; | |
| background: color-mix(in srgb, var(--bg-panel) 86%, transparent); | |
| } | |
| .seed-stats, | |
| .token-stats { | |
| display: grid; | |
| grid-template-columns: repeat(3, minmax(0, 1fr)); | |
| gap: 0.6rem; | |
| } | |
| .seed-stats > span, | |
| .token-stats > span { | |
| display: block; | |
| padding: 0.55rem; | |
| border: 1px solid var(--border); | |
| border-radius: 0.6rem; | |
| background: color-mix(in srgb, var(--bg-surface) 90%, transparent); | |
| color: var(--text-main); | |
| font-size: 0.9rem; | |
| } | |
| .seed-chip-wrap { | |
| display: flex; | |
| flex-wrap: wrap; | |
| gap: 0.45rem; | |
| } | |
| .seed-chip { | |
| border: 1px solid var(--border); | |
| border-radius: 999px; | |
| padding: 0.24rem 0.7rem; | |
| color: var(--text-main); | |
| background: linear-gradient( | |
| 145deg, | |
| color-mix(in srgb, var(--accent) 20%, transparent), | |
| color-mix(in srgb, var(--accent-2) 15%, transparent) | |
| ); | |
| font-size: 0.83rem; | |
| } | |
| .seed-empty, | |
| .seed-overflow, | |
| .token-note { | |
| color: var(--text-muted); | |
| font-size: 0.83rem; | |
| padding: 0.24rem 0.3rem; | |
| } | |
| .setting-help-q { | |
| position: relative; | |
| display: inline-flex; | |
| align-items: center; | |
| justify-content: center; | |
| width: 1.05rem; | |
| height: 1.05rem; | |
| margin-left: 0.42rem; | |
| border: 1px solid var(--border); | |
| border-radius: 999px; | |
| color: var(--text-main); | |
| background: color-mix(in srgb, var(--bg-panel) 90%, transparent); | |
| font-size: 0.74rem; | |
| font-weight: 700; | |
| cursor: help; | |
| line-height: 1; | |
| vertical-align: middle; | |
| } | |
| .setting-help-tooltip { | |
| position: fixed; | |
| left: 0; | |
| top: 0; | |
| transform: translate(-50%, -100%); | |
| min-width: 180px; | |
| max-width: 320px; | |
| padding: 0.42rem 0.55rem; | |
| border: 1px solid var(--border); | |
| border-radius: 0.5rem; | |
| background: color-mix(in srgb, var(--bg-surface) 98%, transparent); | |
| color: var(--text-main); | |
| font-size: 0.74rem; | |
| font-weight: 500; | |
| line-height: 1.25; | |
| box-shadow: var(--shadow); | |
| opacity: 0; | |
| visibility: hidden; | |
| transition: opacity 120ms ease; | |
| z-index: 10000; | |
| pointer-events: none; | |
| white-space: normal; | |
| } | |
| .setting-help-tooltip.is-visible { | |
| opacity: 1; | |
| visibility: visible; | |
| } | |
| """ | |
| THEME_JS = """ | |
| (theme_name) => { | |
| const theme = theme_name || "dark"; | |
| document.documentElement.setAttribute("data-crawler-theme", theme); | |
| return []; | |
| } | |
| """ | |
| SEED_WIDGET_JS = """ | |
| (seed_text) => { | |
| const parseSeedText = (value) => { | |
| if (typeof value !== "string") return []; | |
| return value | |
| .split(/\\r?\\n/) | |
| .map((line) => line.trim()) | |
| .filter((line) => line.length > 0); | |
| }; | |
| const dedupe = (values) => { | |
| const seen = new Set(); | |
| const out = []; | |
| for (const value of values) { | |
| if (!seen.has(value)) { | |
| seen.add(value); | |
| out.push(value); | |
| } | |
| } | |
| return out; | |
| }; | |
| const domainOf = (value) => { | |
| try { | |
| return new URL(value).hostname || ""; | |
| } catch { | |
| return ""; | |
| } | |
| }; | |
| const escapeHtml = (value) => String(value) | |
| .replaceAll("&", "&") | |
| .replaceAll("<", "<") | |
| .replaceAll(">", ">") | |
| .replaceAll('"', """) | |
| .replaceAll("'", "'"); | |
| const seeds = dedupe(parseSeedText(seed_text)); | |
| const domainSet = new Set(seeds.map(domainOf).filter(Boolean)); | |
| const chips = seeds.length | |
| ? seeds.slice(0, 12).map((url) => `<span class=\"seed-chip\">${escapeHtml(url)}</span>`).join("") | |
| : '<span class=\"seed-empty\">No seed URLs configured yet.</span>'; | |
| const overflow = seeds.length > 12 | |
| ? `<span class=\"seed-overflow\">+${seeds.length - 12} more</span>` | |
| : ""; | |
| const firstUrlChars = seeds.length ? seeds[0].length : 0; | |
| return `<div class=\"seed-widget\"><div class=\"seed-stats\"><span><strong>${seeds.length}</strong> seeds</span><span><strong>${domainSet.size}</strong> domains</span><span><strong>${firstUrlChars}</strong> first-url chars</span></div><div class=\"seed-chip-wrap\">${chips}${overflow}</div></div>`; | |
| } | |
| """ | |
| SETTING_HELP_JS = """ | |
| () => { | |
| const helpByPrefix = [ | |
| ["Theme", "Switch between visual color themes."], | |
| ["Seed URL List (one URL per line)", "Provide crawl entry points. Put one URL per line; duplicates are ignored."], | |
| ["Shard Size Rows", "Rows written per parquet shard before a full shard is emitted."], | |
| ["Shard Limit", "Maximum number of shards to produce for a run (1 to 15)."], | |
| ["Max Links Per Page", "Maximum discovered links to enqueue from each parsed page."], | |
| ["Request Timeout (seconds)", "HTTP request timeout per URL."], | |
| ["Max Response Bytes", "Maximum response body bytes to read per page."], | |
| ["Upload shards to my HF repo", "Enable direct upload of produced shards to your Hugging Face Space repo."], | |
| ["HF Repo ID", "Target Hugging Face repo in owner/name format."], | |
| ["HF Token (write permissions)", "Token with write access to the target repo."], | |
| ["Private HF Repo", "Create the target repo as private if it does not exist."], | |
| ["HF Path Prefix", "Folder path inside the repo where shards are uploaded."], | |
| ["Upload incomplete shard buffers", "On crawl finish/stop, flush the current partial shard buffer and upload it too."], | |
| ]; | |
| const tooltipId = "setting-help-tooltip"; | |
| let tooltip = document.getElementById(tooltipId); | |
| if (!tooltip) { | |
| tooltip = document.createElement("div"); | |
| tooltip.id = tooltipId; | |
| tooltip.className = "setting-help-tooltip"; | |
| document.body.appendChild(tooltip); | |
| } | |
| const placeTooltip = (target) => { | |
| const rect = target.getBoundingClientRect(); | |
| const aboveTop = rect.top - 10; | |
| tooltip.style.left = `${rect.left + (rect.width / 2)}px`; | |
| if (aboveTop < 44) { | |
| tooltip.style.top = `${rect.bottom + 10}px`; | |
| tooltip.style.transform = "translate(-50%, 0)"; | |
| } else { | |
| tooltip.style.top = `${rect.top - 10}px`; | |
| tooltip.style.transform = "translate(-50%, -100%)"; | |
| } | |
| }; | |
| const showTooltip = (target) => { | |
| const message = target.getAttribute("data-help") || ""; | |
| if (!message) return; | |
| tooltip.textContent = message; | |
| placeTooltip(target); | |
| tooltip.classList.add("is-visible"); | |
| }; | |
| const hideTooltip = () => { | |
| tooltip.classList.remove("is-visible"); | |
| }; | |
| const clean = (value) => String(value || "").replace(/\\s+/g, " ").trim(); | |
| const labels = document.querySelectorAll(".gradio-container label"); | |
| for (const label of labels) { | |
| const text = clean(label.textContent); | |
| const match = helpByPrefix.find(([prefix]) => text.startsWith(prefix)); | |
| if (!match) continue; | |
| let q = label.querySelector(".setting-help-q"); | |
| if (!q) { | |
| q = document.createElement("span"); | |
| q.className = "setting-help-q"; | |
| q.textContent = "?"; | |
| label.appendChild(q); | |
| } | |
| q.setAttribute("data-help", match[1]); | |
| q.setAttribute("aria-label", match[1]); | |
| q.setAttribute("title", match[1]); | |
| q.tabIndex = 0; | |
| if (!q.hasAttribute("data-help-bound")) { | |
| q.addEventListener("mouseenter", () => showTooltip(q)); | |
| q.addEventListener("mouseleave", hideTooltip); | |
| q.addEventListener("focus", () => showTooltip(q)); | |
| q.addEventListener("blur", hideTooltip); | |
| q.setAttribute("data-help-bound", "1"); | |
| } | |
| } | |
| return []; | |
| } | |
| """ | |
| def utc_now_iso() -> str: | |
| return datetime.now(timezone.utc).isoformat(timespec="seconds") | |
| def safe_queue_size(queue: Any) -> int: | |
| try: | |
| return int(queue.qsize()) | |
| except Exception: | |
| return -1 | |
| def parse_seed_url_rows(seed_urls_input: Any) -> list[str]: | |
| if seed_urls_input is None: | |
| return [] | |
| if isinstance(seed_urls_input, str): | |
| return [line.strip() for line in seed_urls_input.splitlines() if line.strip()] | |
| if isinstance(seed_urls_input, (list, tuple)): | |
| rows_iterable: list[Any] = list(seed_urls_input) | |
| elif hasattr(seed_urls_input, "values"): | |
| try: | |
| rows_iterable = seed_urls_input.values.tolist() # pandas.DataFrame path | |
| except Exception: | |
| rows_iterable = [] | |
| else: | |
| rows_iterable = [seed_urls_input] | |
| items: list[str] = [] | |
| for row in rows_iterable: | |
| value_sources: list[Any] | |
| if isinstance(row, dict): | |
| value_sources = [next(iter(row.values()), "")] | |
| elif isinstance(row, (list, tuple)): | |
| value_sources = [row[0] if row else ""] | |
| else: | |
| value_sources = [row] | |
| for source in value_sources: | |
| if source is None: | |
| continue | |
| for line in str(source).splitlines(): | |
| value = line.strip() | |
| if value: | |
| items.append(value) | |
| return items | |
| def unique_preserve_order(values: list[str]) -> list[str]: | |
| seen: set[str] = set() | |
| out: list[str] = [] | |
| for value in values: | |
| if value in seen: | |
| continue | |
| seen.add(value) | |
| out.append(value) | |
| return out | |
| def collect_seed_urls(seed_urls_input: Any) -> list[str]: | |
| return unique_preserve_order(parse_seed_url_rows(seed_urls_input)) | |
| def render_seed_widget_html(seed_urls_input: Any) -> str: | |
| seeds = collect_seed_urls(seed_urls_input) | |
| domains = {(urlsplit(u).hostname or "").lower().strip(".") for u in seeds} | |
| domains = {d for d in domains if d} | |
| chips = [f'<span class="seed-chip">{escape(url)}</span>' for url in seeds[:12]] | |
| chips_html = "".join(chips) if chips else '<span class="seed-empty">No seed URLs configured yet.</span>' | |
| overflow_html = f'<span class="seed-overflow">+{len(seeds) - 12} more</span>' if len(seeds) > 12 else "" | |
| return ( | |
| '<div class="seed-widget">' | |
| '<div class="seed-stats">' | |
| f"<span><strong>{len(seeds)}</strong> seeds</span>" | |
| f"<span><strong>{len(domains)}</strong> domains</span>" | |
| f"<span><strong>{len(seeds[0]) if seeds else 0}</strong> first-url chars</span>" | |
| "</div>" | |
| f'<div class="seed-chip-wrap">{chips_html}{overflow_html}</div>' | |
| "</div>" | |
| ) | |
| def render_seed_summary_text(seed_urls_input: Any) -> str: | |
| seeds = collect_seed_urls(seed_urls_input) | |
| domains = {(urlsplit(u).hostname or "").lower().strip(".") for u in seeds} | |
| domains = {d for d in domains if d} | |
| first_url_chars = len(seeds[0]) if seeds else 0 | |
| lines = [ | |
| f"Seeds: {len(seeds)}", | |
| f"Domains: {len(domains)}", | |
| f"First URL chars: {first_url_chars}", | |
| "", | |
| "Seed URLs:", | |
| ] | |
| if seeds: | |
| lines.extend([f"- {url}" for url in seeds]) | |
| else: | |
| lines.append("- (none)") | |
| return "\n".join(lines) | |
| def render_tokenization_widget_html(snapshot: dict[str, Any]) -> str: | |
| tokenized_shards = int(snapshot.get("tokenized_shards", 0) or 0) | |
| tokenized_rows = int(snapshot.get("tokenized_rows", 0) or 0) | |
| tokenized_tokens = int(snapshot.get("tokenized_tokens", 0) or 0) | |
| written_shards = int(snapshot.get("written_shards", 0) or 0) | |
| return ( | |
| '<div class="token-widget">' | |
| '<div class="token-stats">' | |
| f"<span><strong>{tokenized_tokens}</strong> text tokens</span>" | |
| f"<span><strong>{tokenized_rows}</strong> tokenized rows</span>" | |
| f"<span><strong>{tokenized_shards}/{written_shards}</strong> tokenized shards</span>" | |
| "</div>" | |
| '<div class="token-note">Live shard tokenization uses tiktoken on the parquet <code>text</code> column.</div>' | |
| "</div>" | |
| ) | |
| def render_qvp_widget_md(snapshot: dict[str, Any]) -> str: | |
| queue_count = int(snapshot.get("fetch_queue", 0) or 0) | |
| visited_count = int(snapshot.get("fetch_succeeded", 0) or 0) | |
| parsed_count = int(snapshot.get("parsed_pages", 0) or 0) | |
| return ( | |
| "### Live Metrics\n" | |
| f"- Queue: `{queue_count}`\n" | |
| f"- Visited: `{visited_count}`\n" | |
| f"- Parsed: `{parsed_count}`" | |
| ) | |
| def validate_hf_requirements(enable_hf_upload: bool, hf_repo_id: str, hf_token: str) -> None: | |
| if not enable_hf_upload: | |
| return | |
| if not hf_repo_id.strip(): | |
| raise ValueError("HF repo is required when upload is enabled.") | |
| if not hf_token.strip(): | |
| raise ValueError("HF token is required when upload is enabled.") | |
| def preflight_validate_start( | |
| enable_hf_upload: bool, | |
| hf_repo_id: str, | |
| hf_token: str, | |
| ) -> None: | |
| if not bool(enable_hf_upload): | |
| return | |
| if not hf_repo_id.strip() or not hf_token.strip(): | |
| raise gr.Error("HF upload is enabled. Enter both HF Repo ID and HF Token first.") | |
| def build_crawler_config( | |
| *, | |
| seed_urls_input: Any, | |
| max_links_per_page: int, | |
| request_timeout_seconds: float, | |
| max_response_bytes: int, | |
| shard_size_rows: int, | |
| max_shards: int, | |
| enable_hf_upload: bool, | |
| upload_incomplete_shards: bool, | |
| hf_repo_id: str, | |
| hf_token: str, | |
| hf_private_repo: bool, | |
| hf_path_prefix: str, | |
| total_workers: int, | |
| ) -> CrawlerConfig: | |
| validate_hf_requirements(enable_hf_upload, hf_repo_id, hf_token) | |
| seed_urls = collect_seed_urls(seed_urls_input) | |
| return CrawlerConfig( | |
| seed_urls=seed_urls, | |
| max_links_per_page=int(max_links_per_page), | |
| request_timeout_seconds=float(request_timeout_seconds), | |
| max_response_bytes=int(max_response_bytes), | |
| shard_size_rows=int(shard_size_rows), | |
| max_shards=int(max_shards), | |
| output_dir=Path(__file__).resolve().parent / "shards", | |
| enable_hf_upload=bool(enable_hf_upload), | |
| upload_incomplete_shards=bool(upload_incomplete_shards), | |
| hf_repo_id=hf_repo_id.strip(), | |
| hf_token=hf_token.strip(), | |
| hf_private_repo=bool(hf_private_repo), | |
| hf_path_prefix=hf_path_prefix.strip() or "crawl_shards", | |
| total_workers=int(total_workers), | |
| ) | |
| class RunState: | |
| run_id: int = 0 | |
| running: bool = False | |
| started_at: str = "" | |
| finished_at: str = "" | |
| stop_requested: bool = False | |
| last_error: str = "" | |
| class CrawlerRunManager: | |
| def __init__(self) -> None: | |
| self._lock = threading.Lock() | |
| self._thread: threading.Thread | None = None | |
| self._loop: asyncio.AbstractEventLoop | None = None | |
| self._crawler: AsyncCrawler | None = None | |
| self._state = RunState() | |
| self._logs: deque[str] = deque(maxlen=600) | |
| self._last_snapshot: dict[str, Any] | None = None | |
| def start(self, config: CrawlerConfig) -> str: | |
| with self._lock: | |
| if self._thread is not None and self._thread.is_alive(): | |
| return "A crawl is already running. Stop it before starting another one." | |
| self._state.run_id += 1 | |
| self._state.running = True | |
| self._state.started_at = utc_now_iso() | |
| self._state.finished_at = "" | |
| self._state.stop_requested = False | |
| self._state.last_error = "" | |
| self._last_snapshot = None | |
| self._logs.clear() | |
| run_id = self._state.run_id | |
| self._logs.append( | |
| f"[{utc_now_iso()}] Started run #{run_id} with {config.total_workers} workers " | |
| f"({config.fetch_workers} fetch / {config.parser_workers} parser)." | |
| ) | |
| self._thread = threading.Thread( | |
| target=self._run_crawler, | |
| args=(run_id, config), | |
| daemon=True, | |
| name=f"crawler-run-{run_id}", | |
| ) | |
| self._thread.start() | |
| return f"Run #{run_id} started." | |
| def stop(self) -> str: | |
| with self._lock: | |
| if self._thread is None or not self._thread.is_alive(): | |
| return "No active crawl to stop." | |
| self._state.stop_requested = True | |
| crawler = self._crawler | |
| loop = self._loop | |
| run_id = self._state.run_id | |
| self._logs.append(f"[{utc_now_iso()}] Stop requested for run #{run_id}") | |
| if crawler is not None and loop is not None and loop.is_running(): | |
| loop.call_soon_threadsafe(crawler.request_stop, "user_requested_stop") | |
| elif crawler is not None: | |
| crawler.request_stop("user_requested_stop") | |
| return f"Stop signal sent to run #{run_id}." | |
| def _run_crawler(self, run_id: int, config: CrawlerConfig) -> None: | |
| loop: asyncio.AbstractEventLoop | None = None | |
| try: | |
| crawler = AsyncCrawler(config) | |
| if hasattr(asyncio, "Runner"): | |
| with asyncio.Runner() as runner: # type: ignore[attr-defined] | |
| loop = runner.get_loop() | |
| with self._lock: | |
| if self._state.run_id == run_id: | |
| self._crawler = crawler | |
| self._loop = loop | |
| runner.run(crawler.run()) | |
| else: | |
| loop = asyncio.new_event_loop() | |
| asyncio.set_event_loop(loop) | |
| with self._lock: | |
| if self._state.run_id == run_id: | |
| self._crawler = crawler | |
| self._loop = loop | |
| loop.run_until_complete(crawler.run()) | |
| final_snapshot = self._snapshot_from_crawler(crawler) | |
| with self._lock: | |
| if self._state.run_id == run_id: | |
| self._last_snapshot = final_snapshot | |
| self._logs.append(f"[{utc_now_iso()}] Run #{run_id} completed") | |
| except Exception: | |
| error_text = traceback.format_exc(limit=20) | |
| with self._lock: | |
| self._state.last_error = error_text | |
| self._logs.append(f"[{utc_now_iso()}] Run #{run_id} crashed") | |
| finally: | |
| with self._lock: | |
| if self._state.run_id == run_id: | |
| self._state.running = False | |
| self._state.finished_at = utc_now_iso() | |
| self._crawler = None | |
| self._loop = None | |
| if loop is not None and not loop.is_closed(): | |
| loop.close() | |
| with contextlib.suppress(Exception): | |
| asyncio.set_event_loop(None) | |
| def _snapshot_from_crawler(self, crawler: AsyncCrawler) -> dict[str, Any]: | |
| stats = crawler.stats | |
| return { | |
| "timestamp": utc_now_iso(), | |
| "workers_total": crawler.config.total_workers, | |
| "workers_split": f"{crawler.config.fetch_workers}/{crawler.config.parser_workers}", | |
| "stop_reason": crawler.stop_reason or "-", | |
| "fetch_succeeded": stats.fetch_succeeded, | |
| "parsed_pages": stats.parsed_pages, | |
| "written_shards": stats.written_shards, | |
| "tokenized_shards": stats.tokenized_shards, | |
| "tokenized_rows": stats.tokenized_rows, | |
| "tokenized_tokens": stats.tokenized_tokens, | |
| "fetch_queue": safe_queue_size(crawler.fetch_queue), | |
| "parse_queue": safe_queue_size(crawler.parse_queue), | |
| "record_queue": safe_queue_size(crawler.record_queue), | |
| "stop_event": crawler.stop_event.is_set(), | |
| } | |
| def poll(self) -> tuple[str, dict[str, Any], str]: | |
| with self._lock: | |
| crawler = self._crawler | |
| state = RunState( | |
| run_id=self._state.run_id, | |
| running=self._state.running, | |
| started_at=self._state.started_at, | |
| finished_at=self._state.finished_at, | |
| stop_requested=self._state.stop_requested, | |
| last_error=self._state.last_error, | |
| ) | |
| if crawler is not None: | |
| snapshot = self._snapshot_from_crawler(crawler) | |
| with self._lock: | |
| self._last_snapshot = snapshot | |
| with self._lock: | |
| latest = self._last_snapshot or { | |
| "timestamp": utc_now_iso(), | |
| "workers_total": 0, | |
| "workers_split": "-", | |
| "stop_reason": "-", | |
| "fetch_succeeded": 0, | |
| "parsed_pages": 0, | |
| "written_shards": 0, | |
| "tokenized_shards": 0, | |
| "tokenized_rows": 0, | |
| "tokenized_tokens": 0, | |
| "fetch_queue": 0, | |
| "parse_queue": 0, | |
| "record_queue": 0, | |
| "stop_event": False, | |
| } | |
| logs_text = "\n".join(self._logs) | |
| status_lines = [ | |
| "### Crawler Status", | |
| f"- Run ID: `{state.run_id}`", | |
| f"- Running: `{state.running}`", | |
| f"- Stop requested: `{state.stop_requested}`", | |
| f"- Started at (UTC): `{state.started_at or '-'}`", | |
| f"- Finished at (UTC): `{state.finished_at or '-'}`", | |
| ] | |
| if state.last_error: | |
| status_lines.append("- Last error:") | |
| status_lines.append("```text") | |
| status_lines.append(state.last_error.strip()) | |
| status_lines.append("```") | |
| return "\n".join(status_lines), latest, logs_text | |
| RUN_MANAGER = CrawlerRunManager() | |
| def _format_dashboard_response( | |
| status: str, | |
| snapshot: dict[str, Any], | |
| logs: str, | |
| ) -> tuple[str, str, str, str]: | |
| return ( | |
| status, | |
| render_qvp_widget_md(snapshot), | |
| logs, | |
| render_tokenization_widget_html(snapshot), | |
| ) | |
| def _start_crawl( | |
| *, | |
| total_workers: int, | |
| seed_urls_input: Any, | |
| max_links_per_page: int, | |
| request_timeout_seconds: float, | |
| max_response_bytes: int, | |
| shard_size_rows: int, | |
| max_shards: int, | |
| enable_hf_upload: bool, | |
| upload_incomplete_shards: bool, | |
| hf_repo_id: str, | |
| hf_token: str, | |
| hf_private_repo: bool, | |
| hf_path_prefix: str, | |
| ) -> tuple[str, str, str, str]: | |
| try: | |
| config = build_crawler_config( | |
| seed_urls_input=seed_urls_input, | |
| max_links_per_page=max_links_per_page, | |
| request_timeout_seconds=request_timeout_seconds, | |
| max_response_bytes=max_response_bytes, | |
| shard_size_rows=shard_size_rows, | |
| max_shards=max_shards, | |
| enable_hf_upload=enable_hf_upload, | |
| upload_incomplete_shards=upload_incomplete_shards, | |
| hf_repo_id=hf_repo_id, | |
| hf_token=hf_token, | |
| hf_private_repo=hf_private_repo, | |
| hf_path_prefix=hf_path_prefix, | |
| total_workers=total_workers, | |
| ) | |
| except ValueError as exc: | |
| raise gr.Error(str(exc)) from exc | |
| message = RUN_MANAGER.start(config) | |
| status, snapshot, logs = RUN_MANAGER.poll() | |
| return _format_dashboard_response(f"{status}\n\n{message}", snapshot, logs) | |
| def start_crawl_standard( | |
| seed_urls_input: Any, | |
| max_links_per_page: int, | |
| request_timeout_seconds: float, | |
| max_response_bytes: int, | |
| shard_size_rows: int, | |
| max_shards: int, | |
| enable_hf_upload: bool, | |
| upload_incomplete_shards: bool, | |
| hf_repo_id: str, | |
| hf_token: str, | |
| hf_private_repo: bool, | |
| hf_path_prefix: str, | |
| ) -> tuple[str, str, str, str]: | |
| return _start_crawl( | |
| total_workers=NORMAL_TOTAL_WORKERS, | |
| seed_urls_input=seed_urls_input, | |
| max_links_per_page=max_links_per_page, | |
| request_timeout_seconds=request_timeout_seconds, | |
| max_response_bytes=max_response_bytes, | |
| shard_size_rows=shard_size_rows, | |
| max_shards=max_shards, | |
| enable_hf_upload=enable_hf_upload, | |
| upload_incomplete_shards=upload_incomplete_shards, | |
| hf_repo_id=hf_repo_id, | |
| hf_token=hf_token, | |
| hf_private_repo=hf_private_repo, | |
| hf_path_prefix=hf_path_prefix, | |
| ) | |
| def start_crawl_super( | |
| seed_urls_input: Any, | |
| max_links_per_page: int, | |
| request_timeout_seconds: float, | |
| max_response_bytes: int, | |
| shard_size_rows: int, | |
| max_shards: int, | |
| enable_hf_upload: bool, | |
| upload_incomplete_shards: bool, | |
| hf_repo_id: str, | |
| hf_token: str, | |
| hf_private_repo: bool, | |
| hf_path_prefix: str, | |
| ) -> tuple[str, str, str, str]: | |
| return _start_crawl( | |
| total_workers=SUPER_TOTAL_WORKERS, | |
| seed_urls_input=seed_urls_input, | |
| max_links_per_page=max_links_per_page, | |
| request_timeout_seconds=request_timeout_seconds, | |
| max_response_bytes=max_response_bytes, | |
| shard_size_rows=shard_size_rows, | |
| max_shards=max_shards, | |
| enable_hf_upload=enable_hf_upload, | |
| upload_incomplete_shards=upload_incomplete_shards, | |
| hf_repo_id=hf_repo_id, | |
| hf_token=hf_token, | |
| hf_private_repo=hf_private_repo, | |
| hf_path_prefix=hf_path_prefix, | |
| ) | |
| def stop_crawl() -> tuple[str, str, str, str]: | |
| message = RUN_MANAGER.stop() | |
| status, snapshot, logs = RUN_MANAGER.poll() | |
| return _format_dashboard_response(f"{status}\n\n{message}", snapshot, logs) | |
| def poll_dashboard() -> tuple[str, str, str, str]: | |
| status, snapshot, logs = RUN_MANAGER.poll() | |
| return _format_dashboard_response(status, snapshot, logs) | |
| def render_seed_widget(seed_urls_input: Any) -> str: | |
| return render_seed_summary_text(seed_urls_input) | |
| def noop_event(*_args: Any) -> None: | |
| return None | |
| def toggle_hf_fields(enable_hf_upload: bool) -> tuple[Any, Any, Any, Any, Any]: | |
| update = gr.update(visible=enable_hf_upload) | |
| return update, update, update, update, update | |
| def build_ui() -> gr.Blocks: | |
| defaults = CrawlerConfig( | |
| seed_urls=[ | |
| "https://en.wikipedia.org/wiki/Main_Page", | |
| "https://docs.python.org/3/", | |
| "https://developer.mozilla.org/en-US/", | |
| "https://www.nasa.gov/", | |
| ] | |
| ) | |
| default_seed_text = "\n".join(defaults.seed_urls) | |
| with gr.Blocks(title="DataMuncherLabs AutoWS") as demo: | |
| gr.Markdown("# DataMuncherLabs AutoWS") | |
| gr.Markdown("Async web crawler dashboard with live parquet text tokenization.") | |
| with gr.Row(): | |
| theme_name = gr.Dropdown( | |
| choices=[ | |
| "red", | |
| "blue", | |
| "light", | |
| "dark", | |
| "green", | |
| "sunset", | |
| "ocean", | |
| "amber", | |
| "graphite", | |
| "mint", | |
| ], | |
| value="dark", | |
| label="Theme", | |
| interactive=True, | |
| ) | |
| gr.Markdown( | |
| "- Standard mode: **12 threads** (`10 fetch`, `2 parse`)\n" | |
| "- Super mode: **24 threads** (`20 fetch`, `4 parse`)" | |
| ) | |
| with gr.Row(): | |
| with gr.Column(scale=2): | |
| seed_urls_input = gr.Textbox( | |
| lines=10, | |
| value=default_seed_text, | |
| interactive=True, | |
| label="Seed URL List (one URL per line)", | |
| placeholder="https://example.com", | |
| ) | |
| seed_widget_html = gr.Textbox( | |
| label="Seed URL Summary", | |
| value=render_seed_summary_text(default_seed_text), | |
| lines=10, | |
| interactive=False, | |
| ) | |
| token_widget_html = gr.HTML( | |
| label="Live Tokenization", | |
| value=render_tokenization_widget_html({}), | |
| ) | |
| with gr.Column(scale=1): | |
| shard_size_rows = gr.Slider( | |
| label=f"Shard Size Rows (max {MAX_SHARD_ROWS})", | |
| minimum=100, | |
| maximum=MAX_SHARD_ROWS, | |
| step=100, | |
| value=min(defaults.shard_size_rows, MAX_SHARD_ROWS), | |
| ) | |
| max_shards = gr.Slider( | |
| label=f"Shard Limit (1-{MAX_SHARDS})", | |
| minimum=1, | |
| maximum=MAX_SHARDS, | |
| step=1, | |
| value=min(defaults.max_shards, MAX_SHARDS), | |
| ) | |
| max_links_per_page = gr.Slider( | |
| label="Max Links Per Page", | |
| minimum=10, | |
| maximum=1000, | |
| step=10, | |
| value=defaults.max_links_per_page, | |
| ) | |
| request_timeout_seconds = gr.Slider( | |
| label="Request Timeout (seconds)", | |
| minimum=3, | |
| maximum=60, | |
| step=1, | |
| value=defaults.request_timeout_seconds, | |
| ) | |
| max_response_bytes = gr.Slider( | |
| label="Max Response Bytes", | |
| minimum=500_000, | |
| maximum=8_000_000, | |
| step=100_000, | |
| value=defaults.max_response_bytes, | |
| ) | |
| with gr.Accordion("Hugging Face Upload", open=False): | |
| enable_hf_upload = gr.Checkbox( | |
| label="Upload shards to my HF repo", | |
| value=False, | |
| ) | |
| hf_repo_id = gr.Textbox( | |
| label="HF Repo ID", | |
| placeholder="username/dataset-name", | |
| visible=False, | |
| ) | |
| hf_token = gr.Textbox( | |
| label="HF Token (write permissions)", | |
| type="password", | |
| placeholder="hf_xxx", | |
| visible=False, | |
| ) | |
| hf_private_repo = gr.Checkbox( | |
| label="Private HF Repo", | |
| value=False, | |
| visible=False, | |
| ) | |
| hf_path_prefix = gr.Textbox( | |
| label="HF Path Prefix", | |
| value="crawl_shards", | |
| visible=False, | |
| ) | |
| upload_incomplete_shards = gr.Checkbox( | |
| label="Upload incomplete shard buffers", | |
| value=False, | |
| visible=False, | |
| ) | |
| with gr.Row(): | |
| start_button = gr.Button("Start Crawl (12 Threads)", variant="primary") | |
| super_button = gr.Button("Super Mode (24 Threads)", variant="primary") | |
| stop_button = gr.Button("Stop Crawl", variant="stop") | |
| refresh_button = gr.Button("Refresh") | |
| status_md = gr.Markdown("### Crawler Status\n- Run ID: `0`\n- Running: `False`") | |
| qvp_md = gr.Markdown("### Live Metrics\n- Queue: `0`\n- Visited: `0`\n- Parsed: `0`") | |
| logs_box = gr.Textbox(label="Run Log", lines=12, interactive=False) | |
| start_inputs = [ | |
| seed_urls_input, | |
| max_links_per_page, | |
| request_timeout_seconds, | |
| max_response_bytes, | |
| shard_size_rows, | |
| max_shards, | |
| enable_hf_upload, | |
| upload_incomplete_shards, | |
| hf_repo_id, | |
| hf_token, | |
| hf_private_repo, | |
| hf_path_prefix, | |
| ] | |
| outputs = [status_md, qvp_md, logs_box, token_widget_html] | |
| start_button.click( | |
| preflight_validate_start, | |
| inputs=[enable_hf_upload, hf_repo_id, hf_token], | |
| outputs=[], | |
| queue=False, | |
| ).then(start_crawl_standard, inputs=start_inputs, outputs=outputs) | |
| super_button.click( | |
| preflight_validate_start, | |
| inputs=[enable_hf_upload, hf_repo_id, hf_token], | |
| outputs=[], | |
| queue=False, | |
| ).then(start_crawl_super, inputs=start_inputs, outputs=outputs) | |
| stop_button.click(stop_crawl, inputs=[], outputs=outputs) | |
| refresh_button.click(poll_dashboard, inputs=[], outputs=outputs) | |
| enable_hf_upload.change( | |
| toggle_hf_fields, | |
| inputs=enable_hf_upload, | |
| outputs=[ | |
| hf_repo_id, | |
| hf_token, | |
| hf_private_repo, | |
| hf_path_prefix, | |
| upload_incomplete_shards, | |
| ], | |
| ) | |
| seed_urls_input.change( | |
| fn=render_seed_widget, | |
| inputs=[seed_urls_input], | |
| outputs=[seed_widget_html], | |
| queue=False, | |
| ) | |
| theme_name.change( | |
| fn=noop_event, | |
| inputs=[theme_name], | |
| outputs=[], | |
| js=THEME_JS, | |
| queue=False, | |
| ) | |
| demo.load(fn=noop_event, inputs=[], outputs=[], js=SETTING_HELP_JS, queue=False) | |
| demo.load( | |
| fn=noop_event, | |
| inputs=[], | |
| outputs=[], | |
| js='() => { document.documentElement.setAttribute("data-crawler-theme", "dark"); }', | |
| queue=False, | |
| ) | |
| demo.load( | |
| fn=render_seed_widget, | |
| inputs=[seed_urls_input], | |
| outputs=[seed_widget_html], | |
| queue=False, | |
| ) | |
| demo.load(fn=poll_dashboard, inputs=[], outputs=outputs) | |
| enable_hf_upload.change( | |
| fn=noop_event, | |
| inputs=[enable_hf_upload], | |
| outputs=[], | |
| js=SETTING_HELP_JS, | |
| queue=False, | |
| ) | |
| upload_incomplete_shards.change( | |
| fn=noop_event, | |
| inputs=[upload_incomplete_shards], | |
| outputs=[], | |
| js=SETTING_HELP_JS, | |
| queue=False, | |
| ) | |
| timer = gr.Timer(value=1.0) | |
| timer.tick(fn=poll_dashboard, inputs=[], outputs=outputs) | |
| return demo | |
| demo = build_ui() | |
| def main() -> None: | |
| queued = demo.queue(default_concurrency_limit=32) | |
| launch_sig = inspect.signature(queued.launch) | |
| launch_kwargs: dict[str, Any] = {} | |
| if "css" in launch_sig.parameters: | |
| launch_kwargs["css"] = APP_CSS | |
| if "theme" in launch_sig.parameters: | |
| launch_kwargs["theme"] = gr.themes.Default(primary_hue="green") | |
| if "ssr_mode" in launch_sig.parameters: | |
| launch_kwargs["ssr_mode"] = False | |
| queued.launch(**launch_kwargs) | |
| if __name__ == "__main__": | |
| main() | |