Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import requests | |
| from bs4 import BeautifulSoup | |
| from urllib.parse import urljoin, urlparse | |
| import time | |
| import json | |
| import csv | |
| import io | |
| import re | |
| from collections import deque | |
| from langdetect import detect, DetectorFactory | |
| from langdetect.lang_detect_exception import LangDetectException | |
| DetectorFactory.seed = 42 | |
| LANGUAGE_NAMES = { | |
| "en": "English", "fr": "French", "de": "German", "es": "Spanish", | |
| "it": "Italian", "pt": "Portuguese", "nl": "Dutch", "pl": "Polish", | |
| "ru": "Russian", "zh-cn": "Chinese (Simplified)", "zh-tw": "Chinese (Traditional)", | |
| "ja": "Japanese", "ko": "Korean", "ar": "Arabic", "hi": "Hindi", | |
| "tr": "Turkish", "sv": "Swedish", "da": "Danish", "fi": "Finnish", | |
| "no": "Norwegian", "cs": "Czech", "hu": "Hungarian", "ro": "Romanian", | |
| "uk": "Ukrainian", "vi": "Vietnamese", "th": "Thai", "id": "Indonesian", | |
| "ms": "Malay", "bg": "Bulgarian", "hr": "Croatian", "sk": "Slovak", | |
| "sl": "Slovenian", "lt": "Lithuanian", "lv": "Latvian", "et": "Estonian", | |
| "he": "Hebrew", "fa": "Persian", "bn": "Bengali", "ur": "Urdu", | |
| "sw": "Swahili", "ca": "Catalan", "af": "Afrikaans", | |
| } | |
| HEADERS = { | |
| "User-Agent": "Mozilla/5.0 (compatible; DatasetCrawler/1.0; +https://huggingface.co/spaces)", | |
| "Accept-Language": "en-US,en;q=0.9", | |
| } | |
| def clean_text(text): | |
| text = re.sub(r'\s+', ' ', text).strip() | |
| text = re.sub(r'[^\S\n]+', ' ', text) | |
| return text | |
| def extract_text_from_page(html, url): | |
| soup = BeautifulSoup(html, "html.parser") | |
| for tag in soup(["script", "style", "nav", "footer", "header", "aside", "noscript", "form"]): | |
| tag.decompose() | |
| paragraphs = [] | |
| for tag in soup.find_all(["p", "article", "section", "main", "h1", "h2", "h3", "blockquote", "li"]): | |
| text = clean_text(tag.get_text(separator=" ")) | |
| if len(text) > 50: | |
| paragraphs.append(text) | |
| title_tag = soup.find("title") | |
| title = clean_text(title_tag.get_text()) if title_tag else "" | |
| return title, paragraphs | |
| def detect_language(text): | |
| try: | |
| return detect(text[:500]) | |
| except LangDetectException: | |
| return "unknown" | |
| def topic_matches(text, keywords): | |
| if not keywords: | |
| return True | |
| text_lower = text.lower() | |
| return any(kw.lower() in text_lower for kw in keywords) | |
| def crawl( | |
| seed_urls_text, | |
| target_language, | |
| topic_keywords_text, | |
| max_pages, | |
| max_depth, | |
| delay, | |
| min_text_length, | |
| progress=gr.Progress(track_tqdm=False), | |
| ): | |
| seed_urls = [u.strip() for u in seed_urls_text.strip().split("\n") if u.strip()] | |
| if not seed_urls: | |
| return "β οΈ Please provide at least one seed URL.", "", None, None | |
| topic_keywords = [k.strip() for k in topic_keywords_text.split(",") if k.strip()] if topic_keywords_text.strip() else [] | |
| lang_filter = target_language if target_language != "any" else None | |
| visited = set() | |
| queue = deque() | |
| for url in seed_urls: | |
| queue.append((url, 0)) | |
| collected = [] | |
| logs = [] | |
| page_count = 0 | |
| progress(0, desc="Starting crawl...") | |
| while queue and page_count < max_pages: | |
| url, depth = queue.popleft() | |
| if url in visited or depth > max_depth: | |
| continue | |
| visited.add(url) | |
| try: | |
| resp = requests.get(url, headers=HEADERS, timeout=10) | |
| if "text/html" not in resp.headers.get("content-type", ""): | |
| continue | |
| resp.encoding = resp.apparent_encoding | |
| html = resp.text | |
| except Exception as e: | |
| logs.append(f"β Failed: {url} β {e}") | |
| continue | |
| title, paragraphs = extract_text_from_page(html, url) | |
| full_text = " ".join(paragraphs) | |
| if len(full_text) < min_text_length: | |
| logs.append(f"β Skipped (too short): {url}") | |
| continue | |
| detected_lang = detect_language(full_text) | |
| if lang_filter and detected_lang != lang_filter: | |
| logs.append(f"β Skipped (lang={detected_lang}): {url}") | |
| continue | |
| if not topic_matches(full_text, topic_keywords): | |
| logs.append(f"β Skipped (topic mismatch): {url}") | |
| continue | |
| collected.append({ | |
| "url": url, | |
| "title": title, | |
| "language": detected_lang, | |
| "word_count": len(full_text.split()), | |
| "paragraphs": paragraphs, | |
| "text": full_text, | |
| }) | |
| page_count += 1 | |
| logs.append(f"β [{page_count}/{max_pages}] {title[:60] or url} (lang={detected_lang}, words={len(full_text.split())})") | |
| progress(page_count / max_pages, desc=f"Crawled {page_count}/{max_pages} pages") | |
| # Enqueue links | |
| if depth < max_depth: | |
| try: | |
| soup = BeautifulSoup(html, "html.parser") | |
| base_domain = urlparse(url).netloc | |
| for a in soup.find_all("a", href=True): | |
| href = urljoin(url, a["href"]) | |
| parsed = urlparse(href) | |
| if parsed.scheme in ("http", "https") and parsed.netloc == base_domain: | |
| clean = parsed._replace(fragment="").geturl() | |
| if clean not in visited: | |
| queue.append((clean, depth + 1)) | |
| except: | |
| pass | |
| time.sleep(delay) | |
| # Build outputs | |
| stats = f"""## π Crawl Complete | |
| | Metric | Value | | |
| |--------|-------| | |
| | Pages crawled | {page_count} | | |
| | URLs visited | {len(visited)} | | |
| | Text samples collected | {len(collected)} | | |
| | Total words | {sum(d['word_count'] for d in collected):,} | | |
| | Language filter | {LANGUAGE_NAMES.get(lang_filter, lang_filter) if lang_filter else 'Any'} | | |
| | Topic keywords | {', '.join(topic_keywords) if topic_keywords else 'None (all topics)'} | | |
| """ | |
| log_text = "\n".join(logs[-200:]) # last 200 log lines | |
| # JSON output | |
| json_data = json.dumps( | |
| [{"url": d["url"], "title": d["title"], "language": d["language"], "text": d["text"]} for d in collected], | |
| ensure_ascii=False, | |
| indent=2 | |
| ) | |
| # CSV output | |
| csv_buf = io.StringIO() | |
| writer = csv.DictWriter(csv_buf, fieldnames=["url", "title", "language", "word_count", "text"]) | |
| writer.writeheader() | |
| for d in collected: | |
| writer.writerow({"url": d["url"], "title": d["title"], "language": d["language"], "word_count": d["word_count"], "text": d["text"][:5000]}) | |
| csv_data = csv_buf.getvalue() | |
| # Save files | |
| json_path = "/tmp/crawled_dataset.json" | |
| csv_path = "/tmp/crawled_dataset.csv" | |
| with open(json_path, "w", encoding="utf-8") as f: | |
| f.write(json_data) | |
| with open(csv_path, "w", encoding="utf-8") as f: | |
| f.write(csv_data) | |
| preview_rows = [] | |
| for d in collected[:5]: | |
| preview_rows.append([d["url"], d["title"][:50], d["language"], d["word_count"], d["text"][:200] + "..."]) | |
| return stats, log_text, json_path, csv_path, preview_rows | |
| # ββ UI βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| THEME = gr.themes.Base( | |
| primary_hue="emerald", | |
| secondary_hue="teal", | |
| neutral_hue="zinc", | |
| font=[gr.themes.GoogleFont("IBM Plex Mono"), gr.themes.GoogleFont("IBM Plex Sans"), "sans-serif"], | |
| ).set( | |
| body_background_fill="#0f1117", | |
| body_text_color="#e2e8f0", | |
| block_background_fill="#1a1f2e", | |
| block_border_color="#2d3748", | |
| input_background_fill="#0d1117", | |
| input_border_color="#374151", | |
| ) | |
| css = """ | |
| :root { | |
| --accent: #10b981; | |
| --accent-dim: #065f46; | |
| --bg-card: #1a1f2e; | |
| --text-muted: #6b7280; | |
| } | |
| .gradio-container { max-width: 1200px !important; margin: auto; } | |
| h1.title { | |
| font-family: 'IBM Plex Mono', monospace !important; | |
| font-size: 2rem; | |
| color: #10b981; | |
| letter-spacing: -0.03em; | |
| text-align: center; | |
| margin: 1rem 0 0.25rem; | |
| } | |
| .subtitle { | |
| text-align: center; | |
| color: #6b7280; | |
| font-family: 'IBM Plex Sans', sans-serif; | |
| font-size: 0.9rem; | |
| margin-bottom: 1.5rem; | |
| } | |
| .section-label { | |
| font-family: 'IBM Plex Mono', monospace; | |
| font-size: 0.7rem; | |
| text-transform: uppercase; | |
| letter-spacing: 0.1em; | |
| color: #10b981; | |
| margin-bottom: 0.25rem; | |
| } | |
| .crawl-btn { | |
| background: linear-gradient(135deg, #10b981, #059669) !important; | |
| color: white !important; | |
| font-family: 'IBM Plex Mono', monospace !important; | |
| font-size: 1rem !important; | |
| letter-spacing: 0.05em !important; | |
| border-radius: 4px !important; | |
| height: 48px !important; | |
| } | |
| .crawl-btn:hover { | |
| background: linear-gradient(135deg, #059669, #047857) !important; | |
| transform: translateY(-1px); | |
| box-shadow: 0 4px 20px rgba(16,185,129,0.3) !important; | |
| } | |
| .stop-btn { | |
| font-family: 'IBM Plex Mono', monospace !important; | |
| } | |
| footer { display: none !important; } | |
| """ | |
| lang_choices = [("Any Language", "any")] + [(f"{v} ({k})", k) for k, v in sorted(LANGUAGE_NAMES.items(), key=lambda x: x[1])] | |
| with gr.Blocks(title="WebCrawler Β· Dataset Builder") as demo: | |
| gr.HTML(""" | |
| <h1 class='title'>βΈ WebCrawler / Dataset Builder</h1> | |
| <p class='subtitle'>Crawl the web and extract text datasets filtered by language or topic β ready for NLP & LLM training.</p> | |
| """) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| gr.HTML("<div class='section-label'>π Seed URLs</div>") | |
| seed_urls = gr.Textbox( | |
| label="Seed URLs (one per line)", | |
| placeholder="https://example.com\nhttps://another-site.org", | |
| lines=5, | |
| value="https://en.wikipedia.org/wiki/Artificial_intelligence", | |
| ) | |
| gr.HTML("<div class='section-label'>π€ Language Filter</div>") | |
| target_lang = gr.Dropdown( | |
| label="Target Language", | |
| choices=lang_choices, | |
| value="any", | |
| ) | |
| gr.HTML("<div class='section-label'>π·οΈ Topic Keywords (optional)</div>") | |
| topic_kw = gr.Textbox( | |
| label="Keywords (comma-separated)", | |
| placeholder="machine learning, neural network, AI", | |
| lines=2, | |
| ) | |
| with gr.Column(scale=1): | |
| gr.HTML("<div class='section-label'>βοΈ Crawl Settings</div>") | |
| max_pages = gr.Slider(label="Max Pages", minimum=1, maximum=500, value=20, step=1) | |
| max_depth = gr.Slider(label="Max Depth", minimum=0, maximum=5, value=2, step=1) | |
| delay = gr.Slider(label="Delay Between Requests (s)", minimum=0.1, maximum=5.0, value=0.5, step=0.1) | |
| min_len = gr.Slider(label="Min Text Length (chars)", minimum=100, maximum=5000, value=300, step=100) | |
| with gr.Row(): | |
| run_btn = gr.Button("βΆ START CRAWL", elem_classes="crawl-btn", variant="primary") | |
| stop_btn = gr.Button("βΉ Stop", elem_classes="stop-btn", variant="stop") | |
| with gr.Tabs(): | |
| with gr.Tab("π Summary"): | |
| stats_md = gr.Markdown("*Results will appear here after crawling.*") | |
| with gr.Tab("π Preview"): | |
| preview_table = gr.Dataframe( | |
| headers=["URL", "Title", "Lang", "Words", "Text Preview"], | |
| label="First 5 Results", | |
| wrap=True, | |
| ) | |
| with gr.Tab("π Logs"): | |
| log_box = gr.Textbox(label="Crawl Log", lines=20, max_lines=30) | |
| with gr.Tab("πΎ Download"): | |
| gr.Markdown("### Download your dataset") | |
| with gr.Row(): | |
| json_file = gr.File(label="π JSON Dataset", file_types=[".json"]) | |
| csv_file = gr.File(label="π CSV Dataset", file_types=[".csv"]) | |
| crawl_event = run_btn.click( | |
| fn=crawl, | |
| inputs=[seed_urls, target_lang, topic_kw, max_pages, max_depth, delay, min_len], | |
| outputs=[stats_md, log_box, json_file, csv_file, preview_table], | |
| ) | |
| stop_btn.click(fn=None, cancels=[crawl_event]) | |
| if __name__ == "__main__": | |
| demo.launch(theme=THEME, css=css) | |