import gradio as gr import requests from bs4 import BeautifulSoup from urllib.parse import urljoin, urlparse, urldefrag import os import zipfile import io import mimetypes import re import base64 from collections import deque INLINE_SIZE_LIMIT = 25 * 1024 # 25 KB def sanitize_filename(url, page=False): parsed = urlparse(url) path = parsed.path if parsed.path and parsed.path != "/" else "/index" if page: # For HTML pages: folder + index.html if path.endswith("/"): path += "index" filename = path.lstrip("/") + ".html" else: filename = os.path.basename(path) if not filename: filename = "file" ext = os.path.splitext(filename)[1] if not ext: filename += ".res" filename = re.sub(r'[^a-zA-Z0-9_.-]', '_', filename) return filename def make_data_uri(content, mimetype): b64 = base64.b64encode(content).decode('utf-8') return f'data:{mimetype};base64,{b64}' def download_asset(url, session): try: resp = session.get(url, timeout=20, stream=True) resp.raise_for_status() content = resp.content mimetype = resp.headers.get("Content-Type") or mimetypes.guess_type(url)[0] or "application/octet-stream" return content, mimetype except Exception: return None, None def process_css(css_content, base_url, session, asset_folder, zip_file, asset_map, status_msgs): urls = re.findall(r'url\((["\']?)(.*?)\1\)', css_content) for _, asset_url in urls: if asset_url.startswith('data:'): continue real_url = urljoin(base_url, asset_url) if real_url in asset_map: local_path = asset_map[real_url] else: content, mimetype = download_asset(real_url, session) if content: if len(content) <= INLINE_SIZE_LIMIT: data_uri = make_data_uri(content, mimetype) css_content = css_content.replace(asset_url, data_uri) status_msgs.append(f"Inlined CSS asset: {real_url}") asset_map[real_url] = data_uri continue else: filename = sanitize_filename(real_url) local_path = f"{asset_folder}/{filename}" zip_file.writestr(local_path, content) asset_map[real_url] = local_path status_msgs.append(f"Saved CSS asset: {real_url}") else: continue css_content = css_content.replace(asset_url, asset_map[real_url]) return css_content def fix_links(soup, base_url, url_to_local, domain): # Fix all "a" hrefs to local HTML files if internal for tag in soup.find_all('a', href=True): href = tag['href'] parsed = urlparse(urljoin(base_url, href)) if parsed.scheme.startswith('http') and parsed.netloc == domain: canonical_url = urldefrag(parsed.geturl())[0] if canonical_url in url_to_local: tag['href'] = url_to_local[canonical_url] # Also fix form action, iframe src, etc if desired (not implemented here) return soup def clone_site_multipage(start_url, max_depth=2, max_pages=20): status_msgs = [] session = requests.Session() session.headers.update({'User-Agent': 'Mozilla/5.0 (compatible; SiteCloner/3.0)'}) asset_folder = "assets" visited = set() url_queue = deque() url_queue.append((start_url, 0)) domain = urlparse(start_url).netloc url_to_local = dict() asset_map = dict() # Map asset URLs to local paths or data URIs html_pages = dict() # url => (html str, soup) # Crawl while url_queue and len(visited) < max_pages: url, depth = url_queue.popleft() canonical_url = urldefrag(url)[0] if canonical_url in visited or depth > max_depth: continue try: resp = session.get(url, timeout=20) resp.raise_for_status() except Exception as e: status_msgs.append(f"❌ Failed to fetch: {url} ({e})") continue visited.add(canonical_url) soup = BeautifulSoup(resp.text, "html.parser") local_html_path = sanitize_filename(canonical_url, page=True) url_to_local[canonical_url] = local_html_path html_pages[canonical_url] = (resp.text, soup) status_msgs.append(f"Fetched: {url}") # Enqueue new internal links for tag in soup.find_all('a', href=True): href = tag['href'] parsed = urlparse(urljoin(url, href)) final_url = urldefrag(parsed.geturl())[0] if parsed.scheme.startswith('http') and parsed.netloc == domain and final_url not in visited: url_queue.append((final_url, depth + 1)) # Prepare zip zip_buffer = io.BytesIO() with zipfile.ZipFile(zip_buffer, "w", zipfile.ZIP_DEFLATED) as zip_file: for page_url, (raw_html, soup) in html_pages.items(): base_url = page_url # --- CSS --- for tag in soup.find_all("link", rel="stylesheet"): href = tag.get("href") if href: css_url = urljoin(base_url, href) if css_url in asset_map: if asset_map[css_url].startswith("data:"): # Inline style_tag = soup.new_tag("style") style_tag.string = asset_map[css_url] tag.replace_with(style_tag) else: tag['href'] = asset_map[css_url] continue content, mimetype = download_asset(css_url, session) if content: try: css_text = content.decode('utf-8', errors='replace') new_css = process_css(css_text, css_url, session, asset_folder, zip_file, asset_map, status_msgs) if len(new_css.encode('utf-8')) <= INLINE_SIZE_LIMIT: style_tag = soup.new_tag("style") style_tag.string = new_css tag.replace_with(style_tag) asset_map[css_url] = new_css status_msgs.append(f"Inlined CSS: {css_url}") else: filename = sanitize_filename(css_url) local_path = f"{asset_folder}/{filename}" zip_file.writestr(local_path, new_css) tag['href'] = local_path asset_map[css_url] = local_path status_msgs.append(f"Saved CSS: {css_url}") except Exception as e: status_msgs.append(f"❌ CSS error: {css_url} ({e})") else: status_msgs.append(f"❌ Failed CSS: {css_url}") # --- JS --- for tag in soup.find_all("script", src=True): src = tag.get("src") if src: js_url = urljoin(base_url, src) if js_url in asset_map: if asset_map[js_url].startswith("data:"): script_tag = soup.new_tag("script") script_tag.string = asset_map[js_url] tag.replace_with(script_tag) else: tag['src'] = asset_map[js_url] continue content, mimetype = download_asset(js_url, session) if content: if len(content) <= INLINE_SIZE_LIMIT: script_tag = soup.new_tag("script") script_tag.string = content.decode('utf-8', errors='replace') tag.replace_with(script_tag) asset_map[js_url] = content.decode('utf-8', errors='replace') status_msgs.append(f"Inlined JS: {js_url}") else: filename = sanitize_filename(js_url) local_path = f"{asset_folder}/{filename}" zip_file.writestr(local_path, content) tag['src'] = local_path asset_map[js_url] = local_path status_msgs.append(f"Saved JS: {js_url}") else: status_msgs.append(f"❌ Failed JS: {js_url}") # --- Images --- def handle_img_attr(tag, attr): res_url = tag.get(attr) if res_url: if attr == "srcset": res_url = res_url.split(",")[0].split()[0] if res_url.startswith('data:'): return full_url = urljoin(base_url, res_url) if full_url in asset_map: tag[attr] = asset_map[full_url] return content, mimetype = download_asset(full_url, session) if content: if len(content) <= INLINE_SIZE_LIMIT: data_uri = make_data_uri(content, mimetype) tag[attr] = data_uri asset_map[full_url] = data_uri status_msgs.append(f"Inlined image: {full_url}") else: filename = sanitize_filename(full_url) local_path = f"{asset_folder}/{filename}" zip_file.writestr(local_path, content) tag[attr] = local_path asset_map[full_url] = local_path status_msgs.append(f"Saved image: {full_url}") else: status_msgs.append(f"❌ Failed image: {full_url}") for tag in soup.find_all(["img", "source"]): handle_img_attr(tag, "src") handle_img_attr(tag, "srcset") # --- Favicon & icons --- for tag in soup.find_all("link", rel=lambda val: val and ("icon" in val or "shortcut" in val)): href = tag.get("href") if href: icon_url = urljoin(base_url, href) if icon_url in asset_map: tag['href'] = asset_map[icon_url] continue content, mimetype = download_asset(icon_url, session) if content: if len(content) <= INLINE_SIZE_LIMIT: data_uri = make_data_uri(content, mimetype) tag['href'] = data_uri asset_map[icon_url] = data_uri status_msgs.append(f"Inlined icon: {icon_url}") else: filename = sanitize_filename(icon_url) local_path = f"{asset_folder}/{filename}" zip_file.writestr(local_path, content) tag['href'] = local_path asset_map[icon_url] = local_path status_msgs.append(f"Saved icon: {icon_url}") else: status_msgs.append(f"❌ Failed icon: {icon_url}") # --- Inline style attributes (background images) --- for tag in soup.find_all(style=True): style = tag['style'] bg_urls = re.findall(r'url\((["\']?)(.*?)\1\)', style) for _, bg_url in bg_urls: if bg_url.startswith('data:'): continue full_url = urljoin(base_url, bg_url) if full_url in asset_map: tag['style'] = tag['style'].replace(bg_url, asset_map[full_url]) continue content, mimetype = download_asset(full_url, session) if content: if len(content) <= INLINE_SIZE_LIMIT: data_uri = make_data_uri(content, mimetype) tag['style'] = tag['style'].replace(bg_url, data_uri) asset_map[full_url] = data_uri status_msgs.append(f"Inlined BG: {full_url}") else: filename = sanitize_filename(full_url) local_path = f"{asset_folder}/{filename}" zip_file.writestr(local_path, content) tag['style'] = tag['style'].replace(bg_url, local_path) asset_map[full_url] = local_path status_msgs.append(f"Saved BG: {full_url}") else: status_msgs.append(f"❌ Failed BG: {full_url}") # --- Video/audio assets --- for tag in soup.find_all(["video", "audio", "source"]): handle_img_attr(tag, "src") # Rewrite internal links to local files fix_links(soup, base_url, url_to_local, domain) # Save modified HTML new_html = str(soup) zip_file.writestr(url_to_local[page_url], new_html) zip_buffer.seek(0) report = f"✅ Crawled {len(html_pages)} page(s).\n---\n" + "\n".join(status_msgs[-50:]) # last 50 logs return report, zip_buffer with gr.Blocks() as demo: gr.Markdown( """ # 🌐 Full Website Cloner (Multi-Page) - Recursively clones all internal pages (up to depth and page limit). - Downloads HTML, CSS, JS, images, favicon, and more. - Inlines small assets, rewrites all links to local. - Outputs a ready-to-run zip. For learning/demo. Best for simple/static sites. """ ) url_input = gr.Textbox(label="Website URL", placeholder="https://example.com") depth_input = gr.Slider(label="Crawl Depth", minimum=1, maximum=4, value=2, step=1) page_input = gr.Slider(label="Max Pages", minimum=1, maximum=30, value=10, step=1) clone_btn = gr.Button("Clone & Download ZIP") output_file = gr.File(label="Download ZIP") status = gr.Textbox(label="Status/Info", interactive=False, lines=18) def wrapper(url, depth, maxpages): msg, zip_obj = clone_site_multipage(url, int(depth), int(maxpages)) return zip_obj, msg clone_btn.click(wrapper, [url_input, depth_input, page_input], [output_file, status]) if __name__ == "__main__": demo.launch()