Spaces:
Running
Running
import gradio as gr | |
import requests | |
from bs4 import BeautifulSoup | |
from urllib.parse import urljoin, urlparse, urldefrag | |
import os | |
import zipfile | |
import io | |
import mimetypes | |
import re | |
import base64 | |
from collections import deque | |
INLINE_SIZE_LIMIT = 25 * 1024 # 25 KB | |
def sanitize_filename(url, page=False): | |
parsed = urlparse(url) | |
path = parsed.path if parsed.path and parsed.path != "/" else "/index" | |
if page: | |
# For HTML pages: folder + index.html | |
if path.endswith("/"): | |
path += "index" | |
filename = path.lstrip("/") + ".html" | |
else: | |
filename = os.path.basename(path) | |
if not filename: | |
filename = "file" | |
ext = os.path.splitext(filename)[1] | |
if not ext: | |
filename += ".res" | |
filename = re.sub(r'[^a-zA-Z0-9_.-]', '_', filename) | |
return filename | |
def make_data_uri(content, mimetype): | |
b64 = base64.b64encode(content).decode('utf-8') | |
return f'data:{mimetype};base64,{b64}' | |
def download_asset(url, session): | |
try: | |
resp = session.get(url, timeout=20, stream=True) | |
resp.raise_for_status() | |
content = resp.content | |
mimetype = resp.headers.get("Content-Type") or mimetypes.guess_type(url)[0] or "application/octet-stream" | |
return content, mimetype | |
except Exception: | |
return None, None | |
def process_css(css_content, base_url, session, asset_folder, zip_file, asset_map, status_msgs): | |
urls = re.findall(r'url\((["\']?)(.*?)\1\)', css_content) | |
for _, asset_url in urls: | |
if asset_url.startswith('data:'): continue | |
real_url = urljoin(base_url, asset_url) | |
if real_url in asset_map: | |
local_path = asset_map[real_url] | |
else: | |
content, mimetype = download_asset(real_url, session) | |
if content: | |
if len(content) <= INLINE_SIZE_LIMIT: | |
data_uri = make_data_uri(content, mimetype) | |
css_content = css_content.replace(asset_url, data_uri) | |
status_msgs.append(f"Inlined CSS asset: {real_url}") | |
asset_map[real_url] = data_uri | |
continue | |
else: | |
filename = sanitize_filename(real_url) | |
local_path = f"{asset_folder}/{filename}" | |
zip_file.writestr(local_path, content) | |
asset_map[real_url] = local_path | |
status_msgs.append(f"Saved CSS asset: {real_url}") | |
else: | |
continue | |
css_content = css_content.replace(asset_url, asset_map[real_url]) | |
return css_content | |
def fix_links(soup, base_url, url_to_local, domain): | |
# Fix all "a" hrefs to local HTML files if internal | |
for tag in soup.find_all('a', href=True): | |
href = tag['href'] | |
parsed = urlparse(urljoin(base_url, href)) | |
if parsed.scheme.startswith('http') and parsed.netloc == domain: | |
canonical_url = urldefrag(parsed.geturl())[0] | |
if canonical_url in url_to_local: | |
tag['href'] = url_to_local[canonical_url] | |
# Also fix form action, iframe src, etc if desired (not implemented here) | |
return soup | |
def clone_site_multipage(start_url, max_depth=2, max_pages=20): | |
status_msgs = [] | |
session = requests.Session() | |
session.headers.update({'User-Agent': 'Mozilla/5.0 (compatible; SiteCloner/3.0)'}) | |
asset_folder = "assets" | |
visited = set() | |
url_queue = deque() | |
url_queue.append((start_url, 0)) | |
domain = urlparse(start_url).netloc | |
url_to_local = dict() | |
asset_map = dict() # Map asset URLs to local paths or data URIs | |
html_pages = dict() # url => (html str, soup) | |
# Crawl | |
while url_queue and len(visited) < max_pages: | |
url, depth = url_queue.popleft() | |
canonical_url = urldefrag(url)[0] | |
if canonical_url in visited or depth > max_depth: | |
continue | |
try: | |
resp = session.get(url, timeout=20) | |
resp.raise_for_status() | |
except Exception as e: | |
status_msgs.append(f"β Failed to fetch: {url} ({e})") | |
continue | |
visited.add(canonical_url) | |
soup = BeautifulSoup(resp.text, "html.parser") | |
local_html_path = sanitize_filename(canonical_url, page=True) | |
url_to_local[canonical_url] = local_html_path | |
html_pages[canonical_url] = (resp.text, soup) | |
status_msgs.append(f"Fetched: {url}") | |
# Enqueue new internal links | |
for tag in soup.find_all('a', href=True): | |
href = tag['href'] | |
parsed = urlparse(urljoin(url, href)) | |
final_url = urldefrag(parsed.geturl())[0] | |
if parsed.scheme.startswith('http') and parsed.netloc == domain and final_url not in visited: | |
url_queue.append((final_url, depth + 1)) | |
# Prepare zip | |
zip_buffer = io.BytesIO() | |
with zipfile.ZipFile(zip_buffer, "w", zipfile.ZIP_DEFLATED) as zip_file: | |
for page_url, (raw_html, soup) in html_pages.items(): | |
base_url = page_url | |
# --- CSS --- | |
for tag in soup.find_all("link", rel="stylesheet"): | |
href = tag.get("href") | |
if href: | |
css_url = urljoin(base_url, href) | |
if css_url in asset_map: | |
if asset_map[css_url].startswith("data:"): | |
# Inline | |
style_tag = soup.new_tag("style") | |
style_tag.string = asset_map[css_url] | |
tag.replace_with(style_tag) | |
else: | |
tag['href'] = asset_map[css_url] | |
continue | |
content, mimetype = download_asset(css_url, session) | |
if content: | |
try: | |
css_text = content.decode('utf-8', errors='replace') | |
new_css = process_css(css_text, css_url, session, asset_folder, zip_file, asset_map, status_msgs) | |
if len(new_css.encode('utf-8')) <= INLINE_SIZE_LIMIT: | |
style_tag = soup.new_tag("style") | |
style_tag.string = new_css | |
tag.replace_with(style_tag) | |
asset_map[css_url] = new_css | |
status_msgs.append(f"Inlined CSS: {css_url}") | |
else: | |
filename = sanitize_filename(css_url) | |
local_path = f"{asset_folder}/{filename}" | |
zip_file.writestr(local_path, new_css) | |
tag['href'] = local_path | |
asset_map[css_url] = local_path | |
status_msgs.append(f"Saved CSS: {css_url}") | |
except Exception as e: | |
status_msgs.append(f"β CSS error: {css_url} ({e})") | |
else: | |
status_msgs.append(f"β Failed CSS: {css_url}") | |
# --- JS --- | |
for tag in soup.find_all("script", src=True): | |
src = tag.get("src") | |
if src: | |
js_url = urljoin(base_url, src) | |
if js_url in asset_map: | |
if asset_map[js_url].startswith("data:"): | |
script_tag = soup.new_tag("script") | |
script_tag.string = asset_map[js_url] | |
tag.replace_with(script_tag) | |
else: | |
tag['src'] = asset_map[js_url] | |
continue | |
content, mimetype = download_asset(js_url, session) | |
if content: | |
if len(content) <= INLINE_SIZE_LIMIT: | |
script_tag = soup.new_tag("script") | |
script_tag.string = content.decode('utf-8', errors='replace') | |
tag.replace_with(script_tag) | |
asset_map[js_url] = content.decode('utf-8', errors='replace') | |
status_msgs.append(f"Inlined JS: {js_url}") | |
else: | |
filename = sanitize_filename(js_url) | |
local_path = f"{asset_folder}/{filename}" | |
zip_file.writestr(local_path, content) | |
tag['src'] = local_path | |
asset_map[js_url] = local_path | |
status_msgs.append(f"Saved JS: {js_url}") | |
else: | |
status_msgs.append(f"β Failed JS: {js_url}") | |
# --- Images --- | |
def handle_img_attr(tag, attr): | |
res_url = tag.get(attr) | |
if res_url: | |
if attr == "srcset": | |
res_url = res_url.split(",")[0].split()[0] | |
if res_url.startswith('data:'): return | |
full_url = urljoin(base_url, res_url) | |
if full_url in asset_map: | |
tag[attr] = asset_map[full_url] | |
return | |
content, mimetype = download_asset(full_url, session) | |
if content: | |
if len(content) <= INLINE_SIZE_LIMIT: | |
data_uri = make_data_uri(content, mimetype) | |
tag[attr] = data_uri | |
asset_map[full_url] = data_uri | |
status_msgs.append(f"Inlined image: {full_url}") | |
else: | |
filename = sanitize_filename(full_url) | |
local_path = f"{asset_folder}/{filename}" | |
zip_file.writestr(local_path, content) | |
tag[attr] = local_path | |
asset_map[full_url] = local_path | |
status_msgs.append(f"Saved image: {full_url}") | |
else: | |
status_msgs.append(f"β Failed image: {full_url}") | |
for tag in soup.find_all(["img", "source"]): | |
handle_img_attr(tag, "src") | |
handle_img_attr(tag, "srcset") | |
# --- Favicon & icons --- | |
for tag in soup.find_all("link", rel=lambda val: val and ("icon" in val or "shortcut" in val)): | |
href = tag.get("href") | |
if href: | |
icon_url = urljoin(base_url, href) | |
if icon_url in asset_map: | |
tag['href'] = asset_map[icon_url] | |
continue | |
content, mimetype = download_asset(icon_url, session) | |
if content: | |
if len(content) <= INLINE_SIZE_LIMIT: | |
data_uri = make_data_uri(content, mimetype) | |
tag['href'] = data_uri | |
asset_map[icon_url] = data_uri | |
status_msgs.append(f"Inlined icon: {icon_url}") | |
else: | |
filename = sanitize_filename(icon_url) | |
local_path = f"{asset_folder}/{filename}" | |
zip_file.writestr(local_path, content) | |
tag['href'] = local_path | |
asset_map[icon_url] = local_path | |
status_msgs.append(f"Saved icon: {icon_url}") | |
else: | |
status_msgs.append(f"β Failed icon: {icon_url}") | |
# --- Inline style attributes (background images) --- | |
for tag in soup.find_all(style=True): | |
style = tag['style'] | |
bg_urls = re.findall(r'url\((["\']?)(.*?)\1\)', style) | |
for _, bg_url in bg_urls: | |
if bg_url.startswith('data:'): continue | |
full_url = urljoin(base_url, bg_url) | |
if full_url in asset_map: | |
tag['style'] = tag['style'].replace(bg_url, asset_map[full_url]) | |
continue | |
content, mimetype = download_asset(full_url, session) | |
if content: | |
if len(content) <= INLINE_SIZE_LIMIT: | |
data_uri = make_data_uri(content, mimetype) | |
tag['style'] = tag['style'].replace(bg_url, data_uri) | |
asset_map[full_url] = data_uri | |
status_msgs.append(f"Inlined BG: {full_url}") | |
else: | |
filename = sanitize_filename(full_url) | |
local_path = f"{asset_folder}/{filename}" | |
zip_file.writestr(local_path, content) | |
tag['style'] = tag['style'].replace(bg_url, local_path) | |
asset_map[full_url] = local_path | |
status_msgs.append(f"Saved BG: {full_url}") | |
else: | |
status_msgs.append(f"β Failed BG: {full_url}") | |
# --- Video/audio assets --- | |
for tag in soup.find_all(["video", "audio", "source"]): | |
handle_img_attr(tag, "src") | |
# Rewrite internal links to local files | |
fix_links(soup, base_url, url_to_local, domain) | |
# Save modified HTML | |
new_html = str(soup) | |
zip_file.writestr(url_to_local[page_url], new_html) | |
zip_buffer.seek(0) | |
report = f"β Crawled {len(html_pages)} page(s).\n---\n" + "\n".join(status_msgs[-50:]) # last 50 logs | |
return report, zip_buffer | |
with gr.Blocks() as demo: | |
gr.Markdown( | |
""" | |
# π Full Website Cloner (Multi-Page) | |
- Recursively clones all internal pages (up to depth and page limit). | |
- Downloads HTML, CSS, JS, images, favicon, and more. | |
- Inlines small assets, rewrites all links to local. | |
- Outputs a ready-to-run zip. For learning/demo. Best for simple/static sites. | |
""" | |
) | |
url_input = gr.Textbox(label="Website URL", placeholder="https://example.com") | |
depth_input = gr.Slider(label="Crawl Depth", minimum=1, maximum=4, value=2, step=1) | |
page_input = gr.Slider(label="Max Pages", minimum=1, maximum=30, value=10, step=1) | |
clone_btn = gr.Button("Clone & Download ZIP") | |
output_file = gr.File(label="Download ZIP") | |
status = gr.Textbox(label="Status/Info", interactive=False, lines=18) | |
def wrapper(url, depth, maxpages): | |
msg, zip_obj = clone_site_multipage(url, int(depth), int(maxpages)) | |
return zip_obj, msg | |
clone_btn.click(wrapper, [url_input, depth_input, page_input], [output_file, status]) | |
if __name__ == "__main__": | |
demo.launch() |