Spaces:

prakashkumarsingh
/

hellojiiii

Running

App Files Files Community

hellojiiii / app.py

prakashkumarsingh

Update app.py

b2a624b verified 26 days ago

raw

history blame contribute delete

15 kB

	import gradio as gr
	import requests
	from bs4 import BeautifulSoup
	from urllib.parse import urljoin, urlparse, urldefrag
	import os
	import zipfile
	import io
	import mimetypes
	import re
	import base64
	from collections import deque

	INLINE_SIZE_LIMIT = 25 * 1024 # 25 KB

	def sanitize_filename(url, page=False):
	parsed = urlparse(url)
	path = parsed.path if parsed.path and parsed.path != "/" else "/index"
	if page:
	# For HTML pages: folder + index.html
	if path.endswith("/"):
	path += "index"
	filename = path.lstrip("/") + ".html"
	else:
	filename = os.path.basename(path)
	if not filename:
	filename = "file"
	ext = os.path.splitext(filename)[1]
	if not ext:
	filename += ".res"
	filename = re.sub(r'[^a-zA-Z0-9_.-]', '_', filename)
	return filename

	def make_data_uri(content, mimetype):
	b64 = base64.b64encode(content).decode('utf-8')
	return f'data:{mimetype};base64,{b64}'

	def download_asset(url, session):
	try:
	resp = session.get(url, timeout=20, stream=True)
	resp.raise_for_status()
	content = resp.content
	mimetype = resp.headers.get("Content-Type") or mimetypes.guess_type(url)[0] or "application/octet-stream"
	return content, mimetype
	except Exception:
	return None, None

	def process_css(css_content, base_url, session, asset_folder, zip_file, asset_map, status_msgs):
	urls = re.findall(r'url\((["\']?)(.*?)\1\)', css_content)
	for _, asset_url in urls:
	if asset_url.startswith('data:'): continue
	real_url = urljoin(base_url, asset_url)
	if real_url in asset_map:
	local_path = asset_map[real_url]
	else:
	content, mimetype = download_asset(real_url, session)
	if content:
	if len(content) <= INLINE_SIZE_LIMIT:
	data_uri = make_data_uri(content, mimetype)
	css_content = css_content.replace(asset_url, data_uri)
	status_msgs.append(f"Inlined CSS asset: {real_url}")
	asset_map[real_url] = data_uri
	continue
	else:
	filename = sanitize_filename(real_url)
	local_path = f"{asset_folder}/{filename}"
	zip_file.writestr(local_path, content)
	asset_map[real_url] = local_path
	status_msgs.append(f"Saved CSS asset: {real_url}")
	else:
	continue
	css_content = css_content.replace(asset_url, asset_map[real_url])
	return css_content

	def fix_links(soup, base_url, url_to_local, domain):
	# Fix all "a" hrefs to local HTML files if internal
	for tag in soup.find_all('a', href=True):
	href = tag['href']
	parsed = urlparse(urljoin(base_url, href))
	if parsed.scheme.startswith('http') and parsed.netloc == domain:
	canonical_url = urldefrag(parsed.geturl())[0]
	if canonical_url in url_to_local:
	tag['href'] = url_to_local[canonical_url]
	# Also fix form action, iframe src, etc if desired (not implemented here)
	return soup

	def clone_site_multipage(start_url, max_depth=2, max_pages=20):
	status_msgs = []
	session = requests.Session()
	session.headers.update({'User-Agent': 'Mozilla/5.0 (compatible; SiteCloner/3.0)'})
	asset_folder = "assets"
	visited = set()
	url_queue = deque()
	url_queue.append((start_url, 0))
	domain = urlparse(start_url).netloc
	url_to_local = dict()
	asset_map = dict() # Map asset URLs to local paths or data URIs

	html_pages = dict() # url => (html str, soup)

	# Crawl
	while url_queue and len(visited) < max_pages:
	url, depth = url_queue.popleft()
	canonical_url = urldefrag(url)[0]
	if canonical_url in visited or depth > max_depth:
	continue
	try:
	resp = session.get(url, timeout=20)
	resp.raise_for_status()
	except Exception as e:
	status_msgs.append(f"❌ Failed to fetch: {url} ({e})")
	continue
	visited.add(canonical_url)
	soup = BeautifulSoup(resp.text, "html.parser")
	local_html_path = sanitize_filename(canonical_url, page=True)
	url_to_local[canonical_url] = local_html_path
	html_pages[canonical_url] = (resp.text, soup)
	status_msgs.append(f"Fetched: {url}")
	# Enqueue new internal links
	for tag in soup.find_all('a', href=True):
	href = tag['href']
	parsed = urlparse(urljoin(url, href))
	final_url = urldefrag(parsed.geturl())[0]
	if parsed.scheme.startswith('http') and parsed.netloc == domain and final_url not in visited:
	url_queue.append((final_url, depth + 1))

	# Prepare zip
	zip_buffer = io.BytesIO()
	with zipfile.ZipFile(zip_buffer, "w", zipfile.ZIP_DEFLATED) as zip_file:
	for page_url, (raw_html, soup) in html_pages.items():
	base_url = page_url
	# --- CSS ---
	for tag in soup.find_all("link", rel="stylesheet"):
	href = tag.get("href")
	if href:
	css_url = urljoin(base_url, href)
	if css_url in asset_map:
	if asset_map[css_url].startswith("data:"):
	# Inline
	style_tag = soup.new_tag("style")
	style_tag.string = asset_map[css_url]
	tag.replace_with(style_tag)
	else:
	tag['href'] = asset_map[css_url]
	continue
	content, mimetype = download_asset(css_url, session)
	if content:
	try:
	css_text = content.decode('utf-8', errors='replace')
	new_css = process_css(css_text, css_url, session, asset_folder, zip_file, asset_map, status_msgs)
	if len(new_css.encode('utf-8')) <= INLINE_SIZE_LIMIT:
	style_tag = soup.new_tag("style")
	style_tag.string = new_css
	tag.replace_with(style_tag)
	asset_map[css_url] = new_css
	status_msgs.append(f"Inlined CSS: {css_url}")
	else:
	filename = sanitize_filename(css_url)
	local_path = f"{asset_folder}/{filename}"
	zip_file.writestr(local_path, new_css)
	tag['href'] = local_path
	asset_map[css_url] = local_path
	status_msgs.append(f"Saved CSS: {css_url}")
	except Exception as e:
	status_msgs.append(f"❌ CSS error: {css_url} ({e})")
	else:
	status_msgs.append(f"❌ Failed CSS: {css_url}")

	# --- JS ---
	for tag in soup.find_all("script", src=True):
	src = tag.get("src")
	if src:
	js_url = urljoin(base_url, src)
	if js_url in asset_map:
	if asset_map[js_url].startswith("data:"):
	script_tag = soup.new_tag("script")
	script_tag.string = asset_map[js_url]
	tag.replace_with(script_tag)
	else:
	tag['src'] = asset_map[js_url]
	continue
	content, mimetype = download_asset(js_url, session)
	if content:
	if len(content) <= INLINE_SIZE_LIMIT:
	script_tag = soup.new_tag("script")
	script_tag.string = content.decode('utf-8', errors='replace')
	tag.replace_with(script_tag)
	asset_map[js_url] = content.decode('utf-8', errors='replace')
	status_msgs.append(f"Inlined JS: {js_url}")
	else:
	filename = sanitize_filename(js_url)
	local_path = f"{asset_folder}/{filename}"
	zip_file.writestr(local_path, content)
	tag['src'] = local_path
	asset_map[js_url] = local_path
	status_msgs.append(f"Saved JS: {js_url}")
	else:
	status_msgs.append(f"❌ Failed JS: {js_url}")

	# --- Images ---
	def handle_img_attr(tag, attr):
	res_url = tag.get(attr)
	if res_url:
	if attr == "srcset":
	res_url = res_url.split(",")[0].split()[0]
	if res_url.startswith('data:'): return
	full_url = urljoin(base_url, res_url)
	if full_url in asset_map:
	tag[attr] = asset_map[full_url]
	return
	content, mimetype = download_asset(full_url, session)
	if content:
	if len(content) <= INLINE_SIZE_LIMIT:
	data_uri = make_data_uri(content, mimetype)
	tag[attr] = data_uri
	asset_map[full_url] = data_uri
	status_msgs.append(f"Inlined image: {full_url}")
	else:
	filename = sanitize_filename(full_url)
	local_path = f"{asset_folder}/{filename}"
	zip_file.writestr(local_path, content)
	tag[attr] = local_path
	asset_map[full_url] = local_path
	status_msgs.append(f"Saved image: {full_url}")
	else:
	status_msgs.append(f"❌ Failed image: {full_url}")

	for tag in soup.find_all(["img", "source"]):
	handle_img_attr(tag, "src")
	handle_img_attr(tag, "srcset")

	# --- Favicon & icons ---
	for tag in soup.find_all("link", rel=lambda val: val and ("icon" in val or "shortcut" in val)):
	href = tag.get("href")
	if href:
	icon_url = urljoin(base_url, href)
	if icon_url in asset_map:
	tag['href'] = asset_map[icon_url]
	continue
	content, mimetype = download_asset(icon_url, session)
	if content:
	if len(content) <= INLINE_SIZE_LIMIT:
	data_uri = make_data_uri(content, mimetype)
	tag['href'] = data_uri
	asset_map[icon_url] = data_uri
	status_msgs.append(f"Inlined icon: {icon_url}")
	else:
	filename = sanitize_filename(icon_url)
	local_path = f"{asset_folder}/{filename}"
	zip_file.writestr(local_path, content)
	tag['href'] = local_path
	asset_map[icon_url] = local_path
	status_msgs.append(f"Saved icon: {icon_url}")
	else:
	status_msgs.append(f"❌ Failed icon: {icon_url}")

	# --- Inline style attributes (background images) ---
	for tag in soup.find_all(style=True):
	style = tag['style']
	bg_urls = re.findall(r'url\((["\']?)(.*?)\1\)', style)
	for _, bg_url in bg_urls:
	if bg_url.startswith('data:'): continue
	full_url = urljoin(base_url, bg_url)
	if full_url in asset_map:
	tag['style'] = tag['style'].replace(bg_url, asset_map[full_url])
	continue
	content, mimetype = download_asset(full_url, session)
	if content:
	if len(content) <= INLINE_SIZE_LIMIT:
	data_uri = make_data_uri(content, mimetype)
	tag['style'] = tag['style'].replace(bg_url, data_uri)
	asset_map[full_url] = data_uri
	status_msgs.append(f"Inlined BG: {full_url}")
	else:
	filename = sanitize_filename(full_url)
	local_path = f"{asset_folder}/{filename}"
	zip_file.writestr(local_path, content)
	tag['style'] = tag['style'].replace(bg_url, local_path)
	asset_map[full_url] = local_path
	status_msgs.append(f"Saved BG: {full_url}")
	else:
	status_msgs.append(f"❌ Failed BG: {full_url}")

	# --- Video/audio assets ---
	for tag in soup.find_all(["video", "audio", "source"]):
	handle_img_attr(tag, "src")

	# Rewrite internal links to local files
	fix_links(soup, base_url, url_to_local, domain)

	# Save modified HTML
	new_html = str(soup)
	zip_file.writestr(url_to_local[page_url], new_html)

	zip_buffer.seek(0)
	report = f"✅ Crawled {len(html_pages)} page(s).\n---\n" + "\n".join(status_msgs[-50:]) # last 50 logs
	return report, zip_buffer

	with gr.Blocks() as demo:
	gr.Markdown(
	"""
	# 🌐 Full Website Cloner (Multi-Page)
	- Recursively clones all internal pages (up to depth and page limit).
	- Downloads HTML, CSS, JS, images, favicon, and more.
	- Inlines small assets, rewrites all links to local.
	- Outputs a ready-to-run zip. For learning/demo. Best for simple/static sites.
	"""
	)
	url_input = gr.Textbox(label="Website URL", placeholder="https://example.com")
	depth_input = gr.Slider(label="Crawl Depth", minimum=1, maximum=4, value=2, step=1)
	page_input = gr.Slider(label="Max Pages", minimum=1, maximum=30, value=10, step=1)
	clone_btn = gr.Button("Clone & Download ZIP")
	output_file = gr.File(label="Download ZIP")
	status = gr.Textbox(label="Status/Info", interactive=False, lines=18)

	def wrapper(url, depth, maxpages):
	msg, zip_obj = clone_site_multipage(url, int(depth), int(maxpages))
	return zip_obj, msg

	clone_btn.click(wrapper, [url_input, depth_input, page_input], [output_file, status])

	if __name__ == "__main__":
	demo.launch()