hellojiiii / app.py
prakashkumarsingh's picture
Update app.py
b2a624b verified
import gradio as gr
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse, urldefrag
import os
import zipfile
import io
import mimetypes
import re
import base64
from collections import deque
INLINE_SIZE_LIMIT = 25 * 1024 # 25 KB
def sanitize_filename(url, page=False):
parsed = urlparse(url)
path = parsed.path if parsed.path and parsed.path != "/" else "/index"
if page:
# For HTML pages: folder + index.html
if path.endswith("/"):
path += "index"
filename = path.lstrip("/") + ".html"
else:
filename = os.path.basename(path)
if not filename:
filename = "file"
ext = os.path.splitext(filename)[1]
if not ext:
filename += ".res"
filename = re.sub(r'[^a-zA-Z0-9_.-]', '_', filename)
return filename
def make_data_uri(content, mimetype):
b64 = base64.b64encode(content).decode('utf-8')
return f'data:{mimetype};base64,{b64}'
def download_asset(url, session):
try:
resp = session.get(url, timeout=20, stream=True)
resp.raise_for_status()
content = resp.content
mimetype = resp.headers.get("Content-Type") or mimetypes.guess_type(url)[0] or "application/octet-stream"
return content, mimetype
except Exception:
return None, None
def process_css(css_content, base_url, session, asset_folder, zip_file, asset_map, status_msgs):
urls = re.findall(r'url\((["\']?)(.*?)\1\)', css_content)
for _, asset_url in urls:
if asset_url.startswith('data:'): continue
real_url = urljoin(base_url, asset_url)
if real_url in asset_map:
local_path = asset_map[real_url]
else:
content, mimetype = download_asset(real_url, session)
if content:
if len(content) <= INLINE_SIZE_LIMIT:
data_uri = make_data_uri(content, mimetype)
css_content = css_content.replace(asset_url, data_uri)
status_msgs.append(f"Inlined CSS asset: {real_url}")
asset_map[real_url] = data_uri
continue
else:
filename = sanitize_filename(real_url)
local_path = f"{asset_folder}/{filename}"
zip_file.writestr(local_path, content)
asset_map[real_url] = local_path
status_msgs.append(f"Saved CSS asset: {real_url}")
else:
continue
css_content = css_content.replace(asset_url, asset_map[real_url])
return css_content
def fix_links(soup, base_url, url_to_local, domain):
# Fix all "a" hrefs to local HTML files if internal
for tag in soup.find_all('a', href=True):
href = tag['href']
parsed = urlparse(urljoin(base_url, href))
if parsed.scheme.startswith('http') and parsed.netloc == domain:
canonical_url = urldefrag(parsed.geturl())[0]
if canonical_url in url_to_local:
tag['href'] = url_to_local[canonical_url]
# Also fix form action, iframe src, etc if desired (not implemented here)
return soup
def clone_site_multipage(start_url, max_depth=2, max_pages=20):
status_msgs = []
session = requests.Session()
session.headers.update({'User-Agent': 'Mozilla/5.0 (compatible; SiteCloner/3.0)'})
asset_folder = "assets"
visited = set()
url_queue = deque()
url_queue.append((start_url, 0))
domain = urlparse(start_url).netloc
url_to_local = dict()
asset_map = dict() # Map asset URLs to local paths or data URIs
html_pages = dict() # url => (html str, soup)
# Crawl
while url_queue and len(visited) < max_pages:
url, depth = url_queue.popleft()
canonical_url = urldefrag(url)[0]
if canonical_url in visited or depth > max_depth:
continue
try:
resp = session.get(url, timeout=20)
resp.raise_for_status()
except Exception as e:
status_msgs.append(f"❌ Failed to fetch: {url} ({e})")
continue
visited.add(canonical_url)
soup = BeautifulSoup(resp.text, "html.parser")
local_html_path = sanitize_filename(canonical_url, page=True)
url_to_local[canonical_url] = local_html_path
html_pages[canonical_url] = (resp.text, soup)
status_msgs.append(f"Fetched: {url}")
# Enqueue new internal links
for tag in soup.find_all('a', href=True):
href = tag['href']
parsed = urlparse(urljoin(url, href))
final_url = urldefrag(parsed.geturl())[0]
if parsed.scheme.startswith('http') and parsed.netloc == domain and final_url not in visited:
url_queue.append((final_url, depth + 1))
# Prepare zip
zip_buffer = io.BytesIO()
with zipfile.ZipFile(zip_buffer, "w", zipfile.ZIP_DEFLATED) as zip_file:
for page_url, (raw_html, soup) in html_pages.items():
base_url = page_url
# --- CSS ---
for tag in soup.find_all("link", rel="stylesheet"):
href = tag.get("href")
if href:
css_url = urljoin(base_url, href)
if css_url in asset_map:
if asset_map[css_url].startswith("data:"):
# Inline
style_tag = soup.new_tag("style")
style_tag.string = asset_map[css_url]
tag.replace_with(style_tag)
else:
tag['href'] = asset_map[css_url]
continue
content, mimetype = download_asset(css_url, session)
if content:
try:
css_text = content.decode('utf-8', errors='replace')
new_css = process_css(css_text, css_url, session, asset_folder, zip_file, asset_map, status_msgs)
if len(new_css.encode('utf-8')) <= INLINE_SIZE_LIMIT:
style_tag = soup.new_tag("style")
style_tag.string = new_css
tag.replace_with(style_tag)
asset_map[css_url] = new_css
status_msgs.append(f"Inlined CSS: {css_url}")
else:
filename = sanitize_filename(css_url)
local_path = f"{asset_folder}/{filename}"
zip_file.writestr(local_path, new_css)
tag['href'] = local_path
asset_map[css_url] = local_path
status_msgs.append(f"Saved CSS: {css_url}")
except Exception as e:
status_msgs.append(f"❌ CSS error: {css_url} ({e})")
else:
status_msgs.append(f"❌ Failed CSS: {css_url}")
# --- JS ---
for tag in soup.find_all("script", src=True):
src = tag.get("src")
if src:
js_url = urljoin(base_url, src)
if js_url in asset_map:
if asset_map[js_url].startswith("data:"):
script_tag = soup.new_tag("script")
script_tag.string = asset_map[js_url]
tag.replace_with(script_tag)
else:
tag['src'] = asset_map[js_url]
continue
content, mimetype = download_asset(js_url, session)
if content:
if len(content) <= INLINE_SIZE_LIMIT:
script_tag = soup.new_tag("script")
script_tag.string = content.decode('utf-8', errors='replace')
tag.replace_with(script_tag)
asset_map[js_url] = content.decode('utf-8', errors='replace')
status_msgs.append(f"Inlined JS: {js_url}")
else:
filename = sanitize_filename(js_url)
local_path = f"{asset_folder}/{filename}"
zip_file.writestr(local_path, content)
tag['src'] = local_path
asset_map[js_url] = local_path
status_msgs.append(f"Saved JS: {js_url}")
else:
status_msgs.append(f"❌ Failed JS: {js_url}")
# --- Images ---
def handle_img_attr(tag, attr):
res_url = tag.get(attr)
if res_url:
if attr == "srcset":
res_url = res_url.split(",")[0].split()[0]
if res_url.startswith('data:'): return
full_url = urljoin(base_url, res_url)
if full_url in asset_map:
tag[attr] = asset_map[full_url]
return
content, mimetype = download_asset(full_url, session)
if content:
if len(content) <= INLINE_SIZE_LIMIT:
data_uri = make_data_uri(content, mimetype)
tag[attr] = data_uri
asset_map[full_url] = data_uri
status_msgs.append(f"Inlined image: {full_url}")
else:
filename = sanitize_filename(full_url)
local_path = f"{asset_folder}/{filename}"
zip_file.writestr(local_path, content)
tag[attr] = local_path
asset_map[full_url] = local_path
status_msgs.append(f"Saved image: {full_url}")
else:
status_msgs.append(f"❌ Failed image: {full_url}")
for tag in soup.find_all(["img", "source"]):
handle_img_attr(tag, "src")
handle_img_attr(tag, "srcset")
# --- Favicon & icons ---
for tag in soup.find_all("link", rel=lambda val: val and ("icon" in val or "shortcut" in val)):
href = tag.get("href")
if href:
icon_url = urljoin(base_url, href)
if icon_url in asset_map:
tag['href'] = asset_map[icon_url]
continue
content, mimetype = download_asset(icon_url, session)
if content:
if len(content) <= INLINE_SIZE_LIMIT:
data_uri = make_data_uri(content, mimetype)
tag['href'] = data_uri
asset_map[icon_url] = data_uri
status_msgs.append(f"Inlined icon: {icon_url}")
else:
filename = sanitize_filename(icon_url)
local_path = f"{asset_folder}/{filename}"
zip_file.writestr(local_path, content)
tag['href'] = local_path
asset_map[icon_url] = local_path
status_msgs.append(f"Saved icon: {icon_url}")
else:
status_msgs.append(f"❌ Failed icon: {icon_url}")
# --- Inline style attributes (background images) ---
for tag in soup.find_all(style=True):
style = tag['style']
bg_urls = re.findall(r'url\((["\']?)(.*?)\1\)', style)
for _, bg_url in bg_urls:
if bg_url.startswith('data:'): continue
full_url = urljoin(base_url, bg_url)
if full_url in asset_map:
tag['style'] = tag['style'].replace(bg_url, asset_map[full_url])
continue
content, mimetype = download_asset(full_url, session)
if content:
if len(content) <= INLINE_SIZE_LIMIT:
data_uri = make_data_uri(content, mimetype)
tag['style'] = tag['style'].replace(bg_url, data_uri)
asset_map[full_url] = data_uri
status_msgs.append(f"Inlined BG: {full_url}")
else:
filename = sanitize_filename(full_url)
local_path = f"{asset_folder}/{filename}"
zip_file.writestr(local_path, content)
tag['style'] = tag['style'].replace(bg_url, local_path)
asset_map[full_url] = local_path
status_msgs.append(f"Saved BG: {full_url}")
else:
status_msgs.append(f"❌ Failed BG: {full_url}")
# --- Video/audio assets ---
for tag in soup.find_all(["video", "audio", "source"]):
handle_img_attr(tag, "src")
# Rewrite internal links to local files
fix_links(soup, base_url, url_to_local, domain)
# Save modified HTML
new_html = str(soup)
zip_file.writestr(url_to_local[page_url], new_html)
zip_buffer.seek(0)
report = f"βœ… Crawled {len(html_pages)} page(s).\n---\n" + "\n".join(status_msgs[-50:]) # last 50 logs
return report, zip_buffer
with gr.Blocks() as demo:
gr.Markdown(
"""
# 🌐 Full Website Cloner (Multi-Page)
- Recursively clones all internal pages (up to depth and page limit).
- Downloads HTML, CSS, JS, images, favicon, and more.
- Inlines small assets, rewrites all links to local.
- Outputs a ready-to-run zip. For learning/demo. Best for simple/static sites.
"""
)
url_input = gr.Textbox(label="Website URL", placeholder="https://example.com")
depth_input = gr.Slider(label="Crawl Depth", minimum=1, maximum=4, value=2, step=1)
page_input = gr.Slider(label="Max Pages", minimum=1, maximum=30, value=10, step=1)
clone_btn = gr.Button("Clone & Download ZIP")
output_file = gr.File(label="Download ZIP")
status = gr.Textbox(label="Status/Info", interactive=False, lines=18)
def wrapper(url, depth, maxpages):
msg, zip_obj = clone_site_multipage(url, int(depth), int(maxpages))
return zip_obj, msg
clone_btn.click(wrapper, [url_input, depth_input, page_input], [output_file, status])
if __name__ == "__main__":
demo.launch()