import gradio as gr
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse, urldefrag
import os
import zipfile
import io
import mimetypes
import re
import base64
from collections import deque

INLINE_SIZE_LIMIT = 25 * 1024  # 25 KB

def sanitize_filename(url, page=False):
    parsed = urlparse(url)
    path = parsed.path if parsed.path and parsed.path != "/" else "/index"
    if page:
        # For HTML pages: folder + index.html
        if path.endswith("/"):
            path += "index"
        filename = path.lstrip("/") + ".html"
    else:
        filename = os.path.basename(path)
        if not filename:
            filename = "file"
        ext = os.path.splitext(filename)[1]
        if not ext:
            filename += ".res"
        filename = re.sub(r'[^a-zA-Z0-9_.-]', '_', filename)
    return filename

def make_data_uri(content, mimetype):
    b64 = base64.b64encode(content).decode('utf-8')
    return f'data:{mimetype};base64,{b64}'

def download_asset(url, session):
    try:
        resp = session.get(url, timeout=20, stream=True)
        resp.raise_for_status()
        content = resp.content
        mimetype = resp.headers.get("Content-Type") or mimetypes.guess_type(url)[0] or "application/octet-stream"
        return content, mimetype
    except Exception:
        return None, None

def process_css(css_content, base_url, session, asset_folder, zip_file, asset_map, status_msgs):
    urls = re.findall(r'url\((["\']?)(.*?)\1\)', css_content)
    for _, asset_url in urls:
        if asset_url.startswith('data:'): continue
        real_url = urljoin(base_url, asset_url)
        if real_url in asset_map:
            local_path = asset_map[real_url]
        else:
            content, mimetype = download_asset(real_url, session)
            if content:
                if len(content) <= INLINE_SIZE_LIMIT:
                    data_uri = make_data_uri(content, mimetype)
                    css_content = css_content.replace(asset_url, data_uri)
                    status_msgs.append(f"Inlined CSS asset: {real_url}")
                    asset_map[real_url] = data_uri
                    continue
                else:
                    filename = sanitize_filename(real_url)
                    local_path = f"{asset_folder}/{filename}"
                    zip_file.writestr(local_path, content)
                    asset_map[real_url] = local_path
                    status_msgs.append(f"Saved CSS asset: {real_url}")
            else:
                continue
        css_content = css_content.replace(asset_url, asset_map[real_url])
    return css_content

def fix_links(soup, base_url, url_to_local, domain):
    # Fix all "a" hrefs to local HTML files if internal
    for tag in soup.find_all('a', href=True):
        href = tag['href']
        parsed = urlparse(urljoin(base_url, href))
        if parsed.scheme.startswith('http') and parsed.netloc == domain:
            canonical_url = urldefrag(parsed.geturl())[0]
            if canonical_url in url_to_local:
                tag['href'] = url_to_local[canonical_url]
    # Also fix form action, iframe src, etc if desired (not implemented here)
    return soup

def clone_site_multipage(start_url, max_depth=2, max_pages=20):
    status_msgs = []
    session = requests.Session()
    session.headers.update({'User-Agent': 'Mozilla/5.0 (compatible; SiteCloner/3.0)'})
    asset_folder = "assets"
    visited = set()
    url_queue = deque()
    url_queue.append((start_url, 0))
    domain = urlparse(start_url).netloc
    url_to_local = dict()
    asset_map = dict()  # Map asset URLs to local paths or data URIs

    html_pages = dict()  # url => (html str, soup)

    # Crawl
    while url_queue and len(visited) < max_pages:
        url, depth = url_queue.popleft()
        canonical_url = urldefrag(url)[0]
        if canonical_url in visited or depth > max_depth:
            continue
        try:
            resp = session.get(url, timeout=20)
            resp.raise_for_status()
        except Exception as e:
            status_msgs.append(f"❌ Failed to fetch: {url} ({e})")
            continue
        visited.add(canonical_url)
        soup = BeautifulSoup(resp.text, "html.parser")
        local_html_path = sanitize_filename(canonical_url, page=True)
        url_to_local[canonical_url] = local_html_path
        html_pages[canonical_url] = (resp.text, soup)
        status_msgs.append(f"Fetched: {url}")
        # Enqueue new internal links
        for tag in soup.find_all('a', href=True):
            href = tag['href']
            parsed = urlparse(urljoin(url, href))
            final_url = urldefrag(parsed.geturl())[0]
            if parsed.scheme.startswith('http') and parsed.netloc == domain and final_url not in visited:
                url_queue.append((final_url, depth + 1))

    # Prepare zip
    zip_buffer = io.BytesIO()
    with zipfile.ZipFile(zip_buffer, "w", zipfile.ZIP_DEFLATED) as zip_file:
        for page_url, (raw_html, soup) in html_pages.items():
            base_url = page_url
            # --- CSS ---
            for tag in soup.find_all("link", rel="stylesheet"):
                href = tag.get("href")
                if href:
                    css_url = urljoin(base_url, href)
                    if css_url in asset_map:
                        if asset_map[css_url].startswith("data:"):
                            # Inline
                            style_tag = soup.new_tag("style")
                            style_tag.string = asset_map[css_url]
                            tag.replace_with(style_tag)
                        else:
                            tag['href'] = asset_map[css_url]
                        continue
                    content, mimetype = download_asset(css_url, session)
                    if content:
                        try:
                            css_text = content.decode('utf-8', errors='replace')
                            new_css = process_css(css_text, css_url, session, asset_folder, zip_file, asset_map, status_msgs)
                            if len(new_css.encode('utf-8')) <= INLINE_SIZE_LIMIT:
                                style_tag = soup.new_tag("style")
                                style_tag.string = new_css
                                tag.replace_with(style_tag)
                                asset_map[css_url] = new_css
                                status_msgs.append(f"Inlined CSS: {css_url}")
                            else:
                                filename = sanitize_filename(css_url)
                                local_path = f"{asset_folder}/{filename}"
                                zip_file.writestr(local_path, new_css)
                                tag['href'] = local_path
                                asset_map[css_url] = local_path
                                status_msgs.append(f"Saved CSS: {css_url}")
                        except Exception as e:
                            status_msgs.append(f"❌ CSS error: {css_url} ({e})")
                    else:
                        status_msgs.append(f"❌ Failed CSS: {css_url}")

            # --- JS ---
            for tag in soup.find_all("script", src=True):
                src = tag.get("src")
                if src:
                    js_url = urljoin(base_url, src)
                    if js_url in asset_map:
                        if asset_map[js_url].startswith("data:"):
                            script_tag = soup.new_tag("script")
                            script_tag.string = asset_map[js_url]
                            tag.replace_with(script_tag)
                        else:
                            tag['src'] = asset_map[js_url]
                        continue
                    content, mimetype = download_asset(js_url, session)
                    if content:
                        if len(content) <= INLINE_SIZE_LIMIT:
                            script_tag = soup.new_tag("script")
                            script_tag.string = content.decode('utf-8', errors='replace')
                            tag.replace_with(script_tag)
                            asset_map[js_url] = content.decode('utf-8', errors='replace')
                            status_msgs.append(f"Inlined JS: {js_url}")
                        else:
                            filename = sanitize_filename(js_url)
                            local_path = f"{asset_folder}/{filename}"
                            zip_file.writestr(local_path, content)
                            tag['src'] = local_path
                            asset_map[js_url] = local_path
                            status_msgs.append(f"Saved JS: {js_url}")
                    else:
                        status_msgs.append(f"❌ Failed JS: {js_url}")

            # --- Images ---
            def handle_img_attr(tag, attr):
                res_url = tag.get(attr)
                if res_url:
                    if attr == "srcset":
                        res_url = res_url.split(",")[0].split()[0]
                    if res_url.startswith('data:'): return
                    full_url = urljoin(base_url, res_url)
                    if full_url in asset_map:
                        tag[attr] = asset_map[full_url]
                        return
                    content, mimetype = download_asset(full_url, session)
                    if content:
                        if len(content) <= INLINE_SIZE_LIMIT:
                            data_uri = make_data_uri(content, mimetype)
                            tag[attr] = data_uri
                            asset_map[full_url] = data_uri
                            status_msgs.append(f"Inlined image: {full_url}")
                        else:
                            filename = sanitize_filename(full_url)
                            local_path = f"{asset_folder}/{filename}"
                            zip_file.writestr(local_path, content)
                            tag[attr] = local_path
                            asset_map[full_url] = local_path
                            status_msgs.append(f"Saved image: {full_url}")
                    else:
                        status_msgs.append(f"❌ Failed image: {full_url}")

            for tag in soup.find_all(["img", "source"]):
                handle_img_attr(tag, "src")
                handle_img_attr(tag, "srcset")

            # --- Favicon & icons ---
            for tag in soup.find_all("link", rel=lambda val: val and ("icon" in val or "shortcut" in val)):
                href = tag.get("href")
                if href:
                    icon_url = urljoin(base_url, href)
                    if icon_url in asset_map:
                        tag['href'] = asset_map[icon_url]
                        continue
                    content, mimetype = download_asset(icon_url, session)
                    if content:
                        if len(content) <= INLINE_SIZE_LIMIT:
                            data_uri = make_data_uri(content, mimetype)
                            tag['href'] = data_uri
                            asset_map[icon_url] = data_uri
                            status_msgs.append(f"Inlined icon: {icon_url}")
                        else:
                            filename = sanitize_filename(icon_url)
                            local_path = f"{asset_folder}/{filename}"
                            zip_file.writestr(local_path, content)
                            tag['href'] = local_path
                            asset_map[icon_url] = local_path
                            status_msgs.append(f"Saved icon: {icon_url}")
                    else:
                        status_msgs.append(f"❌ Failed icon: {icon_url}")

            # --- Inline style attributes (background images) ---
            for tag in soup.find_all(style=True):
                style = tag['style']
                bg_urls = re.findall(r'url\((["\']?)(.*?)\1\)', style)
                for _, bg_url in bg_urls:
                    if bg_url.startswith('data:'): continue
                    full_url = urljoin(base_url, bg_url)
                    if full_url in asset_map:
                        tag['style'] = tag['style'].replace(bg_url, asset_map[full_url])
                        continue
                    content, mimetype = download_asset(full_url, session)
                    if content:
                        if len(content) <= INLINE_SIZE_LIMIT:
                            data_uri = make_data_uri(content, mimetype)
                            tag['style'] = tag['style'].replace(bg_url, data_uri)
                            asset_map[full_url] = data_uri
                            status_msgs.append(f"Inlined BG: {full_url}")
                        else:
                            filename = sanitize_filename(full_url)
                            local_path = f"{asset_folder}/{filename}"
                            zip_file.writestr(local_path, content)
                            tag['style'] = tag['style'].replace(bg_url, local_path)
                            asset_map[full_url] = local_path
                            status_msgs.append(f"Saved BG: {full_url}")
                    else:
                        status_msgs.append(f"❌ Failed BG: {full_url}")

            # --- Video/audio assets ---
            for tag in soup.find_all(["video", "audio", "source"]):
                handle_img_attr(tag, "src")

            # Rewrite internal links to local files
            fix_links(soup, base_url, url_to_local, domain)

            # Save modified HTML
            new_html = str(soup)
            zip_file.writestr(url_to_local[page_url], new_html)

    zip_buffer.seek(0)
    report = f"✅ Crawled {len(html_pages)} page(s).\n---\n" + "\n".join(status_msgs[-50:]) # last 50 logs
    return report, zip_buffer

with gr.Blocks() as demo:
    gr.Markdown(
        """
        # 🌐 Full Website Cloner (Multi-Page)
        - Recursively clones all internal pages (up to depth and page limit).
        - Downloads HTML, CSS, JS, images, favicon, and more.
        - Inlines small assets, rewrites all links to local.
        - Outputs a ready-to-run zip. For learning/demo. Best for simple/static sites.
        """
    )
    url_input = gr.Textbox(label="Website URL", placeholder="https://example.com")
    depth_input = gr.Slider(label="Crawl Depth", minimum=1, maximum=4, value=2, step=1)
    page_input = gr.Slider(label="Max Pages", minimum=1, maximum=30, value=10, step=1)
    clone_btn = gr.Button("Clone & Download ZIP")
    output_file = gr.File(label="Download ZIP")
    status = gr.Textbox(label="Status/Info", interactive=False, lines=18)
    
    def wrapper(url, depth, maxpages):
        msg, zip_obj = clone_site_multipage(url, int(depth), int(maxpages))
        return zip_obj, msg
    
    clone_btn.click(wrapper, [url_input, depth_input, page_input], [output_file, status])

if __name__ == "__main__":
    demo.launch()