Spaces:

jeffgui
/

jeff-deepsite

Running

File size: 14,835 Bytes

<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Google AI API Documentation Scraper</title>
    <script src="https://cdn.tailwindcss.com"></script>
    <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.0/css/all.min.css">
    <style>
        .console-output {
            font-family: 'Courier New', monospace;
            white-space: pre-wrap;
            background-color: #1e1e1e;
        }
        .scrollable-content {
            max-height: 60vh;
            overflow-y: auto;
        }
    </style>
</head>
<body class="bg-gray-900 text-gray-100 min-h-screen">
    <div class="container mx-auto px-4 py-8">
        <div class="flex justify-between items-center mb-6">
            <div class="flex items-center space-x-2">
                <i class="fab fa-google text-3xl text-blue-400"></i>
                <h1 class="text-2xl font-bold bg-gradient-to-r from-blue-400 to-green-400 bg-clip-text text-transparent">Google AI API Scraper</h1>
            </div>
            <div class="flex space-x-4">
                <button id="run-btn" class="px-4 py-2 bg-green-600 hover:bg-green-700 rounded-lg flex items-center space-x-2 transition-all">
                    <i class="fas fa-download"></i>
                    <span>Run Scraper</span>
                </button>
                <button id="clear-btn" class="px-4 py-2 bg-gray-700 hover:bg-gray-600 rounded-lg flex items-center space-x-2 transition-all">
                    <i class="fas fa-trash-alt"></i>
                    <span>Clear</span>
                </button>
            </div>
        </div>

        <div class="bg-gray-800 rounded-xl overflow-hidden shadow-2xl">
            <div class="bg-gray-700 px-4 py-2 flex justify-between items-center">
                <div class="flex items-center space-x-2">
                    <i class="fas fa-terminal text-green-400"></i>
                    <span>Console Output</span>
                </div>
                <div class="text-sm text-gray-400">Python 3.x</div>
            </div>
            <div class="p-4">
                <div id="console-output" class="console-output p-4 rounded scrollable-content">
                    <div class="text-green-400">Ready to scrape https://ai.google.dev/api?lang=python</div>
                    <div class="text-gray-400">Click "Run Scraper" to start downloading documentation...</div>
                </div>
            </div>
            <div class="bg-gray-700 px-4 py-2">
                <label class="block text-sm mb-2 text-gray-400">Output Directory</label>
                <input type="text" id="output-dir" value="google_ai_docs" class="w-full bg-gray-800 px-3 py-2 rounded outline-none border border-gray-600 focus:border-blue-500">
            </div>
        </div>

        <div class="mt-8 grid grid-cols-1 md:grid-cols-3 gap-6">
            <div class="bg-gray-800 p-6 rounded-xl shadow-lg">
                <div class="flex items-center mb-4">
                    <div class="bg-blue-500 p-2 rounded-lg mr-4">
                        <i class="fas fa-sitemap text-white"></i>
                    </div>
                    <h3 class="text-xl font-semibold">Recursive Scraping</h3>
                </div>
                <p class="text-gray-400">Downloads all linked pages within the Google AI API documentation hierarchy.</p>
            </div>
            <div class="bg-gray-800 p-6 rounded-xl shadow-lg">
                <div class="flex items-center mb-4">
                    <div class="bg-purple-500 p-2 rounded-lg mr-4">
                        <i class="fas fa-file-alt text-white"></i>
                    </div>
                    <h3 class="text-xl font-semibold">Local Storage</h3>
                </div>
                <p class="text-gray-400">Saves complete HTML pages with assets to your specified directory structure.</p>
            </div>
            <div class="bg-gray-800 p-6 rounded-xl shadow-lg">
                <div class="flex items-center mb-4">
                    <div class="bg-green-500 p-2 rounded-lg mr-4">
                        <i class="fas fa-code-branch text-white"></i>
                    </div>
                    <h3 class="text-xl font-semibold">Link Preservation</h3>
                </div>
                <p class="text-gray-400">Rewrites links to work locally while maintaining the original document structure.</p>
            </div>
        </div>
    </div>

    <script>
        document.addEventListener('DOMContentLoaded', function() {
            const consoleOutput = document.getElementById('console-output');
            const runButton = document.getElementById('run-btn');
            const clearButton = document.getElementById('clear-btn');
            const outputDirInput = document.getElementById('output-dir');

            function addToConsole(message, type = 'info') {
                const colors = {
                    'info': 'text-gray-100',
                    'success': 'text-green-400',
                    'error': 'text-red-400',
                    'warning': 'text-yellow-400'
                };
                const div = document.createElement('div');
                div.className = colors[type] || 'text-gray-100';
                div.textContent = message;
                consoleOutput.appendChild(div);
                consoleOutput.scrollTop = consoleOutput.scrollHeight;
            }

            function clearConsole() {
                consoleOutput.innerHTML = '<div class="text-green-400">Console cleared. Ready to start new session.</div>';
            }

            function runScraper() {
                const outputDir = outputDirInput.value.trim() || 'google_ai_docs';
                addToConsole('Starting scraper for https://ai.google.dev/api?lang=python ...', 'info');
                addToConsole(`Output directory: ${outputDir}`, 'info');
                
                // In a real implementation, this would call a Python backend or use Pyodide
                // For this demo, we'll simulate the scraping process
                
                setTimeout(() => {
                    simulateScraping(outputDir);
                }, 500);
            }

            function simulateScraping(outputDir) {
                const pages = [
                    { url: 'https://ai.google.dev/api?lang=python', title: 'Google AI Python API' },
                    { url: 'https://ai.google.dev/api/python', title: 'Python Client Library' },
                    { url: 'https://ai.google.dev/api/python/quickstart', title: 'Quickstart Guide' },
                    { url: 'https://ai.google.dev/api/python/reference', title: 'API Reference' },
                    { url: 'https://ai.google.dev/api/python/authentication', title: 'Authentication' },
                    { url: 'https://ai.google.dev/api/python/examples', title: 'Code Examples' }
                ];

                pages.forEach((page, index) => {
                    setTimeout(() => {
                        const success = Math.random() > 0.1; // 90% success rate for simulation
                        
                        if (success) {
                            addToConsole(`Downloaded: ${page.url} → ${outputDir}/${page.title.replace(/\s+/g, '_').toLowerCase()}.html`, 'success');
                            
                            // Simulate downloading assets
                            if (index === 0 || index === pages.length - 1) {
                                setTimeout(() => {
                                    addToConsole(`Downloaded assets for ${page.title} (styles.css, script.js, images)`, 'info');
                                }, 200);
                            }
                        } else {
                            addToConsole(`Error downloading: ${page.url} (simulated error)`, 'error');
                        }
                    }, index * 800);
                });

                setTimeout(() => {
                    addToConsole('\nScraping completed!', 'success');
                    addToConsole(`All documentation saved to ${outputDir}/ directory`, 'success');
                    addToConsole('Total pages downloaded: ' + pages.length, 'info');
                    addToConsole('Note: This was a simulation. Actual implementation would use:', 'warning');
                    addToConsole('- BeautifulSoup for HTML parsing', 'info');
                    addToConsole('- Requests/httpx for HTTP requests', 'info');
                    addToConsole('- OS module for file handling', 'info');
                }, pages.length * 800 + 500);
            }

            runButton.addEventListener('click', runScraper);
            clearButton.addEventListener('click', clearConsole);
        });
    </script>

    <!-- Actual Python script that would do the real work -->
    <script type="text/python" id="python-script">
# This would be the actual Python implementation

import os
import time
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import concurrent.futures

class GoogleAIDocsScraper:
    def __init__(self, base_url, output_dir="google_ai_docs"):
        self.base_url = base_url
        self.output_dir = output_dir
        self.visited_urls = set()
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        })
        
        # Create output directory if it doesn't exist
        os.makedirs(self.output_dir, exist_ok=True)
    
    def is_valid_url(self, url):
        """Check if URL is valid and should be scraped"""
        parsed = urlparse(url)
        return (
            parsed.netloc == 'ai.google.dev' and 
            not url.startswith('#') and
            not any(ext in url for ext in ['.pdf', '.zip', '.png', '.jpg', '.jpeg', '.gif'])
        )
    
    def sanitize_filename(self, url):
        """Create a safe filename from URL"""
        parsed = urlparse(url)
        path = parsed.path.strip('/') or 'index'
        path = path.replace('/', '_').replace('?', '_').replace('=', '_')
        return f"{path}.html"
    
    def save_page(self, url, content):
        """Save page content to file"""
        filename = self.sanitize_filename(url)
        filepath = os.path.join(self.output_dir, filename)
        
        # Save HTML
        with open(filepath, 'w', encoding='utf-8') as f:
            f.write(content)
        
        return filepath
    
    def rewrite_links(self, soup, page_url):
        """Rewrite links to work locally"""
        for tag, attr in [('a', 'href'), ('img', 'src'), ('link', 'href'), ('script', 'src')]:
            for element in soup.find_all(tag, {attr: True}):
                url = element[attr]
                
                # Skip fragment links and external URLs
                if not self.is_valid_url(url) or url.startswith('#'):
                    continue
                
                # Convert relative URLs to absolute
                absolute_url = urljoin(page_url, url)
                
                # Rewrite to local file path
                if absolute_url.startswith(self.base_url):
                    element[attr] = self.sanitize_filename(absolute_url)
    
    def scrape_page(self, url):
        """Scrape a single page"""
        if url in self.visited_urls:
            return
        
        self.visited_urls.add(url)
        
        try:
            response = self.session.get(url, timeout=10)
            response.raise_for_status()
            
            soup = BeautifulSoup(response.text, 'html.parser')
            
            # Rewrite links before saving
            self.rewrite_links(soup, url)
            
            # Save the page
            saved_path = self.save_page(url, str(soup))
            print(f"Successfully saved: {url} -> {saved_path}")
            
            # Find all links on this page and add to queue
            for link in soup.find_all('a', href=True):
                href = link['href']
                absolute_url = urljoin(url, href)
                
                if self.is_valid_url(absolute_url) and absolute_url not in self.visited_urls:
                    self.scrape_page(absolute_url)
                    
        except Exception as e:
            print(f"Error scraping {url}: {str(e)}")
    
    def scrape_all(self):
        """Start recursive scraping"""
        print(f"Starting recursive scraping from {self.base_url}")
        start_time = time.time()
        
        self.scrape_page(self.base_url)
        
        print(f"\nScraping completed in {time.time() - start_time:.2f} seconds")
        print(f"Total pages scraped: {len(self.visited_urls)}")
        print(f"Output directory: {os.path.abspath(self.output_dir)}")

if __name__ == "__main__":
    BASE_URL = "https://ai.google.dev/api?lang=python"
    OUTPUT_DIR = "google_ai_docs"
    
    scraper = GoogleAIDocsScraper(BASE_URL, OUTPUT_DIR)
    scraper.scrape_all()
    </script>

    <script>
        // In a real implementation, you would use Pyodide to run the Python code
        // This is just a placeholder to show where you'd integrate it
        
        /* 
        async function loadPyodideAndRun() {
            let pyodide = await loadPyodide({
                indexURL: "https://cdn.jsdelivr.net/pyodide/v0.23.1/full/"
            });
            
            const pythonCode = document.getElementById('python-script').textContent;
            try {
                await pyodide.loadPackage(['requests', 'beautifulsoup4']);
                const result = await pyodide.runPythonAsync(pythonCode);
                console.log(result);
            } catch (error) {
                console.error("Error running Python:", error);
            }
        }
        
        document.getElementById('run-btn').addEventListener('click', loadPyodideAndRun);
        */
    </script>
<p style="border-radius: 8px; text-align: center; font-size: 12px; color: #fff; margin-top: 16px;position: fixed; left: 8px; bottom: 8px; z-index: 10; background: rgba(0, 0, 0, 0.8); padding: 4px 8px;">Made with <img src="https://enzostvs-deepsite.hf.space/logo.svg" alt="DeepSite Logo" style="width: 16px; height: 16px; vertical-align: middle;display:inline-block;margin-right:3px;filter:brightness(0) invert(1);"><a href="https://enzostvs-deepsite.hf.space" style="color: #fff;text-decoration: underline;" target="_blank" >DeepSite</a> - <a href="https://enzostvs-deepsite.hf.space?remix=jeffgui/jeff-deepsite" style="color: #fff;text-decoration: underline;" target="_blank" >🧬 Remix</a></p></body>
</html>