jeff-deepsite / index.html
jeffgui's picture
Add 1 files
f542823 verified
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Google AI API Documentation Scraper</title>
<script src="https://cdn.tailwindcss.com"></script>
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.0/css/all.min.css">
<style>
.console-output {
font-family: 'Courier New', monospace;
white-space: pre-wrap;
background-color: #1e1e1e;
}
.scrollable-content {
max-height: 60vh;
overflow-y: auto;
}
</style>
</head>
<body class="bg-gray-900 text-gray-100 min-h-screen">
<div class="container mx-auto px-4 py-8">
<div class="flex justify-between items-center mb-6">
<div class="flex items-center space-x-2">
<i class="fab fa-google text-3xl text-blue-400"></i>
<h1 class="text-2xl font-bold bg-gradient-to-r from-blue-400 to-green-400 bg-clip-text text-transparent">Google AI API Scraper</h1>
</div>
<div class="flex space-x-4">
<button id="run-btn" class="px-4 py-2 bg-green-600 hover:bg-green-700 rounded-lg flex items-center space-x-2 transition-all">
<i class="fas fa-download"></i>
<span>Run Scraper</span>
</button>
<button id="clear-btn" class="px-4 py-2 bg-gray-700 hover:bg-gray-600 rounded-lg flex items-center space-x-2 transition-all">
<i class="fas fa-trash-alt"></i>
<span>Clear</span>
</button>
</div>
</div>
<div class="bg-gray-800 rounded-xl overflow-hidden shadow-2xl">
<div class="bg-gray-700 px-4 py-2 flex justify-between items-center">
<div class="flex items-center space-x-2">
<i class="fas fa-terminal text-green-400"></i>
<span>Console Output</span>
</div>
<div class="text-sm text-gray-400">Python 3.x</div>
</div>
<div class="p-4">
<div id="console-output" class="console-output p-4 rounded scrollable-content">
<div class="text-green-400">Ready to scrape https://ai.google.dev/api?lang=python</div>
<div class="text-gray-400">Click "Run Scraper" to start downloading documentation...</div>
</div>
</div>
<div class="bg-gray-700 px-4 py-2">
<label class="block text-sm mb-2 text-gray-400">Output Directory</label>
<input type="text" id="output-dir" value="google_ai_docs" class="w-full bg-gray-800 px-3 py-2 rounded outline-none border border-gray-600 focus:border-blue-500">
</div>
</div>
<div class="mt-8 grid grid-cols-1 md:grid-cols-3 gap-6">
<div class="bg-gray-800 p-6 rounded-xl shadow-lg">
<div class="flex items-center mb-4">
<div class="bg-blue-500 p-2 rounded-lg mr-4">
<i class="fas fa-sitemap text-white"></i>
</div>
<h3 class="text-xl font-semibold">Recursive Scraping</h3>
</div>
<p class="text-gray-400">Downloads all linked pages within the Google AI API documentation hierarchy.</p>
</div>
<div class="bg-gray-800 p-6 rounded-xl shadow-lg">
<div class="flex items-center mb-4">
<div class="bg-purple-500 p-2 rounded-lg mr-4">
<i class="fas fa-file-alt text-white"></i>
</div>
<h3 class="text-xl font-semibold">Local Storage</h3>
</div>
<p class="text-gray-400">Saves complete HTML pages with assets to your specified directory structure.</p>
</div>
<div class="bg-gray-800 p-6 rounded-xl shadow-lg">
<div class="flex items-center mb-4">
<div class="bg-green-500 p-2 rounded-lg mr-4">
<i class="fas fa-code-branch text-white"></i>
</div>
<h3 class="text-xl font-semibold">Link Preservation</h3>
</div>
<p class="text-gray-400">Rewrites links to work locally while maintaining the original document structure.</p>
</div>
</div>
</div>
<script>
document.addEventListener('DOMContentLoaded', function() {
const consoleOutput = document.getElementById('console-output');
const runButton = document.getElementById('run-btn');
const clearButton = document.getElementById('clear-btn');
const outputDirInput = document.getElementById('output-dir');
function addToConsole(message, type = 'info') {
const colors = {
'info': 'text-gray-100',
'success': 'text-green-400',
'error': 'text-red-400',
'warning': 'text-yellow-400'
};
const div = document.createElement('div');
div.className = colors[type] || 'text-gray-100';
div.textContent = message;
consoleOutput.appendChild(div);
consoleOutput.scrollTop = consoleOutput.scrollHeight;
}
function clearConsole() {
consoleOutput.innerHTML = '<div class="text-green-400">Console cleared. Ready to start new session.</div>';
}
function runScraper() {
const outputDir = outputDirInput.value.trim() || 'google_ai_docs';
addToConsole('Starting scraper for https://ai.google.dev/api?lang=python ...', 'info');
addToConsole(`Output directory: ${outputDir}`, 'info');
// In a real implementation, this would call a Python backend or use Pyodide
// For this demo, we'll simulate the scraping process
setTimeout(() => {
simulateScraping(outputDir);
}, 500);
}
function simulateScraping(outputDir) {
const pages = [
{ url: 'https://ai.google.dev/api?lang=python', title: 'Google AI Python API' },
{ url: 'https://ai.google.dev/api/python', title: 'Python Client Library' },
{ url: 'https://ai.google.dev/api/python/quickstart', title: 'Quickstart Guide' },
{ url: 'https://ai.google.dev/api/python/reference', title: 'API Reference' },
{ url: 'https://ai.google.dev/api/python/authentication', title: 'Authentication' },
{ url: 'https://ai.google.dev/api/python/examples', title: 'Code Examples' }
];
pages.forEach((page, index) => {
setTimeout(() => {
const success = Math.random() > 0.1; // 90% success rate for simulation
if (success) {
addToConsole(`Downloaded: ${page.url}${outputDir}/${page.title.replace(/\s+/g, '_').toLowerCase()}.html`, 'success');
// Simulate downloading assets
if (index === 0 || index === pages.length - 1) {
setTimeout(() => {
addToConsole(`Downloaded assets for ${page.title} (styles.css, script.js, images)`, 'info');
}, 200);
}
} else {
addToConsole(`Error downloading: ${page.url} (simulated error)`, 'error');
}
}, index * 800);
});
setTimeout(() => {
addToConsole('\nScraping completed!', 'success');
addToConsole(`All documentation saved to ${outputDir}/ directory`, 'success');
addToConsole('Total pages downloaded: ' + pages.length, 'info');
addToConsole('Note: This was a simulation. Actual implementation would use:', 'warning');
addToConsole('- BeautifulSoup for HTML parsing', 'info');
addToConsole('- Requests/httpx for HTTP requests', 'info');
addToConsole('- OS module for file handling', 'info');
}, pages.length * 800 + 500);
}
runButton.addEventListener('click', runScraper);
clearButton.addEventListener('click', clearConsole);
});
</script>
<!-- Actual Python script that would do the real work -->
<script type="text/python" id="python-script">
# This would be the actual Python implementation
import os
import time
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import concurrent.futures
class GoogleAIDocsScraper:
def __init__(self, base_url, output_dir="google_ai_docs"):
self.base_url = base_url
self.output_dir = output_dir
self.visited_urls = set()
self.session = requests.Session()
self.session.headers.update({
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
})
# Create output directory if it doesn't exist
os.makedirs(self.output_dir, exist_ok=True)
def is_valid_url(self, url):
"""Check if URL is valid and should be scraped"""
parsed = urlparse(url)
return (
parsed.netloc == 'ai.google.dev' and
not url.startswith('#') and
not any(ext in url for ext in ['.pdf', '.zip', '.png', '.jpg', '.jpeg', '.gif'])
)
def sanitize_filename(self, url):
"""Create a safe filename from URL"""
parsed = urlparse(url)
path = parsed.path.strip('/') or 'index'
path = path.replace('/', '_').replace('?', '_').replace('=', '_')
return f"{path}.html"
def save_page(self, url, content):
"""Save page content to file"""
filename = self.sanitize_filename(url)
filepath = os.path.join(self.output_dir, filename)
# Save HTML
with open(filepath, 'w', encoding='utf-8') as f:
f.write(content)
return filepath
def rewrite_links(self, soup, page_url):
"""Rewrite links to work locally"""
for tag, attr in [('a', 'href'), ('img', 'src'), ('link', 'href'), ('script', 'src')]:
for element in soup.find_all(tag, {attr: True}):
url = element[attr]
# Skip fragment links and external URLs
if not self.is_valid_url(url) or url.startswith('#'):
continue
# Convert relative URLs to absolute
absolute_url = urljoin(page_url, url)
# Rewrite to local file path
if absolute_url.startswith(self.base_url):
element[attr] = self.sanitize_filename(absolute_url)
def scrape_page(self, url):
"""Scrape a single page"""
if url in self.visited_urls:
return
self.visited_urls.add(url)
try:
response = self.session.get(url, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
# Rewrite links before saving
self.rewrite_links(soup, url)
# Save the page
saved_path = self.save_page(url, str(soup))
print(f"Successfully saved: {url} -> {saved_path}")
# Find all links on this page and add to queue
for link in soup.find_all('a', href=True):
href = link['href']
absolute_url = urljoin(url, href)
if self.is_valid_url(absolute_url) and absolute_url not in self.visited_urls:
self.scrape_page(absolute_url)
except Exception as e:
print(f"Error scraping {url}: {str(e)}")
def scrape_all(self):
"""Start recursive scraping"""
print(f"Starting recursive scraping from {self.base_url}")
start_time = time.time()
self.scrape_page(self.base_url)
print(f"\nScraping completed in {time.time() - start_time:.2f} seconds")
print(f"Total pages scraped: {len(self.visited_urls)}")
print(f"Output directory: {os.path.abspath(self.output_dir)}")
if __name__ == "__main__":
BASE_URL = "https://ai.google.dev/api?lang=python"
OUTPUT_DIR = "google_ai_docs"
scraper = GoogleAIDocsScraper(BASE_URL, OUTPUT_DIR)
scraper.scrape_all()
</script>
<script>
// In a real implementation, you would use Pyodide to run the Python code
// This is just a placeholder to show where you'd integrate it
/*
async function loadPyodideAndRun() {
let pyodide = await loadPyodide({
indexURL: "https://cdn.jsdelivr.net/pyodide/v0.23.1/full/"
});
const pythonCode = document.getElementById('python-script').textContent;
try {
await pyodide.loadPackage(['requests', 'beautifulsoup4']);
const result = await pyodide.runPythonAsync(pythonCode);
console.log(result);
} catch (error) {
console.error("Error running Python:", error);
}
}
document.getElementById('run-btn').addEventListener('click', loadPyodideAndRun);
*/
</script>
<p style="border-radius: 8px; text-align: center; font-size: 12px; color: #fff; margin-top: 16px;position: fixed; left: 8px; bottom: 8px; z-index: 10; background: rgba(0, 0, 0, 0.8); padding: 4px 8px;">Made with <img src="https://enzostvs-deepsite.hf.space/logo.svg" alt="DeepSite Logo" style="width: 16px; height: 16px; vertical-align: middle;display:inline-block;margin-right:3px;filter:brightness(0) invert(1);"><a href="https://enzostvs-deepsite.hf.space" style="color: #fff;text-decoration: underline;" target="_blank" >DeepSite</a> - <a href="https://enzostvs-deepsite.hf.space?remix=jeffgui/jeff-deepsite" style="color: #fff;text-decoration: underline;" target="_blank" >🧬 Remix</a></p></body>
</html>