Spaces:
Running
Running
<html lang="en"> | |
<head> | |
<meta charset="UTF-8"> | |
<meta name="viewport" content="width=device-width, initial-scale=1.0"> | |
<title>Google AI API Documentation Scraper</title> | |
<script src="https://cdn.tailwindcss.com"></script> | |
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.0/css/all.min.css"> | |
<style> | |
.console-output { | |
font-family: 'Courier New', monospace; | |
white-space: pre-wrap; | |
background-color: #1e1e1e; | |
} | |
.scrollable-content { | |
max-height: 60vh; | |
overflow-y: auto; | |
} | |
</style> | |
</head> | |
<body class="bg-gray-900 text-gray-100 min-h-screen"> | |
<div class="container mx-auto px-4 py-8"> | |
<div class="flex justify-between items-center mb-6"> | |
<div class="flex items-center space-x-2"> | |
<i class="fab fa-google text-3xl text-blue-400"></i> | |
<h1 class="text-2xl font-bold bg-gradient-to-r from-blue-400 to-green-400 bg-clip-text text-transparent">Google AI API Scraper</h1> | |
</div> | |
<div class="flex space-x-4"> | |
<button id="run-btn" class="px-4 py-2 bg-green-600 hover:bg-green-700 rounded-lg flex items-center space-x-2 transition-all"> | |
<i class="fas fa-download"></i> | |
<span>Run Scraper</span> | |
</button> | |
<button id="clear-btn" class="px-4 py-2 bg-gray-700 hover:bg-gray-600 rounded-lg flex items-center space-x-2 transition-all"> | |
<i class="fas fa-trash-alt"></i> | |
<span>Clear</span> | |
</button> | |
</div> | |
</div> | |
<div class="bg-gray-800 rounded-xl overflow-hidden shadow-2xl"> | |
<div class="bg-gray-700 px-4 py-2 flex justify-between items-center"> | |
<div class="flex items-center space-x-2"> | |
<i class="fas fa-terminal text-green-400"></i> | |
<span>Console Output</span> | |
</div> | |
<div class="text-sm text-gray-400">Python 3.x</div> | |
</div> | |
<div class="p-4"> | |
<div id="console-output" class="console-output p-4 rounded scrollable-content"> | |
<div class="text-green-400">Ready to scrape https://ai.google.dev/api?lang=python</div> | |
<div class="text-gray-400">Click "Run Scraper" to start downloading documentation...</div> | |
</div> | |
</div> | |
<div class="bg-gray-700 px-4 py-2"> | |
<label class="block text-sm mb-2 text-gray-400">Output Directory</label> | |
<input type="text" id="output-dir" value="google_ai_docs" class="w-full bg-gray-800 px-3 py-2 rounded outline-none border border-gray-600 focus:border-blue-500"> | |
</div> | |
</div> | |
<div class="mt-8 grid grid-cols-1 md:grid-cols-3 gap-6"> | |
<div class="bg-gray-800 p-6 rounded-xl shadow-lg"> | |
<div class="flex items-center mb-4"> | |
<div class="bg-blue-500 p-2 rounded-lg mr-4"> | |
<i class="fas fa-sitemap text-white"></i> | |
</div> | |
<h3 class="text-xl font-semibold">Recursive Scraping</h3> | |
</div> | |
<p class="text-gray-400">Downloads all linked pages within the Google AI API documentation hierarchy.</p> | |
</div> | |
<div class="bg-gray-800 p-6 rounded-xl shadow-lg"> | |
<div class="flex items-center mb-4"> | |
<div class="bg-purple-500 p-2 rounded-lg mr-4"> | |
<i class="fas fa-file-alt text-white"></i> | |
</div> | |
<h3 class="text-xl font-semibold">Local Storage</h3> | |
</div> | |
<p class="text-gray-400">Saves complete HTML pages with assets to your specified directory structure.</p> | |
</div> | |
<div class="bg-gray-800 p-6 rounded-xl shadow-lg"> | |
<div class="flex items-center mb-4"> | |
<div class="bg-green-500 p-2 rounded-lg mr-4"> | |
<i class="fas fa-code-branch text-white"></i> | |
</div> | |
<h3 class="text-xl font-semibold">Link Preservation</h3> | |
</div> | |
<p class="text-gray-400">Rewrites links to work locally while maintaining the original document structure.</p> | |
</div> | |
</div> | |
</div> | |
<script> | |
document.addEventListener('DOMContentLoaded', function() { | |
const consoleOutput = document.getElementById('console-output'); | |
const runButton = document.getElementById('run-btn'); | |
const clearButton = document.getElementById('clear-btn'); | |
const outputDirInput = document.getElementById('output-dir'); | |
function addToConsole(message, type = 'info') { | |
const colors = { | |
'info': 'text-gray-100', | |
'success': 'text-green-400', | |
'error': 'text-red-400', | |
'warning': 'text-yellow-400' | |
}; | |
const div = document.createElement('div'); | |
div.className = colors[type] || 'text-gray-100'; | |
div.textContent = message; | |
consoleOutput.appendChild(div); | |
consoleOutput.scrollTop = consoleOutput.scrollHeight; | |
} | |
function clearConsole() { | |
consoleOutput.innerHTML = '<div class="text-green-400">Console cleared. Ready to start new session.</div>'; | |
} | |
function runScraper() { | |
const outputDir = outputDirInput.value.trim() || 'google_ai_docs'; | |
addToConsole('Starting scraper for https://ai.google.dev/api?lang=python ...', 'info'); | |
addToConsole(`Output directory: ${outputDir}`, 'info'); | |
// In a real implementation, this would call a Python backend or use Pyodide | |
// For this demo, we'll simulate the scraping process | |
setTimeout(() => { | |
simulateScraping(outputDir); | |
}, 500); | |
} | |
function simulateScraping(outputDir) { | |
const pages = [ | |
{ url: 'https://ai.google.dev/api?lang=python', title: 'Google AI Python API' }, | |
{ url: 'https://ai.google.dev/api/python', title: 'Python Client Library' }, | |
{ url: 'https://ai.google.dev/api/python/quickstart', title: 'Quickstart Guide' }, | |
{ url: 'https://ai.google.dev/api/python/reference', title: 'API Reference' }, | |
{ url: 'https://ai.google.dev/api/python/authentication', title: 'Authentication' }, | |
{ url: 'https://ai.google.dev/api/python/examples', title: 'Code Examples' } | |
]; | |
pages.forEach((page, index) => { | |
setTimeout(() => { | |
const success = Math.random() > 0.1; // 90% success rate for simulation | |
if (success) { | |
addToConsole(`Downloaded: ${page.url} → ${outputDir}/${page.title.replace(/\s+/g, '_').toLowerCase()}.html`, 'success'); | |
// Simulate downloading assets | |
if (index === 0 || index === pages.length - 1) { | |
setTimeout(() => { | |
addToConsole(`Downloaded assets for ${page.title} (styles.css, script.js, images)`, 'info'); | |
}, 200); | |
} | |
} else { | |
addToConsole(`Error downloading: ${page.url} (simulated error)`, 'error'); | |
} | |
}, index * 800); | |
}); | |
setTimeout(() => { | |
addToConsole('\nScraping completed!', 'success'); | |
addToConsole(`All documentation saved to ${outputDir}/ directory`, 'success'); | |
addToConsole('Total pages downloaded: ' + pages.length, 'info'); | |
addToConsole('Note: This was a simulation. Actual implementation would use:', 'warning'); | |
addToConsole('- BeautifulSoup for HTML parsing', 'info'); | |
addToConsole('- Requests/httpx for HTTP requests', 'info'); | |
addToConsole('- OS module for file handling', 'info'); | |
}, pages.length * 800 + 500); | |
} | |
runButton.addEventListener('click', runScraper); | |
clearButton.addEventListener('click', clearConsole); | |
}); | |
</script> | |
<!-- Actual Python script that would do the real work --> | |
<script type="text/python" id="python-script"> | |
# This would be the actual Python implementation | |
import os | |
import time | |
import requests | |
from bs4 import BeautifulSoup | |
from urllib.parse import urljoin, urlparse | |
import concurrent.futures | |
class GoogleAIDocsScraper: | |
def __init__(self, base_url, output_dir="google_ai_docs"): | |
self.base_url = base_url | |
self.output_dir = output_dir | |
self.visited_urls = set() | |
self.session = requests.Session() | |
self.session.headers.update({ | |
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' | |
}) | |
# Create output directory if it doesn't exist | |
os.makedirs(self.output_dir, exist_ok=True) | |
def is_valid_url(self, url): | |
"""Check if URL is valid and should be scraped""" | |
parsed = urlparse(url) | |
return ( | |
parsed.netloc == 'ai.google.dev' and | |
not url.startswith('#') and | |
not any(ext in url for ext in ['.pdf', '.zip', '.png', '.jpg', '.jpeg', '.gif']) | |
) | |
def sanitize_filename(self, url): | |
"""Create a safe filename from URL""" | |
parsed = urlparse(url) | |
path = parsed.path.strip('/') or 'index' | |
path = path.replace('/', '_').replace('?', '_').replace('=', '_') | |
return f"{path}.html" | |
def save_page(self, url, content): | |
"""Save page content to file""" | |
filename = self.sanitize_filename(url) | |
filepath = os.path.join(self.output_dir, filename) | |
# Save HTML | |
with open(filepath, 'w', encoding='utf-8') as f: | |
f.write(content) | |
return filepath | |
def rewrite_links(self, soup, page_url): | |
"""Rewrite links to work locally""" | |
for tag, attr in [('a', 'href'), ('img', 'src'), ('link', 'href'), ('script', 'src')]: | |
for element in soup.find_all(tag, {attr: True}): | |
url = element[attr] | |
# Skip fragment links and external URLs | |
if not self.is_valid_url(url) or url.startswith('#'): | |
continue | |
# Convert relative URLs to absolute | |
absolute_url = urljoin(page_url, url) | |
# Rewrite to local file path | |
if absolute_url.startswith(self.base_url): | |
element[attr] = self.sanitize_filename(absolute_url) | |
def scrape_page(self, url): | |
"""Scrape a single page""" | |
if url in self.visited_urls: | |
return | |
self.visited_urls.add(url) | |
try: | |
response = self.session.get(url, timeout=10) | |
response.raise_for_status() | |
soup = BeautifulSoup(response.text, 'html.parser') | |
# Rewrite links before saving | |
self.rewrite_links(soup, url) | |
# Save the page | |
saved_path = self.save_page(url, str(soup)) | |
print(f"Successfully saved: {url} -> {saved_path}") | |
# Find all links on this page and add to queue | |
for link in soup.find_all('a', href=True): | |
href = link['href'] | |
absolute_url = urljoin(url, href) | |
if self.is_valid_url(absolute_url) and absolute_url not in self.visited_urls: | |
self.scrape_page(absolute_url) | |
except Exception as e: | |
print(f"Error scraping {url}: {str(e)}") | |
def scrape_all(self): | |
"""Start recursive scraping""" | |
print(f"Starting recursive scraping from {self.base_url}") | |
start_time = time.time() | |
self.scrape_page(self.base_url) | |
print(f"\nScraping completed in {time.time() - start_time:.2f} seconds") | |
print(f"Total pages scraped: {len(self.visited_urls)}") | |
print(f"Output directory: {os.path.abspath(self.output_dir)}") | |
if __name__ == "__main__": | |
BASE_URL = "https://ai.google.dev/api?lang=python" | |
OUTPUT_DIR = "google_ai_docs" | |
scraper = GoogleAIDocsScraper(BASE_URL, OUTPUT_DIR) | |
scraper.scrape_all() | |
</script> | |
<script> | |
// In a real implementation, you would use Pyodide to run the Python code | |
// This is just a placeholder to show where you'd integrate it | |
/* | |
async function loadPyodideAndRun() { | |
let pyodide = await loadPyodide({ | |
indexURL: "https://cdn.jsdelivr.net/pyodide/v0.23.1/full/" | |
}); | |
const pythonCode = document.getElementById('python-script').textContent; | |
try { | |
await pyodide.loadPackage(['requests', 'beautifulsoup4']); | |
const result = await pyodide.runPythonAsync(pythonCode); | |
console.log(result); | |
} catch (error) { | |
console.error("Error running Python:", error); | |
} | |
} | |
document.getElementById('run-btn').addEventListener('click', loadPyodideAndRun); | |
*/ | |
</script> | |
<p style="border-radius: 8px; text-align: center; font-size: 12px; color: #fff; margin-top: 16px;position: fixed; left: 8px; bottom: 8px; z-index: 10; background: rgba(0, 0, 0, 0.8); padding: 4px 8px;">Made with <img src="https://enzostvs-deepsite.hf.space/logo.svg" alt="DeepSite Logo" style="width: 16px; height: 16px; vertical-align: middle;display:inline-block;margin-right:3px;filter:brightness(0) invert(1);"><a href="https://enzostvs-deepsite.hf.space" style="color: #fff;text-decoration: underline;" target="_blank" >DeepSite</a> - <a href="https://enzostvs-deepsite.hf.space?remix=jeffgui/jeff-deepsite" style="color: #fff;text-decoration: underline;" target="_blank" >🧬 Remix</a></p></body> | |
</html> |