Spaces:

jeffgui
/

jeff-deepsite

Running

App Files Files Community

jeff-deepsite / index.html

jeffgui

Add 1 files

f542823 verified about 1 month ago

raw

history blame contribute delete

14.8 kB

	<!DOCTYPE html>
	<html lang="en">
	<head>
	<meta charset="UTF-8">
	<meta name="viewport" content="width=device-width, initial-scale=1.0">
	<title>Google AI API Documentation Scraper</title>
	<script src="https://cdn.tailwindcss.com"></script>
	<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.0/css/all.min.css">
	<style>
	.console-output {
	font-family: 'Courier New', monospace;
	white-space: pre-wrap;
	background-color: #1e1e1e;
	}
	.scrollable-content {
	max-height: 60vh;
	overflow-y: auto;
	}
	</style>
	</head>
	<body class="bg-gray-900 text-gray-100 min-h-screen">
	<div class="container mx-auto px-4 py-8">
	<div class="flex justify-between items-center mb-6">
	<div class="flex items-center space-x-2">
	<i class="fab fa-google text-3xl text-blue-400"></i>
	<h1 class="text-2xl font-bold bg-gradient-to-r from-blue-400 to-green-400 bg-clip-text text-transparent">Google AI API Scraper</h1>
	</div>
	<div class="flex space-x-4">
	<button id="run-btn" class="px-4 py-2 bg-green-600 hover:bg-green-700 rounded-lg flex items-center space-x-2 transition-all">
	<i class="fas fa-download"></i>
	<span>Run Scraper</span>
	</button>
	<button id="clear-btn" class="px-4 py-2 bg-gray-700 hover:bg-gray-600 rounded-lg flex items-center space-x-2 transition-all">
	<i class="fas fa-trash-alt"></i>
	<span>Clear</span>
	</button>
	</div>
	</div>

	<div class="bg-gray-800 rounded-xl overflow-hidden shadow-2xl">
	<div class="bg-gray-700 px-4 py-2 flex justify-between items-center">
	<div class="flex items-center space-x-2">
	<i class="fas fa-terminal text-green-400"></i>
	<span>Console Output</span>
	</div>
	<div class="text-sm text-gray-400">Python 3.x</div>
	</div>
	<div class="p-4">
	<div id="console-output" class="console-output p-4 rounded scrollable-content">
	<div class="text-green-400">Ready to scrape https://ai.google.dev/api?lang=python</div>
	<div class="text-gray-400">Click "Run Scraper" to start downloading documentation...</div>
	</div>
	</div>
	<div class="bg-gray-700 px-4 py-2">
	<label class="block text-sm mb-2 text-gray-400">Output Directory</label>
	<input type="text" id="output-dir" value="google_ai_docs" class="w-full bg-gray-800 px-3 py-2 rounded outline-none border border-gray-600 focus:border-blue-500">
	</div>
	</div>

	<div class="mt-8 grid grid-cols-1 md:grid-cols-3 gap-6">
	<div class="bg-gray-800 p-6 rounded-xl shadow-lg">
	<div class="flex items-center mb-4">
	<div class="bg-blue-500 p-2 rounded-lg mr-4">
	<i class="fas fa-sitemap text-white"></i>
	</div>
	<h3 class="text-xl font-semibold">Recursive Scraping</h3>
	</div>
	<p class="text-gray-400">Downloads all linked pages within the Google AI API documentation hierarchy.</p>
	</div>
	<div class="bg-gray-800 p-6 rounded-xl shadow-lg">
	<div class="flex items-center mb-4">
	<div class="bg-purple-500 p-2 rounded-lg mr-4">
	<i class="fas fa-file-alt text-white"></i>
	</div>
	<h3 class="text-xl font-semibold">Local Storage</h3>
	</div>
	<p class="text-gray-400">Saves complete HTML pages with assets to your specified directory structure.</p>
	</div>
	<div class="bg-gray-800 p-6 rounded-xl shadow-lg">
	<div class="flex items-center mb-4">
	<div class="bg-green-500 p-2 rounded-lg mr-4">
	<i class="fas fa-code-branch text-white"></i>
	</div>
	<h3 class="text-xl font-semibold">Link Preservation</h3>
	</div>
	<p class="text-gray-400">Rewrites links to work locally while maintaining the original document structure.</p>
	</div>
	</div>
	</div>

	<script>
	document.addEventListener('DOMContentLoaded', function() {
	const consoleOutput = document.getElementById('console-output');
	const runButton = document.getElementById('run-btn');
	const clearButton = document.getElementById('clear-btn');
	const outputDirInput = document.getElementById('output-dir');

	function addToConsole(message, type = 'info') {
	const colors = {
	'info': 'text-gray-100',
	'success': 'text-green-400',
	'error': 'text-red-400',
	'warning': 'text-yellow-400'
	};
	const div = document.createElement('div');
	div.className = colors[type] \|\| 'text-gray-100';
	div.textContent = message;
	consoleOutput.appendChild(div);
	consoleOutput.scrollTop = consoleOutput.scrollHeight;
	}

	function clearConsole() {
	consoleOutput.innerHTML = '<div class="text-green-400">Console cleared. Ready to start new session.</div>';
	}

	function runScraper() {
	const outputDir = outputDirInput.value.trim() \|\| 'google_ai_docs';
	addToConsole('Starting scraper for https://ai.google.dev/api?lang=python ...', 'info');
	addToConsole(`Output directory: ${outputDir}`, 'info');

	// In a real implementation, this would call a Python backend or use Pyodide
	// For this demo, we'll simulate the scraping process

	setTimeout(() => {
	simulateScraping(outputDir);
	}, 500);
	}

	function simulateScraping(outputDir) {
	const pages = [
	{ url: 'https://ai.google.dev/api?lang=python', title: 'Google AI Python API' },
	{ url: 'https://ai.google.dev/api/python', title: 'Python Client Library' },
	{ url: 'https://ai.google.dev/api/python/quickstart', title: 'Quickstart Guide' },
	{ url: 'https://ai.google.dev/api/python/reference', title: 'API Reference' },
	{ url: 'https://ai.google.dev/api/python/authentication', title: 'Authentication' },
	{ url: 'https://ai.google.dev/api/python/examples', title: 'Code Examples' }
	];

	pages.forEach((page, index) => {
	setTimeout(() => {
	const success = Math.random() > 0.1; // 90% success rate for simulation

	if (success) {
	addToConsole(`Downloaded: ${page.url} → ${outputDir}/${page.title.replace(/\s+/g, '_').toLowerCase()}.html`, 'success');

	// Simulate downloading assets
	if (index === 0 \|\| index === pages.length - 1) {
	setTimeout(() => {
	addToConsole(`Downloaded assets for ${page.title} (styles.css, script.js, images)`, 'info');
	}, 200);
	}
	} else {
	addToConsole(`Error downloading: ${page.url} (simulated error)`, 'error');
	}
	}, index * 800);
	});

	setTimeout(() => {
	addToConsole('\nScraping completed!', 'success');
	addToConsole(`All documentation saved to ${outputDir}/ directory`, 'success');
	addToConsole('Total pages downloaded: ' + pages.length, 'info');
	addToConsole('Note: This was a simulation. Actual implementation would use:', 'warning');
	addToConsole('- BeautifulSoup for HTML parsing', 'info');
	addToConsole('- Requests/httpx for HTTP requests', 'info');
	addToConsole('- OS module for file handling', 'info');
	}, pages.length * 800 + 500);
	}

	runButton.addEventListener('click', runScraper);
	clearButton.addEventListener('click', clearConsole);
	});
	</script>

	<!-- Actual Python script that would do the real work -->
	<script type="text/python" id="python-script">
	# This would be the actual Python implementation

	import os
	import time
	import requests
	from bs4 import BeautifulSoup
	from urllib.parse import urljoin, urlparse
	import concurrent.futures

	class GoogleAIDocsScraper:
	def __init__(self, base_url, output_dir="google_ai_docs"):
	self.base_url = base_url
	self.output_dir = output_dir
	self.visited_urls = set()
	self.session = requests.Session()
	self.session.headers.update({
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
	})

	# Create output directory if it doesn't exist
	os.makedirs(self.output_dir, exist_ok=True)

	def is_valid_url(self, url):
	"""Check if URL is valid and should be scraped"""
	parsed = urlparse(url)
	return (
	parsed.netloc == 'ai.google.dev' and
	not url.startswith('#') and
	not any(ext in url for ext in ['.pdf', '.zip', '.png', '.jpg', '.jpeg', '.gif'])
	)

	def sanitize_filename(self, url):
	"""Create a safe filename from URL"""
	parsed = urlparse(url)
	path = parsed.path.strip('/') or 'index'
	path = path.replace('/', '_').replace('?', '_').replace('=', '_')
	return f"{path}.html"

	def save_page(self, url, content):
	"""Save page content to file"""
	filename = self.sanitize_filename(url)
	filepath = os.path.join(self.output_dir, filename)

	# Save HTML
	with open(filepath, 'w', encoding='utf-8') as f:
	f.write(content)

	return filepath

	def rewrite_links(self, soup, page_url):
	"""Rewrite links to work locally"""
	for tag, attr in [('a', 'href'), ('img', 'src'), ('link', 'href'), ('script', 'src')]:
	for element in soup.find_all(tag, {attr: True}):
	url = element[attr]

	# Skip fragment links and external URLs
	if not self.is_valid_url(url) or url.startswith('#'):
	continue

	# Convert relative URLs to absolute
	absolute_url = urljoin(page_url, url)

	# Rewrite to local file path
	if absolute_url.startswith(self.base_url):
	element[attr] = self.sanitize_filename(absolute_url)

	def scrape_page(self, url):
	"""Scrape a single page"""
	if url in self.visited_urls:
	return

	self.visited_urls.add(url)

	try:
	response = self.session.get(url, timeout=10)
	response.raise_for_status()

	soup = BeautifulSoup(response.text, 'html.parser')

	# Rewrite links before saving
	self.rewrite_links(soup, url)

	# Save the page
	saved_path = self.save_page(url, str(soup))
	print(f"Successfully saved: {url} -> {saved_path}")

	# Find all links on this page and add to queue
	for link in soup.find_all('a', href=True):
	href = link['href']
	absolute_url = urljoin(url, href)

	if self.is_valid_url(absolute_url) and absolute_url not in self.visited_urls:
	self.scrape_page(absolute_url)

	except Exception as e:
	print(f"Error scraping {url}: {str(e)}")

	def scrape_all(self):
	"""Start recursive scraping"""
	print(f"Starting recursive scraping from {self.base_url}")
	start_time = time.time()

	self.scrape_page(self.base_url)

	print(f"\nScraping completed in {time.time() - start_time:.2f} seconds")
	print(f"Total pages scraped: {len(self.visited_urls)}")
	print(f"Output directory: {os.path.abspath(self.output_dir)}")

	if __name__ == "__main__":
	BASE_URL = "https://ai.google.dev/api?lang=python"
	OUTPUT_DIR = "google_ai_docs"

	scraper = GoogleAIDocsScraper(BASE_URL, OUTPUT_DIR)
	scraper.scrape_all()
	</script>

	<script>
	// In a real implementation, you would use Pyodide to run the Python code
	// This is just a placeholder to show where you'd integrate it

	/*
	async function loadPyodideAndRun() {
	let pyodide = await loadPyodide({
	indexURL: "https://cdn.jsdelivr.net/pyodide/v0.23.1/full/"
	});

	const pythonCode = document.getElementById('python-script').textContent;
	try {
	await pyodide.loadPackage(['requests', 'beautifulsoup4']);
	const result = await pyodide.runPythonAsync(pythonCode);
	console.log(result);
	} catch (error) {
	console.error("Error running Python:", error);
	}
	}

	document.getElementById('run-btn').addEventListener('click', loadPyodideAndRun);
	*/
	</script>
	<p style="border-radius: 8px; text-align: center; font-size: 12px; color: #fff; margin-top: 16px;position: fixed; left: 8px; bottom: 8px; z-index: 10; background: rgba(0, 0, 0, 0.8); padding: 4px 8px;">Made with <img src="https://enzostvs-deepsite.hf.space/logo.svg" alt="DeepSite Logo" style="width: 16px; height: 16px; vertical-align: middle;display:inline-block;margin-right:3px;filter:brightness(0) invert(1);"><a href="https://enzostvs-deepsite.hf.space" style="color: #fff;text-decoration: underline;" target="_blank" >DeepSite</a> - <a href="https://enzostvs-deepsite.hf.space?remix=jeffgui/jeff-deepsite" style="color: #fff;text-decoration: underline;" target="_blank" >🧬 Remix</a></p></body>
	</html>