Spaces:

AiDeveloper1
/

RCS

Sleeping

App Files Files Community

RCS / main.py

AiDeveloper1

Update main.py

b74cfe9 verified 7 months ago

raw

history blame

5.88 kB

	from fastapi import FastAPI, HTTPException, Request
	from pydantic import HttpUrl
	from playwright.async_api import async_playwright
	from urllib.parse import urljoin, urlparse
	import logging
	from fastapi.responses import JSONResponse
	from fastapi.templating import Jinja2Templates
	from fastapi.staticfiles import StaticFiles
	from typing import List, Dict
	import asyncio
	import os

	# Set up logging
	logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
	logger = logging.getLogger(__name__)

	app = FastAPI(title="Website Scraper API with Frontend")

	# Mount static files
	app.mount("/static", StaticFiles(directory="static"), name="static")

	# Set up Jinja2 templates
	templates = Jinja2Templates(directory="templates")

	# Maximum number of pages to scrape
	MAX_PAGES = 20

	async def scrape_page(url: str, visited: set, base_domain: str) -> tuple[Dict, set]:
	"""Scrape a single page for text, images, and links using Playwright."""
	try:
	logger.info(f"Starting Playwright for URL: {url}")
	async with async_playwright() as p:
	browser = await p.chromium.launch(headless=True)
	context = await browser.new_context(
	user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36",
	viewport={"width": 800, "height": 600}, # Reduced viewport for performance
	bypass_csp=True # Bypass Content Security Policy
	)
	page = await context.new_page()

	# Retry navigation with fallback
	for attempt in range(2): # Try up to 2 times
	try:
	logger.info(f"Navigating to {url} (Attempt {attempt + 1})")
	await page.goto(url, wait_until="domcontentloaded", timeout=30000) # 30s timeout
	break # Success, exit retry loop
	except Exception as e:
	logger.warning(f"Navigation attempt {attempt + 1} failed for {url}: {str(e)}")
	if attempt == 1: # Last attempt
	logger.error(f"All navigation attempts failed for {url}")
	await browser.close()
	return {}, set()
	await asyncio.sleep(1) # Wait before retry

	# Scroll to trigger lazy-loaded images
	await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
	await page.wait_for_timeout(2000) # Wait for lazy-loaded content

	# Extract text content
	text_content = await page.evaluate(
	"""() => document.body.innerText"""
	)
	text_content = ' '.join(text_content.split()) if text_content else ""

	# Extract images from src, data-src, and srcset
	images = await page.evaluate(
	"""() => {
	const imgElements = document.querySelectorAll('img');
	const imgUrls = new Set();
	imgElements.forEach(img => {
	if (img.src) imgUrls.add(img.src);
	if (img.dataset.src) imgUrls.add(img.dataset.src);
	if (img.srcset) {
	img.srcset.split(',').forEach(src => {
	const url = src.trim().split(' ')[0];
	if (url) imgUrls.add(url);
	});
	}
	});
	return Array.from(imgUrls);
	}"""
	)
	images = [urljoin(url, img) for img in images if img]

	# Extract links
	links = await page.evaluate(
	"""() => Array.from(document.querySelectorAll('a')).map(a => a.href)"""
	)
	links = set(urljoin(url, link) for link in links if urlparse(urljoin(url, link)).netloc == base_domain and urljoin(url, link) not in visited)

	await browser.close()
	logger.info(f"Successfully scraped {url}")

	page_data = {
	"url": url,
	"text": text_content,
	"images": images
	}
	return page_data, links

	except Exception as e:
	logger.error(f"Error scraping {url}: {str(e)}")
	return {}, set()

	@app.get("/scrape")
	async def crawl_website(url: HttpUrl):
	"""Crawl the website starting from the given URL and return scraped data for up to 10 pages as JSON."""
	try:
	logger.info(f"Starting crawl for {url}")
	visited = set()
	to_visit = {str(url)}
	base_domain = urlparse(str(url)).netloc
	results = []

	while to_visit and len(visited) < MAX_PAGES:
	current_url = to_visit.pop()
	if current_url in visited:
	continue

	logger.info(f"Scraping: {current_url}")
	visited.add(current_url)

	page_data, new_links = await scrape_page(current_url, visited, base_domain)
	if page_data:
	results.append(page_data)
	to_visit.update(new_links)

	# Small delay to avoid overwhelming the server
	await asyncio.sleep(0.5)

	logger.info(f"Crawl completed for {url}")
	return JSONResponse(content={"pages": results})

	except Exception as e:
	logger.error(f"Scraping failed for {url}: {str(e)}")
	raise HTTPException(status_code=500, detail=f"Scraping failed: {str(e)}")

	@app.get("/")
	async def serve_home(request: Request):
	"""Serve the frontend HTML page."""
	logger.info("Serving home page")
	return templates.TemplateResponse("index.html", {"request": request})

	if __name__ == "__main__":
	logger.info("Starting FastAPI server on port 7860")
	import uvicorn
	uvicorn.run(app, host="0.0.0.0", port=7860)