Spaces:

Really-amin
/

Hoghoghi

Paused

App Files Files Community

Hoghoghi / app /api /scraping.py

Really-amin

Upload 143 files

c636ebf verified 3 months ago

raw

history blame contribute delete

15.3 kB

	"""
	Scraping and Rating API Endpoints
	================================

	FastAPI endpoints for web scraping and data rating functionality.
	Provides comprehensive API for managing scraping jobs, monitoring progress,
	and retrieving rating data.
	"""

	import logging
	from typing import List, Optional, Dict, Any
	from datetime import datetime
	from fastapi import APIRouter, HTTPException, BackgroundTasks, Query, Depends
	from fastapi.responses import JSONResponse
	from pydantic import BaseModel, Field, HttpUrl
	from enum import Enum

	from ..services.scraping_service import ScrapingService, ScrapingStrategy
	from ..services.rating_service import RatingService

	logger = logging.getLogger(__name__)

	# Initialize services
	scraping_service = ScrapingService()
	rating_service = RatingService()

	# Request/Response Models


	class ScrapingStrategyEnum(str, Enum):
	"""Available scraping strategies for API"""
	GENERAL = "general"
	LEGAL_DOCUMENTS = "legal_documents"
	NEWS_ARTICLES = "news_articles"
	ACADEMIC_PAPERS = "academic_papers"
	GOVERNMENT_SITES = "government_sites"
	CUSTOM = "custom"


	class ScrapingRequest(BaseModel):
	"""Request model for starting a scraping job"""
	urls: List[str] = Field(..., description="List of URLs to scrape")
	strategy: ScrapingStrategyEnum = Field(
	default=ScrapingStrategyEnum.GENERAL, description="Scraping strategy to use")
	keywords: Optional[List[str]] = Field(
	default=None, description="Keywords to filter content")
	content_types: Optional[List[str]] = Field(
	default=None, description="Content types to focus on")
	max_depth: int = Field(default=1, ge=1, le=5,
	description="Maximum depth for recursive scraping")
	delay_between_requests: float = Field(
	default=1.0, ge=0.1, le=10.0, description="Delay between requests in seconds")


	class ScrapingJobResponse(BaseModel):
	"""Response model for scraping job"""
	job_id: str
	status: str
	total_items: int
	completed_items: int
	failed_items: int
	progress: float
	created_at: str
	strategy: str


	class ScrapedItemResponse(BaseModel):
	"""Response model for scraped item"""
	id: str
	url: str
	title: str
	content: str
	metadata: Dict[str, Any]
	timestamp: str
	source_url: str
	rating_score: float
	processing_status: str
	error_message: Optional[str]
	strategy_used: str
	content_hash: str
	word_count: int
	language: str
	domain: str


	class RatingSummaryResponse(BaseModel):
	"""Response model for rating summary"""
	total_rated: int
	average_score: float
	score_range: Dict[str, float]
	average_confidence: float
	rating_level_distribution: Dict[str, int]
	criteria_averages: Dict[str, float]
	recent_ratings_24h: int


	class ScrapingStatisticsResponse(BaseModel):
	"""Response model for scraping statistics"""
	total_items: int
	status_distribution: Dict[str, int]
	language_distribution: Dict[str, int]
	average_rating: float
	active_jobs: int
	total_jobs: int


	# Create router
	router = APIRouter()


	@router.post("/scrape", response_model=Dict[str, str])
	async def start_scraping_job(request: ScrapingRequest, background_tasks: BackgroundTasks):
	"""
	Start a new scraping job

	- urls: List of URLs to scrape
	- strategy: Scraping strategy to use
	- keywords: Optional keywords to filter content
	- content_types: Optional content types to focus on
	- max_depth: Maximum depth for recursive scraping (1-5)
	- delay_between_requests: Delay between requests in seconds (0.1-10.0)
	"""
	try:
	# Convert strategy enum to service enum
	strategy_map = {
	ScrapingStrategyEnum.GENERAL: ScrapingStrategy.GENERAL,
	ScrapingStrategyEnum.LEGAL_DOCUMENTS: ScrapingStrategy.LEGAL_DOCUMENTS,
	ScrapingStrategyEnum.NEWS_ARTICLES: ScrapingStrategy.NEWS_ARTICLES,
	ScrapingStrategyEnum.ACADEMIC_PAPERS: ScrapingStrategy.ACADEMIC_PAPERS,
	ScrapingStrategyEnum.GOVERNMENT_SITES: ScrapingStrategy.GOVERNMENT_SITES,
	ScrapingStrategyEnum.CUSTOM: ScrapingStrategy.CUSTOM
	}

	strategy = strategy_map[request.strategy]

	# Start scraping job
	job_id = await scraping_service.start_scraping_job(
	urls=request.urls,
	strategy=strategy,
	keywords=request.keywords,
	content_types=request.content_types,
	max_depth=request.max_depth,
	delay=request.delay_between_requests
	)

	logger.info(
	f"Started scraping job {job_id} with {len(request.urls)} URLs")

	return {
	"job_id": job_id,
	"status": "started",
	"message": f"Scraping job started successfully with {len(request.urls)} URLs"
	}

	except Exception as e:
	logger.error(f"Error starting scraping job: {e}")
	raise HTTPException(
	status_code=500, detail=f"Failed to start scraping job: {str(e)}")


	@router.get("/scrape/status", response_model=List[ScrapingJobResponse])
	async def get_scraping_jobs_status():
	"""
	Get status of all scraping jobs

	Returns list of all active and recent scraping jobs with their progress.
	"""
	try:
	jobs = await scraping_service.get_all_jobs()
	return [ScrapingJobResponse(**job) for job in jobs if job is not None]

	except Exception as e:
	logger.error(f"Error getting scraping jobs status: {e}")
	raise HTTPException(
	status_code=500, detail=f"Failed to get scraping jobs status: {str(e)}")


	@router.get("/scrape/status/{job_id}", response_model=ScrapingJobResponse)
	async def get_scraping_job_status(job_id: str):
	"""
	Get status of a specific scraping job

	- job_id: ID of the scraping job to check
	"""
	try:
	job_status = await scraping_service.get_job_status(job_id)
	if not job_status:
	raise HTTPException(
	status_code=404, detail=f"Scraping job {job_id} not found")

	return ScrapingJobResponse(**job_status)

	except HTTPException:
	raise
	except Exception as e:
	logger.error(f"Error getting scraping job status: {e}")
	raise HTTPException(
	status_code=500, detail=f"Failed to get scraping job status: {str(e)}")


	@router.get("/scrape/items", response_model=List[ScrapedItemResponse])
	async def get_scraped_items(
	job_id: Optional[str] = Query(None, description="Filter by job ID"),
	limit: int = Query(100, ge=1, le=1000,
	description="Maximum number of items to return"),
	offset: int = Query(0, ge=0, description="Number of items to skip")
	):
	"""
	Get scraped items with optional filtering

	- job_id: Optional job ID to filter items
	- limit: Maximum number of items to return (1-1000)
	- offset: Number of items to skip for pagination
	"""
	try:
	items = await scraping_service.get_scraped_items(
	job_id=job_id,
	limit=limit,
	offset=offset
	)

	return [ScrapedItemResponse(**item) for item in items]

	except Exception as e:
	logger.error(f"Error getting scraped items: {e}")
	raise HTTPException(
	status_code=500, detail=f"Failed to get scraped items: {str(e)}")


	@router.get("/scrape/statistics", response_model=ScrapingStatisticsResponse)
	async def get_scraping_statistics():
	"""
	Get comprehensive scraping statistics

	Returns overall statistics about scraped items, jobs, and system health.
	"""
	try:
	stats = await scraping_service.get_scraping_statistics()
	return ScrapingStatisticsResponse(**stats)

	except Exception as e:
	logger.error(f"Error getting scraping statistics: {e}")
	raise HTTPException(
	status_code=500, detail=f"Failed to get scraping statistics: {str(e)}")


	@router.post("/rating/rate/{item_id}")
	async def rate_specific_item(item_id: str):
	"""
	Rate a specific scraped item

	- item_id: ID of the item to rate
	"""
	try:
	# Get item data
	items = await scraping_service.get_scraped_items(limit=1000)
	item_data = None

	for item in items:
	if item['id'] == item_id:
	item_data = item
	break

	if not item_data:
	raise HTTPException(
	status_code=404, detail=f"Item {item_id} not found")

	# Rate the item
	rating_result = await rating_service.rate_item(item_data)

	return {
	"item_id": item_id,
	"rating_result": rating_result.to_dict(),
	"message": f"Item {item_id} rated successfully"
	}

	except HTTPException:
	raise
	except Exception as e:
	logger.error(f"Error rating item {item_id}: {e}")
	raise HTTPException(
	status_code=500, detail=f"Failed to rate item: {str(e)}")


	@router.post("/rating/rate-all")
	async def rate_all_unrated_items():
	"""
	Rate all unrated scraped items

	Automatically rates all items that haven't been rated yet.
	"""
	try:
	# Get all items
	items = await scraping_service.get_scraped_items(limit=1000)
	unrated_items = [item for item in items if item['rating_score'] == 0.0]

	rated_count = 0
	failed_count = 0

	for item in unrated_items:
	try:
	await rating_service.rate_item(item)
	rated_count += 1
	except Exception as e:
	logger.error(f"Failed to rate item {item['id']}: {e}")
	failed_count += 1

	return {
	"total_items": len(unrated_items),
	"rated_count": rated_count,
	"failed_count": failed_count,
	"message": f"Rated {rated_count} items, {failed_count} failed"
	}

	except Exception as e:
	logger.error(f"Error rating all items: {e}")
	raise HTTPException(
	status_code=500, detail=f"Failed to rate all items: {str(e)}")


	@router.get("/rating/summary", response_model=RatingSummaryResponse)
	async def get_rating_summary():
	"""
	Get comprehensive rating summary

	Returns overall statistics about rated items, score distributions, and criteria averages.
	"""
	try:
	summary = await rating_service.get_rating_summary()
	return RatingSummaryResponse(**summary)

	except Exception as e:
	logger.error(f"Error getting rating summary: {e}")
	raise HTTPException(
	status_code=500, detail=f"Failed to get rating summary: {str(e)}")


	@router.get("/rating/history/{item_id}")
	async def get_item_rating_history(item_id: str):
	"""
	Get rating history for a specific item

	- item_id: ID of the item to get history for
	"""
	try:
	history = await rating_service.get_item_rating_history(item_id)
	return {
	"item_id": item_id,
	"history": history,
	"total_changes": len(history)
	}

	except Exception as e:
	logger.error(f"Error getting rating history for item {item_id}: {e}")
	raise HTTPException(
	status_code=500, detail=f"Failed to get rating history: {str(e)}")


	@router.post("/rating/re-evaluate/{item_id}")
	async def re_evaluate_item(item_id: str):
	"""
	Re-evaluate a specific item

	- item_id: ID of the item to re-evaluate
	"""
	try:
	rating_result = await rating_service.re_evaluate_item(item_id)

	if not rating_result:
	raise HTTPException(
	status_code=404, detail=f"Item {item_id} not found")

	return {
	"item_id": item_id,
	"rating_result": rating_result.to_dict(),
	"message": f"Item {item_id} re-evaluated successfully"
	}

	except HTTPException:
	raise
	except Exception as e:
	logger.error(f"Error re-evaluating item {item_id}: {e}")
	raise HTTPException(
	status_code=500, detail=f"Failed to re-evaluate item: {str(e)}")


	@router.get("/rating/low-quality")
	async def get_low_quality_items(
	threshold: float = Query(
	0.4, ge=0.0, le=1.0, description="Quality threshold"),
	limit: int = Query(
	50, ge=1, le=200, description="Maximum number of items to return")
	):
	"""
	Get items with low quality ratings

	- threshold: Quality threshold (0.0-1.0)
	- limit: Maximum number of items to return (1-200)
	"""
	try:
	items = await rating_service.get_low_quality_items(threshold=threshold, limit=limit)

	return {
	"threshold": threshold,
	"total_items": len(items),
	"items": items
	}

	except Exception as e:
	logger.error(f"Error getting low quality items: {e}")
	raise HTTPException(
	status_code=500, detail=f"Failed to get low quality items: {str(e)}")


	@router.delete("/scrape/cleanup")
	async def cleanup_old_jobs(days: int = Query(7, ge=1, le=30, description="Days to keep jobs")):
	"""
	Clean up old completed jobs

	- days: Number of days to keep jobs (1-30)
	"""
	try:
	await scraping_service.cleanup_old_jobs(days=days)

	return {
	"message": f"Cleaned up jobs older than {days} days",
	"days": days
	}

	except Exception as e:
	logger.error(f"Error cleaning up old jobs: {e}")
	raise HTTPException(
	status_code=500, detail=f"Failed to cleanup old jobs: {str(e)}")


	@router.get("/health")
	async def scraping_health_check():
	"""
	Health check for scraping and rating services

	Returns status of both scraping and rating services.
	"""
	try:
	# Check scraping service
	scraping_stats = await scraping_service.get_scraping_statistics()

	# Check rating service
	rating_summary = await rating_service.get_rating_summary()

	return {
	"status": "healthy",
	"timestamp": datetime.now().isoformat(),
	"services": {
	"scraping": {
	"active_jobs": scraping_stats.get('active_jobs', 0),
	"total_items": scraping_stats.get('total_items', 0)
	},
	"rating": {
	"total_rated": rating_summary.get('total_rated', 0),
	"average_score": rating_summary.get('average_score', 0)
	}
	}
	}

	except Exception as e:
	logger.error(f"Health check failed: {e}")
	return {
	"status": "unhealthy",
	"timestamp": datetime.now().isoformat(),
	"error": str(e)
	}