Spaces:

gaur3009
/

Scaper_search

Runtime error

Scaper_search / scraper.py

Update scraper.py

ee2e25a verified 26 days ago

1.2 kB

	import requests
	from bs4 import BeautifulSoup
	import re

	# Clean HTML tags
	TAG_CLEANER = re.compile(r"<[^>]+>")

	def clean_text(text):
	"""Clean and normalize text"""
	text = TAG_CLEANER.sub('', text)
	text = re.sub(r'\s+', ' ', text).strip()
	return text

	def scrape_url(url):
	"""Efficient content extraction with fallbacks"""
	try:
	headers = {
	'User-Agent': 'Mozilla/5.0 (compatible; ResearchBot/1.0)',
	'Accept-Language': 'en-US,en;q=0.9'
	}
	response = requests.get(url, timeout=8, headers=headers)
	response.raise_for_status()

	soup = BeautifulSoup(response.text, 'lxml')

	# Try semantic tags first
	for selector in ['article', 'main', '.article-body', '.post-content']:
	if element := soup.select_one(selector):
	return clean_text(element.get_text())

	# Fallback to paragraph aggregation
	paragraphs = soup.find_all('p')
	content = " ".join(p.get_text().strip() for p in paragraphs)
	return clean_text(content)[:5000]

	except Exception as e:
	return f"⚠️ Error: Could not retrieve content from {url}"