Spaces:
Runtime error
Runtime error
import requests | |
from bs4 import BeautifulSoup | |
import re | |
# Clean HTML tags | |
TAG_CLEANER = re.compile(r"<[^>]+>") | |
def clean_text(text): | |
"""Clean and normalize text""" | |
text = TAG_CLEANER.sub('', text) | |
text = re.sub(r'\s+', ' ', text).strip() | |
return text | |
def scrape_url(url): | |
"""Efficient content extraction with fallbacks""" | |
try: | |
headers = { | |
'User-Agent': 'Mozilla/5.0 (compatible; ResearchBot/1.0)', | |
'Accept-Language': 'en-US,en;q=0.9' | |
} | |
response = requests.get(url, timeout=8, headers=headers) | |
response.raise_for_status() | |
soup = BeautifulSoup(response.text, 'lxml') | |
# Try semantic tags first | |
for selector in ['article', 'main', '.article-body', '.post-content']: | |
if element := soup.select_one(selector): | |
return clean_text(element.get_text()) | |
# Fallback to paragraph aggregation | |
paragraphs = soup.find_all('p') | |
content = " ".join(p.get_text().strip() for p in paragraphs) | |
return clean_text(content)[:5000] | |
except Exception as e: | |
return f"β οΈ Error: Could not retrieve content from {url}" |