Spaces:
Sleeping
Sleeping
from bs4 import BeautifulSoup | |
import requests | |
from requests.exceptions import HTTPError | |
class WebScrapingService: | |
def __init__(self): | |
pass | |
def scrape_text_from_url(self, url): | |
try: | |
response = requests.get(url) | |
soup = BeautifulSoup(response.text, 'html.parser') | |
unwanted_elements = ['footer', 'script', 'style', 'noscript'] | |
for tag in unwanted_elements: | |
for el in soup.find_all(tag): | |
el.extract() | |
text = ' '.join([p.text for p in soup.find_all('p')]) | |
return text.strip() # Strip leading and trailing whitespaces | |
except Exception as e: | |
raise HTTPError(e) |