Questo / web_scraping_service /beautiful_scrape.py
shubhendu-ghosh-DS
summarized the google searches
193bf75
raw
history blame
711 Bytes
from bs4 import BeautifulSoup
import requests
from requests.exceptions import HTTPError
class WebScrapingService:
def __init__(self):
pass
def scrape_text_from_url(self, url):
try:
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
unwanted_elements = ['footer', 'script', 'style', 'noscript']
for tag in unwanted_elements:
for el in soup.find_all(tag):
el.extract()
text = ' '.join([p.text for p in soup.find_all('p')])
return text.strip() # Strip leading and trailing whitespaces
except Exception as e:
raise HTTPError(e)