Spaces:
Runtime error
Runtime error
from bs4 import BeautifulSoup | |
import requests | |
import re | |
def scrape_hes(url): | |
HEADERS = { | |
'User-Agent': 'Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148'} | |
page = requests.get(url, headers=HEADERS) | |
src = page.content # variable to store page content | |
soup = BeautifulSoup(src, "html.parser") # beautify code | |
# print(soup) | |
Matches_Details = [] | |
# find all divs where exists class... | |
article_content = soup.find("div", {'article-content'}) | |
all_paragraphes = article_content.find_all("p") # get all a tags | |
# matches_number = len(all_matches) | |
article_text = "" | |
i=0 | |
for x in all_paragraphes: | |
if i==0: | |
i=1 | |
x = x.text.strip() | |
article_text = article_text+'\n'+x | |
continue | |
x = x.text.strip() | |
article_text = article_text+'\n'+x | |
#suppression espaces vides | |
article_text = re.sub(r'\n[\t\n\s]+\n*',r"\n",article_text) | |
return article_text.strip() | |