Spaces:
Runtime error
Runtime error
File size: 1,032 Bytes
55af729 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 |
from bs4 import BeautifulSoup
import requests
import re
def scrape_hes(url):
HEADERS = {
'User-Agent': 'Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148'}
page = requests.get(url, headers=HEADERS)
src = page.content # variable to store page content
soup = BeautifulSoup(src, "html.parser") # beautify code
# print(soup)
Matches_Details = []
# find all divs where exists class...
article_content = soup.find("div", {'article-content'})
all_paragraphes = article_content.find_all("p") # get all a tags
# matches_number = len(all_matches)
article_text = ""
i=0
for x in all_paragraphes:
if i==0:
i=1
x = x.text.strip()
article_text = article_text+'\n'+x
continue
x = x.text.strip()
article_text = article_text+'\n'+x
#suppression espaces vides
article_text = re.sub(r'\n[\t\n\s]+\n*',r"\n",article_text)
return article_text.strip()
|