Spaces:
Runtime error
Runtime error
File size: 2,906 Bytes
4c404f5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 |
from selenium import webdriver
from selenium.webdriver import FirefoxOptions
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import time
def load_driver():
print("Loading driver...")
opts = FirefoxOptions()
opts.add_argument("--headless")
driver = webdriver.Firefox(options=opts)
return driver
def parse_review(html):
# Review text
soup = BeautifulSoup(html, 'html.parser')
positive_review = None
negative_review = None
rows = soup.find_all("div", class_="c-review__row")
for row in rows:
if row.find("span", class_="c-review__translation-loader"):
continue
delimiter = row.find("span", class_="bui-u-sr-only").text.strip()
review_text = row.find("span", class_='c-review__body').text.strip()
if delimiter == "Понравилось":
positive_review = review_text
elif delimiter == "Не понравилось":
negative_review = review_text
else:
raise ValueError()
# Room name
room_info = soup.find('div', class_='c-review-block__room-info-row')
room_name = room_info.find('div', class_='bui-list__body').get_text(strip=True) if room_info else None
# Datetime of the review
# datetime_review = soup.find('span', class_='c-review-block__date').get_text(strip=True)
# Number of nights + date
stay_date_info = soup.find('ul', class_='c-review-block__stay-date')
date_info = stay_date_info.get_text(strip=True).replace(" ·", ", ")
return {
"positive": positive_review,
"negative": negative_review,
"room": room_name,
"time": date_info
}
def scrape_page(driver: webdriver.Firefox, url: str, page_count: int = 5, wait_time: int = 1):
# url = "https://www.booking.com/hotel/th/queen-boutique.ru.html#tab-reviews"
review_infos = []
driver.get(url)
print("page loaded")
for i in range(page_count):
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, "review_list_new_item_block")))
time.sleep(wait_time)
# Remove cookie banner
try:
driver.execute_script("return document.getElementById('onetrust-banner-sdk').remove();")
except:
pass
elems = driver.find_elements(By.CLASS_NAME, "review_list_new_item_block")
for elem in elems:
html = elem.get_attribute('outerHTML')
review_info = parse_review(html)
review_infos.append(review_info)
print(f"Done page {i+1} of {page_count}")
pagenext = driver.find_element(By.CLASS_NAME, "pagenext")
pagenext.click()
return review_infos |