import pdb import numpy as np import pandas as pd from scrapy.selector import Selector from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.common.keys import Keys import time from tqdm import tqdm import warnings import pdb warnings.filterwarnings("ignore") chrome_options = webdriver.ChromeOptions() chrome_options.add_argument('--headless') chrome_options.add_argument('--no-sandbox') chrome_options.add_argument('--disable-dev-shm-usage') driver = webdriver.Chrome(options=chrome_options) if __name__ == '__main__': url = "https://www.imdb.com/title/tt1517268/reviews/?ref_=tt_ov_rt" driver.get(url) sel = Selector(text = driver.page_source) review_counts = sel.css('.lister .header span::text').extract_first().replace(',','').split(' ')[0] more_review_pages = int(int(review_counts)/25) for i in tqdm(range(more_review_pages)): try: css_selector = 'load-more-trigger' driver.find_element(By.ID, css_selector).click() except: pass rating_list = [] review_date_list = [] review_title_list = [] author_list = [] review_list = [] review_url_list = [] error_url_list = [] error_msg_list = [] reviews = driver.find_elements(By.CSS_SELECTOR, 'div.review-container') for d in tqdm(reviews): try: sel2 = Selector(text = d.get_attribute('innerHTML')) try: rating = sel2.css('.rating-other-user-rating span::text').extract_first() except: rating = np.NaN try: review = sel2.css('.text.show-more__control::text').extract_first() except: review = np.NaN try: review_date = sel2.css('.review-date::text').extract_first() except: review_date = np.NaN try: author = sel2.css('.display-name-link a::text').extract_first() except: author = np.NaN try: review_title = sel2.css('a.title::text').extract_first() except: review_title = np.NaN try: review_url = sel2.css('a.title::attr(href)').extract_first() except: review_url = np.NaN rating_list.append(rating) review_date_list.append(review_date) review_title_list.append(review_title) author_list.append(author) review_list.append(review) review_url_list.append(review_url) except Exception as e: error_url_list.append(url) error_msg_list.append(e) review_df = pd.DataFrame({ 'Review_Date':review_date_list, 'Author':author_list, 'Rating':rating_list, 'Review_Title':review_title_list, 'Review':review_list, 'Review_Url':review_url }) pdb.set_trace()