1st_langchain / download.py
jfeng1115's picture
init commit
72a5f6e
raw
history blame
2.98 kB
import pdb
import numpy as np
import pandas as pd
from scrapy.selector import Selector
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
import time
from tqdm import tqdm
import warnings
import pdb
warnings.filterwarnings("ignore")
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
driver = webdriver.Chrome(options=chrome_options)
if __name__ == '__main__':
url = "https://www.imdb.com/title/tt1517268/reviews/?ref_=tt_ov_rt"
driver.get(url)
sel = Selector(text = driver.page_source)
review_counts = sel.css('.lister .header span::text').extract_first().replace(',','').split(' ')[0]
more_review_pages = int(int(review_counts)/25)
for i in tqdm(range(more_review_pages)):
try:
css_selector = 'load-more-trigger'
driver.find_element(By.ID, css_selector).click()
except:
pass
rating_list = []
review_date_list = []
review_title_list = []
author_list = []
review_list = []
review_url_list = []
error_url_list = []
error_msg_list = []
reviews = driver.find_elements(By.CSS_SELECTOR, 'div.review-container')
for d in tqdm(reviews):
try:
sel2 = Selector(text = d.get_attribute('innerHTML'))
try:
rating = sel2.css('.rating-other-user-rating span::text').extract_first()
except:
rating = np.NaN
try:
review = sel2.css('.text.show-more__control::text').extract_first()
except:
review = np.NaN
try:
review_date = sel2.css('.review-date::text').extract_first()
except:
review_date = np.NaN
try:
author = sel2.css('.display-name-link a::text').extract_first()
except:
author = np.NaN
try:
review_title = sel2.css('a.title::text').extract_first()
except:
review_title = np.NaN
try:
review_url = sel2.css('a.title::attr(href)').extract_first()
except:
review_url = np.NaN
rating_list.append(rating)
review_date_list.append(review_date)
review_title_list.append(review_title)
author_list.append(author)
review_list.append(review)
review_url_list.append(review_url)
except Exception as e:
error_url_list.append(url)
error_msg_list.append(e)
review_df = pd.DataFrame({
'Review_Date':review_date_list,
'Author':author_list,
'Rating':rating_list,
'Review_Title':review_title_list,
'Review':review_list,
'Review_Url':review_url
})
pdb.set_trace()