Spaces:
Sleeping
Sleeping
import grequests | |
from bs4 import BeautifulSoup | |
import pandas as pd | |
import time | |
import csv | |
from tqdm import tqdm | |
#product_url = "https://www.amazon.co.uk/Smiths-Savoury-Snacks-Favourites-24/product-reviews/B07X2M1D16/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews" | |
#product_url = 'https://www.amazon.co.uk/product-reviews/B0B21DW5DL/ref=cm_cr_arp_d_viewopt_sr?ie=UTF8&reviewerType=all_review' | |
custom_headers = { | |
# Eliminating non-english reviews | |
"Accept-language": "en;q=1.0", | |
"Accept-Encoding": "gzip, deflate, br", | |
"Cache-Control": "max-age=0", | |
"Connection": "keep-alive", | |
"User-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.1 Safari/605.1.15", | |
} | |
def get_soup(response): | |
if response.status_code != 200: | |
print("Error in getting webpage") | |
return None | |
soup = BeautifulSoup(response.text, "html.parser") | |
return soup | |
def get_reviews(soup): | |
review_elements = soup.select("div.review") | |
scraped_reviews = [] | |
for review in review_elements: | |
r_content_element = review.select_one("span.review-text") | |
r_content = r_content_element.text if r_content_element else None | |
preprocessed_review = r_content.replace('\n', '') | |
scraped_reviews.append(preprocessed_review) | |
return scraped_reviews | |
def scrape_reviews(base_url): | |
all_reviews = [] | |
star_ratings = ['one', 'two', 'three', 'four', 'five'] | |
for star in tqdm(star_ratings): | |
page_number = 1 | |
while True: | |
url = f"{base_url}&filterByStar={star}_star&&pageNumber={page_number}" | |
response = grequests.get(url, headers=custom_headers).send().response | |
soup = get_soup(response) | |
if not soup: | |
continue # Skip to next star rating if unable to parse page | |
reviews = get_reviews(soup) | |
all_reviews.extend(reviews) | |
# Note: there's a valid page for any pageNumber, | |
# so we need to stop scraping based on the button of next page | |
# Check for the presence of the "Next page" element | |
next_page_element = soup.find("li", class_="a-disabled a-last") | |
if next_page_element: | |
break # Exit loop if "Next page" element is found | |
page_number += 1 | |
return all_reviews | |