import re import os import requests from base64 import b64decode from bs4 import BeautifulSoup from typing import Dict, Optional Z_KEY = os.environ.get('ZYTE_KEY') PAGE_NOT_FOUND_STR = 'page not found' def zyte_call(url: str) -> bytes: api_response = requests.post( "https://api.zyte.com/v1/extract", auth=(Z_KEY, ""), json={ "url": url, "httpResponseBody": True }, ) http_response_body: bytes = b64decode( api_response.json()["httpResponseBody"]) return http_response_body def get_asin_pdp(soup: BeautifulSoup) -> Optional[Dict[str, str]]: # Check if 404 if PAGE_NOT_FOUND_STR in soup.find('title').text.lower(): return None # Get ASIN try: asin = soup.find('link', rel='canonical')['href'].split('/')[-1] except TypeError: asin = None # Get title search = soup.find('span', id="productTitle") title = search.text.lstrip().rstrip() if search else None # Get feature-bullets search = soup.find('div', id="feature-bullets") if search: bullet_search = search.find_all('span', class_='a-list-item') feature_bullets = [h.text.lstrip().rstrip() for h in bullet_search if len(bullet_search)] # Remove unwanted bullets feature_bullets = [b for b in feature_bullets if b != 'Make sure this fits by entering your model number.'] else: feature_bullets = None # Get KV, tech, A+ tables. Merge with override key hierarchy: A+ > tech > KV kv_res = parse_kv_table(soup) tech_res = parse_tech_table(soup) ap_data = parse_ap_table(soup) tech_data = {**kv_res, **tech_res, **ap_data} res = {'asin': asin, 'title': title, 'feature_bullets': feature_bullets, 'tech_data': tech_data} return res def parse_kv_table(soup: BeautifulSoup) -> Dict[str, str]: kv_res = {} try: search = soup.find('div', id='productOverview_feature_div') table = search.find('table') data = table.find_all('tr') for d in data: kv = d.find_all('td') k = kv[0].text.lstrip().rstrip() v = kv[1].text.lstrip().rstrip() kv_res[k] = v except AttributeError: pass return kv_res def parse_tech_table(soup: BeautifulSoup) -> Dict[str, str]: tech_res = {} tables = soup.find_all('table', id=re.compile('productDetails_techSpec.*')) if tables: for tab in tables: data = tab.find_all('tr') for d in data: key = d.find('th').text.lstrip().rstrip() value = d.find('td').text.strip('\n').replace('\u200e', '').lstrip().rstrip() tech_res[key] = value return tech_res def parse_ap_table(soup: BeautifulSoup) -> Dict[str, str]: ap_res = {} tech = soup.find_all('div', id='tech') for div in tech: tables = div.find_all('table') for tab in tables: data = tab.find_all('tr') for d in data: kv = d.find_all('td') if kv: key = kv[0].text.strip('\n').replace('\u200e', '').lstrip().rstrip() value = kv[1].text.strip('\n').replace('\u200e', '').lstrip().rstrip() ap_res[key] = value return ap_res