iarbel's picture
add error handling
1896c1d
raw
history blame contribute delete
No virus
3.31 kB
import re
import os
import requests
from base64 import b64decode
from bs4 import BeautifulSoup
from typing import Dict, Optional
Z_KEY = os.environ.get('ZYTE_KEY')
PAGE_NOT_FOUND_STR = 'page not found'
def zyte_call(url: str) -> bytes:
api_response = requests.post(
"https://api.zyte.com/v1/extract",
auth=(Z_KEY, ""),
json={
"url": url,
"httpResponseBody": True
},
)
http_response_body: bytes = b64decode(
api_response.json()["httpResponseBody"])
return http_response_body
def get_asin_pdp(soup: BeautifulSoup) -> Optional[Dict[str, str]]:
# Check if 404
if PAGE_NOT_FOUND_STR in soup.find('title').text.lower():
return None
# Get ASIN
try:
asin = soup.find('link', rel='canonical')['href'].split('/')[-1]
except TypeError:
asin = None
# Get title
search = soup.find('span', id="productTitle")
title = search.text.lstrip().rstrip() if search else None
# Get feature-bullets
search = soup.find('div', id="feature-bullets")
if search:
bullet_search = search.find_all('span', class_='a-list-item')
feature_bullets = [h.text.lstrip().rstrip() for h in bullet_search if len(bullet_search)]
# Remove unwanted bullets
feature_bullets = [b for b in feature_bullets if b != 'Make sure this fits by entering your model number.']
else:
feature_bullets = None
# Get KV, tech, A+ tables. Merge with override key hierarchy: A+ > tech > KV
kv_res = parse_kv_table(soup)
tech_res = parse_tech_table(soup)
ap_data = parse_ap_table(soup)
tech_data = {**kv_res, **tech_res, **ap_data}
res = {'asin': asin, 'title': title, 'feature_bullets': feature_bullets, 'tech_data': tech_data}
return res
def parse_kv_table(soup: BeautifulSoup) -> Dict[str, str]:
kv_res = {}
try:
search = soup.find('div', id='productOverview_feature_div')
table = search.find('table')
data = table.find_all('tr')
for d in data:
kv = d.find_all('td')
k = kv[0].text.lstrip().rstrip()
v = kv[1].text.lstrip().rstrip()
kv_res[k] = v
except AttributeError:
pass
return kv_res
def parse_tech_table(soup: BeautifulSoup) -> Dict[str, str]:
tech_res = {}
tables = soup.find_all('table', id=re.compile('productDetails_techSpec.*'))
if tables:
for tab in tables:
data = tab.find_all('tr')
for d in data:
key = d.find('th').text.lstrip().rstrip()
value = d.find('td').text.strip('\n').replace('\u200e', '').lstrip().rstrip()
tech_res[key] = value
return tech_res
def parse_ap_table(soup: BeautifulSoup) -> Dict[str, str]:
ap_res = {}
tech = soup.find_all('div', id='tech')
for div in tech:
tables = div.find_all('table')
for tab in tables:
data = tab.find_all('tr')
for d in data:
kv = d.find_all('td')
if kv:
key = kv[0].text.strip('\n').replace('\u200e', '').lstrip().rstrip()
value = kv[1].text.strip('\n').replace('\u200e', '').lstrip().rstrip()
ap_res[key] = value
return ap_res