File size: 3,118 Bytes
00f57d4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 |
import re
import requests
from base64 import b64decode
from bs4 import BeautifulSoup
from typing import Dict
Z_KEY = ''
def zyte_call(url: str) -> bytes:
api_response = requests.post(
"https://api.zyte.com/v1/extract",
auth=(Z_KEY, ""),
json={
"url": url,
"httpResponseBody": True
},
)
http_response_body: bytes = b64decode(
api_response.json()["httpResponseBody"])
return http_response_body
def get_asin_pdp(soup: BeautifulSoup) -> Dict[str, str]:
# Get ASIN
try:
asin = soup.find('link', rel='canonical')['href'].split('/')[-1]
except TypeError:
asin = None
# Get title
search = soup.find('span', id="productTitle")
title = search.text.lstrip().rstrip() if search else None
# Get feature-bullets
search = soup.find('div', id="feature-bullets")
if search:
bullet_search = search.find_all('span', class_='a-list-item')
feature_bullets = [h.text.lstrip().rstrip() for h in bullet_search if len(bullet_search)]
# Remove unwanted bullets
feature_bullets = [b for b in feature_bullets if b != 'Make sure this fits by entering your model number.']
else:
feature_bullets = None
# Get KV, tech, A+ tables. Merge with override key hierarchy: A+ > tech > KV
kv_res = parse_kv_table(soup)
tech_res = parse_tech_table(soup)
ap_data = parse_ap_table(soup)
tech_data = {**kv_res, **tech_res, **ap_data}
res = {'asin': asin, 'title': title, 'feature_bullets': feature_bullets, 'tech_data': tech_data}
return res
def parse_kv_table(soup: BeautifulSoup) -> Dict[str, str]:
kv_res = {}
try:
search = soup.find('div', id='productOverview_feature_div')
table = search.find('table')
data = table.find_all('tr')
for d in data:
kv = d.find_all('td')
k = kv[0].text.lstrip().rstrip()
v = kv[1].text.lstrip().rstrip()
kv_res[k] = v
except AttributeError:
pass
return kv_res
def parse_tech_table(soup: BeautifulSoup) -> Dict[str, str]:
tech_res = {}
tables = soup.find_all('table', id=re.compile('productDetails_techSpec.*'))
if tables:
for tab in tables:
data = tab.find_all('tr')
for d in data:
key = d.find('th').text.lstrip().rstrip()
value = d.find('td').text.strip('\n').replace('\u200e', '').lstrip().rstrip()
tech_res[key] = value
return tech_res
def parse_ap_table(soup: BeautifulSoup) -> Dict[str, str]:
ap_res = {}
tech = soup.find_all('div', id='tech')
for div in tech:
tables = div.find_all('table')
for tab in tables:
data = tab.find_all('tr')
for d in data:
kv = d.find_all('td')
if kv:
key = kv[0].text.strip('\n').replace('\u200e', '').lstrip().rstrip()
value = kv[1].text.strip('\n').replace('\u200e', '').lstrip().rstrip()
ap_res[key] = value
return ap_res |