|
import re |
|
import requests |
|
from base64 import b64decode |
|
from bs4 import BeautifulSoup |
|
from typing import Dict |
|
|
|
Z_KEY = '' |
|
|
|
|
|
def zyte_call(url: str) -> bytes: |
|
api_response = requests.post( |
|
"https://api.zyte.com/v1/extract", |
|
auth=(Z_KEY, ""), |
|
json={ |
|
"url": url, |
|
"httpResponseBody": True |
|
}, |
|
) |
|
http_response_body: bytes = b64decode( |
|
api_response.json()["httpResponseBody"]) |
|
return http_response_body |
|
|
|
|
|
def get_asin_pdp(soup: BeautifulSoup) -> Dict[str, str]: |
|
|
|
try: |
|
asin = soup.find('link', rel='canonical')['href'].split('/')[-1] |
|
except TypeError: |
|
asin = None |
|
|
|
|
|
search = soup.find('span', id="productTitle") |
|
title = search.text.lstrip().rstrip() if search else None |
|
|
|
|
|
search = soup.find('div', id="feature-bullets") |
|
if search: |
|
bullet_search = search.find_all('span', class_='a-list-item') |
|
feature_bullets = [h.text.lstrip().rstrip() for h in bullet_search if len(bullet_search)] |
|
|
|
feature_bullets = [b for b in feature_bullets if b != 'Make sure this fits by entering your model number.'] |
|
else: |
|
feature_bullets = None |
|
|
|
|
|
kv_res = parse_kv_table(soup) |
|
tech_res = parse_tech_table(soup) |
|
ap_data = parse_ap_table(soup) |
|
tech_data = {**kv_res, **tech_res, **ap_data} |
|
|
|
res = {'asin': asin, 'title': title, 'feature_bullets': feature_bullets, 'tech_data': tech_data} |
|
return res |
|
|
|
|
|
def parse_kv_table(soup: BeautifulSoup) -> Dict[str, str]: |
|
kv_res = {} |
|
try: |
|
search = soup.find('div', id='productOverview_feature_div') |
|
table = search.find('table') |
|
|
|
data = table.find_all('tr') |
|
for d in data: |
|
kv = d.find_all('td') |
|
k = kv[0].text.lstrip().rstrip() |
|
v = kv[1].text.lstrip().rstrip() |
|
kv_res[k] = v |
|
except AttributeError: |
|
pass |
|
return kv_res |
|
|
|
|
|
def parse_tech_table(soup: BeautifulSoup) -> Dict[str, str]: |
|
tech_res = {} |
|
tables = soup.find_all('table', id=re.compile('productDetails_techSpec.*')) |
|
if tables: |
|
for tab in tables: |
|
data = tab.find_all('tr') |
|
for d in data: |
|
key = d.find('th').text.lstrip().rstrip() |
|
value = d.find('td').text.strip('\n').replace('\u200e', '').lstrip().rstrip() |
|
tech_res[key] = value |
|
return tech_res |
|
|
|
|
|
def parse_ap_table(soup: BeautifulSoup) -> Dict[str, str]: |
|
ap_res = {} |
|
tech = soup.find_all('div', id='tech') |
|
for div in tech: |
|
tables = div.find_all('table') |
|
for tab in tables: |
|
data = tab.find_all('tr') |
|
for d in data: |
|
kv = d.find_all('td') |
|
if kv: |
|
key = kv[0].text.strip('\n').replace('\u200e', '').lstrip().rstrip() |
|
value = kv[1].text.strip('\n').replace('\u200e', '').lstrip().rstrip() |
|
ap_res[key] = value |
|
return ap_res |