Spaces:
Runtime error
Runtime error
File size: 1,502 Bytes
4014562 d4218cc 4014562 d4218cc 4014562 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 |
from bs4 import BeautifulSoup as bs
import requests
from typing import Dict, List, Optional
from fake_http_header import FakeHttpHeader
class Scraper:
def __init__(self):
...
def sanity_url(self, url : str) -> bool:
if url.find('amazon')==-1:
return False
return True
def get_product(self, url : str) -> Dict:
if not self.sanity_url(url):
return 'Invalid URL'
webpage = requests.get(url, headers=FakeHttpHeader().as_header_dict())
f = open('webpage_out.html','w')
f.write(webpage.content.decode())
f.close()
if webpage.status_code != 200:
return 'Error Loading Link'
try:
webpage = bs(webpage.content)
title = webpage.findAll("span", attrs={"id": 'productTitle'})[0].text.strip()
categories = [x.strip().lower() for x in webpage.findAll("div", attrs={"id": 'wayfinding-breadcrumbs_feature_div'})[0].text.strip().split('\n') if x.strip()!='' and len(x.strip()) >=3]
desc = webpage.findAll("div", attrs={"id": 'featurebullets_feature_div'})[0].text.replace('About this item','').strip()
except IndexError as e:
if webpage.content.find('captcha')!=-1:
return {'description' : 'Detected as a Bot. Please Try Again Later. Till then, you can continue to type in your description, or manually copy from Amazon.'}
return {'description' : f'{title}\n{desc}', 'labels' : categories} |