Spaces:

Pranjal2041
/

SemSup-XC

Runtime error

SemSup-XC / fetch_prod.py

Fix Error Msg

6c841a9 over 1 year ago

No virus

1.5 kB

	from bs4 import BeautifulSoup as bs
	import requests
	from typing import Dict, List, Optional

	from fake_http_header import FakeHttpHeader

	class Scraper:


	def __init__(self):
	...

	def sanity_url(self, url : str) -> bool:
	if url.find('amazon')==-1:
	return False
	return True

	def get_product(self, url : str) -> Dict:
	if not self.sanity_url(url):
	return 'Invalid URL'

	webpage = requests.get(url, headers=FakeHttpHeader().as_header_dict())
	f = open('webpage_out.html','w')
	f.write(webpage.content.decode())
	f.close()
	if webpage.status_code != 200:
	return 'Error Loading Link'
	try:
	webpage = bs(webpage.content)
	title = webpage.findAll("span", attrs={"id": 'productTitle'})[0].text.strip()
	categories = [x.strip().lower() for x in webpage.findAll("div", attrs={"id": 'wayfinding-breadcrumbs_feature_div'})[0].text.strip().split('\n') if x.strip()!='' and len(x.strip()) >=3]
	desc = webpage.findAll("div", attrs={"id": 'featurebullets_feature_div'})[0].text.replace('About this item','').strip()
	except IndexError as e:
	if webpage.content.find('captcha')!=-1:
	return {'description' : 'Detected as a Bot. Please Try Again Later. Till then, you can continue to type in your description, or manually copy from Amazon.'}
	return {'description' : f'{title}\n{desc}', 'labels' : categories}