Spaces:
Runtime error
Runtime error
John Yang
commited on
Commit
·
7a50274
1
Parent(s):
4b9c9b6
Revert to working version
Browse files- .gitignore +1 -7
- predict_help.py +11 -11
.gitignore
CHANGED
|
@@ -1,7 +1 @@
|
|
| 1 |
-
*.
|
| 2 |
-
*.pyc
|
| 3 |
-
*.txt
|
| 4 |
-
|
| 5 |
-
.DS_Store
|
| 6 |
-
|
| 7 |
-
run.py
|
|
|
|
| 1 |
+
*.pyc
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
predict_help.py
CHANGED
|
@@ -17,6 +17,7 @@ class Page(Enum):
|
|
| 17 |
|
| 18 |
HEADER_ = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.64 Safari/537.36'
|
| 19 |
DEBUG_HTML = "temp.html"
|
|
|
|
| 20 |
NUM_PROD_LIMIT = 10
|
| 21 |
|
| 22 |
API = '85956985fae328bfe5a759a2984448d2'
|
|
@@ -26,11 +27,11 @@ def get_url(url):
|
|
| 26 |
return proxy_url
|
| 27 |
|
| 28 |
# Query -> Search Result ASINs
|
| 29 |
-
def parse_results(query, page_num=None
|
| 30 |
url = 'https://www.amazon.com/s?k=' + query.replace(" ", "+")
|
| 31 |
if page_num is not None:
|
| 32 |
url += "&page=" + str(page_num)
|
| 33 |
-
if
|
| 34 |
print("Search Results URL:", url)
|
| 35 |
webpage = requests.get(url, headers={'User-Agent': HEADER_, 'Accept-Language': 'en-US, en;q=0.5'})
|
| 36 |
soup = BeautifulSoup(webpage.content, 'html.parser')
|
|
@@ -51,26 +52,25 @@ def parse_results(query, page_num=None, verbose=True):
|
|
| 51 |
result = {
|
| 52 |
'asin': asin,
|
| 53 |
'Title': title.text.strip(),
|
| 54 |
-
'Price': price.text.strip().strip("$")
|
| 55 |
}
|
| 56 |
results.append(result)
|
| 57 |
-
if
|
| 58 |
print("Scraped", len(results), "products")
|
| 59 |
return results
|
| 60 |
|
| 61 |
# Scrape information of each product
|
| 62 |
-
def parse_item_page(asin
|
| 63 |
product_dict = {}
|
| 64 |
product_dict["asin"] = asin
|
| 65 |
|
| 66 |
url = f"https://www.amazon.com/dp/{asin}"
|
| 67 |
-
if
|
| 68 |
print("Item Page URL:", url)
|
| 69 |
begin = time.time()
|
| 70 |
webpage = requests.get(url, headers={'User-Agent': HEADER_, 'Accept-Language': 'en-US, en;q=0.5'})
|
| 71 |
end = time.time()
|
| 72 |
-
|
| 73 |
-
print("Item page scraping took", end-begin, "seconds")
|
| 74 |
soup = BeautifulSoup(webpage.content, "html.parser")
|
| 75 |
|
| 76 |
# Title
|
|
@@ -195,9 +195,9 @@ def convert_dict_to_actions(page_type, products=None, asin=None, page_num=None,
|
|
| 195 |
if page_type == Page.RESULTS:
|
| 196 |
info["valid"] = ['click[back to search]']
|
| 197 |
if products is None or page_num is None or num_prods is None:
|
| 198 |
-
print(
|
| 199 |
-
print(
|
| 200 |
-
print(
|
| 201 |
raise Exception('Provide `products`, `num_prods`, `page_num` to get `results` valid actions')
|
| 202 |
# Decide whether to add `next >` as clickable based on # of search results
|
| 203 |
if num_prods > 10:
|
|
|
|
| 17 |
|
| 18 |
HEADER_ = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.64 Safari/537.36'
|
| 19 |
DEBUG_HTML = "temp.html"
|
| 20 |
+
VERBOSE = True
|
| 21 |
NUM_PROD_LIMIT = 10
|
| 22 |
|
| 23 |
API = '85956985fae328bfe5a759a2984448d2'
|
|
|
|
| 27 |
return proxy_url
|
| 28 |
|
| 29 |
# Query -> Search Result ASINs
|
| 30 |
+
def parse_results(query, page_num=None):
|
| 31 |
url = 'https://www.amazon.com/s?k=' + query.replace(" ", "+")
|
| 32 |
if page_num is not None:
|
| 33 |
url += "&page=" + str(page_num)
|
| 34 |
+
if VERBOSE:
|
| 35 |
print("Search Results URL:", url)
|
| 36 |
webpage = requests.get(url, headers={'User-Agent': HEADER_, 'Accept-Language': 'en-US, en;q=0.5'})
|
| 37 |
soup = BeautifulSoup(webpage.content, 'html.parser')
|
|
|
|
| 52 |
result = {
|
| 53 |
'asin': asin,
|
| 54 |
'Title': title.text.strip(),
|
| 55 |
+
'Price': price.text.strip().strip("$")
|
| 56 |
}
|
| 57 |
results.append(result)
|
| 58 |
+
if VERBOSE:
|
| 59 |
print("Scraped", len(results), "products")
|
| 60 |
return results
|
| 61 |
|
| 62 |
# Scrape information of each product
|
| 63 |
+
def parse_item_page(asin):
|
| 64 |
product_dict = {}
|
| 65 |
product_dict["asin"] = asin
|
| 66 |
|
| 67 |
url = f"https://www.amazon.com/dp/{asin}"
|
| 68 |
+
if VERBOSE:
|
| 69 |
print("Item Page URL:", url)
|
| 70 |
begin = time.time()
|
| 71 |
webpage = requests.get(url, headers={'User-Agent': HEADER_, 'Accept-Language': 'en-US, en;q=0.5'})
|
| 72 |
end = time.time()
|
| 73 |
+
print("Item page scraping took", end-begin, "seconds")
|
|
|
|
| 74 |
soup = BeautifulSoup(webpage.content, "html.parser")
|
| 75 |
|
| 76 |
# Title
|
|
|
|
| 195 |
if page_type == Page.RESULTS:
|
| 196 |
info["valid"] = ['click[back to search]']
|
| 197 |
if products is None or page_num is None or num_prods is None:
|
| 198 |
+
print(page_num)
|
| 199 |
+
print(num_prods)
|
| 200 |
+
print(products)
|
| 201 |
raise Exception('Provide `products`, `num_prods`, `page_num` to get `results` valid actions')
|
| 202 |
# Decide whether to add `next >` as clickable based on # of search results
|
| 203 |
if num_prods > 10:
|