Spaces:
Runtime error
Runtime error
John Yang
commited on
Commit
•
69177fb
1
Parent(s):
631c491
Refactoring `verbose` arg
Browse files- app.py +33 -32
- predict_help.py +17 -18
app.py
CHANGED
@@ -172,13 +172,11 @@ def run_episode(goal, env, verbose=True):
|
|
172 |
page_type = Page.SEARCH
|
173 |
|
174 |
elif action == 'click[buy now]':
|
175 |
-
return_value =
|
176 |
if env == 'amazon':
|
177 |
-
|
178 |
-
return_value = "Product URL: " + asin_url
|
179 |
if len(clicked_options) > 0:
|
180 |
-
|
181 |
-
return_value += "\nSelected Options: " + options_str
|
182 |
if env == 'webshop':
|
183 |
query_str = "+".join(search_terms.split())
|
184 |
options_str = json.dumps(options)
|
@@ -186,13 +184,12 @@ def run_episode(goal, env, verbose=True):
|
|
186 |
f'{WEBSHOP_URL}/item_page/{WEBSHOP_SESSION}/'
|
187 |
f'{asin}/{query_str}/{page_num}/{options_str}'
|
188 |
)
|
189 |
-
return_value
|
190 |
if env == 'ebay':
|
191 |
-
asin_url = f"https
|
192 |
-
return_value
|
193 |
if len(clicked_options) > 0:
|
194 |
-
|
195 |
-
return_value += "\nSelected Options: " + options_str
|
196 |
return return_value
|
197 |
|
198 |
elif prev_page_type == Page.ITEM_PAGE:
|
@@ -216,16 +213,19 @@ def run_episode(goal, env, verbose=True):
|
|
216 |
if page_type == Page.RESULTS:
|
217 |
if search_terms in search_results_cache:
|
218 |
data = search_results_cache[search_terms]
|
|
|
|
|
219 |
else:
|
220 |
begin = time.time()
|
221 |
if env == 'amazon':
|
222 |
-
data = parse_results_amz(search_terms, page_num)
|
223 |
if env == 'webshop':
|
224 |
-
data = parse_results_ws(search_terms, page_num)
|
225 |
if env == 'ebay':
|
226 |
-
data = parse_results_ebay(search_terms, page_num)
|
227 |
end = time.time()
|
228 |
-
|
|
|
229 |
|
230 |
search_results_cache[search_terms] = data
|
231 |
num_prods = len(data)
|
@@ -233,18 +233,20 @@ def run_episode(goal, env, verbose=True):
|
|
233 |
title_to_asin_map[d['Title']] = d['asin']
|
234 |
elif page_type == Page.ITEM_PAGE or page_type == Page.SUB_PAGE:
|
235 |
if asin in product_map:
|
236 |
-
|
|
|
237 |
data = product_map[asin]
|
238 |
else:
|
239 |
begin = time.time()
|
240 |
if env == 'amazon':
|
241 |
-
data = parse_item_page_amz(asin)
|
242 |
if env == 'webshop':
|
243 |
-
data = parse_item_page_ws(asin, search_terms, page_num, options)
|
244 |
if env == 'ebay':
|
245 |
-
data = parse_item_page_ebay(asin)
|
246 |
end = time.time()
|
247 |
-
|
|
|
248 |
product_map[asin] = data
|
249 |
elif page_type == Page.SEARCH:
|
250 |
if verbose:
|
@@ -260,23 +262,23 @@ def run_episode(goal, env, verbose=True):
|
|
260 |
html_str = dict_to_fake_html(data, page_type, asin, sub_page_type, options, product_map, goal)
|
261 |
obs = convert_html_to_text(html_str, simple=False, clicked_options=clicked_options, visited_asins=visited_asins)
|
262 |
end = time.time()
|
263 |
-
|
|
|
264 |
|
265 |
# Dict of Info -> Valid Action State (Info)
|
266 |
begin = time.time()
|
267 |
prod_arg = product_map if page_type == Page.ITEM_PAGE else data
|
268 |
info = convert_dict_to_actions(page_type, prod_arg, asin, page_num, num_prods)
|
269 |
end = time.time()
|
270 |
-
|
|
|
271 |
|
272 |
-
if i ==
|
273 |
-
return_value =
|
274 |
if env == 'amazon':
|
275 |
-
|
276 |
-
return_value = "Product URL: " + asin_url
|
277 |
if len(clicked_options) > 0:
|
278 |
-
|
279 |
-
return_value += "\nSelected Options: " + options_str
|
280 |
if env == 'webshop':
|
281 |
query_str = "+".join(search_terms.split())
|
282 |
options_str = json.dumps(options)
|
@@ -284,13 +286,12 @@ def run_episode(goal, env, verbose=True):
|
|
284 |
f'{WEBSHOP_URL}/item_page/{WEBSHOP_SESSION}/'
|
285 |
f'{asin}/{query_str}/{page_num}/{options_str}'
|
286 |
)
|
287 |
-
return_value
|
288 |
if env == 'ebay':
|
289 |
-
asin_url = f"https
|
290 |
-
return_value
|
291 |
if len(clicked_options) > 0:
|
292 |
-
|
293 |
-
return_value += "\nSelected Options: " + options_str
|
294 |
return return_value
|
295 |
|
296 |
gr.Interface(fn=run_episode,\
|
|
|
172 |
page_type = Page.SEARCH
|
173 |
|
174 |
elif action == 'click[buy now]':
|
175 |
+
return_value = {}
|
176 |
if env == 'amazon':
|
177 |
+
return_value['Product URL'] = f"https://www.amazon.com/dp/{asin}"
|
|
|
178 |
if len(clicked_options) > 0:
|
179 |
+
return_value['Selected Options'] = ', '.join(list(clicked_options))
|
|
|
180 |
if env == 'webshop':
|
181 |
query_str = "+".join(search_terms.split())
|
182 |
options_str = json.dumps(options)
|
|
|
184 |
f'{WEBSHOP_URL}/item_page/{WEBSHOP_SESSION}/'
|
185 |
f'{asin}/{query_str}/{page_num}/{options_str}'
|
186 |
)
|
187 |
+
return_value['Product URL'] = asin_url
|
188 |
if env == 'ebay':
|
189 |
+
asin_url = f"https://www.ebay.com/itm/{asin}"
|
190 |
+
return_value['Product URL'] = asin_url
|
191 |
if len(clicked_options) > 0:
|
192 |
+
return_value['Selected Options'] = ', '.join(list(clicked_options))
|
|
|
193 |
return return_value
|
194 |
|
195 |
elif prev_page_type == Page.ITEM_PAGE:
|
|
|
213 |
if page_type == Page.RESULTS:
|
214 |
if search_terms in search_results_cache:
|
215 |
data = search_results_cache[search_terms]
|
216 |
+
if verbose:
|
217 |
+
print(f"Loading cached results page for\"{search_terms}\"")
|
218 |
else:
|
219 |
begin = time.time()
|
220 |
if env == 'amazon':
|
221 |
+
data = parse_results_amz(search_terms, page_num, verbose)
|
222 |
if env == 'webshop':
|
223 |
+
data = parse_results_ws(search_terms, page_num, verbose)
|
224 |
if env == 'ebay':
|
225 |
+
data = parse_results_ebay(search_terms, page_num, verbose)
|
226 |
end = time.time()
|
227 |
+
if verbose:
|
228 |
+
print(f"Parsing search results took {end-begin} seconds")
|
229 |
|
230 |
search_results_cache[search_terms] = data
|
231 |
num_prods = len(data)
|
|
|
233 |
title_to_asin_map[d['Title']] = d['asin']
|
234 |
elif page_type == Page.ITEM_PAGE or page_type == Page.SUB_PAGE:
|
235 |
if asin in product_map:
|
236 |
+
if verbose:
|
237 |
+
print("Loading cached item page for", asin)
|
238 |
data = product_map[asin]
|
239 |
else:
|
240 |
begin = time.time()
|
241 |
if env == 'amazon':
|
242 |
+
data = parse_item_page_amz(asin, verbose)
|
243 |
if env == 'webshop':
|
244 |
+
data = parse_item_page_ws(asin, search_terms, page_num, options, verbose)
|
245 |
if env == 'ebay':
|
246 |
+
data = parse_item_page_ebay(asin, verbose)
|
247 |
end = time.time()
|
248 |
+
if verbose:
|
249 |
+
print("Parsing item page took", end-begin, "seconds")
|
250 |
product_map[asin] = data
|
251 |
elif page_type == Page.SEARCH:
|
252 |
if verbose:
|
|
|
262 |
html_str = dict_to_fake_html(data, page_type, asin, sub_page_type, options, product_map, goal)
|
263 |
obs = convert_html_to_text(html_str, simple=False, clicked_options=clicked_options, visited_asins=visited_asins)
|
264 |
end = time.time()
|
265 |
+
if verbose:
|
266 |
+
print("[Page Info -> WebShop HTML -> Observation] took", end-begin, "seconds")
|
267 |
|
268 |
# Dict of Info -> Valid Action State (Info)
|
269 |
begin = time.time()
|
270 |
prod_arg = product_map if page_type == Page.ITEM_PAGE else data
|
271 |
info = convert_dict_to_actions(page_type, prod_arg, asin, page_num, num_prods)
|
272 |
end = time.time()
|
273 |
+
if verbose:
|
274 |
+
print("Extracting available actions took", end-begin, "seconds")
|
275 |
|
276 |
+
if i == 50:
|
277 |
+
return_value = {}
|
278 |
if env == 'amazon':
|
279 |
+
return_value['Product URL'] = f"https://www.amazon.com/dp/{asin}"
|
|
|
280 |
if len(clicked_options) > 0:
|
281 |
+
return_value['Selected Options'] = ', '.join(list(clicked_options))
|
|
|
282 |
if env == 'webshop':
|
283 |
query_str = "+".join(search_terms.split())
|
284 |
options_str = json.dumps(options)
|
|
|
286 |
f'{WEBSHOP_URL}/item_page/{WEBSHOP_SESSION}/'
|
287 |
f'{asin}/{query_str}/{page_num}/{options_str}'
|
288 |
)
|
289 |
+
return_value['Product URL'] = asin_url
|
290 |
if env == 'ebay':
|
291 |
+
asin_url = f"https://www.ebay.com/itm/{asin}"
|
292 |
+
return_value['Product URL'] = asin_url
|
293 |
if len(clicked_options) > 0:
|
294 |
+
return_value['Selected Options'] = ', '.join(list(clicked_options))
|
|
|
295 |
return return_value
|
296 |
|
297 |
gr.Interface(fn=run_episode,\
|
predict_help.py
CHANGED
@@ -17,7 +17,6 @@ class Page(Enum):
|
|
17 |
|
18 |
HEADER_ = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.64 Safari/537.36'
|
19 |
DEBUG_HTML = "temp.html"
|
20 |
-
VERBOSE = True
|
21 |
NUM_PROD_LIMIT = 10
|
22 |
|
23 |
WEBSHOP_URL = "http://3.83.245.205:3000"
|
@@ -29,11 +28,11 @@ def get_url(url):
|
|
29 |
proxy_url = 'http://api.scraperapi.com/?' + urlencode(payload)
|
30 |
return proxy_url
|
31 |
|
32 |
-
def parse_results_ebay(query, page_num=None):
|
33 |
query_string = '+'.join(query.split())
|
34 |
page_num = 1 if page_num is None else page_num
|
35 |
url = f'https://www.ebay.com/sch/i.html?_nkw={query_string}&_pgn={page_num}'
|
36 |
-
if
|
37 |
print(f"Search Results URL: {url}")
|
38 |
webpage = requests.get(url, headers={'User-Agent': HEADER_, 'Accept-Language': 'en-US, en;q=0.5'})
|
39 |
soup = BeautifulSoup(webpage.text, 'html.parser')
|
@@ -61,21 +60,21 @@ def parse_results_ebay(query, page_num=None):
|
|
61 |
"Title": title,
|
62 |
"Price": price
|
63 |
})
|
64 |
-
if
|
65 |
print(f"Scraped {len(results)} products")
|
66 |
return results
|
67 |
|
68 |
-
def parse_item_page_ebay(asin):
|
69 |
product_dict = {}
|
70 |
product_dict["asin"] = asin
|
71 |
|
72 |
url = f"https://www.ebay.com/itm/{asin}"
|
73 |
-
if
|
74 |
print(f"Item Page URL: {url}")
|
75 |
begin = time.time()
|
76 |
webpage = requests.get(url, headers={'User-Agent': HEADER_, 'Accept-Language': 'en-US, en;q=0.5'})
|
77 |
end = time.time()
|
78 |
-
if
|
79 |
print(f"Item page scraping took {end-begin} seconds")
|
80 |
soup = BeautifulSoup(webpage.content, "html.parser")
|
81 |
|
@@ -148,14 +147,14 @@ def parse_item_page_ebay(asin):
|
|
148 |
return product_dict
|
149 |
|
150 |
|
151 |
-
def parse_results_ws(query, page_num=None):
|
152 |
query_string = '+'.join(query.split())
|
153 |
page_num = 1 if page_num is None else page_num
|
154 |
url = (
|
155 |
f'{WEBSHOP_URL}/search_results/{WEBSHOP_SESSION}/'
|
156 |
f'{query_string}/{page_num}'
|
157 |
)
|
158 |
-
if
|
159 |
print(f"Search Results URL: {url}")
|
160 |
webpage = requests.get(url, headers={'User-Agent': HEADER_, 'Accept-Language': 'en-US, en;q=0.5'})
|
161 |
soup = BeautifulSoup(webpage.content, 'html.parser')
|
@@ -185,11 +184,11 @@ def parse_results_ws(query, page_num=None):
|
|
185 |
"Price": price
|
186 |
})
|
187 |
|
188 |
-
if
|
189 |
print(f"Scraped {len(results)} products")
|
190 |
return results
|
191 |
|
192 |
-
def parse_item_page_ws(asin, query, page_num, options):
|
193 |
product_dict = {}
|
194 |
product_dict["asin"] = asin
|
195 |
|
@@ -199,7 +198,7 @@ def parse_item_page_ws(asin, query, page_num, options):
|
|
199 |
f'{WEBSHOP_URL}/item_page/{WEBSHOP_SESSION}/'
|
200 |
f'{asin}/{query_string}/{page_num}/{options_string}'
|
201 |
)
|
202 |
-
if
|
203 |
print("Item Page URL: ", url)
|
204 |
webpage = requests.get(url, headers={'User-Agent': HEADER_, 'Accept-Language': 'en-US, en;q=0.5'})
|
205 |
soup = BeautifulSoup(webpage.content, 'html.parser')
|
@@ -258,11 +257,11 @@ def parse_item_page_ws(asin, query, page_num, options):
|
|
258 |
return product_dict
|
259 |
|
260 |
# Query -> Search Result ASINs
|
261 |
-
def parse_results_amz(query, page_num=None):
|
262 |
url = 'https://www.amazon.com/s?k=' + query.replace(" ", "+")
|
263 |
if page_num is not None:
|
264 |
url += "&page=" + str(page_num)
|
265 |
-
if
|
266 |
print(f"Search Results URL: {url}")
|
267 |
webpage = requests.get(url, headers={'User-Agent': HEADER_, 'Accept-Language': 'en-US, en;q=0.5'})
|
268 |
soup = BeautifulSoup(webpage.content, 'html.parser')
|
@@ -286,22 +285,22 @@ def parse_results_amz(query, page_num=None):
|
|
286 |
'Price': price.text.strip().strip("$")
|
287 |
}
|
288 |
results.append(result)
|
289 |
-
if
|
290 |
print("Scraped", len(results), "products")
|
291 |
return results
|
292 |
|
293 |
# Scrape information of each product
|
294 |
-
def parse_item_page_amz(asin):
|
295 |
product_dict = {}
|
296 |
product_dict["asin"] = asin
|
297 |
|
298 |
url = f"https://www.amazon.com/dp/{asin}"
|
299 |
-
if
|
300 |
print("Item Page URL:", url)
|
301 |
begin = time.time()
|
302 |
webpage = requests.get(url, headers={'User-Agent': HEADER_, 'Accept-Language': 'en-US, en;q=0.5'})
|
303 |
end = time.time()
|
304 |
-
if
|
305 |
print(f"Item page scraping took {end-begin} seconds")
|
306 |
soup = BeautifulSoup(webpage.content, "html.parser")
|
307 |
|
|
|
17 |
|
18 |
HEADER_ = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.64 Safari/537.36'
|
19 |
DEBUG_HTML = "temp.html"
|
|
|
20 |
NUM_PROD_LIMIT = 10
|
21 |
|
22 |
WEBSHOP_URL = "http://3.83.245.205:3000"
|
|
|
28 |
proxy_url = 'http://api.scraperapi.com/?' + urlencode(payload)
|
29 |
return proxy_url
|
30 |
|
31 |
+
def parse_results_ebay(query, page_num=None, verbose=True):
|
32 |
query_string = '+'.join(query.split())
|
33 |
page_num = 1 if page_num is None else page_num
|
34 |
url = f'https://www.ebay.com/sch/i.html?_nkw={query_string}&_pgn={page_num}'
|
35 |
+
if verbose:
|
36 |
print(f"Search Results URL: {url}")
|
37 |
webpage = requests.get(url, headers={'User-Agent': HEADER_, 'Accept-Language': 'en-US, en;q=0.5'})
|
38 |
soup = BeautifulSoup(webpage.text, 'html.parser')
|
|
|
60 |
"Title": title,
|
61 |
"Price": price
|
62 |
})
|
63 |
+
if verbose:
|
64 |
print(f"Scraped {len(results)} products")
|
65 |
return results
|
66 |
|
67 |
+
def parse_item_page_ebay(asin, verbose=True):
|
68 |
product_dict = {}
|
69 |
product_dict["asin"] = asin
|
70 |
|
71 |
url = f"https://www.ebay.com/itm/{asin}"
|
72 |
+
if verbose:
|
73 |
print(f"Item Page URL: {url}")
|
74 |
begin = time.time()
|
75 |
webpage = requests.get(url, headers={'User-Agent': HEADER_, 'Accept-Language': 'en-US, en;q=0.5'})
|
76 |
end = time.time()
|
77 |
+
if verbose:
|
78 |
print(f"Item page scraping took {end-begin} seconds")
|
79 |
soup = BeautifulSoup(webpage.content, "html.parser")
|
80 |
|
|
|
147 |
return product_dict
|
148 |
|
149 |
|
150 |
+
def parse_results_ws(query, page_num=None, verbose=True):
|
151 |
query_string = '+'.join(query.split())
|
152 |
page_num = 1 if page_num is None else page_num
|
153 |
url = (
|
154 |
f'{WEBSHOP_URL}/search_results/{WEBSHOP_SESSION}/'
|
155 |
f'{query_string}/{page_num}'
|
156 |
)
|
157 |
+
if verbose:
|
158 |
print(f"Search Results URL: {url}")
|
159 |
webpage = requests.get(url, headers={'User-Agent': HEADER_, 'Accept-Language': 'en-US, en;q=0.5'})
|
160 |
soup = BeautifulSoup(webpage.content, 'html.parser')
|
|
|
184 |
"Price": price
|
185 |
})
|
186 |
|
187 |
+
if verbose:
|
188 |
print(f"Scraped {len(results)} products")
|
189 |
return results
|
190 |
|
191 |
+
def parse_item_page_ws(asin, query, page_num, options, verbose=True):
|
192 |
product_dict = {}
|
193 |
product_dict["asin"] = asin
|
194 |
|
|
|
198 |
f'{WEBSHOP_URL}/item_page/{WEBSHOP_SESSION}/'
|
199 |
f'{asin}/{query_string}/{page_num}/{options_string}'
|
200 |
)
|
201 |
+
if verbose:
|
202 |
print("Item Page URL: ", url)
|
203 |
webpage = requests.get(url, headers={'User-Agent': HEADER_, 'Accept-Language': 'en-US, en;q=0.5'})
|
204 |
soup = BeautifulSoup(webpage.content, 'html.parser')
|
|
|
257 |
return product_dict
|
258 |
|
259 |
# Query -> Search Result ASINs
|
260 |
+
def parse_results_amz(query, page_num=None, verbose=True):
|
261 |
url = 'https://www.amazon.com/s?k=' + query.replace(" ", "+")
|
262 |
if page_num is not None:
|
263 |
url += "&page=" + str(page_num)
|
264 |
+
if verbose:
|
265 |
print(f"Search Results URL: {url}")
|
266 |
webpage = requests.get(url, headers={'User-Agent': HEADER_, 'Accept-Language': 'en-US, en;q=0.5'})
|
267 |
soup = BeautifulSoup(webpage.content, 'html.parser')
|
|
|
285 |
'Price': price.text.strip().strip("$")
|
286 |
}
|
287 |
results.append(result)
|
288 |
+
if verbose:
|
289 |
print("Scraped", len(results), "products")
|
290 |
return results
|
291 |
|
292 |
# Scrape information of each product
|
293 |
+
def parse_item_page_amz(asin, verbose=True):
|
294 |
product_dict = {}
|
295 |
product_dict["asin"] = asin
|
296 |
|
297 |
url = f"https://www.amazon.com/dp/{asin}"
|
298 |
+
if verbose:
|
299 |
print("Item Page URL:", url)
|
300 |
begin = time.time()
|
301 |
webpage = requests.get(url, headers={'User-Agent': HEADER_, 'Accept-Language': 'en-US, en;q=0.5'})
|
302 |
end = time.time()
|
303 |
+
if verbose:
|
304 |
print(f"Item page scraping took {end-begin} seconds")
|
305 |
soup = BeautifulSoup(webpage.content, "html.parser")
|
306 |
|