John Yang commited on
Commit
5d16b15
·
1 Parent(s): a8fcac2

eBay parsing working commit

Browse files
Files changed (2) hide show
  1. app.py +17 -0
  2. predict_help.py +123 -3
app.py CHANGED
@@ -7,6 +7,7 @@ from predict_help import (
7
  Page, convert_dict_to_actions, convert_html_to_text,
8
  parse_results_amz, parse_item_page_amz,
9
  parse_results_ws, parse_item_page_ws,
 
10
  WEBSHOP_URL, WEBSHOP_SESSION
11
  )
12
 
@@ -181,6 +182,12 @@ def run_episode(goal, verbose=True, env='amazon'):
181
  f'{asin}/{query_str}/{page_num}/{options_str}'
182
  )
183
  return_value = "Product URL: " + asin_url
 
 
 
 
 
 
184
  return return_value
185
 
186
  elif prev_page_type == Page.ITEM_PAGE:
@@ -210,6 +217,8 @@ def run_episode(goal, verbose=True, env='amazon'):
210
  data = parse_results_amz(search_terms, page_num)
211
  if env == 'webshop':
212
  data = parse_results_ws(search_terms, page_num)
 
 
213
  end = time.time()
214
  print("Parsing search results took", end-begin, "seconds")
215
 
@@ -227,6 +236,8 @@ def run_episode(goal, verbose=True, env='amazon'):
227
  data = parse_item_page_amz(asin)
228
  if env == 'webshop':
229
  data = parse_item_page_ws(asin, search_terms, page_num, options)
 
 
230
  end = time.time()
231
  print("Parsing item page took", end-begin, "seconds")
232
  product_map[asin] = data
@@ -269,6 +280,12 @@ def run_episode(goal, verbose=True, env='amazon'):
269
  f'{asin}/{query_str}/{page_num}/{options_str}'
270
  )
271
  return_value = "Product URL: " + asin_url
 
 
 
 
 
 
272
  return return_value
273
 
274
  gr.Interface(fn=run_episode,\
 
7
  Page, convert_dict_to_actions, convert_html_to_text,
8
  parse_results_amz, parse_item_page_amz,
9
  parse_results_ws, parse_item_page_ws,
10
+ parse_results_ebay, parse_item_page_ebay,
11
  WEBSHOP_URL, WEBSHOP_SESSION
12
  )
13
 
 
182
  f'{asin}/{query_str}/{page_num}/{options_str}'
183
  )
184
  return_value = "Product URL: " + asin_url
185
+ if env == 'ebay':
186
+ asin_url = f"https:///www.ebay.com/itm/{asin}"
187
+ return_value = "Product URL: " + asin_url
188
+ if len(clicked_options) > 0:
189
+ options_str = ', '.join(list(clicked_options))
190
+ return_value += "\nSelected Options: " + options_str
191
  return return_value
192
 
193
  elif prev_page_type == Page.ITEM_PAGE:
 
217
  data = parse_results_amz(search_terms, page_num)
218
  if env == 'webshop':
219
  data = parse_results_ws(search_terms, page_num)
220
+ if env == 'ebay':
221
+ data = parse_results_ebay(search_terms, page_num)
222
  end = time.time()
223
  print("Parsing search results took", end-begin, "seconds")
224
 
 
236
  data = parse_item_page_amz(asin)
237
  if env == 'webshop':
238
  data = parse_item_page_ws(asin, search_terms, page_num, options)
239
+ if env == 'ebay':
240
+ data = parse_item_page_ebay(asin)
241
  end = time.time()
242
  print("Parsing item page took", end-begin, "seconds")
243
  product_map[asin] = data
 
280
  f'{asin}/{query_str}/{page_num}/{options_str}'
281
  )
282
  return_value = "Product URL: " + asin_url
283
+ if env == 'ebay':
284
+ asin_url = f"https:///www.ebay.com/itm/{asin}"
285
+ return_value = "Product URL: " + asin_url
286
+ if len(clicked_options) > 0:
287
+ options_str = ', '.join(list(clicked_options))
288
+ return_value += "\nSelected Options: " + options_str
289
  return return_value
290
 
291
  gr.Interface(fn=run_episode,\
predict_help.py CHANGED
@@ -1,7 +1,7 @@
1
  from bs4 import BeautifulSoup
2
  from bs4.element import Comment
3
  from enum import Enum
4
- import time
5
  from urllib.parse import urlencode
6
 
7
  import json, requests, torch
@@ -19,6 +19,7 @@ HEADER_ = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (K
19
  DEBUG_HTML = "temp.html"
20
  VERBOSE = True
21
  NUM_PROD_LIMIT = 10
 
22
  WEBSHOP_URL = "http://3.83.245.205:3000"
23
  WEBSHOP_SESSION = "abc"
24
 
@@ -28,6 +29,125 @@ def get_url(url):
28
  proxy_url = 'http://api.scraperapi.com/?' + urlencode(payload)
29
  return proxy_url
30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  def parse_results_ws(query, page_num=None):
32
  query_string = '+'.join(query.split())
33
  page_num = 1 if page_num is None else page_num
@@ -181,7 +301,8 @@ def parse_item_page_amz(asin):
181
  begin = time.time()
182
  webpage = requests.get(url, headers={'User-Agent': HEADER_, 'Accept-Language': 'en-US, en;q=0.5'})
183
  end = time.time()
184
- print("Item page scraping took", end-begin, "seconds")
 
185
  soup = BeautifulSoup(webpage.content, "html.parser")
186
 
187
  # Title
@@ -225,7 +346,6 @@ def parse_item_page_amz(asin):
225
  desc_div = desc_body.find(name="div", attrs={"id": "productDescription"})
226
  desc_ps = desc_div.findAll(name="p")
227
  desc = " ".join([p.text for p in desc_ps])
228
-
229
  except AttributeError:
230
  desc = "N/A"
231
  product_dict["Description"] = desc.strip()
 
1
  from bs4 import BeautifulSoup
2
  from bs4.element import Comment
3
  from enum import Enum
4
+ import re, time
5
  from urllib.parse import urlencode
6
 
7
  import json, requests, torch
 
19
  DEBUG_HTML = "temp.html"
20
  VERBOSE = True
21
  NUM_PROD_LIMIT = 10
22
+
23
  WEBSHOP_URL = "http://3.83.245.205:3000"
24
  WEBSHOP_SESSION = "abc"
25
 
 
29
  proxy_url = 'http://api.scraperapi.com/?' + urlencode(payload)
30
  return proxy_url
31
 
32
+ def parse_results_ebay(query, page_num=None):
33
+ query_string = '+'.join(query.split())
34
+ page_num = 1 if page_num is None else page_num
35
+ url = f'https://www.ebay.com/sch/i.html?_nkw={query_string}&_pgn={page_num}'
36
+ if VERBOSE:
37
+ print(f"Search Results URL: {url}")
38
+ webpage = requests.get(url, headers={'User-Agent': HEADER_, 'Accept-Language': 'en-US, en;q=0.5'})
39
+ soup = BeautifulSoup(webpage.text, 'html.parser')
40
+ products = soup.select('.s-item__wrapper.clearfix')
41
+
42
+ results = []
43
+ for item in products[:NUM_PROD_LIMIT]:
44
+ title = item.select_one('.s-item__title').text.strip()
45
+ if "shop on ebay" in title.lower():
46
+ # Skip "Shop on ebay" product title
47
+ continue
48
+ link = item.select_one('.s-item__link')['href']
49
+ asin = link.split("?")[0][len("https://www.ebay.com/itm/"):]
50
+
51
+ try:
52
+ price = item.select_one('.s-item__price').text
53
+ if "to" in price:
54
+ prices = price.split(" to ")
55
+ price = [p.strip("$") for p in prices]
56
+ except:
57
+ price = None
58
+
59
+ results.append({
60
+ "asin": asin,
61
+ "Title": title,
62
+ "Price": price
63
+ })
64
+ if VERBOSE:
65
+ print(f"Scraped {len(results)} products")
66
+ return results
67
+
68
+ def parse_item_page_ebay(asin):
69
+ product_dict = {}
70
+ product_dict["asin"] = asin
71
+
72
+ url = f"https://www.ebay.com/itm/{asin}"
73
+ if VERBOSE:
74
+ print(f"Item Page URL: {url}")
75
+ begin = time.time()
76
+ webpage = requests.get(url, headers={'User-Agent': HEADER_, 'Accept-Language': 'en-US, en;q=0.5'})
77
+ end = time.time()
78
+ if VERBOSE:
79
+ print(f"Item page scraping took {end-begin} seconds")
80
+ soup = BeautifulSoup(webpage.content, "html.parser")
81
+
82
+ # Title
83
+ try:
84
+ product_dict["Title"] = soup.find('h1', {'class': 'x-item-title__mainTitle'}).text.strip()
85
+ except:
86
+ product_dict["Title"] = "N/A"
87
+
88
+ # Price: Get price string, extract decimal numbers from string
89
+ try:
90
+ price_str = soup.find('div', {'class': 'mainPrice'}).text
91
+ prices = re.findall('\d*\.?\d+', price_str)
92
+ product_dict["Price"] = prices[0]
93
+ except:
94
+ product_dict["Price"] = "N/A"
95
+
96
+ # Main Image
97
+ try:
98
+ img_div = soup.find('div', {'id': 'mainImgHldr'})
99
+ img_link = img_div.find('img', {'id': 'icImg'})["src"]
100
+ product_dict["MainImage"] = img_link
101
+ except:
102
+ product_dict["MainImage"] = ""
103
+
104
+ # Rating
105
+ try:
106
+ rating = soup.find('span', {'class': 'reviews-star-rating'})["title"].split()[0]
107
+ except:
108
+ rating = None
109
+ product_dict["Rating"] = rating
110
+
111
+ # Options
112
+ options, options_to_images = {}, {} # TODO: options_to_images possible?
113
+ try:
114
+ option_blocks = soup.findAll('select', {'class': 'msku-sel'})
115
+ for block in option_blocks:
116
+ name = block["name"].strip().strip(":")
117
+ option_tags = block.findAll("option")
118
+ opt_list = []
119
+ for option_tag in option_tags:
120
+ if "select" not in option_tag.text.lower():
121
+ # Do not include "- select -" (aka `not selected`) choice
122
+ opt_list.append(option_tag.text)
123
+ options[name] = opt_list
124
+ except:
125
+ options = {}
126
+ product_dict["options"], product_dict["option_to_image"] = options, options_to_images
127
+
128
+ # Description
129
+ desc = None
130
+ try:
131
+ # Ebay descriptions are shown in `iframe`s
132
+ desc_link = soup.find('iframe', {'id': 'desc_ifr'})["src"]
133
+ desc_webpage = requests.get(desc_link, headers={'User-Agent': HEADER_, 'Accept-Language': 'en-US, en;q=0.5'})
134
+ desc_soup = BeautifulSoup(desc_webpage.content, "html.parser")
135
+ desc = ' '.join(desc_soup.text.split())
136
+ except:
137
+ desc = "N/A"
138
+ product_dict["Description"] = desc
139
+
140
+ # Features
141
+ features = None
142
+ try:
143
+ features = soup.find('div', {'class': 'x-about-this-item'}).text
144
+ except:
145
+ features = "N/A"
146
+ product_dict["BulletPoints"] = features
147
+
148
+ return product_dict
149
+
150
+
151
  def parse_results_ws(query, page_num=None):
152
  query_string = '+'.join(query.split())
153
  page_num = 1 if page_num is None else page_num
 
301
  begin = time.time()
302
  webpage = requests.get(url, headers={'User-Agent': HEADER_, 'Accept-Language': 'en-US, en;q=0.5'})
303
  end = time.time()
304
+ if VERBOSE:
305
+ print(f"Item page scraping took {end-begin} seconds")
306
  soup = BeautifulSoup(webpage.content, "html.parser")
307
 
308
  # Title
 
346
  desc_div = desc_body.find(name="div", attrs={"id": "productDescription"})
347
  desc_ps = desc_div.findAll(name="p")
348
  desc = " ".join([p.text for p in desc_ps])
 
349
  except AttributeError:
350
  desc = "N/A"
351
  product_dict["Description"] = desc.strip()