File size: 3,784 Bytes
feb86e6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 |
import requests
from bs4 import BeautifulSoup
import pandas as pd
from src.utils import timer_decorator, generate_proxy_url
import json
from datetime import datetime
BASE_URL = 'https://listado.mercadolibre.com.co'
@timer_decorator
def main(product, pages):
list_df = []
initial_df, follow = organize_page_data(product=product)
list_df.append(initial_df)
if pages == 'all':
while True:
follow_df, follow = organize_page_data(url=follow)
list_df.append(follow_df)
follow_df.rename(columns={None:product}, inplace=True)
if follow is None:
break
elif isinstance(pages, int):
for _ in range(pages - 1): # subtract 1 because we have already scraped the first page
follow_df, follow = organize_page_data(url=follow)
list_df.append(follow_df)
follow_df.rename(columns={None:product}, inplace=True)
if follow is None:
break
final_data = pd.concat(list_df)
output = json.loads(final_data.to_json(orient='records'))
return output
def organize_page_data(url: str = BASE_URL ,product= None):
s = get_soup_by_url(url=url, product=product)
products = get_all_product_names_for_page(s)
now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
follow = None
try:
follow = get_follow_page(s)
except:
print('follow page not found')
prices = get_all_product_prices_for_page(s)
urls = get_all_product_urls_for_page(s)
years = get_year(s)
kilometros = get_km(s)
locations = get_location(s)
output_dict = {'product':products,
'price':prices,
'link':urls,
'years': years,
'kilometraje':kilometros,
'locations': locations,
'_created': now}
return pd.DataFrame(output_dict), follow
def get_soup_by_url(url, product: str = None):
proxy = generate_proxy_url()
proxies = {'http': proxy,
'https': proxy}
if product is None:
url = url
else:
url = f'{url}/{product}'
r = requests.get(url=url, proxies=proxies)
s = BeautifulSoup(r.content, 'html.parser')
return s
def get_all_product_names_for_page(s):
product_names = s.find_all('h2', attrs= {"class":"ui-search-item__title"})
product_names = [v.text for v in product_names]
return product_names
def get_all_product_prices_for_page(s):
divs = s.find_all('div', attrs= {"class":"ui-search-result__wrapper"})
prices = [div.find_all('span', attrs= {"class":"andes-money-amount__fraction"})[0].text.replace('.','') for div in divs]
return prices
def get_follow_page(s):
follow_page = [div.find('a')['href']
for div in s.find_all('li', attrs={"class":"andes-pagination__button andes-pagination__button--next"})
if div.find('a') is not None][0]
return follow_page
def get_all_product_urls_for_page(s):
product_url = s.find_all('a', attrs= {"class":"ui-search-item__group__element ui-search-link__title-card ui-search-link"})
product_url = [h.get('href') for h in product_url]
return product_url
def get_year(s):
soup = s.find_all('li', attrs={'class': 'ui-search-card-attributes__attribute'})
year = [x.text for x in soup[::2]]
return year
def get_km(s):
soup = s.find_all('li', attrs={'class': 'ui-search-card-attributes__attribute'})
km = [x.text for x in soup[1::2]]
return km
def get_location(s):
soup = s.find_all('span', attrs={'class': 'ui-search-item__group__element ui-search-item__location'})
location = [x.text for x in soup]
return location
if __name__ == '__main__':
data = main(product='carros', pages=1)
print(data) |