|
import requests |
|
from bs4 import BeautifulSoup |
|
import pandas as pd |
|
from src.utils import timer_decorator, generate_proxy_url |
|
import json |
|
from datetime import datetime |
|
|
|
|
|
BASE_URL = 'https://listado.mercadolibre.com.co' |
|
|
|
@timer_decorator |
|
def main(product, pages): |
|
list_df = [] |
|
initial_df, follow = organize_page_data(product=product) |
|
list_df.append(initial_df) |
|
if pages == 'all': |
|
while True: |
|
follow_df, follow = organize_page_data(url=follow) |
|
list_df.append(follow_df) |
|
follow_df.rename(columns={None:product}, inplace=True) |
|
if follow is None: |
|
break |
|
elif isinstance(pages, int): |
|
for _ in range(pages - 1): |
|
follow_df, follow = organize_page_data(url=follow) |
|
list_df.append(follow_df) |
|
follow_df.rename(columns={None:product}, inplace=True) |
|
if follow is None: |
|
break |
|
final_data = pd.concat(list_df) |
|
output = json.loads(final_data.to_json(orient='records')) |
|
return output |
|
|
|
def organize_page_data(url: str = BASE_URL ,product= None): |
|
s = get_soup_by_url(url=url, product=product) |
|
products = get_all_product_names_for_page(s) |
|
now = datetime.now().strftime("%Y-%m-%d %H:%M:%S") |
|
follow = None |
|
try: |
|
follow = get_follow_page(s) |
|
except: |
|
print('follow page not found') |
|
prices = get_all_product_prices_for_page(s) |
|
urls = get_all_product_urls_for_page(s) |
|
years = get_year(s) |
|
kilometros = get_km(s) |
|
locations = get_location(s) |
|
output_dict = {'product':products, |
|
'price':prices, |
|
'link':urls, |
|
'years': years, |
|
'kilometraje':kilometros, |
|
'locations': locations, |
|
'_created': now} |
|
return pd.DataFrame(output_dict), follow |
|
|
|
|
|
def get_soup_by_url(url, product: str = None): |
|
proxy = generate_proxy_url() |
|
proxies = {'http': proxy, |
|
'https': proxy} |
|
if product is None: |
|
url = url |
|
else: |
|
url = f'{url}/{product}' |
|
r = requests.get(url=url, proxies=proxies) |
|
s = BeautifulSoup(r.content, 'html.parser') |
|
return s |
|
|
|
|
|
def get_all_product_names_for_page(s): |
|
product_names = s.find_all('h2', attrs= {"class":"ui-search-item__title"}) |
|
product_names = [v.text for v in product_names] |
|
return product_names |
|
|
|
def get_all_product_prices_for_page(s): |
|
divs = s.find_all('div', attrs= {"class":"ui-search-result__wrapper"}) |
|
prices = [div.find_all('span', attrs= {"class":"andes-money-amount__fraction"})[0].text.replace('.','') for div in divs] |
|
return prices |
|
|
|
def get_follow_page(s): |
|
follow_page = [div.find('a')['href'] |
|
for div in s.find_all('li', attrs={"class":"andes-pagination__button andes-pagination__button--next"}) |
|
if div.find('a') is not None][0] |
|
return follow_page |
|
|
|
def get_all_product_urls_for_page(s): |
|
product_url = s.find_all('a', attrs= {"class":"ui-search-item__group__element ui-search-link__title-card ui-search-link"}) |
|
product_url = [h.get('href') for h in product_url] |
|
return product_url |
|
|
|
def get_year(s): |
|
soup = s.find_all('li', attrs={'class': 'ui-search-card-attributes__attribute'}) |
|
year = [x.text for x in soup[::2]] |
|
return year |
|
def get_km(s): |
|
soup = s.find_all('li', attrs={'class': 'ui-search-card-attributes__attribute'}) |
|
km = [x.text for x in soup[1::2]] |
|
return km |
|
def get_location(s): |
|
soup = s.find_all('span', attrs={'class': 'ui-search-item__group__element ui-search-item__location'}) |
|
location = [x.text for x in soup] |
|
return location |
|
|
|
|
|
if __name__ == '__main__': |
|
data = main(product='carros', pages=1) |
|
print(data) |