import bs4 as BeautifulSoup import pandas as pd import urllib.request from io import BytesIO from pyxlsb import open_workbook as open_xlsb from urllib.request import urlopen, Request import re rutas_websearch = ['https://en.hespareparts.com/search/?search=', 'https://offroadeq.com/parts-search/'] def extrae_dato_web(idx): datx = [] idxx = str(idx).replace('-', '').replace(' ', '') urlg = rutas_websearch[1] + idxx + '/' htmlg = urlopen(urlg).read() soup = BeautifulSoup.BeautifulSoup(htmlg, 'html.parser') lista0 = soup.find_all('h2')[0] lista1 = soup.find_all('dt') lista2 = soup.find_all('dd') lista21 = [kj.text for kj in lista2] if len(lista1)<3: lista1 = ['Alt NA'] + lista1 lista21 = ['Alternate NA'] + lista21 else: pass for i, j in zip(lista1, lista21): try: datx.append( float(j.replace('lbs', '').replace('$', '')) ) except: datx.append(j) datx.append( lista0.text.split('-')[1:][0] ) return(datx) def extrae_web(idx): idxx = str(idx).replace('-', '').replace(' ', '') urlz = rutas_websearch[0] + idxx + '/' try: htmlz = urlopen(urlz).read() soup = BeautifulSoup.BeautifulSoup(htmlz, 'html.parser') lista = soup.find_all('a', {'class': 'link-dark'}) ls = lista[0] page = urlopen(ls['href']) html = page.read() soup = BeautifulSoup.BeautifulSoup(html, 'html.parser') gg = soup.find_all('h1') print(gg) dd = [] for typex in ['depth', 'width', 'height']: try: aa = soup.find_all('span', {'itemprop': typex})[0].text bb = re.findall('[0-9.]+', aa) except: bb = [float(-1.0)] dd.append(float(bb[0])/1000) cc = soup.find_all('div', {'itemprop': 'description'})[0].text cc1 = cc.replace('\r', '').replace('\n', ' ') ggtx = gg[0].text posx = ggtx.find(' - ') ggx = ggtx[posx+3:] vol = dd[0] * dd[1] * dd[2] dd0, dd1, dd2 = dd[0], dd[1], dd[2] except: ggx, dd0, dd1, dd2, vol, cc1 = 'Not Available', -1.0, -1.0, -1.0, -1.0, 'NA' return ggx, dd0, dd1, dd2, vol, cc1 def extrae_alternate(idx): idxx = str(idx).replace('-', '').replace(' ', '') urlg = rutas_websearch[1] + idxx + '/' htmlg = urlopen(urlg).read() soup = BeautifulSoup.BeautifulSoup(htmlg, 'html.parser') dt1 = soup.find_all('dt')[0].text print(dt1) dt2 = soup.find_all('dd')[0].text if dt1 == 'Alternate for': return(dt2) def convierte_excel(df): output = BytesIO() writer = pd.ExcelWriter(output, engine='xlsxwriter') df.to_excel(writer, index=False, sheet_name='data_extraida') workbook = writer.book worksheet = writer.sheets['data_extraida'] format1 = workbook.add_format({'num_format': '0.00'}) worksheet.set_column('A:A', None, format1) writer.close() processed_data = output.getvalue() return processed_data def encuentra_hoja(df): xls_file = pd.ExcelFile(df) sh_names = xls_file.sheet_names for sn in sh_names: d = pd.read_excel(df, sheet_name=sn) if len(d.columns)==1: ds = sn return ds