encoparts / funcs.py
jcmachicao's picture
Update funcs.py
faf1878 verified
import bs4 as BeautifulSoup
import pandas as pd
import urllib.request
from io import BytesIO
from pyxlsb import open_workbook as open_xlsb
from urllib.request import urlopen, Request
import re
rutas_websearch = ['https://en.hespareparts.com/search/?search=', 'https://offroadeq.com/parts-search/']
def extrae_dato_web(idx):
datx = []
idxx = str(idx).replace('-', '').replace(' ', '')
urlg = rutas_websearch[1] + idxx + '/'
htmlg = urlopen(urlg).read()
soup = BeautifulSoup.BeautifulSoup(htmlg, 'html.parser')
lista0 = soup.find_all('h2')[0]
lista1 = soup.find_all('dt')
lista2 = soup.find_all('dd')
lista21 = [kj.text for kj in lista2]
if len(lista1)<3:
lista1 = ['Alt NA'] + lista1
lista21 = ['Alternate NA'] + lista21
else:
pass
for i, j in zip(lista1, lista21):
try:
datx.append( float(j.replace('lbs', '').replace('$', '')) )
except:
datx.append(j)
datx.append( lista0.text.split('-')[1:][0] )
return(datx)
def extrae_web(idx):
idxx = str(idx).replace('-', '').replace(' ', '')
urlz = rutas_websearch[0] + idxx + '/'
try:
htmlz = urlopen(urlz).read()
soup = BeautifulSoup.BeautifulSoup(htmlz, 'html.parser')
lista = soup.find_all('a', {'class': 'link-dark'})
ls = lista[0]
page = urlopen(ls['href'])
html = page.read()
soup = BeautifulSoup.BeautifulSoup(html, 'html.parser')
gg = soup.find_all('h1')
print(gg)
dd = []
for typex in ['depth', 'width', 'height']:
try:
aa = soup.find_all('span', {'itemprop': typex})[0].text
bb = re.findall('[0-9.]+', aa)
except:
bb = [float(-1.0)]
dd.append(float(bb[0])/1000)
cc = soup.find_all('div', {'itemprop': 'description'})[0].text
cc1 = cc.replace('\r', '').replace('\n', ' ')
ggtx = gg[0].text
posx = ggtx.find(' - ')
ggx = ggtx[posx+3:]
vol = dd[0] * dd[1] * dd[2]
dd0, dd1, dd2 = dd[0], dd[1], dd[2]
except:
ggx, dd0, dd1, dd2, vol, cc1 = 'Not Available', -1.0, -1.0, -1.0, -1.0, 'NA'
return ggx, dd0, dd1, dd2, vol, cc1
def extrae_alternate(idx):
idxx = str(idx).replace('-', '').replace(' ', '')
urlg = rutas_websearch[1] + idxx + '/'
htmlg = urlopen(urlg).read()
soup = BeautifulSoup.BeautifulSoup(htmlg, 'html.parser')
dt1 = soup.find_all('dt')[0].text
print(dt1)
dt2 = soup.find_all('dd')[0].text
if dt1 == 'Alternate for':
return(dt2)
def convierte_excel(df):
output = BytesIO()
writer = pd.ExcelWriter(output, engine='xlsxwriter')
df.to_excel(writer, index=False, sheet_name='data_extraida')
workbook = writer.book
worksheet = writer.sheets['data_extraida']
format1 = workbook.add_format({'num_format': '0.00'})
worksheet.set_column('A:A', None, format1)
writer.close()
processed_data = output.getvalue()
return processed_data
def encuentra_hoja(df):
xls_file = pd.ExcelFile(df)
sh_names = xls_file.sheet_names
for sn in sh_names:
d = pd.read_excel(df, sheet_name=sn)
if len(d.columns)==1:
ds = sn
return ds