jcmachicao commited on
Commit
b6c448a
1 Parent(s): 9973f7b

Upload 6 files

Browse files
Files changed (6) hide show
  1. app.py +81 -0
  2. codigos_prueba.xlsx +0 -0
  3. encopartslogo.jpg +0 -0
  4. funcs.py +94 -0
  5. gdmklogo.png +0 -0
  6. requirements.txt +4 -0
app.py ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+
3
+ import streamlit as st
4
+ import pandas as pd
5
+ import base64
6
+ from pyxlsb import open_workbook as open_xlsb
7
+ from datetime import datetime
8
+ from funcs import extrae_dato_web, extrae_web, extrae_alternate, convierte_excel
9
+ import bs4 as BeautifulSoup
10
+ import urllib.request
11
+ from urllib.request import urlopen, Request
12
+ import re
13
+
14
+ c1, c2 = st.columns([6,6])
15
+ with c2:
16
+ st.image('encopartslogo.jpg', width=300, caption='https://encoparts.com/')
17
+
18
+ rutas_websearch = ['https://en.hespareparts.com/search/?search=', 'https://offroadeq.com/parts-search/']
19
+ st.title('Generación de Tablas de Datos de Extracción')
20
+ st.subheader('Carga de Datos')
21
+ selec = st.radio('Seleccione: ', [None, 'Carga por Texto con Comas', 'Carga por Archivo Excel'])
22
+ items = None
23
+
24
+ if selec is None:
25
+
26
+ st.write('Por favor seleccione una opción válida de carga.')
27
+
28
+ else:
29
+
30
+ if selec == 'Carga por Texto con Comas' and items is None:
31
+ st.write(selec)
32
+ codigos = st.text_input('Escriba o peque aqui texto separando los códigos por comas: ')
33
+ if st.button('Proceder'):
34
+ items = list(codigos.split(','))
35
+
36
+ else:
37
+ st.write(selec)
38
+ file = st.file_uploader('Seleccione un archivo: ')
39
+ if file is not None:
40
+ codigosf = pd.read_excel(file)
41
+ st.write('Filas, Columnas de Data de Prueba: ', codigosf.shape)
42
+ namcol = codigosf.columns[0]
43
+ items = pd.Series(codigosf[namcol]).astype(str)
44
+
45
+ if selec is not None and items is not None:
46
+
47
+ st.write(items)
48
+
49
+ datos_tot = []
50
+ st.write('Por favor espere mientas se extrae datos...')
51
+ for it in items:
52
+ extrae_med = extrae_web(it)
53
+ extrae_dat = extrae_dato_web(it)
54
+ itxx = it[:-4]+'-'+it[-4:]
55
+ datos = [it, itxx] + list(extrae_med) + list(extrae_dat)
56
+ datos_tot.append(datos)
57
+
58
+ dtdf = pd.DataFrame(datos_tot)
59
+ dtdf.columns = ['part_no_', 'part_no',
60
+ 'descrip_en', 'length_m', 'width_m', 'height_m', 'vol_m3', 'compatible',
61
+ 'alternate', 'precio_bm_us', 'peso_lb', 'descr']
62
+ now = datetime.now()
63
+ date_time = now.strftime("%m/%d/%Y, %H:%M:%S").replace('/','_').replace(':','_').replace(', ', '_')
64
+ dtdf['peso_kg'] = dtdf.peso_lb*0.453592
65
+
66
+ dtdf2 = dtdf[['part_no_', 'part_no', 'descr', 'length_m', 'width_m', 'height_m', 'vol_m3', 'peso_kg', 'precio_bm_us', 'alternate', 'compatible']]
67
+
68
+ df_xlsx = convierte_excel(dtdf2)
69
+ st.download_button(label='📩 Descargar XLSX', data=df_xlsx,
70
+ file_name = 'df_'+date_time+'.xlsx')
71
+
72
+ csv = dtdf2.to_csv(index=False)
73
+ st.download_button(label='📩 Descargar CSV', data=csv,
74
+ file_name = 'df_'+date_time+'.csv')
75
+
76
+ else:
77
+ st.write('Cuando seleccione la opción, por favor cargue datos y proceda.')
78
+
79
+ c1, c2, c3 = st.columns([4,4,4])
80
+ with c3:
81
+ st.image('gdmklogo.png', width=100, caption='Diseñado por GestioDinámica 2022')
codigos_prueba.xlsx ADDED
Binary file (8.6 kB). View file
 
encopartslogo.jpg ADDED
funcs.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import bs4 as BeautifulSoup
2
+ import pandas as pd
3
+ import urllib.request
4
+ from io import BytesIO
5
+ from pyxlsb import open_workbook as open_xlsb
6
+ from urllib.request import urlopen, Request
7
+ import re
8
+
9
+ rutas_websearch = ['https://en.hespareparts.com/search/?search=', 'https://offroadeq.com/parts-search/']
10
+
11
+ def extrae_dato_web(idx):
12
+ datx = []
13
+ idxx = str(idx).replace('-', '').replace(' ', '')
14
+ urlg = rutas_websearch[1] + idxx + '/'
15
+ htmlg = urlopen(urlg).read()
16
+ soup = BeautifulSoup.BeautifulSoup(htmlg, 'html.parser')
17
+ lista0 = soup.find_all('h2')[0]
18
+ lista1 = soup.find_all('dt')
19
+ lista2 = soup.find_all('dd')
20
+ if len(lista1)<3:
21
+ lista1 = ['Alt NA'] + lista1
22
+ lista2 = ['Alternate NA'] + lista2
23
+ else:
24
+ pass
25
+ for i, j in zip(lista1, lista2):
26
+ try:
27
+ datx.append( float(j.text.replace('lbs', '').replace('$', '')) )
28
+ except:
29
+ datx.append(j)
30
+ datx.append( lista0.text.split('-')[1:][0] )
31
+ return(datx)
32
+
33
+ def extrae_web(idx):
34
+ idxx = str(idx).replace('-', '').replace(' ', '')
35
+ urlz = rutas_websearch[0] + idxx + '/'
36
+
37
+ try:
38
+
39
+ htmlz = urlopen(urlz).read()
40
+ soup = BeautifulSoup.BeautifulSoup(htmlz, 'html.parser')
41
+ lista = soup.find_all('a', {'class': 'link-dark'})
42
+ ls = lista[0]
43
+ page = urlopen(ls['href'])
44
+ html = page.read()
45
+ soup = BeautifulSoup.BeautifulSoup(html, 'html.parser')
46
+ gg = soup.find_all('h1')
47
+ print(gg)
48
+
49
+ dd = []
50
+ for typex in ['depth', 'width', 'height']:
51
+ try:
52
+ aa = soup.find_all('span', {'itemprop': typex})[0].text
53
+ bb = re.findall('[0-9.]+', aa)
54
+ except:
55
+ bb = [float(-1.0)]
56
+ dd.append(float(bb[0])/1000)
57
+
58
+ cc = soup.find_all('div', {'itemprop': 'description'})[0].text
59
+ cc1 = cc.replace('\r', '').replace('\n', ' ')
60
+
61
+ ggtx = gg[0].text
62
+ posx = ggtx.find(' - ')
63
+ ggx = ggtx[posx+3:]
64
+ vol = dd[0] * dd[1] * dd[2]
65
+ dd0, dd1, dd2 = dd[0], dd[1], dd[2]
66
+
67
+ except:
68
+ ggx, dd0, dd1, dd2, vol, cc1 = 'Not Available', -1.0, -1.0, -1.0, -1.0, 'NA'
69
+
70
+ return ggx, dd0, dd1, dd2, vol, cc1
71
+
72
+ def extrae_alternate(idx):
73
+ idxx = str(idx).replace('-', '').replace(' ', '')
74
+ urlg = rutas_websearch[1] + idxx + '/'
75
+ htmlg = urlopen(urlg).read()
76
+ soup = BeautifulSoup.BeautifulSoup(htmlg, 'html.parser')
77
+ dt1 = soup.find_all('dt')[0].text
78
+ print(dt1)
79
+ dt2 = soup.find_all('dd')[0].text
80
+ if dt1 == 'Alternate for':
81
+ return(dt2)
82
+
83
+ def convierte_excel(df):
84
+ output = BytesIO()
85
+ writer = pd.ExcelWriter(output, engine='xlsxwriter')
86
+ df.to_excel(writer, index=False, sheet_name='data_extraida')
87
+ workbook = writer.book
88
+ worksheet = writer.sheets['data_extraida']
89
+ format1 = workbook.add_format({'num_format': '0.00'})
90
+ worksheet.set_column('A:A', None, format1)
91
+ writer.save()
92
+ processed_data = output.getvalue()
93
+ writer.close()
94
+ return processed_data
gdmklogo.png ADDED
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ beautifulsoup4==4.11.1
2
+ pandas==1.3.4
3
+ openpyxl==3.0.10
4
+ pyxlsb==1.0.9