databases_demo / BeautifulSoup.py
raaraya's picture
Update BeautifulSoup.py
cf39ec2
import urllib.request as req
from bs4 import BeautifulSoup as bs
import re
import pandas as pd
import streamlit as st
from io import BytesIO
class BeautifulSoup_st:
def __init__(self, width=600, height=400):
self.iframe_width = width
self.iframe_height = height
def load_page(self, url):
# Visualizar la pagina
self.url = url.replace("watch?v=", "embed/")
page = req.urlopen(self.url)
soup = bs(page)
html_code = st.checkbox('Codigo HTML')
if html_code: st.code(str(soup.prettify()))
def filter_content(self, data_type='p'):
page = req.urlopen(self.url)
soup = bs(page)
# Buscamos todos los tipos de datos en la pagina
page_content = str(soup.body)
data_type_options = [i[1:] for i in set(re.findall('<\w+', page_content))]
data_type = st.selectbox('Tipo de item', options=data_type_options)
# filtramos aplicando el tipo de dato escogido
content_filtered_1 = [str(i) for i in soup.body.findAll(data_type)]
second_filter = st.checkbox('Filtrar por atributo')
# filtramos aplicando el atributo escogido
if second_filter:
content_str_filter = ''
for i in content_filtered_1:
content_str_filter += f' {i}'
atribute_options = [i[:-1] for i in set(re.findall('\w+=', content_str_filter))]
data_atribute = st.selectbox('Atributo', options=atribute_options)
content_filtered_2 = [str(re.findall(f'.*{data_atribute}="([^"]*)".*', i)) for i in content_filtered_1]
df = pd.DataFrame(content_filtered_2)
else:
df = pd.DataFrame(content_filtered_1)
transform_to_text = st.checkbox('Obtener texto contenido')
if transform_to_text:
if second_filter:
content = soup.body.findAll(data_type)
result = []
for item in content:
h = str(item).split('>')[0]
x = None
if data_atribute in h:
x = item[data_atribute]
result.append(x)
df = pd.DataFrame(result)
else:
content = soup.body.findAll(data_type)
function_usage = []
for item in content:
item = item.getText('#')
item = item.replace('\n', '')
function_usage.append(item)
data = [i.split('#') for i in function_usage]
df = pd.DataFrame(data)
self.df = df
st.table(df)
def save(self):
c1, c2, c3 = st.columns([7, 1, 1])
# Descargar csv
c1.write('''
**Formato de descarga**''')
csv = self.df.to_csv(header=False, index=False).encode('utf-8')
c2.download_button(
label="csv",
data=csv,
file_name='DB.csv')
# Descargar excel
output = BytesIO()
excel_file = pd.ExcelWriter(output, engine='xlsxwriter')
content_file = self.df.to_excel(excel_file, header=False, index=False)
data = output.getvalue()
c3.download_button(
label="excel",
data=data,
file_name='DB.xlsx')
#https://docs.python.org/3/library/random.html