Spaces:
Sleeping
Sleeping
import urllib.request as req | |
from bs4 import BeautifulSoup as bs | |
import re | |
import pandas as pd | |
import streamlit as st | |
from io import BytesIO | |
class BeautifulSoup_st: | |
def __init__(self, width=600, height=400): | |
self.iframe_width = width | |
self.iframe_height = height | |
def load_page(self, url): | |
# Visualizar la pagina | |
self.url = url.replace("watch?v=", "embed/") | |
page = req.urlopen(self.url) | |
soup = bs(page) | |
html_code = st.checkbox('Codigo HTML') | |
if html_code: st.code(str(soup.prettify())) | |
def filter_content(self, data_type='p'): | |
page = req.urlopen(self.url) | |
soup = bs(page) | |
# Buscamos todos los tipos de datos en la pagina | |
page_content = str(soup.body) | |
data_type_options = [i[1:] for i in set(re.findall('<\w+', page_content))] | |
data_type = st.selectbox('Tipo de item', options=data_type_options) | |
# filtramos aplicando el tipo de dato escogido | |
content_filtered_1 = [str(i) for i in soup.body.findAll(data_type)] | |
second_filter = st.checkbox('Filtrar por atributo') | |
# filtramos aplicando el atributo escogido | |
if second_filter: | |
content_str_filter = '' | |
for i in content_filtered_1: | |
content_str_filter += f' {i}' | |
atribute_options = [i[:-1] for i in set(re.findall('\w+=', content_str_filter))] | |
data_atribute = st.selectbox('Atributo', options=atribute_options) | |
content_filtered_2 = [str(re.findall(f'.*{data_atribute}="([^"]*)".*', i)) for i in content_filtered_1] | |
df = pd.DataFrame(content_filtered_2) | |
else: | |
df = pd.DataFrame(content_filtered_1) | |
transform_to_text = st.checkbox('Obtener texto contenido') | |
if transform_to_text: | |
if second_filter: | |
content = soup.body.findAll(data_type) | |
result = [] | |
for item in content: | |
h = str(item).split('>')[0] | |
x = None | |
if data_atribute in h: | |
x = item[data_atribute] | |
result.append(x) | |
df = pd.DataFrame(result) | |
else: | |
content = soup.body.findAll(data_type) | |
function_usage = [] | |
for item in content: | |
item = item.getText('#') | |
item = item.replace('\n', '') | |
function_usage.append(item) | |
data = [i.split('#') for i in function_usage] | |
df = pd.DataFrame(data) | |
self.df = df | |
st.table(df) | |
def save(self): | |
c1, c2, c3 = st.columns([7, 1, 1]) | |
# Descargar csv | |
c1.write(''' | |
**Formato de descarga**''') | |
csv = self.df.to_csv(header=False, index=False).encode('utf-8') | |
c2.download_button( | |
label="csv", | |
data=csv, | |
file_name='DB.csv') | |
# Descargar excel | |
output = BytesIO() | |
excel_file = pd.ExcelWriter(output, engine='xlsxwriter') | |
content_file = self.df.to_excel(excel_file, header=False, index=False) | |
data = output.getvalue() | |
c3.download_button( | |
label="excel", | |
data=data, | |
file_name='DB.xlsx') | |
#https://docs.python.org/3/library/random.html | |