Spaces:
Sleeping
Sleeping
File size: 988 Bytes
4609ad5 4d03e45 c5000c6 8f55f96 c5000c6 8f7a569 20d2097 19ad3ef c5000c6 a81a773 c5000c6 20d2097 c5000c6 19ad3ef 20d2097 19ad3ef 90f6dc0 c5000c6 1d048de 4609ad5 20d2097 c5000c6 19ad3ef c5000c6 33c8aaa 1a412ef 33c8aaa 1a412ef 33c8aaa 19ad3ef a6e9c25 5b6a512 1a412ef 6a80262 e241312 a6e9c25 19ad3ef 559e650 c5000c6 20d2097 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 |
import requests
import bs4
import lxml
import pandas as pd
import streamlit as st
import os
os.environ["INPUT_DATA"] = "Input/Sites.xlsx" ## to silence warning
def get_inputData(filePath):
df= pd.read_excel(io=filePath, engine='openpyxl')
return df
def open_url(url):
result=requests.get(url)
return result
def do_scrape(result):
places=[]
soup = bs4.BeautifulSoup(result.text, "lxml")
for i in soup.select('h3'): places.append(i.getText())
return places
def display_data():
filePath=os.environ.get('INPUT_DATA')
df=get_inputData(filePath)
#sources = df['SiteName'].tolist()
for index, row in df.iterrows():
result=open_url(row['SiteURL'])
places=do_scrape(result)
pl= pd.DataFrame()
for p in places:
d = pd.DataFrame({'Source': [row['SiteName']], 'Places': [p]})
pl = pd.concat([pl, d], ignore_index = True)
st.dataframe(pl)
|