Spaces:
Sleeping
Sleeping
import requests | |
import bs4 | |
import lxml | |
import pandas as pd | |
import streamlit as st | |
import os | |
os.environ["INPUT_DATA"] = "Input/Sites.xlsx" ## to silence warning | |
def get_inputData(filePath): | |
df= pd.read_excel(io=filePath, engine='openpyxl') | |
return df | |
def open_url(url): | |
result=requests.get(url) | |
return result | |
def do_scrape(result): | |
places=[] | |
soup = bs4.BeautifulSoup(result.text, "lxml") | |
for i in soup.select('h3'): places.append(i.getText()) | |
return places | |
def display_data(): | |
filePath=os.environ.get('INPUT_DATA') | |
df=get_inputData(filePath) | |
#sources = df['SiteName'].tolist() | |
for index, row in df.iterrows(): | |
result=open_url(row['SiteURL']) | |
places=do_scrape(result) | |
pl= pd.DataFrame() | |
for p in places: | |
d = pd.DataFrame({'Source': [row['SiteName']], 'Places': [p]}) | |
pl = pd.concat([pl, d], ignore_index = True) | |
st.dataframe(pl) | |