aashwinik's picture
Update Workflow/Scrape.py
90f6dc0 verified
raw history blame
No virus
988 Bytes
import requests
import bs4
import lxml
import pandas as pd
import streamlit as st
import os
os.environ["INPUT_DATA"] = "Input/Sites.xlsx" ## to silence warning
def get_inputData(filePath):
df= pd.read_excel(io=filePath, engine='openpyxl')
return df
def open_url(url):
result=requests.get(url)
return result
def do_scrape(result):
places=[]
soup = bs4.BeautifulSoup(result.text, "lxml")
for i in soup.select('h3'): places.append(i.getText())
return places
def display_data():
filePath=os.environ.get('INPUT_DATA')
df=get_inputData(filePath)
#sources = df['SiteName'].tolist()
for index, row in df.iterrows():
result=open_url(row['SiteURL'])
places=do_scrape(result)
pl= pd.DataFrame()
for p in places:
d = pd.DataFrame({'Source': [row['SiteName']], 'Places': [p]})
pl = pd.concat([pl, d], ignore_index = True)
st.dataframe(pl)