File size: 988 Bytes
4609ad5
 
4d03e45
c5000c6
8f55f96
c5000c6
8f7a569
20d2097
19ad3ef
c5000c6
a81a773
c5000c6
20d2097
c5000c6
19ad3ef
 
20d2097
19ad3ef
90f6dc0
c5000c6
1d048de
4609ad5
20d2097
c5000c6
19ad3ef
c5000c6
 
33c8aaa
1a412ef
33c8aaa
1a412ef
33c8aaa
19ad3ef
 
a6e9c25
5b6a512
1a412ef
6a80262
e241312
a6e9c25
19ad3ef
559e650
c5000c6
20d2097
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
import requests
import bs4
import lxml
import pandas as pd
import streamlit as st
import os
os.environ["INPUT_DATA"] = "Input/Sites.xlsx" ## to silence warning


def get_inputData(filePath):
    df= pd.read_excel(io=filePath, engine='openpyxl')
    return df

def open_url(url):
    result=requests.get(url)
    return result

def do_scrape(result):
    places=[]
    soup = bs4.BeautifulSoup(result.text, "lxml")
    for i in soup.select('h3'): places.append(i.getText())
    return places

def display_data():
   
    filePath=os.environ.get('INPUT_DATA')
    df=get_inputData(filePath)
    #sources = df['SiteName'].tolist()
    
    for index, row in df.iterrows():
        
        result=open_url(row['SiteURL'])
        places=do_scrape(result)

        pl= pd.DataFrame()
        
        for p in places:
            d = pd.DataFrame({'Source': [row['SiteName']], 'Places': [p]})
            pl = pd.concat([pl, d], ignore_index = True)
            
    
    st.dataframe(pl)