mpi_data_store / pages /file_web_source_collection.py
rianders's picture
Latest changes
c7d9652
raw
history blame
No virus
3.29 kB
import streamlit as st
import pandas as pd
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
def find_linked_urls(url):
try:
response = requests.get(url)
if response.status_code == 200:
soup = BeautifulSoup(response.text, 'html.parser')
links = soup.find_all('a')
urls = {link.get('href') for link in links if link.get('href') is not None}
return urls
else:
st.write(f"Failed to retrieve {url}")
except Exception as e:
st.write(f"An error occurred with {url}: {e}")
return set()
def convert_to_absolute_urls(base_url, links):
return {urljoin(base_url, link) if not link.startswith('http') else link for link in links}
def categorize_links(base_url, links):
internal_links, external_links = set(), set()
for link in links:
if urlparse(link).netloc == urlparse(base_url).netloc:
internal_links.add(link)
else:
external_links.add(link)
return internal_links, external_links
def main():
st.title("Data Source Configuration")
if 'scanned_urls' not in st.session_state:
st.session_state['scanned_urls'] = {}
st.subheader("Scan Websites for URLs")
url_input = st.text_area("Enter URLs to scan, separated by new lines:")
url_list = [url.strip() for url in url_input.strip().split('\n') if url.strip()] # Splitting and cleaning input
scan_button_clicked = st.button("Scan URLs")
if scan_button_clicked or st.session_state['scanned_urls']:
if scan_button_clicked:
for url in url_list:
unique_urls = find_linked_urls(url)
absolute_urls = convert_to_absolute_urls(url, unique_urls)
internal_links, external_links = categorize_links(url, absolute_urls)
st.session_state['scanned_urls'][url] = {"internal": internal_links, "external": external_links}
selected_urls = []
for base_url, links in st.session_state['scanned_urls'].items():
st.write(f"Base URL: {base_url}")
include_all_internal = st.checkbox(f"Include all internal links from {base_url}", key=f"all_{base_url}")
if include_all_internal:
selected_urls.extend(links["internal"])
else:
selected_internal = [link for link in links["internal"] if st.checkbox(link, key=link)]
selected_urls.extend(selected_internal)
if links["external"]:
st.write("External links:")
for link in links["external"]:
st.write(link)
if selected_urls:
df_selected_urls = pd.DataFrame(selected_urls, columns=['Selected URLs'])
st.write(df_selected_urls)
st.session_state['selected_urls'] = df_selected_urls
# Convert DataFrame to CSV for download
csv = df_selected_urls.to_csv(index=False).encode('utf-8')
st.download_button(
label="Download selected URLs as CSV",
data=csv,
file_name='selected_urls.csv',
mime='text/csv',
)
if __name__ == "__main__":
main()