import streamlit as st import pandas as pd import requests from bs4 import BeautifulSoup from urllib.parse import urljoin, urlparse def find_linked_urls(url): try: response = requests.get(url) if response.status_code == 200: soup = BeautifulSoup(response.text, 'html.parser') links = soup.find_all('a') urls = {link.get('href') for link in links if link.get('href') is not None} return urls else: st.write(f"Failed to retrieve {url}") except Exception as e: st.write(f"An error occurred with {url}: {e}") return set() def convert_to_absolute_urls(base_url, links): absolute_urls = [] for link in links: if not link.startswith('http'): link = urljoin(base_url, link) absolute_urls.append(link) return set(absolute_urls) def categorize_links(base_url, links): internal_links, external_links = set(), set() for link in links: if urlparse(link).netloc == urlparse(base_url).netloc: internal_links.add(link) else: external_links.add(link) return internal_links, external_links def main(): st.title("Data Source Configuration") st.subheader("Scan Websites for URLs") url_input = st.text_area("Enter URLs to scan, separated by new lines:") url_list = [url.strip() for url in url_input.strip().split('\n') if url.strip()] # Splitting and cleaning input if st.button("Scan URLs"): all_links = {} for url in url_list: unique_urls = find_linked_urls(url) absolute_urls = convert_to_absolute_urls(url, unique_urls) internal_links, external_links = categorize_links(url, absolute_urls) all_links[url] = {"internal": internal_links, "external": external_links} selected_urls = [] for base_url, links in all_links.items(): st.write(f"Base URL: {base_url}") include_all_internal = st.checkbox(f"Include all internal links from {base_url}", key=f"all_{base_url}") if include_all_internal: selected_urls.extend(links["internal"]) else: selected_internal = [link for link in links["internal"] if st.checkbox(link, key=link)] selected_urls.extend(selected_internal) # Displaying external links for informational purposes if links["external"]: st.write("External links:") for link in links["external"]: st.write(link) # Convert selected URLs to a DataFrame and display if selected_urls: df_selected_urls = pd.DataFrame(selected_urls, columns=['Selected URLs']) st.write(df_selected_urls) # Saving the DataFrame as CSV if st.button("Save Selected URLs to CSV"): df_selected_urls.to_csv('selected_urls.csv', index=False) st.success("Saved selected URLs to selected_urls.csv") if __name__ == "__main__": main()