import streamlit as st import pandas as pd import requests from bs4 import BeautifulSoup from urllib.parse import urljoin, urlparse from datetime import datetime def find_linked_urls_and_title(url): try: response = requests.get(url) if response.status_code == 200: soup = BeautifulSoup(response.text, 'html.parser') links = soup.find_all('a') urls = {link.get('href') for link in links if link.get('href') is not None} title_tag = soup.find('title') page_title = title_tag.text if title_tag else 'No Title Found' return urls, page_title else: st.write(f"Failed to retrieve {url}") return set(), 'No Title Found' except Exception as e: st.write(f"An error occurred with {url}: {e}") return set(), 'No Title Found' def convert_to_absolute_urls(base_url, links): return {urljoin(base_url, link) if not link.startswith('http') else link for link in links} def categorize_links(base_url, links): internal_links, external_links = set(), set() for link in links: if urlparse(link).netloc == urlparse(base_url).netloc: internal_links.add(link) else: external_links.add(link) return internal_links, external_links def display_editable_table(df): edited_df = st.data_editor(data=df, key="data_editor_key", num_rows="dynamic") # Add num_rows="dynamic" to allow adding/deleting rows return edited_df def prepare_dataframe(df): if "Ignore" not in df.columns: df["Ignore"] = False # Initialize all values as False return df def store_data(df): st.session_state['data'] = df def main(): #menu() st.title("Data Source Configuration") # Initialize 'scanned_urls' with all columns, including 'Ignore' if 'scanned_urls' not in st.session_state: st.session_state['scanned_urls'] = pd.DataFrame(columns=['URL', 'Type', 'Page Name', 'Scanned DateTime', 'Ignore']) st.subheader("Scan Websites for URLs") url_input = st.text_area("Enter URLs to scan, separated by new lines:", "https://fubarlabs.org") url_list = [url.strip() for url in url_input.strip().split('\n') if url.strip()] scan_button_clicked = st.button("Scan URLs") if scan_button_clicked: for url in url_list: unique_urls, page_title = find_linked_urls_and_title(url) scan_datetime = datetime.now().strftime("%Y-%m-%d %H:%M:%S") absolute_urls = convert_to_absolute_urls(url, unique_urls) internal_links, external_links = categorize_links(url, absolute_urls) new_entries = pd.DataFrame([(url, 'Internal', page_title, scan_datetime, False) for url in internal_links] + [(url, 'External', page_title, scan_datetime, False) for url in external_links], columns=['URL', 'Type', 'Page Name', 'Scanned DateTime', 'Ignore']) # Include 'Ignore' column st.session_state['scanned_urls'] = pd.concat([st.session_state['scanned_urls'], new_entries]).drop_duplicates().reset_index(drop=True) store_data(st.session_state['scanned_urls']) if not st.session_state['scanned_urls'].empty: # Prepare the dataframe, this now includes the 'Ignore' column from the start prepared_df = prepare_dataframe(st.session_state['scanned_urls']) # Display the editable table with an "Ignore" column edited_df = display_editable_table(prepared_df) if edited_df is not None: st.session_state['scanned_urls'] = edited_df # Access the edits made to the table if "data_editor_key" in st.session_state: edits = st.session_state["data_editor_key"] st.write("Edits made to the table:") st.write(edits) if st.button('Proceed to Data Organization'): st.switch_page('pages/02_data_organization.py') if __name__ == "__main__": main()