Spaces:

rianders
/

mpi_data_store

Sleeping

File size: 4,064 Bytes
import streamlit as st

import pandas as pd
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
from datetime import datetime

def find_linked_urls_and_title(url):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            links = soup.find_all('a')
            urls = {link.get('href') for link in links if link.get('href') is not None}
            title_tag = soup.find('title')
            page_title = title_tag.text if title_tag else 'No Title Found'
            return urls, page_title
        else:
            st.write(f"Failed to retrieve {url}")
            return set(), 'No Title Found'
    except Exception as e:
        st.write(f"An error occurred with {url}: {e}")
        return set(), 'No Title Found'

def convert_to_absolute_urls(base_url, links):
    return {urljoin(base_url, link) if not link.startswith('http') else link for link in links}

def categorize_links(base_url, links):
    internal_links, external_links = set(), set()
    for link in links:
        if urlparse(link).netloc == urlparse(base_url).netloc:
            internal_links.add(link)
        else:
            external_links.add(link)
    return internal_links, external_links

def display_editable_table(df):
    edited_df = st.data_editor(data=df, key="data_editor_key", num_rows="dynamic")  # Add num_rows="dynamic" to allow adding/deleting rows
    return edited_df

def prepare_dataframe(df):
    if "Ignore" not in df.columns:
        df["Ignore"] = False  # Initialize all values as False
    return df

def store_data(df):
    st.session_state['data'] = df

def main():
    #menu()

    st.title("Data Source Configuration")
    
    # Initialize 'scanned_urls' with all columns, including 'Ignore'
    if 'scanned_urls' not in st.session_state:
        st.session_state['scanned_urls'] = pd.DataFrame(columns=['URL', 'Type', 'Page Name', 'Scanned DateTime', 'Ignore'])
    
    st.subheader("Scan Websites for URLs")
    url_input = st.text_area("Enter URLs to scan, separated by new lines:", "https://fubarlabs.org")
    url_list = [url.strip() for url in url_input.strip().split('\n') if url.strip()]
    scan_button_clicked = st.button("Scan URLs")
    
    if scan_button_clicked:
        for url in url_list:
            unique_urls, page_title = find_linked_urls_and_title(url)
            scan_datetime = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            absolute_urls = convert_to_absolute_urls(url, unique_urls)
            internal_links, external_links = categorize_links(url, absolute_urls)
            
            new_entries = pd.DataFrame([(url, 'Internal', page_title, scan_datetime, False) for url in internal_links] + 
                                       [(url, 'External', page_title, scan_datetime, False) for url in external_links],
                                       columns=['URL', 'Type', 'Page Name', 'Scanned DateTime', 'Ignore'])  # Include 'Ignore' column
            st.session_state['scanned_urls'] = pd.concat([st.session_state['scanned_urls'], new_entries]).drop_duplicates().reset_index(drop=True)
            store_data(st.session_state['scanned_urls'])

    if not st.session_state['scanned_urls'].empty:
        # Prepare the dataframe, this now includes the 'Ignore' column from the start
        prepared_df = prepare_dataframe(st.session_state['scanned_urls'])
        
        # Display the editable table with an "Ignore" column
        edited_df = display_editable_table(prepared_df)
        
        if edited_df is not None:
            st.session_state['scanned_urls'] = edited_df

        # Access the edits made to the table
        if "data_editor_key" in st.session_state:
            edits = st.session_state["data_editor_key"]
            st.write("Edits made to the table:")
            st.write(edits)

        if st.button('Proceed to Data Organization'):
            st.switch_page('pages/02_data_organization.py')

if __name__ == "__main__":
    main()