import json import pandas as pd import streamlit as st from datasets import load_dataset from huggingface_hub import HfApi, login from openai import OpenAI # Import our utility functions from utils.analysis import analyze_dataset_with_openai, generate_dataset_card from utils.visualization import create_distribution_plot, create_wordcloud # Initialize session state variables if "openai_analysis" not in st.session_state: st.session_state.openai_analysis = None if "df" not in st.session_state: st.session_state.df = None if "dataset_name" not in st.session_state: st.session_state.dataset_name = None if "selected_dist_columns" not in st.session_state: st.session_state.selected_dist_columns = [] if "selected_wordcloud_columns" not in st.session_state: st.session_state.selected_wordcloud_columns = [] st.set_page_config( page_title="Dataset Card Generator", page_icon="📊", layout="wide", ) def initialize_openai_client(api_key): """Initialize OpenAI client with API key.""" try: # Basic initialization without any proxy settings return OpenAI(api_key=api_key) except Exception as e: print(f"Error initializing OpenAI client: {e}") # If that fails, try with default configuration return OpenAI( api_key=api_key, default_headers={"User-Agent": "Dataset-Card-Generator"} ) def load_and_analyze_dataset(dataset_name): """Load dataset and perform initial analysis.""" progress_container = st.empty() with progress_container.container(): with st.status("Loading dataset...", expanded=True) as status: try: # Load dataset status.write("📥 Loading dataset from HuggingFace...") dataset = load_dataset(dataset_name, split="train") df = pd.DataFrame(dataset) st.session_state.df = df st.session_state.dataset_name = dataset_name # Initialize OpenAI analysis try: status.write("🤖 Analyzing dataset ...") client = initialize_openai_client(st.session_state.openai_key) sample_data = dataset[:5] print("Sample data:", json.dumps(sample_data, indent=2)) analysis = analyze_dataset_with_openai(client, sample_data) print("Analysis result:", json.dumps(analysis, indent=2)) st.session_state.openai_analysis = analysis except Exception as e: print(f"Analysis error: {str(e)}") status.update(label=f"❌ Error: {str(e)}", state="error") status.update( label="✅ Dataset loaded and analyzed successfully!", state="complete", ) except Exception as e: status.update(label=f"❌ Error: {str(e)}", state="error") st.error(f"Failed to load dataset: {str(e)}") return def display_dataset_analysis(): """Display dataset analysis and visualization options.""" if st.session_state.df is None: return st.header("Dataset Analysis") # Dataset preview with st.expander("📊 Dataset Preview", expanded=True): st.dataframe(st.session_state.df.head(), use_container_width=True) # Column selection for visualizations st.subheader("Select Visualization Fields") col1, col2 = st.columns(2) with col1: # Distribution plot selection st.session_state.selected_dist_columns = st.multiselect( "Distribution Plots (max 2)", options=st.session_state.df.columns.tolist(), format_func=lambda x: get_column_type_description(st.session_state.df, x), max_selections=2, help="Select columns to show value distributions. List columns will show frequency of individual items.", ) with col2: # Word cloud selection text_columns = [ col for col in st.session_state.df.columns if st.session_state.df[col].dtype == "object" or isinstance(st.session_state.df[col].iloc[0], list) ] st.session_state.selected_wordcloud_columns = st.multiselect( "Word Clouds (max 2)", options=text_columns, format_func=lambda x: get_column_type_description(st.session_state.df, x), max_selections=2, help="Select text columns to generate word clouds", ) # Add some spacing st.markdown("---") # Generate card button if st.button("Generate Dataset Card", type="primary", use_container_width=True): if not ( st.session_state.selected_dist_columns or st.session_state.selected_wordcloud_columns ): st.warning( "Please select at least one visualization before generating the card." ) return generate_and_display_card() def generate_and_display_card(): """Generate and display the dataset card with visualizations.""" if not st.session_state.openai_analysis: st.error( "Dataset analysis not available. Please try loading the dataset again." ) return with st.status("Generating dataset card...", expanded=True) as status: try: # Create visualizations status.write("📊 Creating distribution plots...") distribution_plots = {} for col in st.session_state.selected_dist_columns: print(f"Generating distribution plot for {col}") img_base64 = create_distribution_plot(st.session_state.df, col) distribution_plots[col] = img_base64 print(f"Successfully created plot for {col}") status.write("🔤 Generating word clouds...") wordcloud_plots = {} for col in st.session_state.selected_wordcloud_columns: print(f"Generating word cloud for {col}") img_base64 = create_wordcloud(st.session_state.df, col) wordcloud_plots[col] = img_base64 print(f"Successfully created word cloud for {col}") # Generate dataset card content status.write("📝 Composing dataset card...") dataset_info = {"dataset_name": st.session_state.dataset_name} readme_content = generate_dataset_card( dataset_info=dataset_info, distribution_plots=distribution_plots, wordcloud_plots=wordcloud_plots, openai_analysis=st.session_state.openai_analysis, df=st.session_state.df, # Added DataFrame parameter ) # Display results status.update(label="✅ Dataset card generated!", state="complete") # Display the markdown with images st.markdown(readme_content, unsafe_allow_html=True) # Add download button st.download_button( label="⬇️ Download Dataset Card", data=readme_content, file_name="README.md", mime="text/markdown", use_container_width=True, ) except Exception as e: print(f"Error in generate_and_display_card: {str(e)}") st.error(f"Error generating dataset card: {str(e)}") raise e def get_column_type_description(data, column): """Get a user-friendly description of the column type.""" try: if isinstance(data[column].iloc[0], list): return f"{column} (list)" elif data[column].dtype in ["int64", "float64"]: return f"{column} (numeric)" else: return f"{column} (text/categorical)" except: return f"{column} (unknown)" def get_api_keys(): """Get API keys from secrets or user input.""" # Try to get from secrets first try: hf_token = st.secrets["api_keys"]["huggingface"] openai_key = st.secrets["api_keys"]["openai"] return hf_token, openai_key except: return None, None def get_secrets(): """Get API keys from secrets.toml if it exists.""" try: hf_token = st.secrets.get("api_keys", {}).get("huggingface", "") openai_key = st.secrets.get("api_keys", {}).get("openai", "") return hf_token, openai_key except Exception as e: print(f"No secrets file found or error reading secrets: {e}") return "", "" def main(): st.title("📊 Dataset Card Generator") st.markdown( """ Generate beautiful documentation for your HuggingFace datasets with automated analysis, visualizations, and formatted dataset cards. """ ) # Get secrets if available default_hf_token, default_openai_key = get_api_keys() # Authentication section in sidebar with st.sidebar: st.header("🔑 Authentication") # OpenAI API key (required) openai_key = st.text_input( "OpenAI API Key", value=default_openai_key, type="password" if not default_openai_key else "default", help="Required: Your OpenAI API key for dataset analysis", ) # HuggingFace token (optional) hf_token = st.text_input( "HuggingFace Token (optional)", value=default_hf_token, type="password" if not default_hf_token else "default", help="Optional: Only required for private datasets", ) if openai_key: try: # Only attempt HF login if token is provided if hf_token: login(hf_token) st.success("✅ HuggingFace authentication successful!") st.session_state.openai_key = openai_key st.success("✅ OpenAI API key set!") except Exception as e: st.error(f"❌ Authentication error: {str(e)}") return else: st.info("👆 Please enter your OpenAI API key to get started.") return # Main content area if not openai_key: return dataset_name = st.text_input( "Enter HuggingFace Dataset Name", placeholder="username/dataset", help="Enter the full path to your HuggingFace dataset (e.g., 'username/dataset')", ) if dataset_name: if st.button("Load Dataset", type="primary"): load_and_analyze_dataset(dataset_name) if st.session_state.df is not None: display_dataset_analysis() if __name__ == "__main__": main()