Spaces:
Running
Running
File size: 10,782 Bytes
eb03925 fdea16e eb03925 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 |
import json
import pandas as pd
import streamlit as st
from datasets import load_dataset
from huggingface_hub import HfApi, login
from openai import OpenAI
# Import our utility functions
from utils.analysis import analyze_dataset_with_openai, generate_dataset_card
from utils.visualization import create_distribution_plot, create_wordcloud
# Initialize session state variables
if "openai_analysis" not in st.session_state:
st.session_state.openai_analysis = None
if "df" not in st.session_state:
st.session_state.df = None
if "dataset_name" not in st.session_state:
st.session_state.dataset_name = None
if "selected_dist_columns" not in st.session_state:
st.session_state.selected_dist_columns = []
if "selected_wordcloud_columns" not in st.session_state:
st.session_state.selected_wordcloud_columns = []
st.set_page_config(
page_title="Dataset Card Generator",
page_icon="π",
layout="wide",
)
def initialize_openai_client(api_key):
"""Initialize OpenAI client with API key."""
try:
# Basic initialization without any proxy settings
return OpenAI(api_key=api_key)
except Exception as e:
print(f"Error initializing OpenAI client: {e}")
# If that fails, try with default configuration
return OpenAI(
api_key=api_key,
default_headers={"User-Agent": "Dataset-Card-Generator"}
)
def load_and_analyze_dataset(dataset_name):
"""Load dataset and perform initial analysis."""
progress_container = st.empty()
with progress_container.container():
with st.status("Loading dataset...", expanded=True) as status:
try:
# Load dataset
status.write("π₯ Loading dataset from HuggingFace...")
dataset = load_dataset(dataset_name, split="train")
df = pd.DataFrame(dataset)
st.session_state.df = df
st.session_state.dataset_name = dataset_name
# Initialize OpenAI analysis
try:
status.write("π€ Analyzing dataset ...")
client = initialize_openai_client(st.session_state.openai_key)
sample_data = dataset[:5]
print("Sample data:", json.dumps(sample_data, indent=2))
analysis = analyze_dataset_with_openai(client, sample_data)
print("Analysis result:", json.dumps(analysis, indent=2))
st.session_state.openai_analysis = analysis
except Exception as e:
print(f"Analysis error: {str(e)}")
status.update(label=f"β Error: {str(e)}", state="error")
status.update(
label="β
Dataset loaded and analyzed successfully!",
state="complete",
)
except Exception as e:
status.update(label=f"β Error: {str(e)}", state="error")
st.error(f"Failed to load dataset: {str(e)}")
return
def display_dataset_analysis():
"""Display dataset analysis and visualization options."""
if st.session_state.df is None:
return
st.header("Dataset Analysis")
# Dataset preview
with st.expander("π Dataset Preview", expanded=True):
st.dataframe(st.session_state.df.head(), use_container_width=True)
# Column selection for visualizations
st.subheader("Select Visualization Fields")
col1, col2 = st.columns(2)
with col1:
# Distribution plot selection
st.session_state.selected_dist_columns = st.multiselect(
"Distribution Plots (max 2)",
options=st.session_state.df.columns.tolist(),
format_func=lambda x: get_column_type_description(st.session_state.df, x),
max_selections=2,
help="Select columns to show value distributions. List columns will show frequency of individual items.",
)
with col2:
# Word cloud selection
text_columns = [
col
for col in st.session_state.df.columns
if st.session_state.df[col].dtype == "object"
or isinstance(st.session_state.df[col].iloc[0], list)
]
st.session_state.selected_wordcloud_columns = st.multiselect(
"Word Clouds (max 2)",
options=text_columns,
format_func=lambda x: get_column_type_description(st.session_state.df, x),
max_selections=2,
help="Select text columns to generate word clouds",
)
# Add some spacing
st.markdown("---")
# Generate card button
if st.button("Generate Dataset Card", type="primary", use_container_width=True):
if not (
st.session_state.selected_dist_columns
or st.session_state.selected_wordcloud_columns
):
st.warning(
"Please select at least one visualization before generating the card."
)
return
generate_and_display_card()
def generate_and_display_card():
"""Generate and display the dataset card with visualizations."""
if not st.session_state.openai_analysis:
st.error(
"Dataset analysis not available. Please try loading the dataset again."
)
return
with st.status("Generating dataset card...", expanded=True) as status:
try:
# Create visualizations
status.write("π Creating distribution plots...")
distribution_plots = {}
for col in st.session_state.selected_dist_columns:
print(f"Generating distribution plot for {col}")
img_base64 = create_distribution_plot(st.session_state.df, col)
distribution_plots[col] = img_base64
print(f"Successfully created plot for {col}")
status.write("π€ Generating word clouds...")
wordcloud_plots = {}
for col in st.session_state.selected_wordcloud_columns:
print(f"Generating word cloud for {col}")
img_base64 = create_wordcloud(st.session_state.df, col)
wordcloud_plots[col] = img_base64
print(f"Successfully created word cloud for {col}")
# Generate dataset card content
status.write("π Composing dataset card...")
dataset_info = {"dataset_name": st.session_state.dataset_name}
readme_content = generate_dataset_card(
dataset_info=dataset_info,
distribution_plots=distribution_plots,
wordcloud_plots=wordcloud_plots,
openai_analysis=st.session_state.openai_analysis,
df=st.session_state.df, # Added DataFrame parameter
)
# Display results
status.update(label="β
Dataset card generated!", state="complete")
# Display the markdown with images
st.markdown(readme_content, unsafe_allow_html=True)
# Add download button
st.download_button(
label="β¬οΈ Download Dataset Card",
data=readme_content,
file_name="README.md",
mime="text/markdown",
use_container_width=True,
)
except Exception as e:
print(f"Error in generate_and_display_card: {str(e)}")
st.error(f"Error generating dataset card: {str(e)}")
raise e
def get_column_type_description(data, column):
"""Get a user-friendly description of the column type."""
try:
if isinstance(data[column].iloc[0], list):
return f"{column} (list)"
elif data[column].dtype in ["int64", "float64"]:
return f"{column} (numeric)"
else:
return f"{column} (text/categorical)"
except:
return f"{column} (unknown)"
def get_api_keys():
"""Get API keys from secrets or user input."""
# Try to get from secrets first
try:
hf_token = st.secrets["api_keys"]["huggingface"]
openai_key = st.secrets["api_keys"]["openai"]
return hf_token, openai_key
except:
return None, None
def get_secrets():
"""Get API keys from secrets.toml if it exists."""
try:
hf_token = st.secrets.get("api_keys", {}).get("huggingface", "")
openai_key = st.secrets.get("api_keys", {}).get("openai", "")
return hf_token, openai_key
except Exception as e:
print(f"No secrets file found or error reading secrets: {e}")
return "", ""
def main():
st.title("π Dataset Card Generator")
st.markdown(
"""
Generate beautiful documentation for your HuggingFace datasets with automated analysis,
visualizations, and formatted dataset cards.
"""
)
# Get secrets if available
default_hf_token, default_openai_key = get_api_keys()
# Authentication section in sidebar
with st.sidebar:
st.header("π Authentication")
# OpenAI API key (required)
openai_key = st.text_input(
"OpenAI API Key",
value=default_openai_key,
type="password" if not default_openai_key else "default",
help="Required: Your OpenAI API key for dataset analysis",
)
# HuggingFace token (optional)
hf_token = st.text_input(
"HuggingFace Token (optional)",
value=default_hf_token,
type="password" if not default_hf_token else "default",
help="Optional: Only required for private datasets",
)
if openai_key:
try:
# Only attempt HF login if token is provided
if hf_token:
login(hf_token)
st.success("β
HuggingFace authentication successful!")
st.session_state.openai_key = openai_key
st.success("β
OpenAI API key set!")
except Exception as e:
st.error(f"β Authentication error: {str(e)}")
return
else:
st.info("π Please enter your OpenAI API key to get started.")
return
# Main content area
if not openai_key:
return
dataset_name = st.text_input(
"Enter HuggingFace Dataset Name",
placeholder="username/dataset",
help="Enter the full path to your HuggingFace dataset (e.g., 'username/dataset')",
)
if dataset_name:
if st.button("Load Dataset", type="primary"):
load_and_analyze_dataset(dataset_name)
if st.session_state.df is not None:
display_dataset_analysis()
if __name__ == "__main__":
main()
|