File size: 10,782 Bytes
eb03925
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fdea16e
 
 
 
 
 
 
 
 
 
eb03925
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
import json

import pandas as pd
import streamlit as st
from datasets import load_dataset
from huggingface_hub import HfApi, login
from openai import OpenAI

# Import our utility functions
from utils.analysis import analyze_dataset_with_openai, generate_dataset_card
from utils.visualization import create_distribution_plot, create_wordcloud

# Initialize session state variables
if "openai_analysis" not in st.session_state:
    st.session_state.openai_analysis = None
if "df" not in st.session_state:
    st.session_state.df = None
if "dataset_name" not in st.session_state:
    st.session_state.dataset_name = None
if "selected_dist_columns" not in st.session_state:
    st.session_state.selected_dist_columns = []
if "selected_wordcloud_columns" not in st.session_state:
    st.session_state.selected_wordcloud_columns = []

st.set_page_config(
    page_title="Dataset Card Generator",
    page_icon="πŸ“Š",
    layout="wide",
)


def initialize_openai_client(api_key):
    """Initialize OpenAI client with API key."""
    try:
        # Basic initialization without any proxy settings
        return OpenAI(api_key=api_key)
    except Exception as e:
        print(f"Error initializing OpenAI client: {e}")
        # If that fails, try with default configuration
        return OpenAI(
            api_key=api_key,
            default_headers={"User-Agent": "Dataset-Card-Generator"}
        )

def load_and_analyze_dataset(dataset_name):
    """Load dataset and perform initial analysis."""
    progress_container = st.empty()

    with progress_container.container():
        with st.status("Loading dataset...", expanded=True) as status:
            try:
                # Load dataset
                status.write("πŸ“₯ Loading dataset from HuggingFace...")
                dataset = load_dataset(dataset_name, split="train")
                df = pd.DataFrame(dataset)
                st.session_state.df = df
                st.session_state.dataset_name = dataset_name

                # Initialize OpenAI analysis
                try:
                    status.write("πŸ€– Analyzing dataset ...")
                    client = initialize_openai_client(st.session_state.openai_key)
                    sample_data = dataset[:5]
                    print("Sample data:", json.dumps(sample_data, indent=2))

                    analysis = analyze_dataset_with_openai(client, sample_data)
                    print("Analysis result:", json.dumps(analysis, indent=2))

                    st.session_state.openai_analysis = analysis
                except Exception as e:
                    print(f"Analysis error: {str(e)}")
                    status.update(label=f"❌ Error: {str(e)}", state="error")

                status.update(
                    label="βœ… Dataset loaded and analyzed successfully!",
                    state="complete",
                )

            except Exception as e:
                status.update(label=f"❌ Error: {str(e)}", state="error")
                st.error(f"Failed to load dataset: {str(e)}")
                return


def display_dataset_analysis():
    """Display dataset analysis and visualization options."""
    if st.session_state.df is None:
        return

    st.header("Dataset Analysis")

    # Dataset preview
    with st.expander("πŸ“Š Dataset Preview", expanded=True):
        st.dataframe(st.session_state.df.head(), use_container_width=True)

    # Column selection for visualizations
    st.subheader("Select Visualization Fields")

    col1, col2 = st.columns(2)

    with col1:
        # Distribution plot selection
        st.session_state.selected_dist_columns = st.multiselect(
            "Distribution Plots (max 2)",
            options=st.session_state.df.columns.tolist(),
            format_func=lambda x: get_column_type_description(st.session_state.df, x),
            max_selections=2,
            help="Select columns to show value distributions. List columns will show frequency of individual items.",
        )

    with col2:
        # Word cloud selection
        text_columns = [
            col
            for col in st.session_state.df.columns
            if st.session_state.df[col].dtype == "object"
            or isinstance(st.session_state.df[col].iloc[0], list)
        ]

        st.session_state.selected_wordcloud_columns = st.multiselect(
            "Word Clouds (max 2)",
            options=text_columns,
            format_func=lambda x: get_column_type_description(st.session_state.df, x),
            max_selections=2,
            help="Select text columns to generate word clouds",
        )

    # Add some spacing
    st.markdown("---")

    # Generate card button
    if st.button("Generate Dataset Card", type="primary", use_container_width=True):
        if not (
            st.session_state.selected_dist_columns
            or st.session_state.selected_wordcloud_columns
        ):
            st.warning(
                "Please select at least one visualization before generating the card."
            )
            return

        generate_and_display_card()


def generate_and_display_card():
    """Generate and display the dataset card with visualizations."""
    if not st.session_state.openai_analysis:
        st.error(
            "Dataset analysis not available. Please try loading the dataset again."
        )
        return

    with st.status("Generating dataset card...", expanded=True) as status:
        try:
            # Create visualizations
            status.write("πŸ“Š Creating distribution plots...")
            distribution_plots = {}
            for col in st.session_state.selected_dist_columns:
                print(f"Generating distribution plot for {col}")
                img_base64 = create_distribution_plot(st.session_state.df, col)
                distribution_plots[col] = img_base64
                print(f"Successfully created plot for {col}")

            status.write("πŸ”€ Generating word clouds...")
            wordcloud_plots = {}
            for col in st.session_state.selected_wordcloud_columns:
                print(f"Generating word cloud for {col}")
                img_base64 = create_wordcloud(st.session_state.df, col)
                wordcloud_plots[col] = img_base64
                print(f"Successfully created word cloud for {col}")

            # Generate dataset card content
            status.write("πŸ“ Composing dataset card...")
            dataset_info = {"dataset_name": st.session_state.dataset_name}

            readme_content = generate_dataset_card(
                dataset_info=dataset_info,
                distribution_plots=distribution_plots,
                wordcloud_plots=wordcloud_plots,
                openai_analysis=st.session_state.openai_analysis,
                df=st.session_state.df,  # Added DataFrame parameter
            )

            # Display results
            status.update(label="βœ… Dataset card generated!", state="complete")

            # Display the markdown with images
            st.markdown(readme_content, unsafe_allow_html=True)

            # Add download button
            st.download_button(
                label="⬇️ Download Dataset Card",
                data=readme_content,
                file_name="README.md",
                mime="text/markdown",
                use_container_width=True,
            )

        except Exception as e:
            print(f"Error in generate_and_display_card: {str(e)}")
            st.error(f"Error generating dataset card: {str(e)}")
            raise e


def get_column_type_description(data, column):
    """Get a user-friendly description of the column type."""
    try:
        if isinstance(data[column].iloc[0], list):
            return f"{column} (list)"
        elif data[column].dtype in ["int64", "float64"]:
            return f"{column} (numeric)"
        else:
            return f"{column} (text/categorical)"
    except:
        return f"{column} (unknown)"


def get_api_keys():
    """Get API keys from secrets or user input."""
    # Try to get from secrets first
    try:
        hf_token = st.secrets["api_keys"]["huggingface"]
        openai_key = st.secrets["api_keys"]["openai"]
        return hf_token, openai_key
    except:
        return None, None


def get_secrets():
    """Get API keys from secrets.toml if it exists."""
    try:
        hf_token = st.secrets.get("api_keys", {}).get("huggingface", "")
        openai_key = st.secrets.get("api_keys", {}).get("openai", "")
        return hf_token, openai_key
    except Exception as e:
        print(f"No secrets file found or error reading secrets: {e}")
        return "", ""


def main():
    st.title("πŸ“Š Dataset Card Generator")
    st.markdown(
        """
    Generate beautiful documentation for your HuggingFace datasets with automated analysis, 
    visualizations, and formatted dataset cards.
    """
    )

    # Get secrets if available
    default_hf_token, default_openai_key = get_api_keys()

    # Authentication section in sidebar
    with st.sidebar:
        st.header("πŸ”‘ Authentication")

        # OpenAI API key (required)
        openai_key = st.text_input(
            "OpenAI API Key",
            value=default_openai_key,
            type="password" if not default_openai_key else "default",
            help="Required: Your OpenAI API key for dataset analysis",
        )

        # HuggingFace token (optional)
        hf_token = st.text_input(
            "HuggingFace Token (optional)",
            value=default_hf_token,
            type="password" if not default_hf_token else "default",
            help="Optional: Only required for private datasets",
        )

        if openai_key:
            try:
                # Only attempt HF login if token is provided
                if hf_token:
                    login(hf_token)
                    st.success("βœ… HuggingFace authentication successful!")

                st.session_state.openai_key = openai_key
                st.success("βœ… OpenAI API key set!")
            except Exception as e:
                st.error(f"❌ Authentication error: {str(e)}")
                return
        else:
            st.info("πŸ‘† Please enter your OpenAI API key to get started.")
            return

    # Main content area
    if not openai_key:
        return

    dataset_name = st.text_input(
        "Enter HuggingFace Dataset Name",
        placeholder="username/dataset",
        help="Enter the full path to your HuggingFace dataset (e.g., 'username/dataset')",
    )

    if dataset_name:
        if st.button("Load Dataset", type="primary"):
            load_and_analyze_dataset(dataset_name)

    if st.session_state.df is not None:
        display_dataset_analysis()


if __name__ == "__main__":
    main()