import pandas as pd import numpy as np from sklearn.cluster import KMeans from sklearn.preprocessing import StandardScaler, OneHotEncoder from sklearn.compose import ColumnTransformer from sklearn.pipeline import Pipeline import logging import gradio as gr # Configure logging logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s') # Initial hardcoded sample data data = pd.DataFrame({ 'User ID': [1, 2, 3, 4, 5], 'Session Duration': [300, 450, 200, 600, 350], 'Pages Visited': [5, 8, 3, 12, 7], 'Ads Clicked': [2, 1, 0, 3, 2], 'User Interests': ['technology', 'sports', 'technology', 'arts', 'sports'], 'Engagement Score': [0.8, 0.5, 0.3, 0.9, 0.7], 'Device Type': ['mobile', 'desktop', 'mobile', 'tablet', 'desktop'], 'Time of Day': ['morning', 'afternoon', 'evening', 'morning', 'afternoon'], 'Time Spent per Page': [30, 25, 45, 20, 50], 'Click Through Rate': [0.1, 0.2, 0.05, 0.3, 0.15], 'Conversion Rate': [0.05, 0.1, 0, 0.2, 0.1], 'Frequency of Visits': [10, 20, 5, 15, 10], 'Bounce Rate': [0.2, 0.1, 0.5, 0.05, 0.3] }) logging.info("Sample data prepared.") # Define expected columns including 'User ID' expected_columns = { 'User ID': int, 'Session Duration': int, 'Pages Visited': int, 'Ads Clicked': int, 'User Interests': str, 'Engagement Score': float, 'Device Type': str, 'Time of Day': str, 'Time Spent per Page': int, 'Click Through Rate': float, 'Conversion Rate': float, 'Frequency of Visits': int, 'Bounce Rate': float } def validate_data(user_data): if not all(col in user_data.columns for col in expected_columns): logging.error("Missing columns in the uploaded data.") return False, "Missing columns in the uploaded data." for col, dtype in expected_columns.items(): # Check if the expected type is string and the actual type is object if dtype == str and user_data[col].dtype == object: continue if user_data[col].dtype != np.dtype(dtype): logging.error(f"Incorrect data type for column {col}. Expected {dtype}, got {user_data[col].dtype}.") return False, f"Incorrect data type for column {col}. Expected {dtype}, got {user_data[col].dtype}." logging.info("Data is valid.") return True, "Data is valid." def load_user_data(file): try: user_data = pd.read_csv(file) is_valid, message = validate_data(user_data) if not is_valid: return message global data data = user_data # Retrain the pipeline with new data pipeline.fit(data) return "Data uploaded, validated, and model retrained successfully. You can now make predictions by selecting the 'Cluster Prediction' tab above" except Exception as e: return str(e) # Updated preprocessing preprocessor = ColumnTransformer( transformers=[ ('num', StandardScaler(), ['Session Duration', 'Pages Visited', 'Ads Clicked', 'Engagement Score', 'Time Spent per Page', 'Click Through Rate', 'Conversion Rate', 'Frequency of Visits', 'Bounce Rate']), ('cat', OneHotEncoder(handle_unknown='ignore'), ['User Interests', 'Device Type', 'Time of Day']) ]) logging.info("Preprocessor setup complete.") # Clustering kmeans = KMeans(n_clusters=3, random_state=42) logging.info("KMeans clustering configured.") # Define the pipeline pipeline = Pipeline([ ('preprocessor', preprocessor), ('cluster', kmeans) ]) logging.info("Pipeline created.") # Fit the pipeline to the data pipeline.fit(data) def generate_insights(cluster_characteristics): # Example insights based on hypothetical thresholds insights = [] if cluster_characteristics['Engagement Score'] > 0.7 and cluster_characteristics['Conversion Rate'] < 0.1: insights.append("High engagement but low conversion: Consider optimizing the checkout process or providing targeted offers.") if cluster_characteristics['Click Through Rate'] > 0.2: insights.append("High click-through rate: Users are interacting well with ads. Increase ad relevance to boost conversions.") if cluster_characteristics['Bounce Rate'] > 0.3: insights.append("High bounce rate: Review landing page design and content relevance to improve user retention.") if cluster_characteristics['Frequency of Visits'] > 15: insights.append("Frequent visits: Users are returning often, consider loyalty programs or personalized content to maintain engagement.") if cluster_characteristics['Time Spent per Page'] < 20: insights.append("Low time spent per page: Content may not be engaging or relevant enough. Consider content optimization.") if cluster_characteristics['Conversion Rate'] > 0.15: insights.append("High conversion rate: Effective ad targeting. Explore scaling up ad spend on similar user segments.") return " ".join(insights) def predict_cluster(session_duration, pages_visited, ads_clicked, engagement_score, user_interests, device_type, time_of_day, time_spent_per_page, click_through_rate, conversion_rate, frequency_of_visits, bounce_rate): logging.info("Starting cluster prediction.") input_df = pd.DataFrame({ 'Session Duration': [session_duration], 'Pages Visited': [pages_visited], 'Ads Clicked': [ads_clicked], 'Engagement Score': [engagement_score], 'User Interests': [user_interests], 'Device Type': [device_type], 'Time of Day': [time_of_day], 'Time Spent per Page': [time_spent_per_page], 'Click Through Rate': [click_through_rate], 'Conversion Rate': [conversion_rate], 'Frequency of Visits': [frequency_of_visits], 'Bounce Rate': [bounce_rate] }) logging.debug(f"Input DataFrame: {input_df}") cluster = pipeline.predict(input_df)[0] logging.info(f"Predicted cluster: {cluster}") centroids = pipeline.named_steps['cluster'].cluster_centers_ cluster_characteristics = centroids[cluster] # Decode features for insights num_features = ['Session Duration', 'Pages Visited', 'Ads Clicked', 'Engagement Score', 'Time Spent per Page', 'Click Through Rate', 'Conversion Rate', 'Frequency of Visits', 'Bounce Rate'] scaled_features = cluster_characteristics[:9] original_num_values = pipeline.named_steps['preprocessor'].named_transformers_['num'].inverse_transform([scaled_features])[0] cat_features = ['User Interests', 'Device Type', 'Time of Day'] encoded_features = cluster_characteristics[9:] original_cat_values = pipeline.named_steps['preprocessor'].named_transformers_['cat'].inverse_transform([encoded_features])[0] # Combine numerical and categorical features into a dictionary cluster_characteristics = dict(zip(num_features, original_num_values)) cluster_characteristics.update(dict(zip(cat_features, original_cat_values))) # Generate actionable insights insights = generate_insights(cluster_characteristics) logging.info("Cluster prediction completed.") return f"Predicted Cluster: {cluster}\nCharacteristics: {cluster_characteristics}\nActionable Insights: {insights}" def ad_performance_analytics(): logging.info("Calculating ad performance analytics.") avg_ctr = data['Click Through Rate'].mean() avg_conversion_rate = data['Conversion Rate'].mean() avg_bounce_rate = data['Bounce Rate'].mean() logging.debug(f"Average CTR: {avg_ctr}, Average Conversion Rate: {avg_conversion_rate}, Average Bounce Rate: {avg_bounce_rate}") # Prepare the analytics report report = f"Average Click Through Rate: {avg_ctr:.2%}\n" report += f"Average Conversion Rate: {avg_conversion_rate:.2%}\n" report += f"Average Bounce Rate: {avg_bounce_rate:.2%}" logging.info("Ad performance analytics calculation completed.") return report with gr.Blocks() as demo: with gr.Tab("Upload Data"): gr.Markdown(""" **Upload your data file in CSV format. Ensure it contains the following columns with appropriate data types:** - User ID (int) - Session Duration (int) - Pages Visited (int) - Ads Clicked (int) - User Interests (str) - Engagement Score (float) - Device Type (str) - Time of Day (str) - Time Spent per Page (int) - Click Through Rate (float) - Conversion Rate (float) - Frequency of Visits (int) - Bounce Rate (float) **Note:** You can upload your own data for analysis, or continue using the existing sample data for predictions by selecting the **'Cluster Prediction'** tab above. """) file_input = gr.File(label="Upload your CSV data file") upload_message = gr.Textbox() file_input.change(load_user_data, inputs=file_input, outputs=upload_message) with gr.Tab("Cluster Prediction"): with gr.Row(): gr.Markdown("**This form allows you to input user session data to predict which cluster the user belongs to and provides actionable insights based on their behavior.**") session_duration = gr.Number(label="Session Duration", value=300) # Set initial value pages_visited = gr.Number(label="Pages Visited", value=5) # Set initial value ads_clicked = gr.Number(label="Ads Clicked", value=2) # Set initial value engagement_score = gr.Slider(0, 1, label="Engagement Score", value=0.5) # Set initial value user_interests = gr.Dropdown(['technology', 'sports', 'arts'], label="User Interests", value='technology') # Set initial value device_type = gr.Radio(['mobile', 'desktop', 'tablet'], label="Device Type", value='mobile') # Set initial value time_of_day = gr.Radio(['morning', 'afternoon', 'evening'], label="Time of Day", value='morning') # Set initial value time_spent_per_page = gr.Number(label="Time Spent per Page", value=30) # Set initial value click_through_rate = gr.Slider(0, 1, step=0.01, label="Click Through Rate", value=0.1) # Set initial value conversion_rate = gr.Slider(0, 1, step=0.01, label="Conversion Rate", value=0.05) # Set initial value frequency_of_visits = gr.Number(label="Frequency of Visits", value=10) # Set initial value bounce_rate = gr.Slider(0, 1, step=0.01, label="Bounce Rate", value=0.2) # Set initial value predict_button = gr.Button("Predict") output_textbox = gr.Textbox(label="Prediction Output", lines=4) predict_button.click( predict_cluster, inputs=[ session_duration, pages_visited, ads_clicked, engagement_score, user_interests, device_type, time_of_day, time_spent_per_page, click_through_rate, conversion_rate, frequency_of_visits, bounce_rate ], outputs=output_textbox ) logging.info("Gradio predict button configured.") with gr.Tab("Ad Performance Analytics"): gr.Markdown(""" **This form provides a summary of key performance metrics for ads.** - **Average Click-Through Rate (CTR):** Measures the percentage of ad views that result in clicks. Higher values indicate more effective ad engagement. - **Average Conversion Rate:** Indicates the percentage of clicks that convert into actions, such as purchases or sign-ups. This metric helps assess the effectiveness of ad targeting and the overall conversion potential. - **Average Bounce Rate:** Reflects the percentage of single-page visits. Lower bounce rates suggest that the landing pages are relevant to the visitors' interests. Understanding these metrics can help optimize ad strategies and improve overall campaign performance. """) analytics_button = gr.Button("Analyze Ad Performance") analytics_output = gr.Textbox(label="Analytics Output", lines=3) analytics_button.click( ad_performance_analytics, outputs=analytics_output ) logging.info("Gradio analytics button configured.") demo.launch() logging.info("Gradio interface launched.")