import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import logging
import gradio as gr

# Configure logging
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')

# Initial hardcoded sample data
data = pd.DataFrame({
    'User ID': [1, 2, 3, 4, 5],
    'Session Duration': [300, 450, 200, 600, 350],
    'Pages Visited': [5, 8, 3, 12, 7],
    'Ads Clicked': [2, 1, 0, 3, 2],
    'User Interests': ['technology', 'sports', 'technology', 'arts', 'sports'],
    'Engagement Score': [0.8, 0.5, 0.3, 0.9, 0.7],
    'Device Type': ['mobile', 'desktop', 'mobile', 'tablet', 'desktop'],
    'Time of Day': ['morning', 'afternoon', 'evening', 'morning', 'afternoon'],
    'Time Spent per Page': [30, 25, 45, 20, 50],
    'Click Through Rate': [0.1, 0.2, 0.05, 0.3, 0.15],
    'Conversion Rate': [0.05, 0.1, 0, 0.2, 0.1],
    'Frequency of Visits': [10, 20, 5, 15, 10],
    'Bounce Rate': [0.2, 0.1, 0.5, 0.05, 0.3]
})

logging.info("Sample data prepared.")

# Define expected columns including 'User ID'
expected_columns = {
    'User ID': int,
    'Session Duration': int,
    'Pages Visited': int,
    'Ads Clicked': int,
    'User Interests': str,
    'Engagement Score': float,
    'Device Type': str,
    'Time of Day': str,
    'Time Spent per Page': int,
    'Click Through Rate': float,
    'Conversion Rate': float,
    'Frequency of Visits': int,
    'Bounce Rate': float
}

def validate_data(user_data):
    if not all(col in user_data.columns for col in expected_columns):
        logging.error("Missing columns in the uploaded data.")
        return False, "Missing columns in the uploaded data."
    for col, dtype in expected_columns.items():
        # Check if the expected type is string and the actual type is object
        if dtype == str and user_data[col].dtype == object:
            continue
        if user_data[col].dtype != np.dtype(dtype):
            logging.error(f"Incorrect data type for column {col}. Expected {dtype}, got {user_data[col].dtype}.")
            return False, f"Incorrect data type for column {col}. Expected {dtype}, got {user_data[col].dtype}."
    logging.info("Data is valid.")
    return True, "Data is valid."

def load_user_data(file):
    try:
        user_data = pd.read_csv(file)
        is_valid, message = validate_data(user_data)
        if not is_valid:
            return message
        global data
        data = user_data
        # Retrain the pipeline with new data
        pipeline.fit(data)
        return "Data uploaded, validated, and model retrained successfully. You can now make predictions by selecting the 'Cluster Prediction' tab above"
    except Exception as e:
        return str(e)

# Updated preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['Session Duration', 'Pages Visited', 'Ads Clicked', 'Engagement Score', 
                                   'Time Spent per Page', 'Click Through Rate', 'Conversion Rate', 
                                   'Frequency of Visits', 'Bounce Rate']),
        ('cat', OneHotEncoder(handle_unknown='ignore'), ['User Interests', 'Device Type', 'Time of Day'])
    ])

logging.info("Preprocessor setup complete.")

# Clustering
kmeans = KMeans(n_clusters=3, random_state=42)
logging.info("KMeans clustering configured.")

# Define the pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('cluster', kmeans)
])

logging.info("Pipeline created.")

# Fit the pipeline to the data
pipeline.fit(data)

def generate_insights(cluster_characteristics):
    # Example insights based on hypothetical thresholds
    insights = []
    if cluster_characteristics['Engagement Score'] > 0.7 and cluster_characteristics['Conversion Rate'] < 0.1:
        insights.append("High engagement but low conversion: Consider optimizing the checkout process or providing targeted offers.")
    if cluster_characteristics['Click Through Rate'] > 0.2:
        insights.append("High click-through rate: Users are interacting well with ads. Increase ad relevance to boost conversions.")
    if cluster_characteristics['Bounce Rate'] > 0.3:
        insights.append("High bounce rate: Review landing page design and content relevance to improve user retention.")
    if cluster_characteristics['Frequency of Visits'] > 15:
        insights.append("Frequent visits: Users are returning often, consider loyalty programs or personalized content to maintain engagement.")
    if cluster_characteristics['Time Spent per Page'] < 20:
        insights.append("Low time spent per page: Content may not be engaging or relevant enough. Consider content optimization.")
    if cluster_characteristics['Conversion Rate'] > 0.15:
        insights.append("High conversion rate: Effective ad targeting. Explore scaling up ad spend on similar user segments.")
    return " ".join(insights)

def predict_cluster(session_duration, pages_visited, ads_clicked, engagement_score, user_interests, device_type, time_of_day, time_spent_per_page, click_through_rate, conversion_rate, frequency_of_visits, bounce_rate):
    logging.info("Starting cluster prediction.")
    input_df = pd.DataFrame({
        'Session Duration': [session_duration],
        'Pages Visited': [pages_visited],
        'Ads Clicked': [ads_clicked],
        'Engagement Score': [engagement_score],
        'User Interests': [user_interests],
        'Device Type': [device_type],
        'Time of Day': [time_of_day],
        'Time Spent per Page': [time_spent_per_page],
        'Click Through Rate': [click_through_rate],
        'Conversion Rate': [conversion_rate],
        'Frequency of Visits': [frequency_of_visits],
        'Bounce Rate': [bounce_rate]
    })
    logging.debug(f"Input DataFrame: {input_df}")
    cluster = pipeline.predict(input_df)[0]
    logging.info(f"Predicted cluster: {cluster}")
    centroids = pipeline.named_steps['cluster'].cluster_centers_
    cluster_characteristics = centroids[cluster]

    # Decode features for insights
    num_features = ['Session Duration', 'Pages Visited', 'Ads Clicked', 'Engagement Score', 'Time Spent per Page', 'Click Through Rate', 'Conversion Rate', 'Frequency of Visits', 'Bounce Rate']
    scaled_features = cluster_characteristics[:9]
    original_num_values = pipeline.named_steps['preprocessor'].named_transformers_['num'].inverse_transform([scaled_features])[0]
    cat_features = ['User Interests', 'Device Type', 'Time of Day']
    encoded_features = cluster_characteristics[9:]
    original_cat_values = pipeline.named_steps['preprocessor'].named_transformers_['cat'].inverse_transform([encoded_features])[0]

    # Combine numerical and categorical features into a dictionary
    cluster_characteristics = dict(zip(num_features, original_num_values))
    cluster_characteristics.update(dict(zip(cat_features, original_cat_values)))

    # Generate actionable insights
    insights = generate_insights(cluster_characteristics)

    logging.info("Cluster prediction completed.")
    return f"Predicted Cluster: {cluster}\nCharacteristics: {cluster_characteristics}\nActionable Insights: {insights}"

def ad_performance_analytics():
    logging.info("Calculating ad performance analytics.")
    avg_ctr = data['Click Through Rate'].mean()
    avg_conversion_rate = data['Conversion Rate'].mean()
    avg_bounce_rate = data['Bounce Rate'].mean()
    logging.debug(f"Average CTR: {avg_ctr}, Average Conversion Rate: {avg_conversion_rate}, Average Bounce Rate: {avg_bounce_rate}")
    
    # Prepare the analytics report
    report = f"Average Click Through Rate: {avg_ctr:.2%}\n"
    report += f"Average Conversion Rate: {avg_conversion_rate:.2%}\n"
    report += f"Average Bounce Rate: {avg_bounce_rate:.2%}"
    
    logging.info("Ad performance analytics calculation completed.")
    return report

with gr.Blocks() as demo:
    with gr.Tab("Upload Data"):
        gr.Markdown("""
        **Upload your data file in CSV format. Ensure it contains the following columns with appropriate data types:**
        - User ID (int)
        - Session Duration (int)
        - Pages Visited (int)
        - Ads Clicked (int)
        - User Interests (str)
        - Engagement Score (float)
        - Device Type (str)
        - Time of Day (str)
        - Time Spent per Page (int)
        - Click Through Rate (float)
        - Conversion Rate (float)
        - Frequency of Visits (int)
        - Bounce Rate (float)

        **Note:** You can upload your own data for analysis, or continue using the existing sample data for predictions by selecting the **'Cluster Prediction'** tab above.
        """)
        file_input = gr.File(label="Upload your CSV data file")
        upload_message = gr.Textbox()
        file_input.change(load_user_data, inputs=file_input, outputs=upload_message)

    with gr.Tab("Cluster Prediction"):
        with gr.Row():
            gr.Markdown("**This form allows you to input user session data to predict which cluster the user belongs to and provides actionable insights based on their behavior.**")
            session_duration = gr.Number(label="Session Duration", value=300)  # Set initial value
            pages_visited = gr.Number(label="Pages Visited", value=5)  # Set initial value
            ads_clicked = gr.Number(label="Ads Clicked", value=2)  # Set initial value
            engagement_score = gr.Slider(0, 1, label="Engagement Score", value=0.5)  # Set initial value
            user_interests = gr.Dropdown(['technology', 'sports', 'arts'], label="User Interests", value='technology')  # Set initial value
            device_type = gr.Radio(['mobile', 'desktop', 'tablet'], label="Device Type", value='mobile')  # Set initial value
            time_of_day = gr.Radio(['morning', 'afternoon', 'evening'], label="Time of Day", value='morning')  # Set initial value
            time_spent_per_page = gr.Number(label="Time Spent per Page", value=30)  # Set initial value
            click_through_rate = gr.Slider(0, 1, step=0.01, label="Click Through Rate", value=0.1)  # Set initial value
            conversion_rate = gr.Slider(0, 1, step=0.01, label="Conversion Rate", value=0.05)  # Set initial value
            frequency_of_visits = gr.Number(label="Frequency of Visits", value=10)  # Set initial value
            bounce_rate = gr.Slider(0, 1, step=0.01, label="Bounce Rate", value=0.2)  # Set initial value
        predict_button = gr.Button("Predict")
        output_textbox = gr.Textbox(label="Prediction Output", lines=4)
        predict_button.click(
            predict_cluster,
            inputs=[
                session_duration, pages_visited, ads_clicked, engagement_score, user_interests, device_type,
                time_of_day, time_spent_per_page, click_through_rate, conversion_rate, frequency_of_visits, bounce_rate
            ],
            outputs=output_textbox
        )
        logging.info("Gradio predict button configured.")

    with gr.Tab("Ad Performance Analytics"):
        gr.Markdown("""
        **This form provides a summary of key performance metrics for ads.**
        
        - **Average Click-Through Rate (CTR):** Measures the percentage of ad views that result in clicks. Higher values indicate more effective ad engagement.
        - **Average Conversion Rate:** Indicates the percentage of clicks that convert into actions, such as purchases or sign-ups. This metric helps assess the effectiveness of ad targeting and the overall conversion potential.
        - **Average Bounce Rate:** Reflects the percentage of single-page visits. Lower bounce rates suggest that the landing pages are relevant to the visitors' interests.
        
        Understanding these metrics can help optimize ad strategies and improve overall campaign performance.
        """)
        analytics_button = gr.Button("Analyze Ad Performance")
        analytics_output = gr.Textbox(label="Analytics Output", lines=3)
        analytics_button.click(
            ad_performance_analytics,
            outputs=analytics_output
        )
        logging.info("Gradio analytics button configured.")

demo.launch()
logging.info("Gradio interface launched.")