import streamlit as st
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.cluster import KMeans
from sklearn.preprocessing import LabelEncoder
from statsmodels.stats.multicomp import pairwise_tukeyhsd

# 🏠 Titre de l'application
st.title("📊 Analyse des Évaluations des Clients avec ANOVA")

# 📂 Upload du fichier
uploaded_file = st.file_uploader("📂 Téléchargez le fichier 'supermarket_sales.csv'", type=["csv"])

if uploaded_file is not None:
    # 📖 Charger les données
    data = pd.read_csv(uploaded_file)

    # ✅ Renommer les colonnes pour éviter les erreurs de syntaxe
    data = data.rename(columns={'Product line': 'Product_line'})

    # ✅ Sélectionner les colonnes nécessaires
    data = data[['Product_line', 'Payment', 'Rating']]
    data.dropna(inplace=True)  # Supprimer les valeurs manquantes

    # ✅ Convertir en catégories
    data['Product_line'] = data['Product_line'].astype('category')
    data['Payment'] = data['Payment'].astype('category')
    data['Rating'] = pd.to_numeric(data['Rating'], errors='coerce')  # Convertir en numérique

    # 📌 Afficher un aperçu des données
    st.subheader("📊 Aperçu des Données")
    st.write(data.head())

    # ============================
    # 📌 Vérification des Hypothèses
    # ============================

    st.subheader("🧪 Vérification des Hypothèses")

    # 🔹 Test de normalité des résidus (Shapiro-Wilk)
    model = smf.ols('Rating ~ C(Product_line) * C(Payment)', data=data).fit()
    residuals = model.resid

    if len(residuals) > 5000:
        residuals_sample = pd.Series(residuals).sample(5000, random_state=42)
    else:
        residuals_sample = residuals

    shapiro_test = stats.shapiro(residuals_sample)
    st.write(f"✅ Test de Shapiro-Wilk (Normalité) : **p-value = {shapiro_test.pvalue:.4f}**")

    # 🔹 Test d'homogénéité des variances (Levene)
    group_list = [group.dropna().values for _, group in data.groupby('Product_line')['Rating']]
    levene_test = stats.levene(*group_list)
    st.write(f"✅ Test de Levene (Homogénéité des variances) : **p-value = {levene_test.pvalue:.4f}**")

    # ============================
    # 📌 ANOVA à Deux Facteurs
    # ============================

    st.subheader("📊 ANOVA à Deux Facteurs")
    anova_table = sm.stats.anova_lm(model, typ=2)
    st.write(anova_table)

    # ============================
    # 📌 Comparaisons Post-Hoc (Tukey HSD)
    # ============================

    st.subheader("📌 Comparaisons Post-Hoc (Tukey HSD)")
    
    if data['Rating'].isna().sum() == 0:  # Vérifie qu'il n'y a pas de NaN
        tukey = pairwise_tukeyhsd(data['Rating'], data['Product_line'])
        st.write(tukey.summary())
    else:
        st.error("Erreur : Des valeurs non numériques ont été détectées dans 'Rating'. Vérifiez votre fichier CSV.")

    # ============================
    # 📊 Visualisation des Résultats
    # ============================

    st.subheader("📊 Visualisation des Résultats")

    # 🔹 Boxplot
    fig, ax = plt.subplots(figsize=(10, 5))
    sns.boxplot(x='Product_line', y='Rating', hue='Payment', data=data, ax=ax)
    plt.xticks(rotation=45)
    st.pyplot(fig)

    # 🔹 Heatmap des Moyennes des Évaluations
    mean_ratings = data.groupby(['Product_line', 'Payment'])['Rating'].mean().unstack().fillna(0)
    fig, ax = plt.subplots(figsize=(8, 5))
    sns.heatmap(mean_ratings, annot=True, cmap='coolwarm', ax=ax)
    st.pyplot(fig)

    # ============================
    # 📌 Régression Linéaire Multiple
    # ============================

    st.subheader("📈 Régression Linéaire Multiple")
    lm_model = smf.ols('Rating ~ C(Product_line) + C(Payment)', data=data).fit()
    st.write(lm_model.summary())

    # ============================
    # 📌 Clustering des Clients (K-Means)
    # ============================

    st.subheader("🎯 Clustering des Clients (K-Means)")
    
    encoder = LabelEncoder()
    data['Product_line_encoded'] = encoder.fit_transform(data['Product_line'])
    kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
    data['Cluster'] = kmeans.fit_predict(data[['Rating', 'Product_line_encoded']])

    # 🔹 Visualisation du Clustering
    fig, ax = plt.subplots(figsize=(8, 5))
    sns.scatterplot(x='Product_line_encoded', y='Rating', hue=data['Cluster'].astype(str), palette='viridis', data=data, ax=ax)
    plt.xticks(ticks=range(len(encoder.classes_)), labels=encoder.classes_, rotation=45)
    st.pyplot(fig)