import warnings
from pathlib import Path
import pickle
import numpy as np
import pandas as pd
import seaborn as sns
import plotly.express as px
import plotly.graph_objs as go
from plotly.subplots import make_subplots
import plotly.io as pio
from scipy.stats import ttest_ind
import plotly
import json
from src.Predictive_Maintenance.logger import logging

ARTIFACTS_DIR = "artifacts/eda/"

warnings.filterwarnings("ignore")

df = pd.read_csv("notebooks/data/data.csv")

def setup(df: pd.DataFrame) -> pd.DataFrame:
    def type_of_failure(row_name):
        if df.loc[row_name, 'TWF'] == 1:
            df.loc[row_name, 'type_of_failure'] = 'TWF'
        elif df.loc[row_name, 'HDF'] == 1:
            df.loc[row_name, 'type_of_failure'] = 'HDF'
        elif df.loc[row_name, 'PWF'] == 1:
            df.loc[row_name, 'type_of_failure'] = 'PWF'
        elif df.loc[row_name, 'OSF'] == 1:
            df.loc[row_name, 'type_of_failure'] = 'OSF'
        elif df.loc[row_name, 'RNF'] == 1:
            df.loc[row_name, 'type_of_failure'] = 'RNF'

    df.apply(lambda row: type_of_failure(row.name), axis=1)
    df['type_of_failure'].replace(np.NaN, 'no failure', inplace=True)
    df.drop(['TWF', 'HDF', 'PWF', 'OSF', 'RNF'], axis=1, inplace=True)
    logging.info("Created type_of_failure column")
    return df

def save_plot(fig, filename):
    filepath = Path(ARTIFACTS_DIR, filename)
    fig.write_image(filepath)
    return str(filepath)

def question_one(df):
    category_counts = df['type_of_failure'].value_counts()
    total_samples = len(df)
    category_percentages = (category_counts / total_samples) * 100
    categories = list(category_percentages.index)
    percentage_labels = list(category_percentages)
    percentage_labels = [f'{num:.2f}%' for num in percentage_labels]
    fig = px.histogram(df, x='type_of_failure', category_orders={'type_of_failure': categories})
    fig.update_traces(text=percentage_labels, textposition='auto')
    plot_path = save_plot(fig, 'question_one.png')
    logging.info("EDA Question 1 complete")
    return plot_path

def question_two(df):
    category_counts = df['Type'].value_counts()
    total_samples = len(df)
    category_percentages = (category_counts / total_samples) * 100
    categories = list(category_percentages.index)
    percentage_labels = list(category_percentages)
    percentage_labels = [f'{num:.2f}%' for num in percentage_labels]
    fig = px.histogram(df, x='Type', category_orders={'Type': categories})
    fig.update_traces(text=percentage_labels, textposition='auto')
    plot_path = save_plot(fig, 'question_two.png')
    logging.info("EDA Question 2 complete")
    return plot_path

def question_three(df):
    num_cols = ['Air temperature [K]', 'Process temperature [K]', 'Rotational speed [rpm]', 'Torque [Nm]', 'Tool wear [min]']
    fig1 = make_subplots(rows=5, cols=1, subplot_titles=num_cols, vertical_spacing=0.04)

    for i, col in enumerate(num_cols):
        box_plot = go.Box(x=df[col], name=col)
        fig1.add_trace(box_plot, row=i+1, col=1)

    fig1.update_layout(
        title="Distribution of Numerical Features",
        height=1200,
        width=900,
        title_text="Box plots"
    )

    plot_path1 = save_plot(fig1, 'question_three_boxplots.png')

    outlier_cols = ['Torque [Nm]', 'Rotational speed [rpm]']
    fig2 = make_subplots(rows=1, cols=2, subplot_titles=outlier_cols, vertical_spacing=0.03)

    for i, col in enumerate(outlier_cols):
        box_plot = go.Histogram(x=df[col], name=col)
        fig2.add_trace(box_plot, row=1, col=i+1)

    fig2.update_layout(
        title='Distribution of Torque and Rotational speed',
        yaxis_title='Frequency',
        title_text="Histograms",
        width=900
    )

    plot_path2 = save_plot(fig2, 'question_three_histograms.png')
    logging.info("EDA Question 3 complete")
    return {'boxplots': plot_path1, 'histograms': plot_path2}

def question_four(df):
    corr_matrix = df[['Air temperature [K]', 'Process temperature [K]', 'Rotational speed [rpm]', 'Torque [Nm]', 'Tool wear [min]', 'Machine failure']].corr()
    fig = px.imshow(corr_matrix, zmin=-1, zmax=1, text_auto=True)
    fig.update_layout(
        title='Correlation Matrix',
        height=600,
        width=800
    )
    
    plot_path = save_plot(fig, 'question_four_correlation_matrix.png')

    test_cols = ['Air temperature [K]','Process temperature [K]', 'Rotational speed [rpm]', 'Torque [Nm]','Tool wear [min]']
    values = []
    for col in test_cols:
        failed = df[df['Machine failure'] == 1][col]
        non_failed = df[df['Machine failure'] == 0][col]
        t, p = ttest_ind(failed, non_failed)
        values.append([t, p])

    values = pd.DataFrame(values, columns=['test-statistic', 'p-value'], index=test_cols)
    alpha = 0.05
    values['Hypothesis'] = values['p-value'].apply(lambda p: 'Reject null hypothesis' if p < alpha else 'Accept null hypothesis')
    value_path = Path(ARTIFACTS_DIR, 'question_four_ttest.json')
    values.to_json(value_path, orient='split')
    logging.info("EDA Question 4 complete")
    return {'correlation_matrix': plot_path, 'ttest': str(value_path)}

def question_five(df):
    num_cols = ['Air temperature [K]','Process temperature [K]', 'Rotational speed [rpm]', 'Torque [Nm]','Tool wear [min]']
    fig = make_subplots(rows=5, cols=1, subplot_titles=num_cols, vertical_spacing=0.03, horizontal_spacing=0.01)
    for i, col in enumerate(num_cols):
        violin_trace = go.Violin(x=df['Type'], y=df[col], name=col, box_visible=True, meanline_visible=True)
        fig.add_trace(violin_trace, row=i+1, col=1)
    fig.update_layout(height=2000, width=800, title_text="Subplots")
    plot_path = save_plot(fig, 'question_five_violin_plots.png')
    logging.info("EDA Question 5 complete")
    return plot_path

def question_six(df):
    num_cols = ['Air temperature [K]', 'Process temperature [K]', 'Rotational speed [rpm]', 'Torque [Nm]', 'Tool wear [min]']

    fig = make_subplots(rows=5, cols=5, shared_xaxes=True, shared_yaxes=True, vertical_spacing=0.02, horizontal_spacing=0.02)

    for i, col1 in enumerate(num_cols):
        for j, col2 in enumerate(num_cols):
            if i == j:
                fig.add_trace(go.Histogram(x=df[col1], name=col1, showlegend=False), row=i+1, col=j+1)
            else:
                fig.add_trace(go.Scatter(x=df[col2], y=df[col1], mode='markers', name=f'{col1} vs {col2}', showlegend=False, marker=dict(size=3)), row=i+1, col=j+1)

    fig.update_layout(
        title="Pair Plot of Continuous Variables",
        height=1200,
        width=1200,
        title_x=0.5
    )

    plot_path = save_plot(fig, 'question_six_pairplot.png')
    logging.info("EDA Question 6 complete")
    return plot_path

# def question_seven(df):
#     df['Date'] = pd.to_datetime(df['Date'])
#     df['Year'] = df['Date'].dt.year

#     failure_counts = df[df['Machine failure'] == 1].groupby('Year').size()

#     fig = px.line(failure_counts, x=failure_counts.index, y=failure_counts.values, title="Machine Failure Trend Over Time")
#     fig.update_xaxes(title="Year")
#     fig.update_yaxes(title="Failure Count")
#     plot_path = save_plot(fig, 'question_seven_trend.png')
#     logging.info("EDA Question 7 complete")
#     return plot_path

def question_eight(df):
    num_cols = ['Air temperature [K]', 'Process temperature [K]', 'Rotational speed [rpm]', 'Torque [Nm]', 'Tool wear [min]']
    fig = make_subplots(rows=len(num_cols), cols=1, subplot_titles=num_cols, vertical_spacing=0.03)

    for i, col in enumerate(num_cols):
        for product_type in df['Type'].unique():
            data = df[df['Type'] == product_type][col]
            fig.add_trace(go.Box(y=data, name=product_type), row=i+1, col=1)

    fig.update_layout(height=1200, width=800, title="Distribution of Continuous Variables by Product Type")
    plot_path = save_plot(fig, 'question_eight_boxplots.png')
    logging.info("EDA Question 8 complete")
    return plot_path

def question_nine(df):
    fig = px.scatter(df, x='Air temperature [K]', y='Machine failure', title="Machine Failure vs. Air Temperature")
    fig.update_yaxes(title="Machine Failure (1=Yes, 0=No)")
    plot_path = save_plot(fig, 'question_nine_scatter.png')
    logging.info("EDA Question 9 complete")
    return plot_path

def get_eda_obj():
    with open(Path(ARTIFACTS_DIR, 'eda.json'), 'r') as f:
        eda_json = json.load(f)
    return eda_json

def run_eda(df):
    eda_results = {}
    df = setup(df)
    eda_results['question_one'] = question_one(df)
    eda_results['question_two'] = question_two(df)
    eda_results['question_three'] = question_three(df)
    eda_results['question_four'] = question_four(df)
    eda_results['question_five'] = question_five(df)
    eda_results['question_six'] = question_six(df)
    # eda_results['question_seven'] = question_seven(df)
    eda_results['question_eight'] = question_eight(df)
    eda_results['question_nine'] = question_nine(df)
    with open(Path(ARTIFACTS_DIR, 'eda.json'), 'w') as f:
        json.dump(eda_results, f)
    logging.info("EDA completed and results saved")

if __name__ == "__main__":
    df = pd.read_csv("notebooks/data/data.csv")
    run_eda(df)
    logging.info("EDA script executed successfully")
    
    import webbrowser
    from pathlib import Path

    ARTIFACTS_DIR = "artifacts/eda"

    def open_plot(filename):
        filepath = Path(ARTIFACTS_DIR, filename)
        webbrowser.open(f'file://{filepath.absolute()}')

    # Example: Open the plot for question one
    open_plot('question_one.png')