# Import the libraries
import joblib
import pandas as pd
import json
import uuid
from pathlib import Path
import gradio as gr
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from huggingface_hub import CommitScheduler
import os

# Read the Hugging Face token from environment variables
HF_TOKEN = os.getenv("HF_TOKEN")

# Running the training script placed in the same directory as app.py
# The training script will train and persist a random forest model with the filename 'random_forest_pipeline_best.pkl'
import train

# Loading the freshly trained model from disk
saved_model_path = "random_forest_pipeline_best.pkl"
model_pipeline = joblib.load(saved_model_path)

# Preparing the logging functionality
log_file = Path("logs/") / f"data_{uuid.uuid4()}.json"
log_folder = log_file.parent
log_folder.mkdir(parents=True, exist_ok=True)

scheduler = CommitScheduler(
    repo_id="insurance-charge-mlops-logs",  # provide a name for the repo_id
    repo_type="dataset",
    folder_path=log_folder,
    path_in_repo="data",
    every=2,
    token=HF_TOKEN  # Pass the token directly
)

# Defining the predict function which will take features, convert to dataframe and make predictions using the saved model
def predict(age, bmi, children, sex, smoker, region):
    # Prepare the input data as a DataFrame
    input_data = pd.DataFrame({
        'age': [age],
        'bmi': [bmi],
        'children': [children],
        'sex': [sex],
        'smoker': [smoker],
        'region': [region]
    })

    # Making prediction using the loaded model pipeline
    prediction = model_pipeline.predict(input_data)

    # While the prediction is made, log both the inputs and outputs to a log file
    # While writing to the log file, ensure that the commit scheduler is locked to avoid parallel access
    with scheduler.lock:
        with log_file.open("a") as f:
            f.write(json.dumps(
                {
                    'age': age,
                    'bmi': bmi,
                    'children': children,
                    'sex': sex,
                    'smoker': smoker,
                    'region': region,
                    'prediction': prediction[0]
                }
            ))
            f.write("\n")

    return prediction[0]

# Setting up UI components for input and output
age_input = gr.Number(label="Age")
bmi_input = gr.Number(label="BMI")
children_input = gr.Number(label="Children")
sex_input = gr.Radio(choices=['male', 'female'], label="Sex")
smoker_input = gr.Radio(choices=['yes', 'no'], label="Smoker")
region_input = gr.Dropdown(choices=['northeast', 'northwest', 'southeast', 'southwest'], label="Region")

# Creating the gradio interface, make title "HealthyLife Insurance Charge Prediction"
demo = gr.Interface(
    fn=predict,
    inputs=[age_input, bmi_input, children_input, sex_input, smoker_input, region_input],
    outputs="number",
    title="HealthyLife Insurance Charge Prediction"
)

# Launching with a load balancer
demo.queue()
demo.launch(share=False)