import pandas as pd import numpy as np from sklearn.model_selection import train_test_split from sklearn.ensemble import RandomForestClassifier from sklearn.preprocessing import LabelEncoder, OneHotEncoder from sklearn.compose import ColumnTransformer from sklearn.pipeline import Pipeline from sklearn.impute import SimpleImputer from sklearn.model_selection import RandomizedSearchCV import pickle import os import logging logger = logging.getLogger(__name__) class WellnessRecommendationModel: def __init__(self): """Initialize the wellness recommendation model.""" self.model_pipeline = None self.label_encoder = None self.target_classes = ['Pemandian Air Panas', 'Pijat Terapi', 'Retret Kesehatan', 'Retret Yoga', 'Spa', 'Terapi Herbal'] self.feature_names = ['Umur', 'Jenis Kelamin', 'Kota', 'Frekuensi', 'Anggaran/Kunjungan (IDR)', 'Kemauan Bepergian', 'Metode Pemesanan', 'Tujuan Utama', 'Yang Dicari'] self.is_trained = False self._load_model() if not self.is_trained: self._train_model() def _create_preprocessor(self): """Create the preprocessing pipeline.""" categorical_features = ['Jenis Kelamin', 'Kota', 'Frekuensi', 'Kemauan Bepergian', 'Metode Pemesanan', 'Tujuan Utama', 'Yang Dicari'] numerical_features = ['Umur', 'Anggaran/Kunjungan (IDR)'] numeric_transformer = Pipeline(steps=[ ('imputer', SimpleImputer(strategy='mean')) ]) categorical_transformer = Pipeline(steps=[ ('imputer', SimpleImputer(strategy='most_frequent')), ('onehot', OneHotEncoder(handle_unknown='ignore')) ]) preprocessor = ColumnTransformer( transformers=[ ('num', numeric_transformer, numerical_features), ('cat', categorical_transformer, categorical_features) ] ) return preprocessor def _train_model(self): """Train the model using the dataset.""" try: dataset_path = '../dataset_user_2.csv' if not os.path.exists(dataset_path): dataset_path = 'dataset_user_2.csv' if not os.path.exists(dataset_path): logger.error("Dataset not found. Please ensure dataset_user_2.csv is available.") return df = pd.read_csv(dataset_path) logger.info(f"Dataset loaded with {len(df)} records") TARGET_COLUMN = 'Aktivitas Favorit' X = df.drop(columns=[TARGET_COLUMN, 'ID Pengguna', 'Nama'], errors='ignore') y = df[TARGET_COLUMN] self.label_encoder = LabelEncoder() y_encoded = self.label_encoder.fit_transform(y) self.target_classes = list(self.label_encoder.classes_) X_train, X_test, y_train, y_test = train_test_split( X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded ) preprocessor = self._create_preprocessor() self.model_pipeline = Pipeline(steps=[ ('preprocessor', preprocessor), ('classifier', RandomForestClassifier( n_estimators=200, max_depth=30, min_samples_split=10, min_samples_leaf=2, max_features='log2', random_state=42, class_weight='balanced' )) ]) logger.info("Training model...") self.model_pipeline.fit(X_train, y_train) train_score = self.model_pipeline.score(X_train, y_train) test_score = self.model_pipeline.score(X_test, y_test) logger.info(f"Training accuracy: {train_score:.4f}") logger.info(f"Test accuracy: {test_score:.4f}") self.is_trained = True self._save_model() except Exception as e: logger.error(f"Error training model: {str(e)}") def _save_model(self): """Save the trained model and label encoder.""" try: with open('model_pipeline.pkl', 'wb') as f: pickle.dump(self.model_pipeline, f) with open('label_encoder.pkl', 'wb') as f: pickle.dump(self.label_encoder, f) logger.info("Model saved successfully") except Exception as e: logger.error(f"Error saving model: {str(e)}") def _load_model(self): """Load pre-trained model if available.""" try: if os.path.exists('model_pipeline.pkl') and os.path.exists('label_encoder.pkl'): with open('model_pipeline.pkl', 'rb') as f: self.model_pipeline = pickle.load(f) with open('label_encoder.pkl', 'rb') as f: self.label_encoder = pickle.load(f) self.target_classes = list(self.label_encoder.classes_) self.is_trained = True logger.info("Model loaded successfully") except Exception as e: logger.error(f"Error loading model: {str(e)}") def predict(self, X): """Make prediction for a single sample""" if not self.is_trained: raise ValueError("Model is not trained yet") prediction_encoded = self.model_pipeline.predict(X) probabilities = self.model_pipeline.predict_proba(X) max_prob_idx = np.argmax(probabilities, axis=1)[0] confidence = probabilities[0][max_prob_idx] prediction = self.label_encoder.inverse_transform(prediction_encoded)[0] return prediction, confidence def predict_proba(self, X): """Get prediction probabilities for all classes.""" if not self.is_trained: raise ValueError("Model is not trained yet") probabilities = self.model_pipeline.predict_proba(X)[0] prob_dict = {} for i, class_name in enumerate(self.target_classes): prob_dict[class_name] = float(probabilities[i]) return prob_dict def get_target_classes(self): """Get list of target classes.""" return self.target_classes def get_feature_names(self): """Get list of feature names.""" return self.feature_names def is_loaded(self): """Check if model is loaded and trained.""" return self.is_trained