reennv commited on
Commit
cc5422a
·
verified ·
1 Parent(s): 187655e

Upload 21 files

Browse files
Files changed (21) hide show
  1. Prediksi Performa Akademik/edtech/backend/data/processed/cleaned_education_data.csv +0 -0
  2. Prediksi Performa Akademik/edtech/backend/data/raw/personalized_education_data.csv +0 -0
  3. Prediksi Performa Akademik/edtech/backend/models/performance_predictor/trained_model/data_processor.pkl +3 -0
  4. Prediksi Performa Akademik/edtech/backend/models/performance_predictor/trained_model/model_metrics.json +8 -0
  5. Prediksi Performa Akademik/edtech/backend/models/performance_predictor/trained_model/performance_model.pkl +3 -0
  6. Prediksi Performa Akademik/edtech/backend/models/performance_predictor/trained_model/performance_model_20250709_221148_params.json +10 -0
  7. Prediksi Performa Akademik/edtech/backend/models/performance_predictor/trained_model/performance_model_shap_values.npy +3 -0
  8. Prediksi Performa Akademik/edtech/backend/models/performance_predictor/training_logs/actual_vs_predicted.png +0 -0
  9. Prediksi Performa Akademik/edtech/backend/models/performance_predictor/training_logs/feature_importance.csv +5 -0
  10. Prediksi Performa Akademik/edtech/backend/models/performance_predictor/training_logs/feature_importance.png +0 -0
  11. Prediksi Performa Akademik/edtech/backend/models/performance_predictor/training_logs/residual_plot.png +0 -0
  12. Prediksi Performa Akademik/edtech/backend/src/app.py +431 -0
  13. Prediksi Performa Akademik/edtech/backend/src/models/recommenders/collaborative/collab_model.joblib +3 -0
  14. Prediksi Performa Akademik/edtech/backend/src/models/recommenders/content_based/content_model.joblib +3 -0
  15. Prediksi Performa Akademik/edtech/backend/src/models/recommenders/hybrid/hybrid_model.joblib +3 -0
  16. Prediksi Performa Akademik/edtech/backend/src/performance_prediction/__init__.py +11 -0
  17. Prediksi Performa Akademik/edtech/backend/src/performance_prediction/data_processor.py +180 -0
  18. Prediksi Performa Akademik/edtech/backend/src/performance_prediction/evaluator.py +255 -0
  19. Prediksi Performa Akademik/edtech/backend/src/performance_prediction/model_trainer.py +412 -0
  20. Prediksi Performa Akademik/edtech/backend/src/performance_prediction/predictor.py +289 -0
  21. Prediksi Performa Akademik/edtech/backend/src/train_performance_predictor.py +164 -0
Prediksi Performa Akademik/edtech/backend/data/processed/cleaned_education_data.csv ADDED
The diff for this file is too large to render. See raw diff
 
Prediksi Performa Akademik/edtech/backend/data/raw/personalized_education_data.csv ADDED
The diff for this file is too large to render. See raw diff
 
Prediksi Performa Akademik/edtech/backend/models/performance_predictor/trained_model/data_processor.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9510e6be685fb7b5fdfd38517a116a0403faa75c922b2e81a9f215921ac2e0be
3
+ size 217195
Prediksi Performa Akademik/edtech/backend/models/performance_predictor/trained_model/model_metrics.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "mse": 0.05486344948816889,
3
+ "rmse": 0.23422948039939143,
4
+ "mae": 0.1660625786187038,
5
+ "r2": 0.29007536468986816,
6
+ "max_error": 0.7487417459487915,
7
+ "mape": 27499842257.400738
8
+ }
Prediksi Performa Akademik/edtech/backend/models/performance_predictor/trained_model/performance_model.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7c0de80b55e64dc99f1e1ffcc69f7ff3799341b393000728bcf641c64ea02b27
3
+ size 50035
Prediksi Performa Akademik/edtech/backend/models/performance_predictor/trained_model/performance_model_20250709_221148_params.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "subsample": 0.8,
3
+ "reg_lambda": 10,
4
+ "reg_alpha": 1,
5
+ "min_child_weight": 1,
6
+ "max_depth": 9,
7
+ "learning_rate": 0.1,
8
+ "gamma": 0.2,
9
+ "colsample_bytree": 1.0
10
+ }
Prediksi Performa Akademik/edtech/backend/models/performance_predictor/trained_model/performance_model_shap_values.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:de8239f80d8fb314e44e471031c7d93c12ed854bcb692f041b89d65ad19c136c
3
+ size 9728
Prediksi Performa Akademik/edtech/backend/models/performance_predictor/training_logs/actual_vs_predicted.png ADDED
Prediksi Performa Akademik/edtech/backend/models/performance_predictor/training_logs/feature_importance.csv ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ feature,importance
2
+ f18,22.0
3
+ f4,19.0
4
+ f12,3.0
5
+ f3,1.0
Prediksi Performa Akademik/edtech/backend/models/performance_predictor/training_logs/feature_importance.png ADDED
Prediksi Performa Akademik/edtech/backend/models/performance_predictor/training_logs/residual_plot.png ADDED
Prediksi Performa Akademik/edtech/backend/src/app.py ADDED
@@ -0,0 +1,431 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, HTTPException, Depends, status
2
+ from fastapi.middleware.cors import CORSMiddleware
3
+ from pydantic import BaseModel, Field, field_validator
4
+ from typing import List, Optional
5
+ import pandas as pd
6
+ import joblib
7
+ from pathlib import Path
8
+ import numpy as np
9
+ import sys
10
+ import logging
11
+ import time
12
+ from prometheus_fastapi_instrumentator import Instrumentator
13
+ import uvicorn
14
+ import xgboost as xgb
15
+ import shap
16
+ import json
17
+ from contextlib import asynccontextmanager
18
+ from datetime import datetime
19
+ import os
20
+
21
+ # Setup logging
22
+ logging.basicConfig(
23
+ level=logging.INFO,
24
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
25
+ handlers=[
26
+ logging.StreamHandler(),
27
+ logging.FileHandler('api.log')
28
+ ]
29
+ )
30
+ logger = logging.getLogger(__name__)
31
+
32
+ # Setup paths - Disesuaikan dengan struktur folder Anda
33
+ BASE_DIR = Path(__file__).parent.parent # Menyesuaikan dengan lokasi app.py
34
+ MODEL_DIR = BASE_DIR / "models" / "performance_predictor" / "trained_model"
35
+ MODEL_PATH = MODEL_DIR / "performance_model.pkl"
36
+ PREPROCESSOR_PATH = MODEL_DIR / "data_processor.pkl"
37
+ METRICS_PATH = MODEL_DIR / "model_metrics.json"
38
+
39
+ # Pastikan direktori model ada
40
+ os.makedirs(MODEL_DIR, exist_ok=True)
41
+
42
+ # Lifespan handler untuk manajemen siklus hidup aplikasi
43
+ @asynccontextmanager
44
+ async def lifespan(app: FastAPI):
45
+ """Mengelola startup dan shutdown aplikasi"""
46
+ try:
47
+ # Muat komponen saat startup
48
+ app.state.model_components = await load_components()
49
+
50
+ # Muat metrik model
51
+ if METRICS_PATH.exists():
52
+ with open(METRICS_PATH) as f:
53
+ app.state.model_metrics = json.load(f)
54
+ else:
55
+ app.state.model_metrics = {
56
+ "mse": 0.05486344948816889,
57
+ "rmse": 0.23422948039939143,
58
+ "mae": 0.1660625786187038,
59
+ "r2": 0.29007536468986816,
60
+ "max_error": 0.7487417459487915
61
+ }
62
+ logger.warning("File metrik model tidak ditemukan, menggunakan nilai default")
63
+
64
+ logger.info("Aplikasi siap menerima request")
65
+ yield
66
+
67
+ except Exception as e:
68
+ logger.error(f"Startup error: {str(e)}")
69
+ raise HTTPException(
70
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
71
+ detail="Gagal memulai aplikasi"
72
+ )
73
+
74
+ # Inisialisasi FastAPI
75
+ app = FastAPI(
76
+ title="EdTech Performance Prediction API",
77
+ description="API untuk memprediksi performa akademik siswa menggunakan model XGBoost",
78
+ version="2.0.0",
79
+ docs_url="/docs",
80
+ redoc_url="/redoc",
81
+ lifespan=lifespan
82
+ )
83
+
84
+ # Enable CORS
85
+ app.add_middleware(
86
+ CORSMiddleware,
87
+ allow_origins=["http://localhost:3024", "http://192.168.56.1:3024"],
88
+ allow_credentials=True,
89
+ allow_methods=["*"],
90
+ allow_headers=["*"],
91
+ )
92
+
93
+ # Setup Prometheus metrics
94
+ Instrumentator().instrument(app).expose(app)
95
+
96
+ # Definisi Model Pydantic
97
+ class FeatureInput(BaseModel):
98
+ grade: float = Field(..., gt=0, le=12, description="Kelas siswa (1-12)")
99
+ tech_savvy: int = Field(..., ge=1, le=5, description="Kemampuan teknologi (skala 1-5)")
100
+ duration_minutes: float = Field(..., gt=0, description="Durasi belajar dalam menit")
101
+ engagement_score: float = Field(..., ge=0, le=1, description="Skor engagement (0-1)")
102
+ completion_rate: float = Field(..., ge=0, le=1, description="Tingkat penyelesaian materi (0-1)")
103
+ material_rating: float = Field(..., ge=1, le=5, description="Rating materi (skala 1-5)")
104
+ interaction_duration: float = Field(..., gt=0, description="Durasi interaksi dengan materi")
105
+ material_engagement_score: float = Field(..., ge=0, le=1, description="Skor engagement dengan materi")
106
+ feature_engagement: float = Field(..., ge=0, le=1, description="Engagement dengan fitur platform")
107
+ jam_belajar: float = Field(..., ge=0, le=24, description="Jam belajar (0-24)")
108
+ hari_dalam_minggu: float = Field(..., ge=0, le=6, description="Hari dalam minggu (0-6)")
109
+ akhir_pekan: float = Field(..., ge=0, le=1, description="Indikator akhir pekan (0/1)")
110
+ efisiensi_belajar: float = Field(..., ge=0, description="Indeks efisiensi belajar")
111
+ rasio_penyelesaian: float = Field(..., ge=0, le=1, description="Rasio penyelesaian tugas")
112
+ interaksi_total: float = Field(..., ge=0, description="Total interaksi dengan platform")
113
+ preferensi_materi: float = Field(..., ge=0, le=1, description="Preferensi jenis materi")
114
+ jumlah_pengakses: float = Field(..., ge=0, description="Jumlah pengakses materi")
115
+ engagement_rata2: float = Field(..., ge=0, le=1, description="Rata-rata engagement")
116
+ performance_label_encoded: int = Field(..., ge=0, description="Label performa (encoded)")
117
+ learning_speed_encoded: int = Field(..., ge=0, description="Kecepatan belajar (encoded)")
118
+ student_feedback_encoded: int = Field(..., ge=0, description="Feedback siswa (encoded)")
119
+ achievement_status_encoded: int = Field(..., ge=0, description="Status pencapaian (encoded)")
120
+
121
+ @field_validator('engagement_score', 'completion_rate', 'material_engagement_score',
122
+ 'feature_engagement', 'efisiensi_belajar', 'rasio_penyelesaian',
123
+ 'preferensi_materi', 'engagement_rata2')
124
+ @classmethod
125
+ def check_proportion(cls, v):
126
+ if not 0 <= v <= 1:
127
+ raise ValueError("Nilai harus antara 0 dan 1")
128
+ return v
129
+
130
+ class PredictionInput(BaseModel):
131
+ features: FeatureInput
132
+
133
+ class BatchPredictionInput(BaseModel):
134
+ samples: List[FeatureInput]
135
+
136
+ class FeatureContribution(BaseModel):
137
+ feature: str
138
+ value: float
139
+ contribution: float
140
+
141
+ class PredictionResponse(BaseModel):
142
+ prediction: float = Field(..., description="Nilai prediksi skor kuis")
143
+ confidence_interval: List[float] = Field(..., description="Interval kepercayaan prediksi")
144
+ feature_contributions: Optional[List[FeatureContribution]] = Field(
145
+ None,
146
+ description="Kontribusi masing-masing fitur terhadap prediksi"
147
+ )
148
+ execution_time_ms: float = Field(..., description="Waktu eksekusi dalam milidetik")
149
+ model_version: str = Field(..., description="Versi model yang digunakan")
150
+
151
+ class BatchPredictionResponse(BaseModel):
152
+ predictions: List[float]
153
+ confidence_intervals: List[List[float]]
154
+ feature_contributions: Optional[List[List[FeatureContribution]]]
155
+ execution_time_ms: float
156
+ model_version: str
157
+ total_samples: int
158
+ avg_time_per_sample_ms: float
159
+
160
+ class HealthCheckResponse(BaseModel):
161
+ status: str
162
+ model_version: str
163
+ model_metrics: dict
164
+ uptime_seconds: float
165
+
166
+ class ModelInfoResponse(BaseModel):
167
+ features: List[str]
168
+ model_type: str
169
+ training_date: Optional[str]
170
+ performance_metrics: dict
171
+
172
+ # Dependency untuk memuat komponen model
173
+ async def load_components():
174
+ """Memuat model dan preprocessor dari file"""
175
+ try:
176
+ start_time = time.time()
177
+
178
+ # Verifikasi file ada
179
+ if not MODEL_PATH.exists():
180
+ raise FileNotFoundError(f"File model tidak ditemukan di {MODEL_PATH}")
181
+ if not PREPROCESSOR_PATH.exists():
182
+ raise FileNotFoundError(f"File preprocessor tidak ditemukan di {PREPROCESSOR_PATH}")
183
+
184
+ # Load model
185
+ model = joblib.load(MODEL_PATH)
186
+ logger.info(f"Model berhasil dimuat dari {MODEL_PATH}")
187
+
188
+ # Load preprocessor
189
+ processor_data = joblib.load(PREPROCESSOR_PATH)
190
+ preprocessor = processor_data['preprocessor']
191
+ feature_names = processor_data['feature_names']
192
+ logger.info(f"Preprocessor berhasil dimuat dari {PREPROCESSOR_PATH}")
193
+
194
+ load_time = time.time() - start_time
195
+ logger.info(f"Komponen model berhasil dimuat dalam {load_time:.2f} detik")
196
+
197
+ return {
198
+ "model": model,
199
+ "preprocessor": preprocessor,
200
+ "feature_names": feature_names,
201
+ "load_time": load_time
202
+ }
203
+ except FileNotFoundError as e:
204
+ logger.error(f"File tidak ditemukan: {str(e)}")
205
+ raise HTTPException(
206
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
207
+ detail=f"File model/preprocessor tidak ditemukan: {str(e)}"
208
+ )
209
+ except Exception as e:
210
+ logger.error(f"Gagal memuat model/preprocessor: {str(e)}")
211
+ raise HTTPException(
212
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
213
+ detail=f"Gagal memuat komponen model: {str(e)}"
214
+ )
215
+
216
+ # Endpoint Utama
217
+ @app.get("/", include_in_schema=False)
218
+ async def root():
219
+ """Endpoint root untuk informasi dasar API"""
220
+ return {
221
+ "message": "Selamat datang di EdTech Performance Prediction API",
222
+ "version": app.version,
223
+ "endpoints": {
224
+ "docs": "/docs",
225
+ "health": "/health",
226
+ "model_info": "/model/info",
227
+ "predict": "/predict",
228
+ "batch_predict": "/predict/batch"
229
+ }
230
+ }
231
+
232
+ @app.get("/health", response_model=HealthCheckResponse)
233
+ async def health_check():
234
+ """Endpoint untuk health check dan monitoring"""
235
+ return {
236
+ "status": "healthy",
237
+ "model_version": app.version,
238
+ "model_metrics": app.state.model_metrics,
239
+ "uptime_seconds": time.time() - app.state.model_components.get("load_time", time.time())
240
+ }
241
+
242
+ @app.get("/model/info", response_model=ModelInfoResponse)
243
+ async def model_info():
244
+ """Endpoint untuk mendapatkan informasi tentang model"""
245
+ return {
246
+ "features": app.state.model_components["feature_names"],
247
+ "model_type": "XGBoost Regressor",
248
+ "training_date": datetime.fromtimestamp(MODEL_PATH.stat().st_mtime).isoformat(),
249
+ "performance_metrics": app.state.model_metrics
250
+ }
251
+
252
+ @app.post("/predict", response_model=PredictionResponse)
253
+ async def predict_performance(
254
+ input_data: PredictionInput
255
+ ):
256
+ """Endpoint untuk prediksi tunggal performa siswa"""
257
+ start_time = time.time()
258
+
259
+ try:
260
+ components = app.state.model_components
261
+ model = components["model"]
262
+ preprocessor = components["preprocessor"]
263
+ feature_names = components["feature_names"]
264
+
265
+ # Konversi input ke DataFrame
266
+ input_dict = input_data.features.dict()
267
+ input_df = pd.DataFrame([input_dict])
268
+
269
+ # Validasi fitur
270
+ missing_cols = set(feature_names) - set(input_df.columns)
271
+ if missing_cols:
272
+ raise ValueError(f"Kolom berikut tidak ditemukan: {missing_cols}")
273
+
274
+ # Urutkan kolom sesuai dengan yang diharapkan model
275
+ input_df = input_df[feature_names]
276
+
277
+ # Preprocess input
278
+ processed_input = preprocessor.transform(input_df)
279
+
280
+ # Buat prediksi
281
+ if isinstance(model, xgb.Booster):
282
+ dmatrix = xgb.DMatrix(processed_input)
283
+ prediction = model.predict(dmatrix)[0]
284
+ else:
285
+ prediction = model.predict(processed_input)[0]
286
+
287
+ # Hitung confidence interval berdasarkan metrik model
288
+ std_dev = np.sqrt(app.state.model_metrics.get('mse', 0.05486344948816889))
289
+ confidence = [max(0, prediction - 1.96*std_dev), min(1, prediction + 1.96*std_dev)]
290
+
291
+ # Hitung feature contributions menggunakan SHAP
292
+ feature_contributions = None
293
+ if hasattr(model, 'feature_names_in_'):
294
+ try:
295
+ explainer = shap.Explainer(model)
296
+ shap_values = explainer(processed_input)
297
+
298
+ feature_contributions = []
299
+ for i, feature in enumerate(feature_names):
300
+ feature_contributions.append({
301
+ "feature": feature,
302
+ "value": input_df.iloc[0][feature],
303
+ "contribution": float(shap_values[0].values[i])
304
+ })
305
+ # Urutkan berdasarkan kontribusi absolut terbesar
306
+ feature_contributions.sort(key=lambda x: abs(x["contribution"]), reverse=True)
307
+ except Exception as e:
308
+ logger.warning(f"Tidak dapat menghitung SHAP values: {str(e)}")
309
+
310
+ # Hitung waktu response
311
+ exec_time = (time.time() - start_time) * 1000 # dalam milidetik
312
+
313
+ return {
314
+ "prediction": float(prediction),
315
+ "confidence_interval": confidence,
316
+ "feature_contributions": feature_contributions,
317
+ "execution_time_ms": exec_time,
318
+ "model_version": app.version
319
+ }
320
+
321
+ except ValueError as e:
322
+ logger.error(f"Input validation error: {str(e)}")
323
+ raise HTTPException(
324
+ status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
325
+ detail=f"Input tidak valid: {str(e)}"
326
+ )
327
+ except Exception as e:
328
+ logger.error(f"Error dalam prediksi: {str(e)}")
329
+ raise HTTPException(
330
+ status_code=status.HTTP_400_BAD_REQUEST,
331
+ detail=f"Error dalam prediksi: {str(e)}"
332
+ )
333
+
334
+ @app.post("/predict/batch", response_model=BatchPredictionResponse)
335
+ async def batch_predict_performance(
336
+ input_data: BatchPredictionInput
337
+ ):
338
+ """Endpoint untuk prediksi batch performa siswa"""
339
+ start_time = time.time()
340
+
341
+ try:
342
+ components = app.state.model_components
343
+ model = components["model"]
344
+ preprocessor = components["preprocessor"]
345
+ feature_names = components["feature_names"]
346
+
347
+ # Konversi input ke DataFrame
348
+ samples = [sample.dict() for sample in input_data.samples]
349
+ input_df = pd.DataFrame(samples)
350
+
351
+ # Validasi fitur
352
+ missing_cols = set(feature_names) - set(input_df.columns)
353
+ if missing_cols:
354
+ raise ValueError(f"Kolom berikut tidak ditemukan: {missing_cols}")
355
+
356
+ # Urutkan kolom
357
+ input_df = input_df[feature_names]
358
+
359
+ # Preprocess input
360
+ processed_input = preprocessor.transform(input_df)
361
+
362
+ # Buat prediksi
363
+ if isinstance(model, xgb.Booster):
364
+ dmatrix = xgb.DMatrix(processed_input)
365
+ predictions = model.predict(dmatrix)
366
+ else:
367
+ predictions = model.predict(processed_input)
368
+
369
+ # Hitung confidence intervals
370
+ std_dev = np.sqrt(app.state.model_metrics.get('mse', 0.05486344948816889))
371
+ conf_intervals = [
372
+ [max(0, p - 1.96*std_dev), min(1, p + 1.96*std_dev)]
373
+ for p in predictions
374
+ ]
375
+
376
+ # Hitung feature contributions
377
+ feature_contributions_list = None
378
+ if hasattr(model, 'feature_names_in_'):
379
+ try:
380
+ explainer = shap.Explainer(model)
381
+ shap_values = explainer(processed_input)
382
+
383
+ feature_contributions_list = []
384
+ for i in range(len(predictions)):
385
+ contributions = []
386
+ for j, feature in enumerate(feature_names):
387
+ contributions.append({
388
+ "feature": feature,
389
+ "value": input_df.iloc[i][feature],
390
+ "contribution": float(shap_values[i].values[j])
391
+ })
392
+ # Urutkan berdasarkan kontribusi absolut terbesar
393
+ contributions.sort(key=lambda x: abs(x["contribution"]), reverse=True)
394
+ feature_contributions_list.append(contributions)
395
+ except Exception as e:
396
+ logger.warning(f"Tidak dapat menghitung SHAP values untuk batch: {str(e)}")
397
+
398
+ # Hitung waktu response
399
+ exec_time = (time.time() - start_time) * 1000 # dalam milidetik
400
+ avg_time_per_sample = exec_time / len(predictions)
401
+
402
+ return {
403
+ "predictions": [float(p) for p in predictions],
404
+ "confidence_intervals": conf_intervals,
405
+ "feature_contributions": feature_contributions_list,
406
+ "execution_time_ms": exec_time,
407
+ "model_version": app.version,
408
+ "total_samples": len(predictions),
409
+ "avg_time_per_sample_ms": avg_time_per_sample
410
+ }
411
+
412
+ except ValueError as e:
413
+ logger.error(f"Input validation error: {str(e)}")
414
+ raise HTTPException(
415
+ status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
416
+ detail=f"Input tidak valid: {str(e)}"
417
+ )
418
+ except Exception as e:
419
+ logger.error(f"Error dalam batch prediction: {str(e)}")
420
+ raise HTTPException(
421
+ status_code=status.HTTP_400_BAD_REQUEST,
422
+ detail=f"Error dalam batch prediction: {str(e)}"
423
+ )
424
+
425
+ if __name__ == "__main__":
426
+ uvicorn.run(
427
+ "app:app",
428
+ host="192.168.56.1",
429
+ port=8024,
430
+ reload=True
431
+ )
Prediksi Performa Akademik/edtech/backend/src/models/recommenders/collaborative/collab_model.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c4aef73c6272415cb11002c1ff5c96f65587498acaa7c86ad4f7167d1d73fe48
3
+ size 6080
Prediksi Performa Akademik/edtech/backend/src/models/recommenders/content_based/content_model.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:63d1a2f5acb72fa4e6c3825586d578da46d850c31d82883ef50f618789722977
3
+ size 5211833
Prediksi Performa Akademik/edtech/backend/src/models/recommenders/hybrid/hybrid_model.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d173427052471e467df306ab61013e0599cfb0a80ff3805e464f9b7a25166933
3
+ size 32
Prediksi Performa Akademik/edtech/backend/src/performance_prediction/__init__.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .data_processor import PerformanceDataProcessor
2
+ from .model_trainer import PerformanceModelTrainer
3
+ from .evaluator import PerformanceEvaluator
4
+ from .predictor import PerformancePredictor
5
+
6
+ __all__ = [
7
+ 'PerformanceDataProcessor',
8
+ 'PerformanceModelTrainer',
9
+ 'PerformanceEvaluator',
10
+ 'PerformancePredictor'
11
+ ]
Prediksi Performa Akademik/edtech/backend/src/performance_prediction/data_processor.py ADDED
@@ -0,0 +1,180 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # backend/src/performance_prediction/data_processor.py
2
+
3
+ import pandas as pd
4
+ import numpy as np
5
+ from sklearn.model_selection import train_test_split
6
+ from sklearn.preprocessing import StandardScaler, RobustScaler, OneHotEncoder
7
+ from sklearn.impute import SimpleImputer
8
+ from sklearn.pipeline import Pipeline
9
+ from sklearn.compose import ColumnTransformer
10
+ import joblib
11
+ import os
12
+ from datetime import datetime
13
+ from pathlib import Path
14
+ import logging
15
+ import json
16
+
17
+ class PerformanceDataProcessor:
18
+ def __init__(self, data_path, config_path=None):
19
+ self.data_path = data_path
20
+ self.config_path = config_path
21
+ self.features = None
22
+ self.target = None
23
+ self.preprocessor = None
24
+ self.logger = self._setup_logger()
25
+
26
+ def _setup_logger(self):
27
+ logger = logging.getLogger(__name__)
28
+ logger.setLevel(logging.INFO)
29
+ handler = logging.StreamHandler()
30
+ formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
31
+ handler.setFormatter(formatter)
32
+ logger.addHandler(handler)
33
+ return logger
34
+
35
+ def load_data(self):
36
+ """Memuat data dengan penanganan error yang lebih baik"""
37
+ try:
38
+ data = pd.read_csv(self.data_path)
39
+
40
+ # Log informasi dasar data
41
+ self.logger.info(f"Data berhasil dimuat. Shape: {data.shape}")
42
+ self.logger.info(f"Kolom yang tersedia: {list(data.columns)}")
43
+ self.logger.info(f"Contoh data:\n{data.head(2)}")
44
+
45
+ return data
46
+ except Exception as e:
47
+ self.logger.error(f"Gagal memuat data: {str(e)}")
48
+ raise
49
+
50
+ def prepare_features_target(self, data, target_col='quiz_score'):
51
+ """
52
+ Menyiapkan fitur dan target dengan penanganan data yang lebih komprehensif
53
+ """
54
+ try:
55
+ # Load feature configuration if available
56
+ if self.config_path:
57
+ with open(self.config_path) as f:
58
+ config = json.load(f)
59
+ relevant_features = config.get('features', [])
60
+ else:
61
+ # Default features
62
+ relevant_features = [
63
+ 'grade', 'tech_savvy', 'duration_minutes', 'engagement_score',
64
+ 'completion_rate', 'material_rating', 'interaction_duration',
65
+ 'material_engagement_score', 'feature_engagement', 'jam_belajar',
66
+ 'hari_dalam_minggu', 'akhir_pekan', 'efisiensi_belajar',
67
+ 'rasio_penyelesaian', 'interaksi_total', 'preferensi_materi',
68
+ 'jumlah_pengakses', 'engagement_rata2', 'performance_label_encoded',
69
+ 'learning_speed_encoded', 'student_feedback_encoded',
70
+ 'achievement_status_encoded'
71
+ ]
72
+
73
+ # Tambahkan fitur interaksi baru
74
+ data['efisiensi_engagement'] = data['engagement_score'] / (data['duration_minutes'] + 1e-6)
75
+ data['learning_consistency'] = data['completion_rate'] * data['material_rating']
76
+ relevant_features.extend(['efisiensi_engagement', 'learning_consistency'])
77
+
78
+ # Pastikan kolom target ada
79
+ if target_col not in data.columns:
80
+ raise ValueError(f"Kolom target '{target_col}' tidak ditemukan")
81
+
82
+ # Handle missing values
83
+ data[relevant_features] = data[relevant_features].fillna(data[relevant_features].median())
84
+
85
+ self.features = data[relevant_features]
86
+ self.target = data[target_col]
87
+
88
+ # Setup preprocessing pipeline
89
+ numeric_features = self.features.select_dtypes(include=['int64', 'float64']).columns
90
+ categorical_features = self.features.select_dtypes(include=['object', 'category']).columns
91
+
92
+ numeric_transformer = Pipeline(steps=[
93
+ ('imputer', SimpleImputer(strategy='median')),
94
+ ('scaler', RobustScaler()) # Lebih robust terhadap outlier
95
+ ])
96
+
97
+ categorical_transformer = Pipeline(steps=[
98
+ ('imputer', SimpleImputer(strategy='most_frequent')),
99
+ ('onehot', OneHotEncoder(handle_unknown='ignore'))
100
+ ])
101
+
102
+ self.preprocessor = ColumnTransformer(
103
+ transformers=[
104
+ ('num', numeric_transformer, numeric_features),
105
+ ('cat', categorical_transformer, categorical_features)
106
+ ])
107
+
108
+ return self.features, self.target
109
+
110
+ except Exception as e:
111
+ self.logger.error(f"Error dalam menyiapkan fitur: {str(e)}")
112
+ raise
113
+
114
+ def split_data(self, test_size=0.2, val_size=0.2, random_state=42):
115
+ """Membagi data menjadi train, validation, dan test set"""
116
+ try:
117
+ if self.features is None or self.target is None:
118
+ raise ValueError("Fitur atau target belum disiapkan")
119
+
120
+ # Bagi data menjadi train+val dan test
121
+ X_train_val, X_test, y_train_val, y_test = train_test_split(
122
+ self.features, self.target,
123
+ test_size=test_size,
124
+ random_state=random_state
125
+ )
126
+
127
+ # Bagi train_val menjadi train dan validation
128
+ val_size_adjusted = val_size / (1 - test_size) # Adjust untuk ukuran asli dataset
129
+ X_train, X_val, y_train, y_val = train_test_split(
130
+ X_train_val, y_train_val,
131
+ test_size=val_size_adjusted,
132
+ random_state=random_state
133
+ )
134
+
135
+ # Preprocess data
136
+ X_train = self.preprocessor.fit_transform(X_train)
137
+ X_val = self.preprocessor.transform(X_val)
138
+ X_test = self.preprocessor.transform(X_test)
139
+
140
+ # Validasi data
141
+ self._validate_data(X_train, y_train)
142
+ self._validate_data(X_val, y_val)
143
+ self._validate_data(X_test, y_test)
144
+
145
+ return X_train, X_val, X_test, y_train, y_val, y_test
146
+
147
+ except Exception as e:
148
+ self.logger.error(f"Error dalam membagi data: {str(e)}")
149
+ raise
150
+
151
+ def _validate_data(self, X, y):
152
+ """Validasi kualitas data"""
153
+ if isinstance(X, np.ndarray):
154
+ if np.any(np.isnan(X)) or np.any(np.isinf(X)):
155
+ raise ValueError("Data mengandung NaN atau infinity")
156
+ if len(X) != len(y):
157
+ raise ValueError("Jumlah sampel X dan y tidak sama")
158
+ if len(y) == 0:
159
+ raise ValueError("Data target kosong")
160
+
161
+ def save_processor(self, save_dir):
162
+ """Menyimpan processor dan preprocessing pipeline"""
163
+ try:
164
+ os.makedirs(save_dir, exist_ok=True)
165
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
166
+ save_path = Path(save_dir) / f"data_processor_{timestamp}.pkl"
167
+
168
+ # Simpan seluruh objek processor
169
+ joblib.dump({
170
+ 'processor': self,
171
+ 'preprocessor': self.preprocessor,
172
+ 'feature_names': list(self.features.columns) if self.features is not None else None
173
+ }, save_path)
174
+
175
+ self.logger.info(f"Processor disimpan di: {save_path}")
176
+ return str(save_path)
177
+
178
+ except Exception as e:
179
+ self.logger.error(f"Gagal menyimpan processor: {str(e)}")
180
+ raise
Prediksi Performa Akademik/edtech/backend/src/performance_prediction/evaluator.py ADDED
@@ -0,0 +1,255 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # backend/src/performance_prediction/evaluator.py
2
+
3
+ import numpy as np
4
+ import matplotlib.pyplot as plt
5
+ import seaborn as sns
6
+ from sklearn.metrics import (
7
+ mean_squared_error,
8
+ mean_absolute_error,
9
+ r2_score,
10
+ explained_variance_score,
11
+ max_error,
12
+ mean_absolute_percentage_error
13
+ )
14
+ import pandas as pd
15
+ import logging
16
+ from typing import Dict, Tuple, Optional
17
+ from pathlib import Path
18
+ import json
19
+ import shap
20
+
21
+ class PerformanceEvaluator:
22
+ def __init__(self, y_true: np.ndarray, y_pred: np.ndarray, model=None, X_test=None):
23
+ """
24
+ Inisialisasi evaluator dengan tambahan SHAP values dan model interpretability
25
+
26
+ Parameters:
27
+ y_true (np.ndarray): Nilai sebenarnya
28
+ y_pred (np.ndarray): Nilai prediksi
29
+ model (optional): Model yang sudah dilatih untuk interpretasi
30
+ X_test (optional): Data fitur untuk interpretasi model
31
+ """
32
+ self.y_true = y_true
33
+ self.y_pred = y_pred
34
+ self.model = model
35
+ self.X_test = X_test
36
+ self.shap_values = None
37
+ self.logger = self._setup_logger()
38
+ self.metrics = self.calculate_metrics()
39
+
40
+ def _setup_logger(self):
41
+ """Setup logger untuk evaluator"""
42
+ logger = logging.getLogger(__name__)
43
+ logger.setLevel(logging.INFO)
44
+ handler = logging.StreamHandler()
45
+ formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
46
+ handler.setFormatter(formatter)
47
+ logger.addHandler(handler)
48
+ return logger
49
+
50
+ def calculate_metrics(self) -> Dict[str, float]:
51
+ """
52
+ Menghitung berbagai metrik evaluasi dengan penanganan kasus khusus
53
+
54
+ Returns:
55
+ Dict berisi berbagai metrik evaluasi
56
+ """
57
+ metrics = {
58
+ 'mse': mean_squared_error(self.y_true, self.y_pred),
59
+ 'rmse': np.sqrt(mean_squared_error(self.y_true, self.y_pred)),
60
+ 'mae': mean_absolute_error(self.y_true, self.y_pred),
61
+ 'r2': r2_score(self.y_true, self.y_pred),
62
+ 'explained_variance': explained_variance_score(self.y_true, self.y_pred),
63
+ 'max_error': max_error(self.y_true, self.y_pred),
64
+ 'mean_error': np.mean(self.y_true - self.y_pred),
65
+ 'std_error': np.std(self.y_true - self.y_pred)
66
+ }
67
+
68
+ # Hitung MAPE hanya jika tidak ada nilai 0 di y_true
69
+ try:
70
+ metrics['mape'] = mean_absolute_percentage_error(self.y_true, self.y_pred) * 100
71
+ except ValueError:
72
+ metrics['mape'] = np.inf
73
+ self.logger.warning("Terdapat nilai 0 pada y_true, MAPE tidak dapat dihitung")
74
+
75
+ # Hitung metrik tambahan jika model tersedia
76
+ if self.model is not None and self.X_test is not None:
77
+ try:
78
+ self._calculate_shap_values()
79
+ metrics['mean_abs_shap'] = np.mean(np.abs(self.shap_values))
80
+ except Exception as e:
81
+ self.logger.warning(f"Tidak dapat menghitung SHAP values: {str(e)}")
82
+
83
+ return metrics
84
+
85
+ def _calculate_shap_values(self, sample_size: int = 100):
86
+ """Menghitung SHAP values untuk interpretasi model"""
87
+ if self.model is None or self.X_test is None:
88
+ raise ValueError("Model dan X_test diperlukan untuk menghitung SHAP values")
89
+
90
+ # Sample data untuk efisiensi
91
+ if len(self.X_test) > sample_size:
92
+ sample_idx = np.random.choice(len(self.X_test), sample_size, replace=False)
93
+ X_sample = self.X_test[sample_idx]
94
+ else:
95
+ X_sample = self.X_test
96
+
97
+ # Hitung SHAP values
98
+ if hasattr(self.model, 'predict_proba'):
99
+ explainer = shap.Explainer(self.model)
100
+ self.shap_values = explainer(X_sample).values
101
+ else:
102
+ explainer = shap.Explainer(self.model)
103
+ self.shap_values = explainer(X_sample).values
104
+
105
+ def get_performance_report(self) -> str:
106
+ """Membuat laporan performa model dalam format string"""
107
+ report = "\n=== MODEL PERFORMANCE REPORT ===\n"
108
+ for name, value in self.metrics.items():
109
+ report += f"{name.upper():<20}: {value:.4f}\n"
110
+ return report
111
+
112
+ def plot_residuals(self, save_path: Optional[str] = None) -> Optional[plt.Figure]:
113
+ """
114
+ Visualisasi residual plot dengan informasi tambahan
115
+
116
+ Parameters:
117
+ save_path (optional): Path untuk menyimpan plot
118
+
119
+ Returns:
120
+ plt.Figure jika save_path tidak ditentukan
121
+ """
122
+ residuals = self.y_true - self.y_pred
123
+
124
+ plt.figure(figsize=(12, 8))
125
+ sns.scatterplot(x=self.y_pred, y=residuals, alpha=0.6)
126
+
127
+ # Tambahkan garis referensi
128
+ plt.axhline(y=0, color='r', linestyle='--')
129
+
130
+ # Tambahkan garis rata-rata residual
131
+ mean_residual = np.mean(residuals)
132
+ plt.axhline(y=mean_residual, color='b', linestyle='-',
133
+ label=f'Mean Residual: {mean_residual:.2f}')
134
+
135
+ # Hitung dan plot interval kepercayaan
136
+ std_residual = np.std(residuals)
137
+ plt.axhline(y=mean_residual + 1.96*std_residual, color='g', linestyle=':',
138
+ label='95% Confidence Interval')
139
+ plt.axhline(y=mean_residual - 1.96*std_residual, color='g', linestyle=':')
140
+
141
+ plt.xlabel('Predicted Values')
142
+ plt.ylabel('Residuals')
143
+ plt.title('Residual Analysis')
144
+ plt.legend()
145
+
146
+ if save_path:
147
+ plt.savefig(save_path, bbox_inches='tight')
148
+ plt.close()
149
+ self.logger.info(f"Residual plot disimpan di: {save_path}")
150
+ else:
151
+ return plt
152
+
153
+ def plot_actual_vs_predicted(self, save_path: Optional[str] = None) -> Optional[plt.Figure]:
154
+ """Visualisasi aktual vs prediksi dengan informasi tambahan"""
155
+ plt.figure(figsize=(12, 8))
156
+
157
+ # Scatter plot
158
+ ax = sns.scatterplot(x=self.y_true, y=self.y_pred, alpha=0.6)
159
+
160
+ # Garis diagonal
161
+ min_val = min(self.y_true.min(), self.y_pred.min())
162
+ max_val = max(self.y_true.max(), self.y_pred.max())
163
+ plt.plot([min_val, max_val], [min_val, max_val], 'r--', label='Ideal Prediction')
164
+
165
+ # Garis regresi
166
+ coef = np.polyfit(self.y_true, self.y_pred, 1)
167
+ poly1d_fn = np.poly1d(coef)
168
+ plt.plot(self.y_true, poly1d_fn(self.y_true), 'b-',
169
+ label=f'Regression Line (slope={coef[0]:.2f})')
170
+
171
+ plt.xlabel('Actual Values')
172
+ plt.ylabel('Predicted Values')
173
+ plt.title('Actual vs Predicted Values')
174
+ plt.legend()
175
+
176
+ if save_path:
177
+ plt.savefig(save_path, bbox_inches='tight')
178
+ plt.close()
179
+ self.logger.info(f"Actual vs Predicted plot disimpan di: {save_path}")
180
+ else:
181
+ return plt
182
+
183
+ def plot_error_distribution(self, save_path: Optional[str] = None) -> Optional[plt.Figure]:
184
+ """Visualisasi distribusi error dengan informasi statistik"""
185
+ errors = self.y_true - self.y_pred
186
+
187
+ plt.figure(figsize=(12, 8))
188
+
189
+ # Histogram dengan KDE
190
+ ax = sns.histplot(errors, kde=True, bins=30)
191
+
192
+ # Tambahkan garis statistik
193
+ mean_error = np.mean(errors)
194
+ std_error = np.std(errors)
195
+
196
+ plt.axvline(mean_error, color='r', linestyle='-',
197
+ label=f'Mean Error: {mean_error:.2f}')
198
+ plt.axvline(mean_error + std_error, color='g', linestyle='--',
199
+ label=f'±1 Std Dev: {std_error:.2f}')
200
+ plt.axvline(mean_error - std_error, color='g', linestyle='--')
201
+
202
+ plt.xlabel('Prediction Error')
203
+ plt.ylabel('Frequency')
204
+ plt.title('Prediction Error Distribution')
205
+ plt.legend()
206
+
207
+ if save_path:
208
+ plt.savefig(save_path, bbox_inches='tight')
209
+ plt.close()
210
+ self.logger.info(f"Error distribution plot disimpan di: {save_path}")
211
+ else:
212
+ return plt
213
+
214
+ def plot_shap_summary(self, feature_names: list = None, save_path: Optional[str] = None) -> Optional[plt.Figure]:
215
+ """Visualisasi SHAP summary plot"""
216
+ if self.shap_values is None:
217
+ self.logger.warning("SHAP values belum dihitung")
218
+ return None
219
+
220
+ plt.figure(figsize=(14, 8))
221
+ shap.summary_plot(self.shap_values, self.X_test, feature_names=feature_names, show=False)
222
+ plt.title('SHAP Feature Importance')
223
+ plt.tight_layout()
224
+
225
+ if save_path:
226
+ plt.savefig(save_path, bbox_inches='tight')
227
+ plt.close()
228
+ self.logger.info(f"SHAP summary plot disimpan di: {save_path}")
229
+ else:
230
+ return plt
231
+
232
+ def save_evaluation_results(self, save_dir: str):
233
+ """
234
+ Menyimpan semua hasil evaluasi termasuk plot dan metrik
235
+
236
+ Parameters:
237
+ save_dir: Direktori untuk menyimpan hasil
238
+ """
239
+ save_path = Path(save_dir)
240
+ save_path.mkdir(parents=True, exist_ok=True)
241
+
242
+ # Simpan metrik
243
+ with open(save_path / 'evaluation_metrics.json', 'w') as f:
244
+ json.dump(self.metrics, f, indent=4)
245
+
246
+ # Simpan plot
247
+ self.plot_residuals(save_path / 'residual_plot.png')
248
+ self.plot_actual_vs_predicted(save_path / 'actual_vs_predicted.png')
249
+ self.plot_error_distribution(save_path / 'error_distribution.png')
250
+
251
+ # Simpan SHAP plot jika tersedia
252
+ if self.shap_values is not None:
253
+ self.plot_shap_summary(save_path=save_path / 'shap_summary.png')
254
+
255
+ self.logger.info(f"Hasil evaluasi disimpan di: {save_path}")
Prediksi Performa Akademik/edtech/backend/src/performance_prediction/model_trainer.py ADDED
@@ -0,0 +1,412 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # backend/src/performance_prediction/model_trainer.py
2
+
3
+ import xgboost as xgb
4
+ import optuna
5
+ from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
6
+ from sklearn.model_selection import cross_val_score, KFold
7
+ import numpy as np
8
+ import joblib
9
+ import os
10
+ from datetime import datetime
11
+ from pathlib import Path
12
+ import pandas as pd
13
+ import matplotlib.pyplot as plt
14
+ import json
15
+ import logging
16
+ from functools import partial
17
+ import shap
18
+ import random
19
+
20
+ class PerformanceModelTrainer:
21
+ def __init__(self):
22
+ self.model = None
23
+ self.feature_importance = None
24
+ self.shap_values = None
25
+ self.best_params = None
26
+ self.cv_results = None
27
+ self.logger = self._setup_logger()
28
+ self.study = None
29
+
30
+ def _setup_logger(self):
31
+ logger = logging.getLogger(__name__)
32
+ logger.setLevel(logging.INFO)
33
+ handler = logging.StreamHandler()
34
+ formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
35
+ handler.setFormatter(formatter)
36
+ logger.addHandler(handler)
37
+ return logger
38
+
39
+ def objective(self, trial, X, y):
40
+ """Fungsi objective untuk Optuna dengan error handling yang lebih baik"""
41
+ try:
42
+ params = {
43
+ 'objective': 'reg:squarederror',
44
+ 'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
45
+ 'max_depth': trial.suggest_int('max_depth', 3, 12),
46
+ 'learning_rate': trial.suggest_float('learning_rate', 1e-3, 0.3, log=True), # Diperbarui range
47
+ 'subsample': trial.suggest_float('subsample', 0.5, 1.0),
48
+ 'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
49
+ 'gamma': trial.suggest_float('gamma', 0, 1.0),
50
+ 'min_child_weight': trial.suggest_int('min_child_weight', 1, 20),
51
+ 'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10.0, log=True),
52
+ 'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0, log=True),
53
+ 'random_state': 42,
54
+ 'n_jobs': 1
55
+ }
56
+
57
+ model = xgb.XGBRegressor(**params)
58
+
59
+ # Gunakan KFold cross-validation dengan error handling
60
+ kf = KFold(n_splits=3, shuffle=True, random_state=42) # Kurangi splits untuk efisiensi
61
+
62
+ try:
63
+ scores = cross_val_score(
64
+ model, X, y,
65
+ cv=kf,
66
+ scoring='neg_mean_squared_error',
67
+ n_jobs=1,
68
+ error_score='raise'
69
+ )
70
+ return np.mean(scores)
71
+ except Exception as e:
72
+ self.logger.warning(f"Trial gagal: {str(e)}")
73
+ return float('-inf') # Return nilai terburuk jika gagal
74
+
75
+ except Exception as e:
76
+ self.logger.error(f"Error dalam objective function: {str(e)}")
77
+ return float('-inf')
78
+
79
+
80
+ def hyperparameter_tuning(self, X_train, y_train, n_trials=30):
81
+ """Alternatif sederhana jika Optuna bermasalah"""
82
+ param_grid = {
83
+ 'n_estimators': [100, 200, 500],
84
+ 'max_depth': [3, 6, 9],
85
+ 'learning_rate': [0.01, 0.1, 0.2],
86
+ 'subsample': [0.6, 0.8, 1.0],
87
+ 'colsample_bytree': [0.6, 0.8, 1.0]
88
+ }
89
+
90
+ best_score = float('-inf')
91
+ best_params = {}
92
+
93
+ for _ in range(n_trials):
94
+ params = {k: random.choice(v) for k, v in param_grid.items()}
95
+ # Hapus n_estimators untuk xgb.train
96
+ train_params = params.copy()
97
+ train_params.pop('n_estimators', None)
98
+
99
+ model = xgb.XGBRegressor(**params, random_state=42)
100
+ score = cross_val_score(model, X_train, y_train,
101
+ cv=3, scoring='neg_mean_squared_error').mean()
102
+
103
+ if score > best_score:
104
+ best_score = score
105
+ best_params = params
106
+
107
+ self.best_params = best_params
108
+ return best_params
109
+
110
+ def train_model(self, X_train, y_train, X_val=None, y_val=None, params=None):
111
+ """Melatih model final dengan early stopping"""
112
+ try:
113
+ self.logger.info("\n=== TRAINING FINAL MODEL ===")
114
+
115
+ if params is None and self.best_params is not None:
116
+ params = self.best_params
117
+
118
+ # Parameter default
119
+ default_params = {
120
+ 'objective': 'reg:squarederror',
121
+ 'random_state': 42,
122
+ 'verbosity': 1
123
+ }
124
+
125
+ # Hapus n_estimators jika menggunakan xgb.train
126
+ if 'n_estimators' in params:
127
+ params.pop('n_estimators')
128
+
129
+ final_params = {**default_params, **(params or {})}
130
+
131
+ if X_val is not None and y_val is not None:
132
+ self.logger.info("Menggunakan early stopping dengan validation set")
133
+
134
+ dtrain = xgb.DMatrix(X_train, label=y_train)
135
+ dval = xgb.DMatrix(X_val, label=y_val)
136
+
137
+ evals = [(dtrain, 'train'), (dval, 'val')]
138
+ evals_result = {}
139
+ model = xgb.train(
140
+ final_params,
141
+ dtrain,
142
+ num_boost_round=1000,
143
+ evals=evals,
144
+ early_stopping_rounds=50,
145
+ verbose_eval=50,
146
+ evals_result=evals_result
147
+ )
148
+
149
+ # Simpan evals_result
150
+ self.evals_result = evals_result
151
+ else:
152
+ self.logger.info("Training tanpa early stopping")
153
+ model = xgb.XGBRegressor(**final_params)
154
+ model.fit(X_train, y_train)
155
+
156
+ self.model = model
157
+
158
+ # Hitung feature importance dan SHAP values
159
+ self._calculate_feature_importance(X_train)
160
+ self._calculate_shap_values(X_train)
161
+
162
+ return model
163
+ except Exception as e:
164
+ self.logger.error(f"Error dalam training model: {str(e)}")
165
+ raise
166
+
167
+ def _calculate_feature_importance(self, X_train):
168
+ """Menghitung feature importance"""
169
+ try:
170
+ if isinstance(self.model, xgb.Booster):
171
+ # Untuk model Booster (xgb.train)
172
+ importance = self.model.get_score(importance_type='weight')
173
+ # Konversi ke format yang konsisten
174
+ self.feature_importance = {k: float(v) for k, v in importance.items()}
175
+ elif hasattr(self.model, 'feature_importances_'):
176
+ # Untuk model scikit-learn API (XGBRegressor)
177
+ self.feature_importance = dict(zip(
178
+ self.model.get_booster().feature_names,
179
+ self.model.feature_importances_
180
+ ))
181
+ else:
182
+ self.logger.warning("Tipe model tidak dikenali untuk menghitung feature importance")
183
+ self.feature_importance = None
184
+ except Exception as e:
185
+ self.logger.error(f"Gagal menghitung feature importance: {str(e)}")
186
+ self.feature_importance = None
187
+
188
+ def _calculate_shap_values(self, X_train, sample_size=100):
189
+ """Menghitung SHAP values untuk interpretasi model"""
190
+ try:
191
+ if self.model is None:
192
+ raise ValueError("Model belum dilatih")
193
+
194
+ if isinstance(self.model, xgb.Booster):
195
+ explainer = shap.TreeExplainer(self.model)
196
+ X_sample = shap.utils.sample(X_train, sample_size)
197
+ self.shap_values = explainer.shap_values(X_sample)
198
+ else:
199
+ explainer = shap.Explainer(self.model)
200
+ self.shap_values = explainer(X_train)
201
+ except Exception as e:
202
+ self.logger.warning(f"Tidak dapat menghitung SHAP values: {str(e)}")
203
+ self.shap_values = None
204
+
205
+ def evaluate_model(self, X_test, y_test):
206
+ """Evaluasi model dengan metrik lengkap"""
207
+ try:
208
+ if self.model is None:
209
+ raise ValueError("Model belum dilatih")
210
+
211
+ dtest = xgb.DMatrix(X_test)
212
+ predictions = self.model.predict(dtest)
213
+
214
+ # Hitung berbagai metrik evaluasi
215
+ metrics = self._calculate_all_metrics(y_test, predictions)
216
+
217
+ self.logger.info("\n=== HASIL EVALUASI MODEL ===")
218
+ for name, value in metrics.items():
219
+ self.logger.info(f"{name}: {value:.4f}")
220
+
221
+ return {
222
+ 'metrics': metrics,
223
+ 'predictions': predictions,
224
+ 'shap_values': self.shap_values
225
+ }
226
+
227
+ except Exception as e:
228
+ self.logger.error(f"Error dalam evaluasi model: {str(e)}")
229
+ raise
230
+
231
+ def _calculate_all_metrics(self, y_true, y_pred):
232
+ """Menghitung semua metrik evaluasi"""
233
+ metrics = {
234
+ 'mse': mean_squared_error(y_true, y_pred),
235
+ 'rmse': np.sqrt(mean_squared_error(y_true, y_pred)),
236
+ 'mae': mean_absolute_error(y_true, y_pred),
237
+ 'r2': r2_score(y_true, y_pred),
238
+ 'max_error': np.max(np.abs(y_true - y_pred))
239
+ }
240
+
241
+ # Hitung MAPE dengan penanganan nilai 0
242
+ try:
243
+ # Tambahkan epsilon kecil untuk menghindari division by zero
244
+ y_true_adjusted = np.where(y_true == 0, 1e-10, y_true)
245
+ metrics['mape'] = np.mean(np.abs((y_true - y_pred) / y_true_adjusted)) * 100
246
+ except Exception as e:
247
+ metrics['mape'] = np.inf
248
+ self.logger.warning(f"Tidak dapat menghitung MAPE: {str(e)}")
249
+
250
+ return metrics
251
+
252
+ def plot_learning_curve(self, X_train, y_train, X_val, y_val, save_path=None):
253
+ """Visualisasi learning curve"""
254
+ try:
255
+ # Gunakan evals_result yang sudah disimpan
256
+ if not hasattr(self, 'evals_result') or not self.evals_result:
257
+ self.logger.warning("Tidak ada evals_result tersedia untuk learning curve")
258
+ return None
259
+
260
+ results = self.evals_result
261
+ epochs = len(results['train']['rmse']) if 'train' in results else 0
262
+
263
+ if epochs == 0:
264
+ self.logger.warning("Data learning curve kosong")
265
+ return None
266
+
267
+ x_axis = range(0, epochs)
268
+
269
+ fig, ax = plt.subplots(figsize=(12, 8))
270
+ ax.plot(x_axis, results['train']['rmse'], label='Train')
271
+
272
+ if 'val' in results:
273
+ ax.plot(x_axis, results['val']['rmse'], label='Validation')
274
+
275
+ ax.legend()
276
+ plt.ylabel('RMSE')
277
+ plt.xlabel('Epochs')
278
+ plt.title('XGBoost Learning Curve')
279
+
280
+ if save_path:
281
+ plt.savefig(save_path, bbox_inches='tight')
282
+ plt.close()
283
+ self.logger.info(f"Learning curve disimpan di: {save_path}")
284
+ else:
285
+ return plt
286
+
287
+ except Exception as e:
288
+ self.logger.error(f"Error membuat learning curve: {str(e)}")
289
+ raise
290
+
291
+ def plot_feature_importance(self, feature_names=None, top_n=20, save_path=None):
292
+ """Visualisasi feature importance"""
293
+ try:
294
+ if self.feature_importance is None:
295
+ self._calculate_feature_importance(feature_names) # Coba hitung lagi
296
+
297
+ if self.feature_importance is None:
298
+ raise ValueError("Feature importance belum dihitung. Model mungkin belum dilatih atau terjadi error dalam perhitungan.")
299
+
300
+ # Buat DataFrame dari feature importance
301
+ importance_df = pd.DataFrame({
302
+ 'feature': list(self.feature_importance.keys()),
303
+ 'importance': list(self.feature_importance.values())
304
+ }).sort_values('importance', ascending=False)
305
+
306
+ # Jika ada feature_names, pastikan urutannya benar
307
+ if feature_names is not None:
308
+ importance_df = importance_df[importance_df['feature'].isin(feature_names)]
309
+
310
+ # Ambil top N features
311
+ top_features = importance_df.head(top_n)
312
+
313
+ # Plot
314
+ plt.figure(figsize=(14, 10))
315
+ bars = plt.barh(top_features['feature'], top_features['importance'])
316
+ plt.xlabel('Importance Score')
317
+ plt.title('Top Feature Importance')
318
+
319
+ # Tambahkan nilai importance
320
+ for bar in bars:
321
+ width = bar.get_width()
322
+ plt.text(width + 0.001, bar.get_y() + bar.get_height()/2,
323
+ f'{width:.4f}',
324
+ va='center', ha='left')
325
+
326
+ plt.gca().invert_yaxis()
327
+ plt.tight_layout()
328
+
329
+ if save_path:
330
+ plt.savefig(save_path, bbox_inches='tight')
331
+ plt.close()
332
+ self.logger.info(f"Feature importance plot disimpan di: {save_path}")
333
+ return None, importance_df
334
+ else:
335
+ return plt, importance_df
336
+
337
+ except Exception as e:
338
+ self.logger.error(f"Error membuat feature importance plot: {str(e)}")
339
+ raise
340
+
341
+
342
+ def plot_shap_summary(self, feature_names=None, save_path=None):
343
+ """Visualisasi SHAP summary plot"""
344
+ try:
345
+ if self.shap_values is None:
346
+ raise ValueError("SHAP values belum dihitung")
347
+
348
+ plt.figure(figsize=(14, 10))
349
+ shap.summary_plot(self.shap_values, feature_names=feature_names, show=False)
350
+ plt.tight_layout()
351
+
352
+ if save_path:
353
+ plt.savefig(save_path, bbox_inches='tight')
354
+ plt.close()
355
+ self.logger.info(f"SHAP summary plot disimpan di: {save_path}")
356
+ else:
357
+ return plt
358
+
359
+ except Exception as e:
360
+ self.logger.error(f"Error membuat SHAP summary plot: {str(e)}")
361
+ raise
362
+
363
+ def save_model(self, save_dir, model_name=None):
364
+ """Menyimpan model dan semua hasil terkait"""
365
+ try:
366
+ os.makedirs(save_dir, exist_ok=True)
367
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
368
+
369
+ if not model_name:
370
+ model_name = f"performance_model_{timestamp}"
371
+
372
+ # Path untuk berbagai file
373
+ model_path = Path(save_dir) / f"{model_name}.pkl"
374
+ params_path = Path(save_dir) / f"{model_name}_params.json"
375
+ cv_path = Path(save_dir) / f"{model_name}_cv_results.csv"
376
+ shap_path = Path(save_dir) / f"{model_name}_shap_values.npy"
377
+ study_path = Path(save_dir) / f"{model_name}_optuna_study.pkl"
378
+
379
+ # Simpan model
380
+ joblib.dump(self.model, model_path)
381
+
382
+ # Simpan parameter terbaik
383
+ with open(params_path, 'w') as f:
384
+ json.dump(self.best_params, f, indent=4)
385
+
386
+ # Simpan hasil CV jika ada
387
+ if self.cv_results is not None:
388
+ pd.DataFrame(self.cv_results).to_csv(cv_path, index=False)
389
+
390
+ # Simpan SHAP values jika ada
391
+ if self.shap_values is not None:
392
+ np.save(shap_path, self.shap_values, allow_pickle=True)
393
+
394
+ # Simpan optuna study jika ada
395
+ if self.study is not None:
396
+ joblib.dump(self.study, study_path)
397
+
398
+ self.logger.info("\n=== MODEL DISIMPAN ===")
399
+ self.logger.info(f"Model: {model_path}")
400
+ self.logger.info(f"Parameter: {params_path}")
401
+ if self.cv_results is not None:
402
+ self.logger.info(f"Hasil CV: {cv_path}")
403
+ if self.shap_values is not None:
404
+ self.logger.info(f"SHAP values: {shap_path}")
405
+ if self.study is not None:
406
+ self.logger.info(f"Optuna study: {study_path}")
407
+
408
+ return str(model_path)
409
+
410
+ except Exception as e:
411
+ self.logger.error(f"Error menyimpan model: {str(e)}")
412
+ raise
Prediksi Performa Akademik/edtech/backend/src/performance_prediction/predictor.py ADDED
@@ -0,0 +1,289 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # backend/src/performance_prediction/predictor.py
2
+
3
+ import joblib
4
+ import pandas as pd
5
+ import numpy as np
6
+ from pathlib import Path
7
+ import logging
8
+ from typing import Union, Dict, List, Optional
9
+ import xgboost as xgb
10
+ import shap
11
+ from datetime import datetime
12
+
13
+ class PerformancePredictor:
14
+ def __init__(self, model_path: str, preprocessor_path: Optional[str] = None):
15
+ """
16
+ Inisialisasi predictor dengan model dan preprocessor
17
+
18
+ Parameters:
19
+ model_path: Path ke model yang sudah dilatih
20
+ preprocessor_path: Path ke preprocessor (opsional)
21
+ """
22
+ self.model_path = model_path
23
+ self.preprocessor_path = preprocessor_path
24
+ self.model = None
25
+ self.preprocessor = None
26
+ self.feature_names = None
27
+ self.shap_explainer = None
28
+ self.logger = self._setup_logger()
29
+ self._load_components()
30
+
31
+ def _setup_logger(self):
32
+ """Setup logger untuk predictor"""
33
+ logger = logging.getLogger(__name__)
34
+ logger.setLevel(logging.INFO)
35
+ handler = logging.StreamHandler()
36
+ formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
37
+ handler.setFormatter(formatter)
38
+ logger.addHandler(handler)
39
+ return logger
40
+
41
+ def _load_components(self):
42
+ """Memuat model dan preprocessor"""
43
+ try:
44
+ # Load model
45
+ self.model = joblib.load(self.model_path)
46
+ self.logger.info(f"Model berhasil dimuat dari {self.model_path}")
47
+
48
+ # Load preprocessor jika ada
49
+ if self.preprocessor_path:
50
+ processor_data = joblib.load(self.preprocessor_path)
51
+ self.preprocessor = processor_data['preprocessor']
52
+ self.feature_names = processor_data['feature_names']
53
+ self.logger.info(f"Preprocessor berhasil dimuat dari {self.preprocessor_path}")
54
+
55
+ # Setup SHAP explainer
56
+ self._setup_shap_explainer()
57
+
58
+ except Exception as e:
59
+ self.logger.error(f"Gagal memuat komponen: {str(e)}")
60
+ raise
61
+
62
+ def _setup_shap_explainer(self):
63
+ """Mempersiapkan SHAP explainer untuk interpretasi"""
64
+ try:
65
+ if hasattr(self.model, 'predict_proba'):
66
+ self.shap_explainer = shap.Explainer(self.model)
67
+ else:
68
+ self.shap_explainer = shap.Explainer(self.model)
69
+ self.logger.info("SHAP explainer berhasil diinisialisasi")
70
+ except Exception as e:
71
+ self.logger.warning(f"Tidak dapat menginisialisasi SHAP explainer: {str(e)}")
72
+ self.shap_explainer = None
73
+
74
+ def _prepare_input(self, input_data: Union[Dict, List[Dict]], return_dataframe: bool = False) -> Union[np.ndarray, pd.DataFrame]:
75
+ """
76
+ Mempersiapkan input data untuk prediksi
77
+
78
+ Parameters:
79
+ input_data: Input data dalam bentuk dict atau list of dicts
80
+ return_dataframe: Jika True kembalikan DataFrame, jika False kembalikan array
81
+
82
+ Returns:
83
+ Data yang sudah diproses dalam bentuk array atau DataFrame
84
+ """
85
+ # Konversi input ke DataFrame
86
+ if isinstance(input_data, dict):
87
+ input_df = pd.DataFrame([input_data])
88
+ elif isinstance(input_data, list):
89
+ input_df = pd.DataFrame(input_data)
90
+ elif isinstance(input_data, pd.DataFrame):
91
+ input_df = input_data.copy()
92
+ else:
93
+ raise ValueError("Input harus berupa dict, list of dicts, atau DataFrame")
94
+
95
+ # Validasi kolom
96
+ if self.feature_names is not None:
97
+ missing_cols = set(self.feature_names) - set(input_df.columns)
98
+ if missing_cols:
99
+ raise ValueError(f"Kolom berikut tidak ditemukan dalam input: {missing_cols}")
100
+
101
+ # Urutkan kolom sesuai dengan yang diharapkan model
102
+ input_df = input_df[self.feature_names]
103
+
104
+ # Preprocess data jika ada preprocessor
105
+ if self.preprocessor is not None:
106
+ processed_data = self.preprocessor.transform(input_df)
107
+ else:
108
+ processed_data = input_df.values if not return_dataframe else input_df
109
+
110
+ return processed_data if not return_dataframe else input_df
111
+
112
+
113
+ def predict(self, input_data: Union[Dict, List[Dict]],
114
+ return_contributions: bool = False) -> Dict:
115
+ """
116
+ Membuat prediksi dari input data dengan opsi interpretasi
117
+
118
+ Parameters:
119
+ input_data: Input data dalam bentuk dict atau list of dicts
120
+ return_contributions: Jika True, kembalikan kontribusi fitur
121
+
122
+ Returns:
123
+ Dict berisi prediksi dan informasi tambahan
124
+ """
125
+ start_time = datetime.now()
126
+
127
+ try:
128
+ # Persiapkan input
129
+ processed_input = self._prepare_input(input_data)
130
+
131
+ # Buat prediksi
132
+ if isinstance(self.model, xgb.Booster):
133
+ dmatrix = xgb.DMatrix(processed_input)
134
+ predictions = self.model.predict(dmatrix)
135
+ else:
136
+ predictions = self.model.predict(processed_input)
137
+
138
+ # Hitung confidence interval (simplified)
139
+ if hasattr(self.model, 'predict_quantiles'):
140
+ quantiles = self.model.predict_quantiles(processed_input, quantiles=(0.025, 0.975))
141
+ confidence_intervals = list(zip(quantiles[0], quantiles[1]))
142
+ else:
143
+ # Fallback untuk model tanpa quantile prediction
144
+ std_dev = np.std(predictions)
145
+ confidence_intervals = [(p - 1.96*std_dev, p + 1.96*std_dev) for p in predictions]
146
+
147
+ # Hitung feature contributions jika diminta
148
+ feature_contributions = None
149
+ if return_contributions and self.shap_explainer is not None:
150
+ feature_contributions = self._calculate_feature_contributions(processed_input)
151
+
152
+ # Hitung waktu eksekusi
153
+ exec_time = (datetime.now() - start_time).total_seconds()
154
+
155
+ # Format hasil
156
+ if isinstance(predictions, np.ndarray) and predictions.ndim == 1:
157
+ predictions = predictions.tolist()
158
+
159
+ result = {
160
+ 'predictions': predictions,
161
+ 'confidence_intervals': confidence_intervals,
162
+ 'execution_time_seconds': exec_time,
163
+ 'timestamp': start_time.isoformat()
164
+ }
165
+
166
+ if feature_contributions is not None:
167
+ result['feature_contributions'] = feature_contributions
168
+
169
+ return result
170
+
171
+ except Exception as e:
172
+ self.logger.error(f"Error dalam prediksi: {str(e)}")
173
+ raise
174
+
175
+ def _calculate_feature_contributions(self, processed_input: np.ndarray) -> List[Dict]:
176
+ """
177
+ Menghitung kontribusi fitur menggunakan SHAP values
178
+
179
+ Parameters:
180
+ processed_input: Input data yang sudah diproses
181
+
182
+ Returns:
183
+ List berisi kontribusi setiap fitur untuk setiap sampel
184
+ """
185
+ if self.shap_explainer is None:
186
+ return None
187
+
188
+ # Hitung SHAP values
189
+ shap_values = self.shap_explainer(processed_input)
190
+
191
+ # Format hasil
192
+ contributions = []
193
+ for i in range(len(processed_input)):
194
+ sample_contributions = []
195
+
196
+ for j, feature_name in enumerate(self.feature_names):
197
+ sample_contributions.append({
198
+ 'feature': feature_name,
199
+ 'value': processed_input[i][j] if isinstance(processed_input, np.ndarray) else processed_input.iloc[i][j],
200
+ 'contribution': float(shap_values.values[i][j]),
201
+ 'abs_contribution': float(np.abs(shap_values.values[i][j]))
202
+ })
203
+
204
+ # Urutkan berdasarkan kontribusi absolut terbesar
205
+ sample_contributions.sort(key=lambda x: x['abs_contribution'], reverse=True)
206
+ contributions.append(sample_contributions)
207
+
208
+ return contributions
209
+
210
+ def batch_predict(self, input_data: List[Dict], batch_size: int = 100,
211
+ return_contributions: bool = False) -> Dict:
212
+ """
213
+ Membuat prediksi dalam batch untuk efisiensi
214
+
215
+ Parameters:
216
+ input_data: List of dicts berisi input data
217
+ batch_size: Ukuran batch untuk prediksi
218
+ return_contributions: Jika True, kembalikan kontribusi fitur
219
+
220
+ Returns:
221
+ Dict berisi hasil prediksi untuk semua sampel
222
+ """
223
+ start_time = datetime.now()
224
+ total_samples = len(input_data)
225
+ results = []
226
+
227
+ self.logger.info(f"Memulai batch prediction untuk {total_samples} sampel (batch_size={batch_size})")
228
+
229
+ for i in range(0, total_samples, batch_size):
230
+ batch = input_data[i:i+batch_size]
231
+ try:
232
+ batch_result = self.predict(batch, return_contributions)
233
+ results.extend(batch_result['predictions'])
234
+ except Exception as e:
235
+ self.logger.error(f"Error pada batch {i//batch_size}: {str(e)}")
236
+ raise
237
+
238
+ exec_time = (datetime.now() - start_time).total_seconds()
239
+ avg_time_per_sample = exec_time / total_samples
240
+
241
+ self.logger.info(
242
+ f"Batch prediction selesai. Total waktu: {exec_time:.2f} detik "
243
+ f"({avg_time_per_sample:.4f} detik/sampel)"
244
+ )
245
+
246
+ return {
247
+ 'predictions': results,
248
+ 'total_samples': total_samples,
249
+ 'total_time_seconds': exec_time,
250
+ 'avg_time_per_sample': avg_time_per_sample,
251
+ 'timestamp': start_time.isoformat()
252
+ }
253
+
254
+ def evaluate_model(self, X_test: np.ndarray, y_test: np.ndarray) -> Dict:
255
+ """
256
+ Evaluasi model pada dataset test
257
+
258
+ Parameters:
259
+ X_test: Data fitur test
260
+ y_test: Target test
261
+
262
+ Returns:
263
+ Dict berisi metrik evaluasi
264
+ """
265
+ from .evaluator import PerformanceEvaluator
266
+
267
+ evaluator = PerformanceEvaluator(y_test, self.predict(X_test)['predictions'],
268
+ self.model, X_test)
269
+ return evaluator.metrics
270
+
271
+ def save_predictor(self, save_dir: str):
272
+ """
273
+ Menyimpan objek predictor untuk penggunaan nanti
274
+
275
+ Parameters:
276
+ save_dir: Direktori untuk menyimpan predictor
277
+ """
278
+ save_path = Path(save_dir)
279
+ save_path.mkdir(parents=True, exist_ok=True)
280
+
281
+ # Nama file berdasarkan timestamp
282
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
283
+ save_file = save_path / f"predictor_{timestamp}.pkl"
284
+
285
+ # Simpan objek predictor
286
+ joblib.dump(self, save_file)
287
+ self.logger.info(f"Predictor disimpan di: {save_file}")
288
+
289
+ return str(save_file)
Prediksi Performa Akademik/edtech/backend/src/train_performance_predictor.py ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # backend/src/train_performance_predictor.py
2
+ import numpy as np
3
+ import sys
4
+ import os
5
+ import json
6
+ import logging
7
+ from pathlib import Path
8
+ import pandas as pd
9
+ import joblib
10
+ import matplotlib.pyplot as plt
11
+ from datetime import datetime
12
+
13
+ # Setup logging
14
+ logging.basicConfig(
15
+ level=logging.INFO,
16
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
17
+ handlers=[
18
+ logging.StreamHandler(),
19
+ logging.FileHandler('training.log')
20
+ ]
21
+ )
22
+ logger = logging.getLogger(__name__)
23
+
24
+ # Setup paths
25
+ current_dir = Path(__file__).parent
26
+ sys.path.append(str(current_dir))
27
+
28
+ from performance_prediction.data_processor import PerformanceDataProcessor
29
+ from performance_prediction.model_trainer import PerformanceModelTrainer
30
+ from performance_prediction.evaluator import PerformanceEvaluator
31
+
32
+ def main():
33
+ try:
34
+ logger.info("=== MEMULAI PELATIHAN MODEL PREDIKSI PERFORMA ===")
35
+
36
+ # Setup paths
37
+ BASE_DIR = current_dir.parent.parent
38
+ DATA_PATH = BASE_DIR / "backend/data/processed/cleaned_education_data.csv" # Ensure this is the correct path
39
+ MODEL_SAVE_DIR = BASE_DIR / "models/performance_predictor/trained_model"
40
+ LOG_DIR = BASE_DIR / "models/performance_predictor/training_logs"
41
+ CONFIG_PATH = BASE_DIR / "config/model_config.json"
42
+
43
+ # Buat direktori jika belum ada
44
+ os.makedirs(MODEL_SAVE_DIR, exist_ok=True)
45
+ os.makedirs(LOG_DIR, exist_ok=True)
46
+
47
+ # 1. Persiapan Data
48
+ logger.info("\n=== MEMUAT DAN MEMPROSES DATA ===")
49
+ processor = PerformanceDataProcessor(DATA_PATH, CONFIG_PATH)
50
+ data = processor.load_data()
51
+
52
+ # Cek data
53
+ if data is None or data.empty:
54
+ logger.error("Data kosong atau gagal dimuat")
55
+ return
56
+
57
+ # Siapkan fitur dan target
58
+ features, target = processor.prepare_features_target(data)
59
+
60
+ # Bagi data menjadi train, validation, dan test set
61
+ X_train, X_val, X_test, y_train, y_val, y_test = processor.split_data(
62
+ test_size=0.2,
63
+ val_size=0.2
64
+ )
65
+
66
+ # Gunakan subset data untuk testing jika perlu
67
+ # X_train, y_train = X_train[:1000], y_train[:1000]
68
+ # X_val, y_val = X_val[:1000], y_val[:1000]
69
+
70
+ # 2. Pelatihan Model
71
+ logger.info("\n=== MELATIH MODEL ===")
72
+ trainer = PerformanceModelTrainer()
73
+
74
+ # Gunakan parameter yang lebih konservatif untuk testing
75
+ best_params = {
76
+ 'max_depth': 6,
77
+ 'learning_rate': 0.1,
78
+ 'subsample': 0.8,
79
+ 'colsample_bytree': 0.8,
80
+ 'reg_alpha': 0.1,
81
+ 'reg_lambda': 1.0,
82
+ 'min_child_weight': 1,
83
+ 'gamma': 0
84
+ }
85
+
86
+ # Latih model final dengan parameter
87
+ model = trainer.train_model(
88
+ X_train=X_train,
89
+ y_train=y_train,
90
+ X_val=X_val,
91
+ y_val=y_val,
92
+ params=best_params
93
+ )
94
+
95
+ # 3. Evaluasi Model
96
+ logger.info("\n=== EVALUASI MODEL ===")
97
+ evaluation = trainer.evaluate_model(X_test, y_test)
98
+
99
+ logger.info("\n=== DETAIL EVALUASI ===")
100
+ logger.info(f"Contoh 5 prediksi pertama: {evaluation['predictions'][:5]}")
101
+ logger.info(f"Contoh 5 nilai sebenarnya: {y_test[:5]}")
102
+ logger.info(f"Perbedaan prediksi dan aktual: {np.abs(y_test[:5] - evaluation['predictions'][:5])}")
103
+
104
+ # Simpan metrik evaluasi
105
+ metrics = evaluation['metrics']
106
+ with open(MODEL_SAVE_DIR / "model_metrics.json", 'w') as f:json.dump(metrics, f, indent=4)
107
+
108
+ # Visualisasi evaluasi
109
+ evaluator = PerformanceEvaluator(y_test, evaluation['predictions'])
110
+
111
+ # Plot dan simpan visualisasi
112
+ plots = {
113
+ "residual_plot": evaluator.plot_residuals(),
114
+ "actual_vs_predicted": evaluator.plot_actual_vs_predicted(),
115
+ "error_distribution": evaluator.plot_error_distribution()
116
+ }
117
+
118
+ for name, plot in plots.items():
119
+ plot_path = LOG_DIR / f"{name}.png"
120
+ plot.savefig(plot_path, bbox_inches='tight')
121
+ plt.close()
122
+ logger.info(f"Plot {name} disimpan di: {plot_path}")
123
+
124
+ # Plot dari model trainer
125
+ trainer.plot_learning_curve(
126
+ X_train=X_train,
127
+ y_train=y_train,
128
+ X_val=X_val,
129
+ y_val=y_val,
130
+ save_path=LOG_DIR / "learning_curve.png"
131
+ )
132
+
133
+ feature_plot, importance_df = trainer.plot_feature_importance(
134
+ feature_names=processor.features.columns,
135
+ save_path=LOG_DIR / "feature_importance.png"
136
+ )
137
+
138
+ # Simpan feature importance
139
+ importance_df.to_csv(LOG_DIR / "feature_importance.csv", index=False)
140
+
141
+ # SHAP summary plot
142
+ try:
143
+ trainer.plot_shap_summary(
144
+ feature_names=processor.features.columns,
145
+ save_path=LOG_DIR / "shap_summary.png"
146
+ )
147
+ except Exception as e:
148
+ logger.warning(f"Tidak dapat membuat SHAP plot: {str(e)}")
149
+
150
+ # 4. Simpan Model dan Processor
151
+ logger.info("\n=== MENYIMPAN MODEL ===")
152
+ model_path = trainer.save_model(MODEL_SAVE_DIR)
153
+ processor_path = processor.save_processor(MODEL_SAVE_DIR)
154
+
155
+ logger.info("\n=== PELATIHAN SELESAI ===")
156
+ logger.info(f"Model disimpan di: {model_path}")
157
+ logger.info(f"Processor disimpan di: {processor_path}")
158
+ print(f"Log dan visualisasi disimpan di: {LOG_DIR}")
159
+
160
+ except Exception as e:
161
+ logger.error(f"Terjadi kesalahan saat melatih model: {str(e)}")
162
+
163
+ if __name__ == "__main__":
164
+ main()