Spaces:
Sleeping
Sleeping
Commit
·
cc2e1db
1
Parent(s):
d4d1ca8
Update Function
Browse files- .DS_Store +0 -0
- .dockerignore +14 -0
- .gitignore +8 -2
- Dockerfile +42 -0
- README.md +81 -156
- app.py +237 -35
- requirements.txt +1 -0
- scripts/classify.py +4 -259
- scripts/compute_reliability.py +242 -0
- scripts/forecast.py +1384 -12
- scripts/{summarize.py → recommendation.py} +0 -0
- scripts/summary.py +159 -0
.DS_Store
CHANGED
|
Binary files a/.DS_Store and b/.DS_Store differ
|
|
|
.dockerignore
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
__pycache__/
|
| 2 |
+
venv/
|
| 3 |
+
.venv/
|
| 4 |
+
.git/
|
| 5 |
+
.gitignore
|
| 6 |
+
outputs/
|
| 7 |
+
data/
|
| 8 |
+
.env
|
| 9 |
+
*.pyc
|
| 10 |
+
*.pkl
|
| 11 |
+
*.joblib
|
| 12 |
+
*.csv
|
| 13 |
+
__MACOSX
|
| 14 |
+
*.log
|
.gitignore
CHANGED
|
@@ -1,7 +1,13 @@
|
|
|
|
|
| 1 |
.env
|
| 2 |
-
|
|
|
|
| 3 |
__pycache__/
|
| 4 |
scripts/__pycache__
|
| 5 |
.venv/
|
| 6 |
.python-version
|
| 7 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
README.md
|
| 2 |
.env
|
| 3 |
+
data/*
|
| 4 |
+
outputs/*
|
| 5 |
__pycache__/
|
| 6 |
scripts/__pycache__
|
| 7 |
.venv/
|
| 8 |
.python-version
|
| 9 |
+
.ipynb_checkpoints
|
| 10 |
+
.DS_Store
|
| 11 |
+
.vscode/
|
| 12 |
+
.idea/
|
| 13 |
+
*.log
|
Dockerfile
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Start from official slim Python 3.12 image
|
| 2 |
+
FROM python:3.12.9-slim
|
| 3 |
+
|
| 4 |
+
# Set working directory
|
| 5 |
+
WORKDIR /app
|
| 6 |
+
|
| 7 |
+
# Install system dependencies required by some packages (geopandas, prophet/cmdstanpy, tensorflow)
|
| 8 |
+
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 9 |
+
build-essential \
|
| 10 |
+
git \
|
| 11 |
+
curl \
|
| 12 |
+
gcc \
|
| 13 |
+
g++ \
|
| 14 |
+
libgeos-dev \
|
| 15 |
+
libproj-dev \
|
| 16 |
+
proj-data \
|
| 17 |
+
proj-bin \
|
| 18 |
+
libgdal-dev \
|
| 19 |
+
pkg-config \
|
| 20 |
+
wget \
|
| 21 |
+
unzip \
|
| 22 |
+
ca-certificates \
|
| 23 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 24 |
+
|
| 25 |
+
# Ensure pip is up to date
|
| 26 |
+
RUN python -m pip install --upgrade pip setuptools wheel
|
| 27 |
+
|
| 28 |
+
# Copy requirements and install Python dependencies
|
| 29 |
+
COPY requirements.txt /app/requirements.txt
|
| 30 |
+
RUN pip install --no-cache-dir -r /app/requirements.txt
|
| 31 |
+
|
| 32 |
+
# Copy application source
|
| 33 |
+
COPY . /app
|
| 34 |
+
|
| 35 |
+
# Create outputs directory
|
| 36 |
+
RUN mkdir -p /app/outputs /app/data
|
| 37 |
+
|
| 38 |
+
# Expose ports commonly used by Gradio and Uvicorn
|
| 39 |
+
EXPOSE 7860 8000
|
| 40 |
+
|
| 41 |
+
# Default command: run the app with python (Gradio will launch)
|
| 42 |
+
CMD ["python", "app.py"]
|
README.md
CHANGED
|
@@ -1,190 +1,115 @@
|
|
| 1 |
# OMS Analyze — Prototype
|
|
|
|
| 2 |
|
| 3 |
-
|
| 4 |
|
| 5 |
-
แอปสร้างด้วย Gradio สำหรับใช้งานผ่านเว็บเบราว์เซอร์ และรองรับการอัปโหลดไฟล์ CSV เพื่อวิเคราะห์
|
| 6 |
|
| 7 |
-
##
|
| 8 |
|
| 9 |
-
###
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
|
| 16 |
-
### การตั้งค่าสภาพแวดล้อมด้วย pyenv (แนะนำ)
|
| 17 |
|
| 18 |
-
|
| 19 |
|
| 20 |
```
|
| 21 |
-
|
|
|
|
| 22 |
pyenv install 3.12.9
|
| 23 |
pyenv local 3.12.9
|
| 24 |
/Users/$(whoami)/.pyenv/versions/3.12.9/bin/python -m venv .venv
|
| 25 |
source .venv/bin/activate
|
| 26 |
python -m pip install --upgrade pip setuptools wheel
|
| 27 |
python -m pip install -r requirements.txt
|
|
|
|
| 28 |
```
|
| 29 |
|
| 30 |
-
|
| 31 |
-
- ผมคอมเมนต์ `prophet` ใน `requirements.txt` เพราะการติดตั้งจะพยายามคอมไพล์ CmdStan (C++ build) ซึ่งใช้เวลานานและมักต้องการเครื่องมือเพิ่มเติม (Xcode command line tools). ถ้าต้องการติดตั้ง `prophet` ให้ uncomment แล้วเตรียมเครื่องมือ build.
|
| 32 |
-
- ผมอัปเดต `fsspec` และ `openai` เป็นเวอร์ชันที่มีอยู่ใน PyPI เพื่อให้ pip หาแพ็กเกจเจอได้
|
| 33 |
-
|
| 34 |
-
- สำหรับการใช้ Hugging Face Router (สำหรับ LLM ใน Summarization และ Classification): ตั้งค่า `HF_TOKEN` ในไฟล์ `.env` (ดู `.env.example`)
|
| 35 |
-
|
| 36 |
-
### Windows (ใช้ pyenv-win)
|
| 37 |
-
|
| 38 |
-
ถ้าคุณใช้ Windows ผมแนะนำติดตั้ง `pyenv-win` เพื่อจัดการเวอร์ชัน Python (หรือใช้ Python ที่ติดตั้งผ่าน Chocolatey / Microsoft Store แล้วสร้าง venv ตามปกติ)
|
| 39 |
-
|
| 40 |
-
1) ติดตั้ง pyenv-win (PowerShell - Run as Administrator แนะนำ)
|
| 41 |
-
|
| 42 |
-
```
|
| 43 |
-
# ผ่าน PowerShell (Admin)
|
| 44 |
-
Invoke-WebRequest -UseBasicParsing -Uri "https://pyenv.run" -OutFile "pyenv-win-install.ps1"
|
| 45 |
-
Set-ExecutionPolicy -ExecutionPolicy RemoteSigned -Scope CurrentUser -Force
|
| 46 |
-
.\n+\pyenv-win-install.ps1
|
| 47 |
-
|
| 48 |
-
# หรือ ติดตั้งด้วย Scoop (ถ้ามี scoop)
|
| 49 |
-
scoop install pyenv
|
| 50 |
-
|
| 51 |
-
แอปแบ่งเป็นแท็บหลัก (UI) ดังนี้ — คำอธิบายสรุปการใช้งานและไฟล์ผลลัพธ์ที่ระบบจะสร้างในโฟลเดอร์ `outputs/`:
|
| 52 |
-
|
| 53 |
-
### 1. Upload & Preview
|
| 54 |
-
- วัตถุประสงค์: อัปโหลดไฟล์ CSV และตรวจสอบตัวอย่างข้อมูล
|
| 55 |
-
- วิธีใช้:
|
| 56 |
-
1. คลิก "Upload CSV" แล้วเลือกไฟล์ (ตัวอย่าง: `data/data_1.csv`, `data/data_3.csv`)
|
| 57 |
-
2. ดูตัวอย่าง 10 แถวแรกและตรวจสอบชนิดของคอลัมน์
|
| 58 |
-
- ผลลัพธ์: preview เท่านั้น — การรันฟังก์ชันวิเคราะห์จะสร้างผลลัพธ์ลงใน `outputs/`
|
| 59 |
-
|
| 60 |
-
### 2. Summarization
|
| 61 |
-
- วัตถุประสงค์: สร้างสรุปภาษาธรรมชาติสำหรับแต่ละเหตุการณ์หรือชุดข้อมูล
|
| 62 |
-
- วิธีใช้:
|
| 63 |
-
1. อัปโหลดไฟล์ CSV
|
| 64 |
-
2. ระบุแถวที่ต้องการสรุป (เว้นว่าง = ทั้งหมด; หรือระบุ index เช่น `0,1,2`)
|
| 65 |
-
3. ถ้าต้องการใช้ LLM ให้เปิด "Use Hugging Face Router" และตั้ง `HF_TOKEN` ใน `.env`
|
| 66 |
-
4. เลือกระดับรายละเอียด (brief / medium / detailed) แล้วคลิก "Generate Summaries"
|
| 67 |
-
- ผลลัพธ์: ไฟล์ `outputs/summaries_from_ui.csv` (UI run) หรือ `outputs/summaries.csv` (batch export)
|
| 68 |
-
|
| 69 |
-
### 3. Anomaly Detection
|
| 70 |
-
- วัตถุประสงค์: ตรวจหาเหตุการณ์ผิดปกติด้วย ML (IsolationForest, LOF)
|
| 71 |
-
- วิธีใช้:
|
| 72 |
-
1. อัปโหลดไฟล์ CSV
|
| 73 |
-
2. เลือก algorithm: `both` (IsolationForest + LOF), `iso`, หรือ `lof`
|
| 74 |
-
3. ปรับค่า contamination (0.01–0.2, default 0.05)
|
| 75 |
-
4. คลิก "Run Anomaly Detection"
|
| 76 |
-
- ผลลัพธ์: `outputs/anomalies_from_ui.csv` (UI run) หรือ `outputs/anomalies.csv` (batch). รายการ suspects จะถูกบันทึกเป็น `outputs/ntl_suspects.csv` ถ้ามีการตรวจจับ
|
| 77 |
-
|
| 78 |
-
### 4. Forecasting
|
| 79 |
-
- วัตถุประสงค์: พยากรณ์จำนวนเหตุการณ์หรือ downtime ในอนาคต
|
| 80 |
-
- วิธีใช้:
|
| 81 |
-
1. อัปโหลดไฟล์ CSV
|
| 82 |
-
2. เลือก metric: `count` หรือ `downtime_minutes`
|
| 83 |
-
3. ตั้ง horizon (7–90 วัน, default 14)
|
| 84 |
-
4. คลิก "Run Forecast"
|
| 85 |
-
- ผลลัพธ์: `outputs/forecast_count_from_ui.csv`, `outputs/forecast_downtime_minutes_from_ui.csv` (UI run) หรือ batch outputs `outputs/forecast_count.csv`, `outputs/forecast_downtime_minutes.csv`
|
| 86 |
-
(หากต้องการความแม่นยำมากขึ้น ให้ติดตั้ง `prophet==1.1.7`)
|
| 87 |
-
|
| 88 |
-
### 5. Classification
|
| 89 |
-
- วัตถุประสงค์: จำแนกสาเหตุของเหตุการณ์ (root-cause classification)
|
| 90 |
-
- วิธีใช้:
|
| 91 |
-
1. อัปโหลดไฟล์ CSV
|
| 92 |
-
2. (Optional) เปิด "Run weak-labeling using HF" เพื่อให้ LLM สร้าง weak labels (`HF_TOKEN` ต้องถูกตั้ง)
|
| 93 |
-
3. (Optional) เลือก "Run GridSearch" สำหรับ tuning
|
| 94 |
-
4. คลิก "Train Classifier"
|
| 95 |
-
- ผลลัพธ์: โมเดลและผลลัพธ์จะถูกบันทึกลง `outputs/`, ตัวอย่างไฟล์:
|
| 96 |
-
- `outputs/rf_cause_pipeline.joblib`
|
| 97 |
-
- `outputs/predictions_cause.csv`
|
| 98 |
-
- ถ้ามีหลายโมเดล: `outputs/predictions_gb_CauseType.csv`, `outputs/predictions_mlp_CauseType.csv`, `outputs/predictions_rf_CauseType.csv`
|
| 99 |
-
|
| 100 |
-
python -m pip install --upgrade pip setuptools wheel
|
| 101 |
-
python -m pip install -r requirements.txt
|
| 102 |
-
```
|
| 103 |
-
|
| 104 |
-
หมายเหตุ สำหรับ Windows:
|
| 105 |
-
- บางแพ็กเกจ (เช่น `geopandas`, `fiona`, `pyproj`, `shapely`) มี native dependencies (GDAL, PROJ) — การใช้ Miniforge/Conda จะทำให้ง่ายกว่า (conda-forge)
|
| 106 |
-
- `prophet` มักจะต้องการการติดตั้ง CmdStan/cmdstanpy ล่วงหน้า และอาจจะซับซ้อนบน Windows — พิจารณาใช้ WSL2 หรือ conda สำหรับงานนี้
|
| 107 |
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 111 |
python app.py
|
| 112 |
```
|
| 113 |
-
แอปจะรันที่ `http://127.0.0.1:7860` (หรือปรับ port ด้วย `--server.port PORT`)
|
| 114 |
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
- `scripts/`: โมดูล Python สำหรับแต่ละฟีเจอร์
|
| 118 |
-
- `outputs/`: ไฟล์ผลลัพธ์จากการวิเคราะห์
|
| 119 |
-
- `app.py`: แอป Gradio หลัก
|
| 120 |
-
|
| 121 |
-
## คู่มือการใช้งาน
|
| 122 |
-
|
| 123 |
-
แอปแบ่งเป็นแท็บต่างๆ ดังนี้:
|
| 124 |
|
| 125 |
### 1. Upload & Preview
|
| 126 |
-
-
|
| 127 |
- **วิธีใช้**:
|
| 128 |
-
1. คลิก "Upload CSV
|
| 129 |
-
2.
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
-
|
|
|
|
|
|
|
|
|
|
| 134 |
- **วิธีใช้**:
|
| 135 |
-
1.
|
| 136 |
-
2.
|
| 137 |
-
3.
|
| 138 |
-
4.
|
| 139 |
-
5.
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
### 3. Anomaly Detection
|
| 145 |
-
- **วัตถุประสงค์**: ตรวจหาเหตุการณ์ผิดปกติโดยใช้ Machine Learning
|
| 146 |
- **วิธีใช้**:
|
| 147 |
-
1.
|
| 148 |
-
2.
|
| 149 |
-
3.
|
| 150 |
-
4. คลิก "
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
-
|
| 154 |
-
- **ผลลัพธ์**: CSV กับคอลัมน์ anomaly scores และ textual explanations
|
| 155 |
-
|
| 156 |
-
### 4. Forecasting
|
| 157 |
-
- **วัตถุประสงค์**: พยากรณ์จำนวนเหตุการณ์หรือ downtime ในอนาคต
|
| 158 |
- **วิธีใช้**:
|
| 159 |
-
1.
|
| 160 |
-
2. เลือก
|
| 161 |
-
3.
|
| 162 |
-
4.
|
| 163 |
-
5. ดูผลในตาราง (แสดง actual ล่าสุด + forecast)
|
| 164 |
-
6. Download ไฟล์ `forecast_{metric}_from_ui.csv`
|
| 165 |
-
- **ฟีเจอร์**: ใช้ Prophet ถ้าติดตั้งได้, มิฉะนั้นใช้ naive forecast
|
| 166 |
-
- **ผลลัพธ์**: CSV กับคอลัมน์ date และ predicted values
|
| 167 |
|
| 168 |
### 5. Classification
|
| 169 |
-
-
|
| 170 |
- **วิธีใช้**:
|
| 171 |
-
1.
|
| 172 |
-
2. เลือก
|
| 173 |
-
3.
|
| 174 |
-
4. คลิก "Train Classifier"
|
| 175 |
-
5.
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 179 |
|
| 180 |
## หมายเหตุ
|
| 181 |
-
- เป็น prototype
|
| 182 |
-
-
|
| 183 |
-
- ข้อมูลตัวอย่าง: `data/data.csv` มีคอลัมน์เช่น EventNumber, OutageDateTime, CauseType, etc.
|
| 184 |
-
- ถ้ามีปัญหา: ตรวจสอบ console สำหรับ errors และ ensure Python environment ถูกต้อง
|
| 185 |
|
| 186 |
## การพัฒนาเพิ่มเติม
|
| 187 |
-
-
|
| 188 |
-
- SHAP explanations สำหรับ models
|
| 189 |
-
- Human-in-the-loop สำหรับ weak labels
|
| 190 |
-
- Alerting และ real-time processing
|
|
|
|
| 1 |
# OMS Analyze — Prototype
|
| 2 |
+
> Created by PEACE, Powered by AI, Version 0.0.1
|
| 3 |
|
| 4 |
+
Prototype Application Platform สำหรับวิเคราะห์ข้อมูลการดับไฟฟ้า (OMS - Outage Management System) โดยใช้ AI และ Machine Learning เพื่อสรุป สืบหาความผิดปกติ พยากรณ์ และจำแนกสาเหตุ
|
| 5 |
|
| 6 |
+
แอปสร้างด้วย Gradio สำหรับใช้งานผ่านเว็บเบราว์เซอร์ และรองรับการอัปโหลดไฟล์ CSV เพื่อวิเคราะห์ รองรับบน Huggingface Space
|
| 7 |
|
| 8 |
+
## วิธีการติดตั้งและใช้งาน
|
| 9 |
|
| 10 |
+
### วิธีการใช้งานผ่าน Docker (แนะนำ)
|
| 11 |
+
ต้องมี Docker ก่อน ถึงจะสามารถใช้งานได้ [(ดาวน์โหลด Rancher Desktop)](https://github.com/rancher-sandbox/rancher-desktop/releases/download/v1.20.0/Rancher.Desktop.Setup.1.20.0.msi)
|
| 12 |
+
```
|
| 13 |
+
docker build -t ai-oms-analyze:latest .
|
| 14 |
+
docker run -d --rm -p 7860:7860 -p 8000:8000 --env-file .env -v $(pwd)/outputs:/app/outputs ai-oms-analyze:latest
|
| 15 |
+
```
|
| 16 |
|
|
|
|
| 17 |
|
| 18 |
+
### วิธีการใช้งาน (MacOS)
|
| 19 |
|
| 20 |
```
|
| 21 |
+
brew install pyenv
|
| 22 |
+
cd /AI-OMS-Analyze
|
| 23 |
pyenv install 3.12.9
|
| 24 |
pyenv local 3.12.9
|
| 25 |
/Users/$(whoami)/.pyenv/versions/3.12.9/bin/python -m venv .venv
|
| 26 |
source .venv/bin/activate
|
| 27 |
python -m pip install --upgrade pip setuptools wheel
|
| 28 |
python -m pip install -r requirements.txt
|
| 29 |
+
python app.py
|
| 30 |
```
|
| 31 |
|
| 32 |
+
### วิธีการใช้งาน (Windows)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
|
| 34 |
+
```bash
|
| 35 |
+
Set-ExecutionPolicy Bypass -Scope Process -Force; [System.Net.ServicePointManager]::SecurityProtocol = [System.Net.ServicePointManager]::SecurityProtocol -bor 3072; iex ((New-Object System.Net.WebClient).DownloadString('https://community.chocolatey.org/install.ps1'))
|
| 36 |
+
choco install pyenv-win
|
| 37 |
+
Set-ExecutionPolicy -Scope CurrentUser -ExecutionPolicy Unrestricted
|
| 38 |
+
cd /AI-OMS-Analyze
|
| 39 |
+
pyenv install 3.12.9
|
| 40 |
+
pyenv local 3.12.9
|
| 41 |
+
pip install -r requirements.txt
|
| 42 |
python app.py
|
| 43 |
```
|
|
|
|
| 44 |
|
| 45 |
+
## เมนูการใช้งาน
|
| 46 |
+
แอปแบ่งเป็นแท็บต่าง ๆ ดังนี้ (ตรงกับ UI ใน `app.py`):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
|
| 48 |
### 1. Upload & Preview
|
| 49 |
+
- **Usecase Scenario**: อัปโหลดไฟล์ CSV เพื่อตรวจสอบข้อมูลต้นฉบับ และทำความสะอาดข้อมูล (ลบข้อมูลซ้ำ, จัดการค่าที่หายไป)
|
| 50 |
- **วิธีใช้**:
|
| 51 |
+
1. คลิก "Upload CSV (data.csv)" และเลือกไฟล์
|
| 52 |
+
2. ปรับตัวเลือกเช่น Remove Duplicates และ Missing Values Handling
|
| 53 |
+
3. คลิก "Apply Cleansing" เพื่อรันการทำความสะอาด
|
| 54 |
+
4. เปรียบเทียบตัวอย่างข้อมูลในแท็บ "Original Data" และ "Cleansed Data"
|
| 55 |
+
5. ดาวน์โหลดไฟล์ผลลัพธ์จากปุ่ม "Download Cleansed CSV"
|
| 56 |
+
- **ผลลัพธ์**: ไฟล์ `outputs/cleansed_data.csv` (ดาวน์โหลดผ่าน UI)
|
| 57 |
+
|
| 58 |
+
### 2. Recommendation
|
| 59 |
+
- **Usecase Scenario**: สร้างสรุปข้อความสำหรับเหตุการณ์ที่เลือก (เช่น สรุปเหตุการณ์ไฟฟ้าขัดข้องหรือบำรุงรักษา) และส่งออก CSV ของสรุป
|
| 60 |
- **วิธีใช้**:
|
| 61 |
+
1. คลิก "Upload CSV (data.csv)"
|
| 62 |
+
2. กรอกแถวที่ต้องการในช่อง "Rows (comma-separated indexes)" หรือเว้นว่างเพื่อย่อให้เป็นทั้งหมด
|
| 63 |
+
3. เลือกว่าต้องการใช้ Generative AI (Use Generative AI) หรือไม่
|
| 64 |
+
4. เลือกระดับสรุป (Summary Type) แล้วคลิก "Generate Summaries"
|
| 65 |
+
5. ดูผลในตาราง และดาวน์โหลด `outputs/summaries_from_ui.csv`
|
| 66 |
+
- **ฟีเจอร์**: รองรับการใช้ GenAI (model selector จะปรากฏเมื่อเปิด Use Generative AI)
|
| 67 |
+
|
| 68 |
+
### 3. Summary
|
| 69 |
+
- **Usecase Scenario**: สร้างสรุปภาพรวมของชุดข้อมูลทั้งชุด รวมสถิติพื้นฐาน และคำนวณดัชนีความน่าเชื่อถือ (SAIFI, SAIDI, CAIDI)
|
|
|
|
|
|
|
| 70 |
- **วิธีใช้**:
|
| 71 |
+
1. คลิก "Upload CSV for Overall Summary"
|
| 72 |
+
2. เลือกว่าจะใช้ Generative AI ในการขยายความหรือไม่
|
| 73 |
+
3. กำหนดจำนวนลูกค้าทั้งหมดสำหรับการคำนวณ reliability
|
| 74 |
+
4. คลิก "Generate Overall Summary" เพื่อรับ AI summary, basic statistics และ reliability metrics
|
| 75 |
+
|
| 76 |
+
### 4. Anomaly Detection
|
| 77 |
+
- **Usecase Scenario**: ตรวจจับเหตุการณ์ที่ผิดปกติโดยใช้หลาย algorithm (Isolation Forest, LOF, Autoencoder)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 78 |
- **วิธีใช้**:
|
| 79 |
+
1. คลิก "Upload CSV for Anomaly"
|
| 80 |
+
2. เลือก algorithm และปรับค่า contamination
|
| 81 |
+
3. คลิก "Run Anomaly Detection"
|
| 82 |
+
4. ดูผลลัพธ์ในตารางและดาวน์โหลด `outputs/anomalies_from_ui.csv`
|
|
|
|
|
|
|
|
|
|
|
|
|
| 83 |
|
| 84 |
### 5. Classification
|
| 85 |
+
- **Usecase Scenario**: ฝึกโมเดลเพื่อจำแนกสาเหตุของเหตุการณ์ (เลือก Target Column เช่น CauseType หรือ SubCauseType)
|
| 86 |
- **วิธีใช้**:
|
| 87 |
+
1. คลิก "Upload CSV for Classification"
|
| 88 |
+
2. เลือก Target Column และชนิดโมเดล (rf/gb/mlp)
|
| 89 |
+
3. ปรับ Hyperparameters ใน Accordion (ถ้าจำเป็น) หรือเปิด Weak-labeling เพื่อเรียกใช้ HF
|
| 90 |
+
4. คลิก "Train Classifier" แล้วรอรายงานผล
|
| 91 |
+
5. ดาวน์โหลดโมเดลและไฟล์ predictions ผ่านปุ่มที่ปรากฏ
|
| 92 |
+
|
| 93 |
+
### 6. Label Suggestion
|
| 94 |
+
- **Usecase Scenario**: แนะนำป้ายกำกับสำหรับเหตุการณ์ที่ไม่มีฉลาก โดยยึดจากความคล้ายกับตัวอย่างที่มีฉลาก
|
| 95 |
+
- **วิธีใช้**:
|
| 96 |
+
1. คลิก "Upload CSV (defaults to data/data_3.csv)" หรือปล่อยให้ใช้ไฟล์เริ่มต้น
|
| 97 |
+
2. เลือกจำนวนคำแนะนำสูงสุด (Top K suggestions)
|
| 98 |
+
3. คลิก "Run Label Suggestion" แล้วดาวน์โหลด `outputs/label_suggestions.csv`
|
| 99 |
+
|
| 100 |
+
### 7. Forecasting
|
| 101 |
+
- **Usecase Scenario**: พยากรณ์จำนวนเหตุการณ์หรือ downtime ในอนาคตโดยเลือกโมเดล (Prophet, LSTM, Bi-LSTM, GRU, Naive)
|
| 102 |
+
- **วิธีใช้**:
|
| 103 |
+
1. คลิก "Upload CSV for Forecasting"
|
| 104 |
+
2. เลือก metric (count หรือ downtime_minutes) และ model
|
| 105 |
+
3. ปรับ periods/horizon และ (ถ้าจำเป็น) เปิด Multivariate สำหรับ DL models
|
| 106 |
+
4. คลิก "Run Forecasting" เพื่อดู Historical Data, Forecast Results และ Time Series Plot
|
| 107 |
+
5. ดาว��์โหลดไฟล์ forecast ที่สร้างใน `outputs/` (ชื่อไฟล์รูปแบบ `forecast_{metric}_{model}_...csv`)
|
| 108 |
+
|
| 109 |
|
| 110 |
## หมายเหตุ
|
| 111 |
+
- เป็น prototype ยังไม่สามารถใช้งานบนระดับ Production ได้
|
| 112 |
+
- แนะนำแหล่งอ่าน Machine Learning [ที่นี่](https://guopai.github.io/)
|
|
|
|
|
|
|
| 113 |
|
| 114 |
## การพัฒนาเพิ่มเติม
|
| 115 |
+
- TBA
|
|
|
|
|
|
|
|
|
app.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
import gradio as gr
|
| 2 |
import pandas as pd
|
| 3 |
from pathlib import Path
|
| 4 |
-
from scripts.
|
| 5 |
from scripts.data_cleansing import cleanse_data
|
| 6 |
from dotenv import load_dotenv
|
| 7 |
import os
|
|
@@ -36,7 +36,7 @@ with gr.Blocks() as demo:
|
|
| 36 |
with gr.Tabs():
|
| 37 |
# Upload & Preview tab
|
| 38 |
with gr.TabItem('Upload & Preview'):
|
| 39 |
-
gr.Markdown("
|
| 40 |
csv_up = gr.File(label='Upload CSV (data.csv)')
|
| 41 |
with gr.Row():
|
| 42 |
remove_dup = gr.Checkbox(label='Remove Duplicates', value=False)
|
|
@@ -74,9 +74,9 @@ with gr.Blocks() as demo:
|
|
| 74 |
csv_up.change(fn=initial_preview, inputs=csv_up, outputs=[original_preview, cleansed_preview, clean_status])
|
| 75 |
apply_clean.click(fn=apply_cleansing, inputs=[csv_up, remove_dup, missing_handling], outputs=[cleansed_preview, clean_status, download_cleansed])
|
| 76 |
|
| 77 |
-
#
|
| 78 |
-
with gr.TabItem('
|
| 79 |
-
gr.Markdown("
|
| 80 |
csv_in = gr.File(label='Upload CSV (data.csv)')
|
| 81 |
with gr.Row():
|
| 82 |
rows = gr.Textbox(label='Rows (comma-separated indexes) or empty = all', placeholder='e.g. 0,1,2')
|
|
@@ -121,8 +121,75 @@ with gr.Blocks() as demo:
|
|
| 121 |
use_hf.change(fn=update_model_visibility, inputs=use_hf, outputs=model_selector)
|
| 122 |
|
| 123 |
run_btn.click(fn=run_summarize, inputs=[csv_in, rows, use_hf, verbosity], outputs=[out, status, download])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 124 |
with gr.TabItem('Anomaly Detection'):
|
| 125 |
-
gr.Markdown("
|
| 126 |
csv_in_anom = gr.File(label='Upload CSV for Anomaly')
|
| 127 |
with gr.Row():
|
| 128 |
alg = gr.Radio(choices=['both','iso','lof','autoencoder'], value='both', label='Algorithm')
|
|
@@ -146,36 +213,9 @@ with gr.Blocks() as demo:
|
|
| 146 |
|
| 147 |
run_anom.click(fn=run_anomaly_ui, inputs=[csv_in_anom, alg, contamination], outputs=[anom_out, anom_status, anom_download])
|
| 148 |
|
| 149 |
-
# Forecasting tab
|
| 150 |
-
with gr.TabItem('Forecasting'):
|
| 151 |
-
gr.Markdown("**พยากรณ์**: พยากรณ์จำนวนเหตุการณ์หรือเวลาหยุดทำงานในอนาคตโดยใช้การวิเคราะห์อนุกรมเวลา (Prophet) เลือกเมตริกและช่วงพยากรณ์ ดาวน์โหลด CSV พยากรณ์")
|
| 152 |
-
csv_in_fc = gr.File(label='Upload CSV for Forecast')
|
| 153 |
-
with gr.Row():
|
| 154 |
-
metric_fc = gr.Radio(choices=['count','downtime_minutes'], value='count', label='Metric')
|
| 155 |
-
horizon = gr.Slider(minimum=7, maximum=90, value=14, step=1, label='Horizon (days)')
|
| 156 |
-
run_fc = gr.Button('Run Forecast')
|
| 157 |
-
fc_out = gr.Dataframe()
|
| 158 |
-
fc_status = gr.Textbox(label='Forecast Status', interactive=False)
|
| 159 |
-
fc_download = gr.File(label='Download forecast CSV')
|
| 160 |
-
|
| 161 |
-
def run_forecast_ui(file, metric, horizon_days):
|
| 162 |
-
if file is None:
|
| 163 |
-
return pd.DataFrame(), 'No file provided', None
|
| 164 |
-
from scripts.forecast import prepare_timeseries, run_forecast
|
| 165 |
-
df = pd.read_csv(file.name, dtype=str)
|
| 166 |
-
ts, fcst = run_forecast(df, metric=metric, periods=int(horizon_days))
|
| 167 |
-
out_file = ROOT / 'outputs' / f'forecast_{metric}_from_ui.csv'
|
| 168 |
-
out_file.parent.mkdir(exist_ok=True)
|
| 169 |
-
fcst.to_csv(out_file, index=False, encoding='utf-8-sig')
|
| 170 |
-
status = f"Forecast produced: {len(fcst)} rows (horizon {horizon_days} days)."
|
| 171 |
-
display_df = pd.concat([ts.tail(30).rename(columns={'y':'actual'}).set_index('ds'), fcst.set_index('ds')], axis=1).reset_index()
|
| 172 |
-
return display_df, status, str(out_file)
|
| 173 |
-
|
| 174 |
-
run_fc.click(fn=run_forecast_ui, inputs=[csv_in_fc, metric_fc, horizon], outputs=[fc_out, fc_status, fc_download])
|
| 175 |
-
|
| 176 |
# Classification tab
|
| 177 |
with gr.TabItem('Classification'):
|
| 178 |
-
gr.Markdown("
|
| 179 |
csv_in_cls = gr.File(label='Upload CSV for Classification')
|
| 180 |
with gr.Row():
|
| 181 |
label_col = gr.Dropdown(choices=['CauseType','SubCauseType'], value='CauseType', label='Target Column')
|
|
@@ -294,7 +334,7 @@ with gr.Blocks() as demo:
|
|
| 294 |
|
| 295 |
# Label Suggestion tab
|
| 296 |
with gr.TabItem('Label Suggestion'):
|
| 297 |
-
gr.Markdown("
|
| 298 |
csv_in_ls = gr.File(label='Upload CSV (defaults to data/data_3.csv)')
|
| 299 |
with gr.Row():
|
| 300 |
top_k = gr.Slider(minimum=1, maximum=5, value=1, step=1, label='Top K suggestions')
|
|
@@ -321,5 +361,167 @@ with gr.Blocks() as demo:
|
|
| 321 |
|
| 322 |
run_ls.click(fn=run_label_suggestion, inputs=[csv_in_ls, top_k], outputs=[ls_out, ls_status, ls_download])
|
| 323 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 324 |
if __name__ == '__main__':
|
| 325 |
demo.launch()
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
import pandas as pd
|
| 3 |
from pathlib import Path
|
| 4 |
+
from scripts.recommendation import summarize_events
|
| 5 |
from scripts.data_cleansing import cleanse_data
|
| 6 |
from dotenv import load_dotenv
|
| 7 |
import os
|
|
|
|
| 36 |
with gr.Tabs():
|
| 37 |
# Upload & Preview tab
|
| 38 |
with gr.TabItem('Upload & Preview'):
|
| 39 |
+
gr.Markdown("**Usecase Scenario — Upload & Preview**: อัปโหลดไฟล์ CSV เพื่อตรวจสอบข้อมูลต้นฉบับ ทำความสะอาดข้อมูล (ลบข้อมูลซ้ำ, จัดการค่าที่หายไป) เปรียบเทียบตัวอย่างก่อน/หลัง และดาวน์โหลดไฟล์ที่ทำความสะอาดแล้ว")
|
| 40 |
csv_up = gr.File(label='Upload CSV (data.csv)')
|
| 41 |
with gr.Row():
|
| 42 |
remove_dup = gr.Checkbox(label='Remove Duplicates', value=False)
|
|
|
|
| 74 |
csv_up.change(fn=initial_preview, inputs=csv_up, outputs=[original_preview, cleansed_preview, clean_status])
|
| 75 |
apply_clean.click(fn=apply_cleansing, inputs=[csv_up, remove_dup, missing_handling], outputs=[cleansed_preview, clean_status, download_cleansed])
|
| 76 |
|
| 77 |
+
# Recommendation tab
|
| 78 |
+
with gr.TabItem('Recommendation'):
|
| 79 |
+
gr.Markdown("**Usecase Scenario — Recommendation**: สร้างสรุปเหตุการณ์ (เช่น สรุปเหตุการณ์ไฟฟ้าล้ม) สำหรับแถวที่เลือก ปรับระดับรายละเอียด และเลือกใช้ Generative AI เพื่อเพิ่มความชัดเจน 및 ดาวน์โหลดไฟล์สรุป")
|
| 80 |
csv_in = gr.File(label='Upload CSV (data.csv)')
|
| 81 |
with gr.Row():
|
| 82 |
rows = gr.Textbox(label='Rows (comma-separated indexes) or empty = all', placeholder='e.g. 0,1,2')
|
|
|
|
| 121 |
use_hf.change(fn=update_model_visibility, inputs=use_hf, outputs=model_selector)
|
| 122 |
|
| 123 |
run_btn.click(fn=run_summarize, inputs=[csv_in, rows, use_hf, verbosity], outputs=[out, status, download])
|
| 124 |
+
|
| 125 |
+
# Summary tab
|
| 126 |
+
with gr.TabItem('Summary'):
|
| 127 |
+
gr.Markdown("**Usecase Scenario — Summary**: สร้างสรุปภาพรวมของชุดข้อมูลทั้งหมด รวมสถิติพื้นฐาน และคำนวณดัชนีความน่าเชื่อถือ (เช่น SAIFI, SAIDI, CAIDI) พร้อมตัวเลือกใช้ Generative AI ในการขยายความ")
|
| 128 |
+
csv_in_sum = gr.File(label='Upload CSV for Overall Summary')
|
| 129 |
+
with gr.Row():
|
| 130 |
+
use_hf_sum = gr.Checkbox(label='Use Generative AI for Summary', value=False)
|
| 131 |
+
total_customers = gr.Number(label='Total Customers (for reliability calculation)', value=500000, precision=0)
|
| 132 |
+
run_sum = gr.Button('Generate Overall Summary')
|
| 133 |
+
with gr.Row():
|
| 134 |
+
model_selector_sum = gr.Dropdown(
|
| 135 |
+
choices=[
|
| 136 |
+
'meta-llama/Llama-3.1-8B-Instruct:novita',
|
| 137 |
+
'meta-llama/Llama-4-Scout-17B-16E-Instruct:novita',
|
| 138 |
+
'Qwen/Qwen3-VL-235B-A22B-Instruct:novita',
|
| 139 |
+
'deepseek-ai/DeepSeek-R1:novita'
|
| 140 |
+
],
|
| 141 |
+
value='meta-llama/Llama-3.1-8B-Instruct:novita',
|
| 142 |
+
label='GenAI Model',
|
| 143 |
+
interactive=True,
|
| 144 |
+
visible=False
|
| 145 |
+
)
|
| 146 |
+
|
| 147 |
+
with gr.Tabs():
|
| 148 |
+
with gr.TabItem('AI Summary'):
|
| 149 |
+
ai_summary_out = gr.Textbox(label='AI Generated Summary', lines=10)
|
| 150 |
+
with gr.TabItem('Basic Statistics'):
|
| 151 |
+
basic_stats_out = gr.JSON(label='Basic Statistics')
|
| 152 |
+
with gr.TabItem('Reliability Indices'):
|
| 153 |
+
reliability_out = gr.Dataframe(label='Reliability Metrics')
|
| 154 |
+
|
| 155 |
+
sum_status = gr.Textbox(label='Summary Status', interactive=False)
|
| 156 |
+
|
| 157 |
+
def run_overall_summary(file, use_hf_flag, total_cust, model):
|
| 158 |
+
if file is None:
|
| 159 |
+
return {}, {}, pd.DataFrame(), 'No file provided'
|
| 160 |
+
try:
|
| 161 |
+
from scripts.summary import summarize_overall
|
| 162 |
+
df = pd.read_csv(file.name, dtype=str)
|
| 163 |
+
|
| 164 |
+
result = summarize_overall(df, use_hf=use_hf_flag, model=model, total_customers=total_cust)
|
| 165 |
+
|
| 166 |
+
# Prepare outputs
|
| 167 |
+
ai_summary = result.get('ai_summary', 'ไม่สามารถสร้างสรุปด้วย AI ได้')
|
| 168 |
+
basic_stats = {
|
| 169 |
+
'total_events': result.get('total_events'),
|
| 170 |
+
'date_range': result.get('date_range'),
|
| 171 |
+
'event_types': result.get('event_types'),
|
| 172 |
+
'total_affected_customers': result.get('total_affected_customers')
|
| 173 |
+
}
|
| 174 |
+
|
| 175 |
+
# Reliability metrics as DataFrame
|
| 176 |
+
reliability_df = result.get('reliability_df', pd.DataFrame())
|
| 177 |
+
|
| 178 |
+
status = f"Summary generated for {len(df)} events. AI used: {use_hf_flag}"
|
| 179 |
+
return ai_summary, basic_stats, reliability_df, status
|
| 180 |
+
|
| 181 |
+
except Exception as e:
|
| 182 |
+
return f"Error: {str(e)}", {}, pd.DataFrame(), f'Summary failed: {e}'
|
| 183 |
+
|
| 184 |
+
def update_model_visibility_sum(use_hf_flag):
|
| 185 |
+
return gr.update(visible=use_hf_flag, interactive=use_hf_flag)
|
| 186 |
+
|
| 187 |
+
use_hf_sum.change(fn=update_model_visibility_sum, inputs=use_hf_sum, outputs=model_selector_sum)
|
| 188 |
+
|
| 189 |
+
run_sum.click(fn=run_overall_summary, inputs=[csv_in_sum, use_hf_sum, total_customers, model_selector_sum], outputs=[ai_summary_out, basic_stats_out, reliability_out, sum_status])
|
| 190 |
+
|
| 191 |
with gr.TabItem('Anomaly Detection'):
|
| 192 |
+
gr.Markdown("**Usecase Scenario — Anomaly Detection**: ตรวจจับเหตุการณ์ที่มีพฤติกรรมผิดปกติในชุดข้อมูล (เช่น เหตุการณ์ที่มีค่าสูง/ต่ำผิดปกติ) โดยใช้หลาย algorithm ปรับระดับ contamination และส่งออกผลลัพธ์พร้อมธงความผิดปกติ")
|
| 193 |
csv_in_anom = gr.File(label='Upload CSV for Anomaly')
|
| 194 |
with gr.Row():
|
| 195 |
alg = gr.Radio(choices=['both','iso','lof','autoencoder'], value='both', label='Algorithm')
|
|
|
|
| 213 |
|
| 214 |
run_anom.click(fn=run_anomaly_ui, inputs=[csv_in_anom, alg, contamination], outputs=[anom_out, anom_status, anom_download])
|
| 215 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 216 |
# Classification tab
|
| 217 |
with gr.TabItem('Classification'):
|
| 218 |
+
gr.Markdown("**Usecase Scenario — Classification**: ฝึกและทดสอบโมเดลเพื่อจำแนกสาเหตุของเหตุการณ์ กำหนดคอลัมน์เป้าหมาย ปรับ hyperparameters, เปิดใช้งาน weak-labeling และดาวน์โหลดโมเดล/ผลการทำนาย")
|
| 219 |
csv_in_cls = gr.File(label='Upload CSV for Classification')
|
| 220 |
with gr.Row():
|
| 221 |
label_col = gr.Dropdown(choices=['CauseType','SubCauseType'], value='CauseType', label='Target Column')
|
|
|
|
| 334 |
|
| 335 |
# Label Suggestion tab
|
| 336 |
with gr.TabItem('Label Suggestion'):
|
| 337 |
+
gr.Markdown("**Usecase Scenario — Label Suggestion**: ให้คำแนะนำป้ายกำกับสาเหตุที่เป็นไปได้สำหรับเหตุการณ์ที่ไม่มีฉลาก โดยเทียบความคล้ายกับตัวอย่างที่มีฉลาก ปรับจำนวนคำแนะนำสูงสุด และส่งออกเป็นไฟล์ CSV")
|
| 338 |
csv_in_ls = gr.File(label='Upload CSV (defaults to data/data_3.csv)')
|
| 339 |
with gr.Row():
|
| 340 |
top_k = gr.Slider(minimum=1, maximum=5, value=1, step=1, label='Top K suggestions')
|
|
|
|
| 361 |
|
| 362 |
run_ls.click(fn=run_label_suggestion, inputs=[csv_in_ls, top_k], outputs=[ls_out, ls_status, ls_download])
|
| 363 |
|
| 364 |
+
# Forecasting tab
|
| 365 |
+
with gr.TabItem('Forecasting'):
|
| 366 |
+
gr.Markdown("**Usecase Scenario — Forecasting**: พยากรณ์จำนวนเหตุการณ์หรือเวลาหยุดทำงานในอนาคตโดยเลือกโมเดล (Prophet, LSTM, Bi-LSTM, GRU, Naive) ปรับพารามิเตอร์ และส่งออกผลการพยากรณ์")
|
| 367 |
+
gr.Markdown("*Multivariate forecasting (ใช้หลายฟีเจอร์) รองรับเฉพาะโมเดล LSTM, Bi-LSTM, GRU เท่านั้น*")
|
| 368 |
+
csv_in_fc = gr.File(label='Upload CSV for Forecasting')
|
| 369 |
+
with gr.Row():
|
| 370 |
+
metric_fc = gr.Radio(choices=['count','downtime_minutes'], value='count', label='Metric to Forecast')
|
| 371 |
+
model_type_fc = gr.Radio(choices=['prophet','lstm','bilstm','gru','naive'], value='lstm', label='Forecasting Model', elem_id='forecast_model_radio')
|
| 372 |
+
periods_fc = gr.Slider(minimum=1, maximum=30, value=7, step=1, label='Forecast Periods (days)')
|
| 373 |
+
multivariate_fc = gr.Checkbox(value=False, label='Use Multivariate (Multiple Features)', interactive=False)
|
| 374 |
+
run_fc = gr.Button('Run Forecasting')
|
| 375 |
+
|
| 376 |
+
# Add state to track current model
|
| 377 |
+
current_model_state = gr.State(value='lstm')
|
| 378 |
+
|
| 379 |
+
def update_multivariate_visibility(model_choice):
|
| 380 |
+
# Multivariate is only supported for LSTM, Bi-LSTM, GRU
|
| 381 |
+
supported_models = ['lstm', 'bilstm', 'gru']
|
| 382 |
+
is_supported = model_choice in supported_models
|
| 383 |
+
return gr.update(interactive=is_supported, value=False)
|
| 384 |
+
|
| 385 |
+
def update_model_state(model_choice):
|
| 386 |
+
return model_choice
|
| 387 |
+
|
| 388 |
+
# Hyperparameter controls for forecasting
|
| 389 |
+
with gr.Accordion("Hyperparameters (Advanced)", open=False):
|
| 390 |
+
gr.Markdown("Adjust hyperparameters for the selected forecasting model. Defaults are set for good performance.")
|
| 391 |
+
|
| 392 |
+
# Prophet hyperparameters
|
| 393 |
+
prophet_changepoint_prior = gr.Slider(minimum=0.001, maximum=0.5, value=0.05, step=0.001, label="Prophet: changepoint_prior_scale", visible=False)
|
| 394 |
+
prophet_seasonality_prior = gr.Slider(minimum=0.01, maximum=10.0, value=10.0, step=0.1, label="Prophet: seasonality_prior_scale", visible=False)
|
| 395 |
+
prophet_seasonality_mode = gr.Radio(choices=['additive', 'multiplicative'], value='additive', label="Prophet: seasonality_mode", visible=False)
|
| 396 |
+
|
| 397 |
+
# Deep learning hyperparameters (LSTM, Bi-LSTM, GRU)
|
| 398 |
+
dl_seq_length = gr.Slider(minimum=3, maximum=30, value=7, step=1, label="DL: sequence_length (lag/input length)", visible=True)
|
| 399 |
+
dl_epochs = gr.Slider(minimum=10, maximum=200, value=100, step=10, label="DL: epochs", visible=True)
|
| 400 |
+
dl_batch_size = gr.Slider(minimum=4, maximum=64, value=16, step=4, label="DL: batch_size", visible=True)
|
| 401 |
+
dl_learning_rate = gr.Slider(minimum=0.0001, maximum=0.01, value=0.001, step=0.0001, label="DL: learning_rate", visible=True)
|
| 402 |
+
dl_units = gr.Slider(minimum=32, maximum=256, value=100, step=16, label="DL: units (neurons)", visible=True)
|
| 403 |
+
dl_dropout = gr.Slider(minimum=0.0, maximum=0.5, value=0.2, step=0.05, label="DL: dropout_rate", visible=True)
|
| 404 |
+
|
| 405 |
+
# Naive has no hyperparameters
|
| 406 |
+
|
| 407 |
+
def update_forecast_hyperparams_visibility(model_choice):
|
| 408 |
+
prophet_visible = model_choice == 'prophet'
|
| 409 |
+
dl_visible = model_choice in ['lstm', 'bilstm', 'gru']
|
| 410 |
+
return [
|
| 411 |
+
gr.update(visible=prophet_visible), # prophet_changepoint_prior
|
| 412 |
+
gr.update(visible=prophet_visible), # prophet_seasonality_prior
|
| 413 |
+
gr.update(visible=prophet_visible), # prophet_seasonality_mode
|
| 414 |
+
gr.update(visible=dl_visible), # dl_seq_length
|
| 415 |
+
gr.update(visible=dl_visible), # dl_epochs
|
| 416 |
+
gr.update(visible=dl_visible), # dl_batch_size
|
| 417 |
+
gr.update(visible=dl_visible), # dl_learning_rate
|
| 418 |
+
gr.update(visible=dl_visible), # dl_units
|
| 419 |
+
gr.update(visible=dl_visible), # dl_dropout
|
| 420 |
+
]
|
| 421 |
+
|
| 422 |
+
with gr.Tabs():
|
| 423 |
+
with gr.TabItem('Historical Data'):
|
| 424 |
+
hist_out = gr.Dataframe(label='Historical Time Series Data')
|
| 425 |
+
with gr.TabItem('Forecast Results'):
|
| 426 |
+
fcst_out = gr.Dataframe(label='Forecast Results')
|
| 427 |
+
with gr.TabItem('Time Series Plot'):
|
| 428 |
+
plot_out = gr.Plot(label='Historical + Forecast Plot')
|
| 429 |
+
fc_status = gr.Textbox(label='Forecast Status', interactive=False)
|
| 430 |
+
fc_download = gr.File(label='Download forecast CSV')
|
| 431 |
+
|
| 432 |
+
def run_forecast_ui(file, metric, model_type, periods, multivariate, current_model, prophet_cp, prophet_sp, prophet_sm, dl_sl, dl_e, dl_bs, dl_lr, dl_u, dl_d):
|
| 433 |
+
# Use current_model if available, otherwise use model_type
|
| 434 |
+
actual_model = current_model if current_model else model_type
|
| 435 |
+
if file is None:
|
| 436 |
+
return pd.DataFrame(), pd.DataFrame(), None, 'No file provided', None
|
| 437 |
+
try:
|
| 438 |
+
from scripts.forecast import run_forecast
|
| 439 |
+
import matplotlib.pyplot as plt
|
| 440 |
+
df = pd.read_csv(file.name, dtype=str)
|
| 441 |
+
|
| 442 |
+
# Build hyperparams dict based on model type
|
| 443 |
+
hyperparams = {}
|
| 444 |
+
if actual_model == 'prophet':
|
| 445 |
+
hyperparams = {
|
| 446 |
+
'changepoint_prior_scale': prophet_cp,
|
| 447 |
+
'seasonality_prior_scale': prophet_sp,
|
| 448 |
+
'seasonality_mode': prophet_sm
|
| 449 |
+
}
|
| 450 |
+
elif actual_model in ['lstm', 'bilstm', 'gru']:
|
| 451 |
+
hyperparams = {
|
| 452 |
+
'seq_length': int(dl_sl),
|
| 453 |
+
'epochs': int(dl_e),
|
| 454 |
+
'batch_size': int(dl_bs),
|
| 455 |
+
'learning_rate': dl_lr,
|
| 456 |
+
'units': int(dl_u),
|
| 457 |
+
'dropout_rate': dl_d
|
| 458 |
+
}
|
| 459 |
+
|
| 460 |
+
ts, fcst = run_forecast(df, metric=metric, periods=periods, model_type=actual_model, multivariate=multivariate, hyperparams=hyperparams)
|
| 461 |
+
|
| 462 |
+
# Create time series plot
|
| 463 |
+
fig, ax = plt.subplots(figsize=(14, 7))
|
| 464 |
+
|
| 465 |
+
# Plot historical data
|
| 466 |
+
if len(ts) > 0 and 'y' in ts.columns:
|
| 467 |
+
ax.plot(ts['ds'], ts['y'], 'b-', label='Historical Data', linewidth=2, marker='o', markersize=4)
|
| 468 |
+
|
| 469 |
+
# Plot forecast data
|
| 470 |
+
if len(fcst) > 0 and 'yhat' in fcst.columns:
|
| 471 |
+
ax.plot(fcst['ds'], fcst['yhat'], 'r--', label='Forecast', linewidth=3, marker='s', markersize=5)
|
| 472 |
+
if 'yhat_lower' in fcst.columns and 'yhat_upper' in fcst.columns:
|
| 473 |
+
ax.fill_between(fcst['ds'], fcst['yhat_lower'], fcst['yhat_upper'],
|
| 474 |
+
color='red', alpha=0.3, label='Confidence Interval')
|
| 475 |
+
|
| 476 |
+
# Add vertical line to separate historical from forecast
|
| 477 |
+
if len(ts) > 0 and len(fcst) > 0:
|
| 478 |
+
last_hist_date = ts['ds'].max()
|
| 479 |
+
ax.axvline(x=last_hist_date, color='gray', linestyle='--', alpha=0.7, label='Forecast Start')
|
| 480 |
+
|
| 481 |
+
ax.set_title(f'Time Series Forecast: {model_type.upper()} ({metric.replace("_", " ").title()})',
|
| 482 |
+
fontsize=16, fontweight='bold', pad=20)
|
| 483 |
+
ax.set_xlabel('Date', fontsize=14)
|
| 484 |
+
ax.set_ylabel(metric.replace('_', ' ').title(), fontsize=14)
|
| 485 |
+
ax.legend(loc='upper left', fontsize=12)
|
| 486 |
+
ax.grid(True, alpha=0.3)
|
| 487 |
+
|
| 488 |
+
# Format x-axis dates
|
| 489 |
+
import matplotlib.dates as mdates
|
| 490 |
+
ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))
|
| 491 |
+
ax.xaxis.set_major_locator(mdates.DayLocator(interval=max(1, len(ts) // 10)))
|
| 492 |
+
plt.xticks(rotation=45, ha='right')
|
| 493 |
+
|
| 494 |
+
plt.tight_layout()
|
| 495 |
+
|
| 496 |
+
# Save forecast results
|
| 497 |
+
mode = 'multivariate' if multivariate else 'univariate'
|
| 498 |
+
if multivariate and model_type not in ['lstm', 'bilstm', 'gru']:
|
| 499 |
+
mode += ' (fallback: model does not support multivariate)'
|
| 500 |
+
out_file = ROOT / 'outputs' / f'forecast_{metric}_{model_type}_{mode.replace(" ", "_")}.csv'
|
| 501 |
+
out_file.parent.mkdir(exist_ok=True)
|
| 502 |
+
fcst.to_csv(out_file, index=False)
|
| 503 |
+
|
| 504 |
+
status = f"Forecasting completed using {model_type.upper()} ({mode}). Historical data: {len(ts)} days, Forecast: {len(fcst)} days."
|
| 505 |
+
if multivariate and model_type not in ['lstm', 'bilstm', 'gru']:
|
| 506 |
+
status += " Note: Model does not support multivariate - used univariate instead."
|
| 507 |
+
return ts, fcst, fig, status, str(out_file)
|
| 508 |
+
except Exception as e:
|
| 509 |
+
import matplotlib.pyplot as plt
|
| 510 |
+
fig, ax = plt.subplots(figsize=(14, 7))
|
| 511 |
+
ax.text(0.5, 0.5, f'Forecasting Error:\n{str(e)}',
|
| 512 |
+
transform=ax.transAxes, ha='center', va='center',
|
| 513 |
+
fontsize=14, bbox=dict(boxstyle="round,pad=0.3", facecolor="lightcoral"))
|
| 514 |
+
ax.set_title('Time Series Forecast - Error Occurred', fontsize=16, fontweight='bold')
|
| 515 |
+
ax.set_xlim(0, 1)
|
| 516 |
+
ax.set_ylim(0, 1)
|
| 517 |
+
plt.axis('off')
|
| 518 |
+
return pd.DataFrame(), pd.DataFrame(), fig, f'Forecasting failed: {e}', None
|
| 519 |
+
|
| 520 |
+
model_type_fc.change(fn=update_multivariate_visibility, inputs=[model_type_fc], outputs=[multivariate_fc])
|
| 521 |
+
model_type_fc.change(fn=update_model_state, inputs=[model_type_fc], outputs=[current_model_state])
|
| 522 |
+
model_type_fc.change(fn=update_forecast_hyperparams_visibility, inputs=[model_type_fc], outputs=[prophet_changepoint_prior, prophet_seasonality_prior, prophet_seasonality_mode, dl_seq_length, dl_epochs, dl_batch_size, dl_learning_rate, dl_units, dl_dropout])
|
| 523 |
+
|
| 524 |
+
run_fc.click(fn=run_forecast_ui, inputs=[csv_in_fc, metric_fc, model_type_fc, periods_fc, multivariate_fc, current_model_state, prophet_changepoint_prior, prophet_seasonality_prior, prophet_seasonality_mode, dl_seq_length, dl_epochs, dl_batch_size, dl_learning_rate, dl_units, dl_dropout], outputs=[hist_out, fcst_out, plot_out, fc_status, fc_download])
|
| 525 |
+
|
| 526 |
if __name__ == '__main__':
|
| 527 |
demo.launch()
|
requirements.txt
CHANGED
|
@@ -22,3 +22,4 @@ httpx==0.28.1
|
|
| 22 |
orjson==3.11.3
|
| 23 |
cmdstanpy==1.2.5
|
| 24 |
stanio==0.5.1
|
|
|
|
|
|
| 22 |
orjson==3.11.3
|
| 23 |
cmdstanpy==1.2.5
|
| 24 |
stanio==0.5.1
|
| 25 |
+
cloudpickle==3.1.1
|
scripts/classify.py
CHANGED
|
@@ -6,14 +6,14 @@ import numpy as np
|
|
| 6 |
from typing import Optional
|
| 7 |
|
| 8 |
# sklearn imports
|
| 9 |
-
from sklearn.model_selection import train_test_split, StratifiedKFold
|
| 10 |
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
|
| 11 |
from sklearn.neural_network import MLPClassifier
|
| 12 |
from sklearn.pipeline import Pipeline
|
| 13 |
from sklearn.compose import ColumnTransformer
|
| 14 |
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
|
| 15 |
from sklearn.impute import SimpleImputer
|
| 16 |
-
from sklearn.metrics import classification_report
|
| 17 |
import joblib
|
| 18 |
|
| 19 |
# Optional HF weak-labeling
|
|
@@ -174,6 +174,8 @@ def train_classifier(df: pd.DataFrame, label_col: str = 'CauseType', test_size:
|
|
| 174 |
|
| 175 |
# save model
|
| 176 |
model_file = Path('outputs') / f'classifier_{model_type}_{label_col}.joblib'
|
|
|
|
|
|
|
| 177 |
|
| 178 |
# predictions on train set for download
|
| 179 |
y_pred_train = pipeline.predict(X)
|
|
@@ -187,260 +189,3 @@ def train_classifier(df: pd.DataFrame, label_col: str = 'CauseType', test_size:
|
|
| 187 |
'model_file': str(model_file),
|
| 188 |
'predictions_file': str(preds_file)
|
| 189 |
}
|
| 190 |
-
df = parse_and_features(df)
|
| 191 |
-
|
| 192 |
-
is_multi = len(label_cols) > 1
|
| 193 |
-
|
| 194 |
-
# optionally weak-label rows missing label (only for single target)
|
| 195 |
-
if not is_multi and label_cols[0] not in df.columns:
|
| 196 |
-
df[label_cols[0]] = None
|
| 197 |
-
|
| 198 |
-
if not is_multi and df[label_cols[0]].isna().sum() > 0 and HF_TOKEN:
|
| 199 |
-
# attempt weak labeling for missing entries using Detail or FaultDetail
|
| 200 |
-
for idx, row in df[df[label_cols[0]].isna()].iterrows():
|
| 201 |
-
text = None
|
| 202 |
-
for f in ['Detail','FaultDetail','SiteDetail']:
|
| 203 |
-
if f in df.columns and pd.notna(row.get(f)):
|
| 204 |
-
text = row.get(f)
|
| 205 |
-
break
|
| 206 |
-
if text:
|
| 207 |
-
try:
|
| 208 |
-
lbl = weak_label_with_hf(text)
|
| 209 |
-
if lbl:
|
| 210 |
-
df.at[idx, label_cols[0]] = lbl
|
| 211 |
-
except Exception:
|
| 212 |
-
pass
|
| 213 |
-
|
| 214 |
-
# filter rare classes and drop na (for each label_col)
|
| 215 |
-
for col in label_cols:
|
| 216 |
-
if col not in df.columns:
|
| 217 |
-
df[col] = None
|
| 218 |
-
if df[col].notna().any():
|
| 219 |
-
vc = df[col].value_counts()
|
| 220 |
-
rare = vc[vc < min_count_to_keep].index
|
| 221 |
-
if len(rare) > 0:
|
| 222 |
-
df[col] = df[col].apply(lambda x: 'Other' if x in rare else x)
|
| 223 |
-
df = df.dropna(subset=[col])
|
| 224 |
-
|
| 225 |
-
# features
|
| 226 |
-
feature_cols = ['duration_min','Load(MW)_num','Capacity(kVA)_num','AffectedCustomer_num','hour','weekday','device_freq','OpDeviceType','Owner','Weather','EventType']
|
| 227 |
-
X = df[feature_cols]
|
| 228 |
-
|
| 229 |
-
# target
|
| 230 |
-
if is_multi:
|
| 231 |
-
y = df[label_cols]
|
| 232 |
-
# encode each target
|
| 233 |
-
les = [LabelEncoder() for _ in label_cols]
|
| 234 |
-
y_encoded = np.column_stack([le.fit_transform(y[col]) for le, col in zip(les, label_cols)])
|
| 235 |
-
else:
|
| 236 |
-
y = df[label_cols[0]].astype(str)
|
| 237 |
-
le = LabelEncoder()
|
| 238 |
-
y_encoded = le.fit_transform(y)
|
| 239 |
-
les = [le]
|
| 240 |
-
|
| 241 |
-
# split
|
| 242 |
-
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=test_size, random_state=random_state, stratify=y_encoded if not is_multi else None)
|
| 243 |
-
|
| 244 |
-
# model
|
| 245 |
-
if model_type == 'rf':
|
| 246 |
-
clf = RandomForestClassifier(random_state=random_state)
|
| 247 |
-
elif model_type == 'gb':
|
| 248 |
-
clf = GradientBoostingClassifier(random_state=random_state)
|
| 249 |
-
elif model_type == 'mlp':
|
| 250 |
-
clf = MLPClassifier(random_state=random_state, max_iter=500)
|
| 251 |
-
else:
|
| 252 |
-
raise ValueError(f"Unknown model_type: {model_type}")
|
| 253 |
-
|
| 254 |
-
# preprocessor
|
| 255 |
-
preprocessor = ColumnTransformer(
|
| 256 |
-
transformers=[
|
| 257 |
-
('num', Pipeline([('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())]), ['duration_min','Load(MW)_num','Capacity(kVA)_num','AffectedCustomer_num','hour','weekday','device_freq']),
|
| 258 |
-
('cat', Pipeline([('imputer', SimpleImputer(strategy='most_frequent')), ('encoder', OneHotEncoder(handle_unknown='ignore'))]), ['OpDeviceType','Owner','Weather','EventType'])
|
| 259 |
-
]
|
| 260 |
-
)
|
| 261 |
-
|
| 262 |
-
pipeline = Pipeline([('preprocessor', preprocessor), ('classifier', clf)])
|
| 263 |
-
|
| 264 |
-
if do_gridsearch:
|
| 265 |
-
param_grid = {
|
| 266 |
-
'classifier__n_estimators': [50, 100, 200] if hasattr(clf, 'n_estimators') else [1],
|
| 267 |
-
'classifier__max_depth': [None, 10, 20] if hasattr(clf, 'max_depth') else [1],
|
| 268 |
-
}
|
| 269 |
-
cv = 3 if not is_multi else KFold(n_splits=3, shuffle=True, random_state=random_state)
|
| 270 |
-
scoring = 'accuracy' if not is_multi else None
|
| 271 |
-
grid = GridSearchCV(pipeline, param_grid, cv=cv, scoring=scoring, n_jobs=-1)
|
| 272 |
-
grid.fit(X_train, y_train)
|
| 273 |
-
pipeline = grid.best_estimator_
|
| 274 |
-
|
| 275 |
-
pipeline.fit(X_train, y_train)
|
| 276 |
-
|
| 277 |
-
# predict
|
| 278 |
-
y_pred = pipeline.predict(X_test)
|
| 279 |
-
|
| 280 |
-
# report
|
| 281 |
-
if is_multi:
|
| 282 |
-
reports = []
|
| 283 |
-
for i, col in enumerate(label_cols):
|
| 284 |
-
y_test_i = y_test[:, i]
|
| 285 |
-
y_pred_i = y_pred[:, i]
|
| 286 |
-
y_test_inv = les[i].inverse_transform(y_test_i)
|
| 287 |
-
y_pred_inv = les[i].inverse_transform(y_pred_i.astype(int))
|
| 288 |
-
rep = classification_report(y_test_inv, y_pred_inv, zero_division=0)
|
| 289 |
-
reports.append(f"Report for {col}:\n{rep}")
|
| 290 |
-
report = '\n\n'.join(reports)
|
| 291 |
-
else:
|
| 292 |
-
y_test_inv = les[0].inverse_transform(y_test)
|
| 293 |
-
y_pred_inv = les[0].inverse_transform(y_pred)
|
| 294 |
-
report = classification_report(y_test_inv, y_pred_inv, zero_division=0)
|
| 295 |
-
|
| 296 |
-
# save model
|
| 297 |
-
model_file = Path('outputs') / f'classifier_{model_type}_{"_".join(label_cols)}.joblib'
|
| 298 |
-
model_file.parent.mkdir(exist_ok=True)
|
| 299 |
-
joblib.dump({'pipeline': pipeline, 'label_encoders': les}, model_file)
|
| 300 |
-
|
| 301 |
-
# predictions on train set for download
|
| 302 |
-
y_pred_train = pipeline.predict(X)
|
| 303 |
-
if is_multi:
|
| 304 |
-
pred_df = df.copy()
|
| 305 |
-
for i, col in enumerate(label_cols):
|
| 306 |
-
pred_df[f'Predicted_{col}'] = les[i].inverse_transform(y_pred_train[:, i].astype(int))
|
| 307 |
-
else:
|
| 308 |
-
pred_df = df.copy()
|
| 309 |
-
pred_df[f'Predicted_{label_cols[0]}'] = les[0].inverse_transform(y_pred_train)
|
| 310 |
-
|
| 311 |
-
preds_file = Path('outputs') / f'predictions_{model_type}_{"_".join(label_cols)}.csv'
|
| 312 |
-
pred_df.to_csv(preds_file, index=False)
|
| 313 |
-
|
| 314 |
-
return {
|
| 315 |
-
'report': report,
|
| 316 |
-
'model_file': str(model_file),
|
| 317 |
-
'predictions_file': str(preds_file)
|
| 318 |
-
}
|
| 319 |
-
df = parse_and_features(df)
|
| 320 |
-
|
| 321 |
-
# optionally weak-label rows missing label
|
| 322 |
-
if label_col not in df.columns:
|
| 323 |
-
df[label_col] = None
|
| 324 |
-
|
| 325 |
-
if df[label_col].isna().sum() > 0 and HF_TOKEN:
|
| 326 |
-
# attempt weak labeling for missing entries using Detail or FaultDetail
|
| 327 |
-
for idx, row in df[df[label_col].isna()].iterrows():
|
| 328 |
-
text = None
|
| 329 |
-
for f in ['Detail','FaultDetail','SiteDetail']:
|
| 330 |
-
if f in df.columns and pd.notna(row.get(f)):
|
| 331 |
-
text = row.get(f)
|
| 332 |
-
break
|
| 333 |
-
if text:
|
| 334 |
-
lbl = weak_label_with_hf(text)
|
| 335 |
-
if lbl:
|
| 336 |
-
df.at[idx, label_col] = lbl
|
| 337 |
-
|
| 338 |
-
# combine rare classes into 'Other' if needed
|
| 339 |
-
if df[label_col].notna().any():
|
| 340 |
-
vc = df[label_col].value_counts()
|
| 341 |
-
rare = vc[vc < min_count_to_keep].index.tolist()
|
| 342 |
-
if rare:
|
| 343 |
-
df[label_col] = df[label_col].apply(lambda x: 'Other' if x in rare else x)
|
| 344 |
-
|
| 345 |
-
df = df.dropna(subset=[label_col])
|
| 346 |
-
if df.empty:
|
| 347 |
-
raise ValueError('No labeled data available for training')
|
| 348 |
-
|
| 349 |
-
# define features
|
| 350 |
-
feature_cols = ['duration_min','Load(MW)_num','Capacity(kVA)_num','AffectedCustomer_num','hour','weekday','device_freq','OpDeviceType','Owner','Weather','EventType']
|
| 351 |
-
X = df[feature_cols]
|
| 352 |
-
y = df[label_col].astype(str)
|
| 353 |
-
|
| 354 |
-
# encode labels to integers
|
| 355 |
-
le = LabelEncoder()
|
| 356 |
-
y_encoded = le.fit_transform(y)
|
| 357 |
-
|
| 358 |
-
# simple train/test split
|
| 359 |
-
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=test_size, random_state=random_state, stratify=y_encoded)
|
| 360 |
-
|
| 361 |
-
# preprocessing
|
| 362 |
-
numeric_feats = ['duration_min','Load(MW)_num','Capacity(kVA)_num','AffectedCustomer_num','hour','weekday','device_freq']
|
| 363 |
-
cat_feats = ['OpDeviceType','Owner','Weather','EventType']
|
| 364 |
-
|
| 365 |
-
numeric_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())])
|
| 366 |
-
# sklearn versions differ on parameter name for sparse output
|
| 367 |
-
try:
|
| 368 |
-
cat_transformer = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
|
| 369 |
-
except TypeError:
|
| 370 |
-
cat_transformer = OneHotEncoder(handle_unknown='ignore', sparse=False)
|
| 371 |
-
|
| 372 |
-
preprocessor = ColumnTransformer(transformers=[
|
| 373 |
-
('num', numeric_transformer, numeric_feats),
|
| 374 |
-
('cat', cat_transformer, cat_feats)
|
| 375 |
-
], remainder='drop')
|
| 376 |
-
|
| 377 |
-
# choose classifier
|
| 378 |
-
model_type = (model_type or 'rf').lower()
|
| 379 |
-
if model_type == 'rf':
|
| 380 |
-
clf_est = RandomForestClassifier(class_weight='balanced', random_state=random_state)
|
| 381 |
-
clf_name = 'rf'
|
| 382 |
-
elif model_type == 'gb':
|
| 383 |
-
clf_est = GradientBoostingClassifier(random_state=random_state)
|
| 384 |
-
clf_name = 'gb'
|
| 385 |
-
elif model_type == 'mlp':
|
| 386 |
-
clf_est = MLPClassifier(hidden_layer_sizes=(100,), max_iter=300, random_state=random_state)
|
| 387 |
-
clf_name = 'mlp'
|
| 388 |
-
else:
|
| 389 |
-
raise ValueError(f'Unknown model_type: {model_type}')
|
| 390 |
-
|
| 391 |
-
clf = Pipeline(steps=[('pre', preprocessor), ('clf', clf_est)])
|
| 392 |
-
|
| 393 |
-
if do_gridsearch:
|
| 394 |
-
if clf_name == 'rf':
|
| 395 |
-
param_grid = {
|
| 396 |
-
'clf__n_estimators': [100,200],
|
| 397 |
-
'clf__max_depth': [None, 10, 20],
|
| 398 |
-
'clf__min_samples_split': [2,5]
|
| 399 |
-
}
|
| 400 |
-
elif clf_name == 'lgb':
|
| 401 |
-
param_grid = {
|
| 402 |
-
'clf__n_estimators': [100,200],
|
| 403 |
-
'clf__num_leaves': [31,63]
|
| 404 |
-
}
|
| 405 |
-
elif clf_name == 'gb':
|
| 406 |
-
param_grid = {
|
| 407 |
-
'clf__n_estimators': [100,200],
|
| 408 |
-
'clf__max_depth': [3,6]
|
| 409 |
-
}
|
| 410 |
-
elif clf_name == 'mlp':
|
| 411 |
-
param_grid = {
|
| 412 |
-
'clf__hidden_layer_sizes': [(50,),(100,)],
|
| 413 |
-
'clf__alpha': [0.0001, 0.001]
|
| 414 |
-
}
|
| 415 |
-
else:
|
| 416 |
-
param_grid = {}
|
| 417 |
-
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=random_state)
|
| 418 |
-
gs = GridSearchCV(clf, param_grid, cv=cv, scoring='f1_weighted', n_jobs=1)
|
| 419 |
-
gs.fit(X_train, y_train)
|
| 420 |
-
best = gs.best_estimator_
|
| 421 |
-
best_params = gs.best_params_
|
| 422 |
-
model_to_save = best
|
| 423 |
-
else:
|
| 424 |
-
clf.fit(X_train, y_train)
|
| 425 |
-
best_params = None
|
| 426 |
-
model_to_save = clf
|
| 427 |
-
|
| 428 |
-
y_pred = model_to_save.predict(X_test)
|
| 429 |
-
unique_labels = np.unique(np.concatenate([y_test, y_pred]))
|
| 430 |
-
target_names = [le.classes_[i] for i in unique_labels]
|
| 431 |
-
report = classification_report(y_test, y_pred, target_names=target_names, zero_division=0)
|
| 432 |
-
cm = confusion_matrix(y_test, y_pred)
|
| 433 |
-
|
| 434 |
-
# save model pipeline
|
| 435 |
-
out_dir = Path.cwd() / 'outputs'
|
| 436 |
-
out_dir.mkdir(exist_ok=True)
|
| 437 |
-
model_file = out_dir / f'{clf_name}_cause_pipeline.joblib'
|
| 438 |
-
joblib.dump({'pipeline': model_to_save, 'label_encoder': le}, model_file)
|
| 439 |
-
|
| 440 |
-
# save predictions
|
| 441 |
-
pred_df = X_test.copy()
|
| 442 |
-
pred_df['y_true'] = le.inverse_transform(y_test)
|
| 443 |
-
pred_df['y_pred'] = le.inverse_transform(y_pred)
|
| 444 |
-
pred_df.to_csv(out_dir / 'predictions_cause.csv', index=False, encoding='utf-8-sig')
|
| 445 |
-
|
| 446 |
-
return {'model_file': str(model_file), 'report': report, 'confusion_matrix': cm, 'predictions_file': str(out_dir / 'predictions_cause.csv'), 'best_params': best_params}
|
|
|
|
| 6 |
from typing import Optional
|
| 7 |
|
| 8 |
# sklearn imports
|
| 9 |
+
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
|
| 10 |
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
|
| 11 |
from sklearn.neural_network import MLPClassifier
|
| 12 |
from sklearn.pipeline import Pipeline
|
| 13 |
from sklearn.compose import ColumnTransformer
|
| 14 |
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
|
| 15 |
from sklearn.impute import SimpleImputer
|
| 16 |
+
from sklearn.metrics import classification_report, confusion_matrix
|
| 17 |
import joblib
|
| 18 |
|
| 19 |
# Optional HF weak-labeling
|
|
|
|
| 174 |
|
| 175 |
# save model
|
| 176 |
model_file = Path('outputs') / f'classifier_{model_type}_{label_col}.joblib'
|
| 177 |
+
model_file.parent.mkdir(exist_ok=True)
|
| 178 |
+
joblib.dump({'pipeline': pipeline, 'label_encoder': le}, model_file)
|
| 179 |
|
| 180 |
# predictions on train set for download
|
| 181 |
y_pred_train = pipeline.predict(X)
|
|
|
|
| 189 |
'model_file': str(model_file),
|
| 190 |
'predictions_file': str(preds_file)
|
| 191 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
scripts/compute_reliability.py
ADDED
|
@@ -0,0 +1,242 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Compute reliability indices (SAIFI, SAIDI, CAIDI, MAIFI) from outage event CSV.
|
| 2 |
+
|
| 3 |
+
Usage (programmatic):
|
| 4 |
+
from scripts.compute_reliability import compute_reliability
|
| 5 |
+
summary = compute_reliability('data/data_1.csv', total_customers=500000)
|
| 6 |
+
|
| 7 |
+
Usage (CLI):
|
| 8 |
+
python scripts/compute_reliability.py --input data/data_1.csv --total-customers 500000
|
| 9 |
+
|
| 10 |
+
Assumptions & mapping (from inspected CSV):
|
| 11 |
+
- Outage start: `OutageDateTime`
|
| 12 |
+
- Outage end: prefer `CloseEventDateTime`, else `LastRestoDateTime`, else `FirstRestoDateTime`
|
| 13 |
+
- Customers affected: prefer `AffectedCustomer` column; else sum `AffectedCustomer1..5`; else `AllStepCusXTime` or `AllStepCusXTime1..5` fallback.
|
| 14 |
+
- Planned outages: rows with `EventType` containing 'แผน' (e.g., 'แผนดับไฟ') are considered planned and can be excluded by default.
|
| 15 |
+
- Date format is day-first like '10-01-2025 10:28:00'.
|
| 16 |
+
|
| 17 |
+
Outputs saved to `outputs/reliability_summary.csv` and breakdown CSVs.
|
| 18 |
+
"""
|
| 19 |
+
|
| 20 |
+
from __future__ import annotations
|
| 21 |
+
import argparse
|
| 22 |
+
from typing import Optional, Dict
|
| 23 |
+
import pandas as pd
|
| 24 |
+
import numpy as np
|
| 25 |
+
from pathlib import Path
|
| 26 |
+
|
| 27 |
+
DATE_COLS = ['OutageDateTime', 'FirstRestoDateTime', 'LastRestoDateTime', 'CreateEventDateTime', 'CloseEventDateTime']
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def parse_dates(df: pd.DataFrame) -> pd.DataFrame:
|
| 31 |
+
for c in DATE_COLS:
|
| 32 |
+
if c in df.columns:
|
| 33 |
+
# many dates are in format dd-mm-YYYY HH:MM:SS
|
| 34 |
+
df[c] = pd.to_datetime(df[c], dayfirst=True, errors='coerce')
|
| 35 |
+
return df
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
def coalesce_end_time(row: pd.Series) -> pd.Timestamp | None:
|
| 39 |
+
for c in ('CloseEventDateTime', 'LastRestoDateTime', 'FirstRestoDateTime', 'CreateEventDateTime'):
|
| 40 |
+
if c in row and pd.notna(row[c]):
|
| 41 |
+
return row[c]
|
| 42 |
+
return pd.NaT
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
def estimate_customers(row: pd.Series) -> float:
|
| 46 |
+
# Prefer AffectedCustomer if present and numeric
|
| 47 |
+
def to_num(x):
|
| 48 |
+
try:
|
| 49 |
+
if pd.isna(x) or x == '':
|
| 50 |
+
return np.nan
|
| 51 |
+
return float(x)
|
| 52 |
+
except Exception:
|
| 53 |
+
return np.nan
|
| 54 |
+
|
| 55 |
+
cols = row.index
|
| 56 |
+
# Try AffectedCustomer
|
| 57 |
+
if 'AffectedCustomer' in cols:
|
| 58 |
+
v = to_num(row['AffectedCustomer'])
|
| 59 |
+
if not np.isnan(v):
|
| 60 |
+
return v
|
| 61 |
+
# Sum AffectedCustomer1..5
|
| 62 |
+
acs = []
|
| 63 |
+
for i in range(1, 6):
|
| 64 |
+
k = f'AffectedCustomer{i}'
|
| 65 |
+
if k in cols:
|
| 66 |
+
acs.append(to_num(row[k]))
|
| 67 |
+
acs = [x for x in acs if not np.isnan(x)]
|
| 68 |
+
if acs:
|
| 69 |
+
return float(sum(acs))
|
| 70 |
+
# Try AllStepCusXTime or AllStepCusXTime1..5
|
| 71 |
+
if 'AllStepCusXTime' in cols:
|
| 72 |
+
v = to_num(row['AllStepCusXTime'])
|
| 73 |
+
if not np.isnan(v):
|
| 74 |
+
return v
|
| 75 |
+
asts = []
|
| 76 |
+
for i in range(1, 6):
|
| 77 |
+
k = f'AllStepCusXTime{i}'
|
| 78 |
+
if k in cols:
|
| 79 |
+
asts.append(to_num(row[k]))
|
| 80 |
+
asts = [x for x in asts if not np.isnan(x)]
|
| 81 |
+
if asts:
|
| 82 |
+
return float(sum(asts))
|
| 83 |
+
# As last resort, try numeric columns near end: Capacity(kVA) or Load(MW) are not customer counts
|
| 84 |
+
return np.nan
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
def flag_planned(event_type: Optional[str]) -> bool:
|
| 88 |
+
if pd.isna(event_type):
|
| 89 |
+
return False
|
| 90 |
+
s = str(event_type)
|
| 91 |
+
# In this dataset planned outages use Thai word 'แผน'
|
| 92 |
+
if 'แผน' in s:
|
| 93 |
+
return True
|
| 94 |
+
# else treat as unplanned
|
| 95 |
+
return False
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
def compute_reliability(
|
| 99 |
+
input_csv: str | Path,
|
| 100 |
+
total_customers: Optional[float] = None,
|
| 101 |
+
customers_map: Optional[Dict[str, float]] = None,
|
| 102 |
+
exclude_planned: bool = True,
|
| 103 |
+
momentary_threshold_min: float = 1.0,
|
| 104 |
+
groupby_cols: list[str] | None = None,
|
| 105 |
+
out_dir: str | Path | None = 'outputs',
|
| 106 |
+
) -> Dict[str, pd.DataFrame]:
|
| 107 |
+
"""Reads CSV and computes reliability indices.
|
| 108 |
+
|
| 109 |
+
Returns a dict of DataFrames: overall, by_group.
|
| 110 |
+
"""
|
| 111 |
+
input_csv = Path(input_csv)
|
| 112 |
+
out_dir = Path(out_dir)
|
| 113 |
+
out_dir.mkdir(parents=True, exist_ok=True)
|
| 114 |
+
|
| 115 |
+
df = pd.read_csv(input_csv, dtype=str)
|
| 116 |
+
# parse dates
|
| 117 |
+
df = parse_dates(df)
|
| 118 |
+
|
| 119 |
+
# coalesce end time
|
| 120 |
+
df['OutageStart'] = df.get('OutageDateTime')
|
| 121 |
+
df['OutageEnd'] = df.apply(coalesce_end_time, axis=1)
|
| 122 |
+
# compute duration in minutes
|
| 123 |
+
df['DurationMin'] = (pd.to_datetime(df['OutageEnd']) - pd.to_datetime(df['OutageStart'])).dt.total_seconds() / 60.0
|
| 124 |
+
|
| 125 |
+
# customers affected
|
| 126 |
+
df['CustomersAffected'] = df.apply(estimate_customers, axis=1)
|
| 127 |
+
|
| 128 |
+
# flag planned
|
| 129 |
+
df['IsPlanned'] = df['EventType'].apply(flag_planned) if 'EventType' in df.columns else False
|
| 130 |
+
|
| 131 |
+
if exclude_planned:
|
| 132 |
+
df_work = df[~df['IsPlanned']].copy()
|
| 133 |
+
else:
|
| 134 |
+
df_work = df.copy()
|
| 135 |
+
|
| 136 |
+
# Fill missing durations (negative or NaN) with 0
|
| 137 |
+
df_work['DurationMin'] = df_work['DurationMin'].fillna(0)
|
| 138 |
+
df_work.loc[df_work['DurationMin'] < 0, 'DurationMin'] = 0
|
| 139 |
+
|
| 140 |
+
# ensure numeric customers
|
| 141 |
+
df_work['CustomersAffected'] = pd.to_numeric(df_work['CustomersAffected'], errors='coerce').fillna(0)
|
| 142 |
+
|
| 143 |
+
# Choose grouping
|
| 144 |
+
if groupby_cols is None:
|
| 145 |
+
groupby_cols = []
|
| 146 |
+
|
| 147 |
+
# helper to compute indices given total customers
|
| 148 |
+
def compute_from_df(dfall: pd.DataFrame, cust_total: float) -> Dict[str, float]:
|
| 149 |
+
total_interruptions = dfall['CustomersAffected'].sum()
|
| 150 |
+
total_customer_minutes = (dfall['CustomersAffected'] * dfall['DurationMin']).sum()
|
| 151 |
+
# momentary interruptions: durations less than threshold
|
| 152 |
+
momentary_interruptions = dfall.loc[dfall['DurationMin'] < momentary_threshold_min, 'CustomersAffected'].sum()
|
| 153 |
+
saifi = total_interruptions / cust_total if cust_total and cust_total > 0 else np.nan
|
| 154 |
+
saidi = total_customer_minutes / cust_total if cust_total and cust_total > 0 else np.nan
|
| 155 |
+
caidi = (saidi / saifi) if (saifi and saifi > 0) else np.nan
|
| 156 |
+
maifi = momentary_interruptions / cust_total if cust_total and cust_total > 0 else np.nan
|
| 157 |
+
return {
|
| 158 |
+
'TotalInterruptions': total_interruptions,
|
| 159 |
+
'TotalCustomerMinutes': total_customer_minutes,
|
| 160 |
+
'MomentaryInterruptions': momentary_interruptions,
|
| 161 |
+
'SAIFI': saifi,
|
| 162 |
+
'SAIDI': saidi,
|
| 163 |
+
'CAIDI': caidi,
|
| 164 |
+
'MAIFI': maifi,
|
| 165 |
+
}
|
| 166 |
+
|
| 167 |
+
results = {}
|
| 168 |
+
|
| 169 |
+
if customers_map is not None:
|
| 170 |
+
# customers_map expects keys matching grouping (e.g., Feeder or AffectedAreaID). We'll compute per key
|
| 171 |
+
# Overall must supply a 'TOTAL' or we sum map values
|
| 172 |
+
total_customers_map_sum = sum(customers_map.values())
|
| 173 |
+
overall = compute_from_df(df_work, total_customers_map_sum if total_customers is None else total_customers)
|
| 174 |
+
results['overall'] = pd.DataFrame([overall])
|
| 175 |
+
|
| 176 |
+
# per-group
|
| 177 |
+
if groupby_cols:
|
| 178 |
+
group = df_work.groupby(groupby_cols).agg({'CustomersAffected': 'sum', 'DurationMin': 'mean'})
|
| 179 |
+
else:
|
| 180 |
+
# if no group col provided, try Feeder then AffectedAreaID
|
| 181 |
+
if 'Feeder' in df_work.columns:
|
| 182 |
+
groupby_cols = ['Feeder']
|
| 183 |
+
elif 'AffectedAreaID' in df_work.columns:
|
| 184 |
+
groupby_cols = ['AffectedAreaID']
|
| 185 |
+
else:
|
| 186 |
+
groupby_cols = []
|
| 187 |
+
|
| 188 |
+
if groupby_cols:
|
| 189 |
+
rows = []
|
| 190 |
+
for key, sub in df_work.groupby(groupby_cols):
|
| 191 |
+
# key can be tuple
|
| 192 |
+
keyname = key if isinstance(key, str) else '_'.join(map(str, key))
|
| 193 |
+
cust = customers_map.get(keyname, np.nan)
|
| 194 |
+
metrics = compute_from_df(sub, cust if not np.isnan(cust) else np.nan)
|
| 195 |
+
metrics.update({'Group': keyname})
|
| 196 |
+
rows.append(metrics)
|
| 197 |
+
results['by_group'] = pd.DataFrame(rows)
|
| 198 |
+
else:
|
| 199 |
+
results['by_group'] = pd.DataFrame()
|
| 200 |
+
else:
|
| 201 |
+
# customers_map not provided: require total_customers
|
| 202 |
+
if total_customers is None:
|
| 203 |
+
raise ValueError('Either total_customers or customers_map must be provided to compute per-customer indices')
|
| 204 |
+
overall = compute_from_df(df_work, float(total_customers))
|
| 205 |
+
results['overall'] = pd.DataFrame([overall])
|
| 206 |
+
# per-group breakdowns (SAIFI-like per 1000 customers will use proportion of total customers by share)
|
| 207 |
+
if groupby_cols:
|
| 208 |
+
rows = []
|
| 209 |
+
# If we don't have customers per group, we will compute interruption counts and durations but can't compute per-customer normalized indices without providing customers_map.
|
| 210 |
+
for key, sub in df_work.groupby(groupby_cols):
|
| 211 |
+
keyname = key if isinstance(key, str) else '_'.join(map(str, key))
|
| 212 |
+
rows.append({
|
| 213 |
+
'Group': keyname,
|
| 214 |
+
'TotalInterruptions': sub['CustomersAffected'].sum(),
|
| 215 |
+
'TotalCustomerMinutes': (sub['CustomersAffected'] * sub['DurationMin']).sum(),
|
| 216 |
+
'Events': len(sub),
|
| 217 |
+
})
|
| 218 |
+
results['by_group'] = pd.DataFrame(rows)
|
| 219 |
+
else:
|
| 220 |
+
results['by_group'] = pd.DataFrame()
|
| 221 |
+
|
| 222 |
+
# Save CSVs
|
| 223 |
+
results['raw'] = df_work
|
| 224 |
+
results['raw'].to_csv(out_dir / 'events_cleaned.csv', index=False)
|
| 225 |
+
results['overall'].to_csv(out_dir / 'reliability_overall.csv', index=False)
|
| 226 |
+
if 'by_group' in results:
|
| 227 |
+
results['by_group'].to_csv(out_dir / 'reliability_by_group.csv', index=False)
|
| 228 |
+
|
| 229 |
+
return results
|
| 230 |
+
|
| 231 |
+
|
| 232 |
+
if __name__ == '__main__':
|
| 233 |
+
parser = argparse.ArgumentParser()
|
| 234 |
+
parser.add_argument('--input', '-i', required=True, help='Input CSV file')
|
| 235 |
+
parser.add_argument('--total-customers', type=float, help='Total customers served in the system (required if no customers map)')
|
| 236 |
+
parser.add_argument('--exclude-planned', action='store_true', help='Exclude planned outages (default True)')
|
| 237 |
+
parser.add_argument('--momentary-threshold-min', type=float, default=1.0, help='Threshold in minutes for momentary interruption')
|
| 238 |
+
parser.add_argument('--groupby', nargs='*', default=['Feeder'], help='Columns to group by for breakdown (default: Feeder)')
|
| 239 |
+
args = parser.parse_args()
|
| 240 |
+
|
| 241 |
+
res = compute_reliability(args.input, total_customers=args.total_customers, exclude_planned=args.exclude_planned, momentary_threshold_min=args.momentary_threshold_min, groupby_cols=args.groupby)
|
| 242 |
+
print('Wrote outputs to outputs/ (events_cleaned.csv, reliability_overall.csv, reliability_by_group.csv)')
|
scripts/forecast.py
CHANGED
|
@@ -9,8 +9,160 @@ try:
|
|
| 9 |
except Exception:
|
| 10 |
PROPHET_AVAILABLE = False
|
| 11 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
|
| 13 |
def prepare_timeseries(df: pd.DataFrame, date_col: str = 'OutageDateTime', metric: str = 'count') -> pd.DataFrame:
|
|
|
|
| 14 |
# date_col is in format DD-MM-YYYY HH:MM:SS
|
| 15 |
df = df.copy()
|
| 16 |
df['dt'] = pd.to_datetime(df[date_col], format='%d-%m-%Y %H:%M:%S', errors='coerce')
|
|
@@ -29,10 +181,23 @@ def prepare_timeseries(df: pd.DataFrame, date_col: str = 'OutageDateTime', metri
|
|
| 29 |
return ts
|
| 30 |
|
| 31 |
|
| 32 |
-
def forecast_prophet(ts: pd.DataFrame, periods: int = 7, freq: str = 'D') -> pd.DataFrame:
|
| 33 |
if not PROPHET_AVAILABLE:
|
| 34 |
raise RuntimeError('Prophet not available')
|
| 35 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
m.fit(ts)
|
| 37 |
future = m.make_future_dataframe(periods=periods, freq=freq)
|
| 38 |
fcst = m.predict(future)
|
|
@@ -47,13 +212,1220 @@ def forecast_naive(ts: pd.DataFrame, periods: int = 7) -> pd.DataFrame:
|
|
| 47 |
return pd.DataFrame({'ds': future_dates, 'yhat': [last_mean]*periods, 'yhat_lower':[np.nan]*periods, 'yhat_upper':[np.nan]*periods})
|
| 48 |
|
| 49 |
|
| 50 |
-
def
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
except Exception:
|
| 10 |
PROPHET_AVAILABLE = False
|
| 11 |
|
| 12 |
+
try:
|
| 13 |
+
import tensorflow as tf
|
| 14 |
+
from tensorflow.keras.models import Sequential
|
| 15 |
+
from tensorflow.keras.layers import LSTM, Bidirectional, GRU, Dense, Dropout
|
| 16 |
+
from sklearn.preprocessing import MinMaxScaler
|
| 17 |
+
TF_AVAILABLE = True
|
| 18 |
+
except Exception:
|
| 19 |
+
TF_AVAILABLE = False
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def prepare_multivariate_timeseries(df: pd.DataFrame, date_col: str = 'OutageDateTime') -> pd.DataFrame:
|
| 23 |
+
"""Prepare multivariate time series with multiple features"""
|
| 24 |
+
df = df.copy()
|
| 25 |
+
df['dt'] = pd.to_datetime(df[date_col], format='%d-%m-%Y %H:%M:%S', errors='coerce')
|
| 26 |
+
df = df.dropna(subset=['dt'])
|
| 27 |
+
df['day'] = df['dt'].dt.floor('D')
|
| 28 |
+
|
| 29 |
+
# Aggregate daily data
|
| 30 |
+
daily_data = df.groupby('day').agg({
|
| 31 |
+
'EventNumber': 'count', # daily count
|
| 32 |
+
'Load(MW)': lambda x: pd.to_numeric(x, errors='coerce').mean(),
|
| 33 |
+
'Capacity(kVA)': lambda x: pd.to_numeric(x, errors='coerce').mean(),
|
| 34 |
+
'AffectedCustomer': lambda x: pd.to_numeric(x, errors='coerce').sum(),
|
| 35 |
+
'OpDeviceType': lambda x: x.mode().iloc[0] if len(x) > 0 else 'Unknown',
|
| 36 |
+
'Owner': lambda x: x.mode().iloc[0] if len(x) > 0 else 'Unknown',
|
| 37 |
+
'Weather': lambda x: x.mode().iloc[0] if len(x) > 0 else 'Unknown',
|
| 38 |
+
'EventType': lambda x: x.mode().iloc[0] if len(x) > 0 else 'Unknown'
|
| 39 |
+
}).reset_index()
|
| 40 |
+
|
| 41 |
+
# Rename columns
|
| 42 |
+
daily_data = daily_data.rename(columns={
|
| 43 |
+
'day': 'ds',
|
| 44 |
+
'EventNumber': 'daily_count',
|
| 45 |
+
'Load(MW)': 'avg_load_mw',
|
| 46 |
+
'Capacity(kVA)': 'avg_capacity_kva',
|
| 47 |
+
'AffectedCustomer': 'total_affected_customers'
|
| 48 |
+
})
|
| 49 |
+
|
| 50 |
+
# Calculate duration if available
|
| 51 |
+
if 'LastRestoDateTime' in df.columns:
|
| 52 |
+
df['last_dt'] = pd.to_datetime(df.get('LastRestoDateTime'), format='%d-%m-%Y %H:%M:%S', errors='coerce')
|
| 53 |
+
df['duration_min'] = (df['last_dt'] - df['dt']).dt.total_seconds() / 60.0
|
| 54 |
+
duration_agg = df.groupby('day')['duration_min'].sum().reset_index()
|
| 55 |
+
duration_agg = duration_agg.rename(columns={'day': 'ds', 'duration_min': 'total_downtime_min'})
|
| 56 |
+
daily_data = daily_data.merge(duration_agg, on='ds', how='left')
|
| 57 |
+
daily_data['total_downtime_min'] = daily_data['total_downtime_min'].fillna(0)
|
| 58 |
+
else:
|
| 59 |
+
daily_data['total_downtime_min'] = 0
|
| 60 |
+
|
| 61 |
+
# Add time features
|
| 62 |
+
daily_data['ds'] = pd.to_datetime(daily_data['ds'])
|
| 63 |
+
daily_data['day_of_week'] = daily_data['ds'].dt.dayofweek
|
| 64 |
+
daily_data['month'] = daily_data['ds'].dt.month
|
| 65 |
+
daily_data['is_weekend'] = daily_data['day_of_week'].isin([5, 6]).astype(int)
|
| 66 |
+
|
| 67 |
+
# Fill missing numeric values
|
| 68 |
+
numeric_cols = ['avg_load_mw', 'avg_capacity_kva', 'total_affected_customers', 'total_downtime_min']
|
| 69 |
+
for col in numeric_cols:
|
| 70 |
+
daily_data[col] = pd.to_numeric(daily_data[col], errors='coerce').fillna(daily_data[col].mean())
|
| 71 |
+
|
| 72 |
+
# Encode categorical variables
|
| 73 |
+
categorical_cols = ['OpDeviceType', 'Owner', 'Weather', 'EventType']
|
| 74 |
+
for col in categorical_cols:
|
| 75 |
+
daily_data[col] = daily_data[col].fillna('Unknown')
|
| 76 |
+
# Simple frequency encoding
|
| 77 |
+
freq_map = daily_data[col].value_counts().to_dict()
|
| 78 |
+
daily_data[f'{col}_freq'] = daily_data[col].map(freq_map)
|
| 79 |
+
|
| 80 |
+
return daily_data
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
def prepare_multivariate_timeseries(df: pd.DataFrame, date_col: str = 'OutageDateTime', target_metric: str = 'count') -> pd.DataFrame:
|
| 84 |
+
"""
|
| 85 |
+
Prepare multivariate time series data with multiple features per day.
|
| 86 |
+
|
| 87 |
+
Args:
|
| 88 |
+
df: Input dataframe
|
| 89 |
+
date_col: Date column name
|
| 90 |
+
target_metric: Target metric ('count' or 'downtime_minutes')
|
| 91 |
+
|
| 92 |
+
Returns:
|
| 93 |
+
DataFrame with daily aggregated features
|
| 94 |
+
"""
|
| 95 |
+
df = df.copy()
|
| 96 |
+
|
| 97 |
+
# Convert data types properly
|
| 98 |
+
df['dt'] = pd.to_datetime(df[date_col], format='%d-%m-%Y %H:%M:%S', errors='coerce')
|
| 99 |
+
df = df.dropna(subset=['dt'])
|
| 100 |
+
df['day'] = df['dt'].dt.floor('D')
|
| 101 |
+
|
| 102 |
+
# Convert numeric columns
|
| 103 |
+
numeric_cols = ['Load(MW)', 'Capacity(kVA)', 'AffectedCustomer', 'FirstStepDuration', 'LastStepDuration']
|
| 104 |
+
for col in numeric_cols:
|
| 105 |
+
if col in df.columns:
|
| 106 |
+
df[col] = pd.to_numeric(df[col], errors='coerce')
|
| 107 |
+
|
| 108 |
+
# Target variable
|
| 109 |
+
if target_metric == 'count':
|
| 110 |
+
daily_data = df.groupby('day').size().rename('daily_count').reset_index()
|
| 111 |
+
elif target_metric == 'downtime_minutes':
|
| 112 |
+
df['last_dt'] = pd.to_datetime(df.get('LastRestoDateTime'), format='%d-%m-%Y %H:%M:%S', errors='coerce')
|
| 113 |
+
df['duration_min'] = (df['last_dt'] - df['dt']).dt.total_seconds() / 60.0
|
| 114 |
+
daily_data = df.groupby('day')['duration_min'].sum().rename('total_downtime_min').reset_index()
|
| 115 |
+
else:
|
| 116 |
+
raise ValueError('Unsupported target_metric')
|
| 117 |
+
|
| 118 |
+
# Additional features - aggregate per day
|
| 119 |
+
# Numeric features
|
| 120 |
+
numeric_agg = df.groupby('day').agg({
|
| 121 |
+
'Load(MW)': 'mean',
|
| 122 |
+
'Capacity(kVA)': 'mean',
|
| 123 |
+
'AffectedCustomer': 'sum',
|
| 124 |
+
'FirstStepDuration': 'mean',
|
| 125 |
+
'LastStepDuration': 'mean'
|
| 126 |
+
}).reset_index()
|
| 127 |
+
|
| 128 |
+
# Time features
|
| 129 |
+
time_features = df.groupby('day').agg({
|
| 130 |
+
'dt': ['count', lambda x: x.dt.hour.mean(), lambda x: x.dt.weekday.mean()]
|
| 131 |
+
}).reset_index()
|
| 132 |
+
time_features.columns = ['day', 'event_count', 'avg_hour', 'avg_weekday']
|
| 133 |
+
|
| 134 |
+
# Categorical features - take most common per day
|
| 135 |
+
categorical_features = df.groupby('day').agg({
|
| 136 |
+
'OpDeviceType': lambda x: x.mode().iloc[0] if len(x.mode()) > 0 else 'Unknown',
|
| 137 |
+
'Owner': lambda x: x.mode().iloc[0] if len(x.mode()) > 0 else 'Unknown',
|
| 138 |
+
'Weather': lambda x: x.mode().iloc[0] if len(x.mode()) > 0 else 'Unknown',
|
| 139 |
+
'EventType': lambda x: x.mode().iloc[0] if len(x.mode()) > 0 else 'Unknown'
|
| 140 |
+
}).reset_index()
|
| 141 |
+
|
| 142 |
+
# Merge all features
|
| 143 |
+
daily_data = daily_data.merge(numeric_agg, on='day', how='left')
|
| 144 |
+
daily_data = daily_data.merge(time_features, on='day', how='left')
|
| 145 |
+
daily_data = daily_data.merge(categorical_features, on='day', how='left')
|
| 146 |
+
|
| 147 |
+
# Fill missing values
|
| 148 |
+
daily_data = daily_data.fillna({
|
| 149 |
+
'Load(MW)': daily_data['Load(MW)'].mean(),
|
| 150 |
+
'Capacity(kVA)': daily_data['Capacity(kVA)'].mean(),
|
| 151 |
+
'AffectedCustomer': 0,
|
| 152 |
+
'FirstStepDuration': daily_data['FirstStepDuration'].mean(),
|
| 153 |
+
'LastStepDuration': daily_data['LastStepDuration'].mean(),
|
| 154 |
+
'avg_hour': 12,
|
| 155 |
+
'avg_weekday': 3
|
| 156 |
+
})
|
| 157 |
+
|
| 158 |
+
# Rename day column to ds for consistency
|
| 159 |
+
daily_data = daily_data.rename(columns={'day': 'ds'})
|
| 160 |
+
|
| 161 |
+
return daily_data
|
| 162 |
+
|
| 163 |
|
| 164 |
def prepare_timeseries(df: pd.DataFrame, date_col: str = 'OutageDateTime', metric: str = 'count') -> pd.DataFrame:
|
| 165 |
+
"""Prepare univariate time series data (original function for backward compatibility)"""
|
| 166 |
# date_col is in format DD-MM-YYYY HH:MM:SS
|
| 167 |
df = df.copy()
|
| 168 |
df['dt'] = pd.to_datetime(df[date_col], format='%d-%m-%Y %H:%M:%S', errors='coerce')
|
|
|
|
| 181 |
return ts
|
| 182 |
|
| 183 |
|
| 184 |
+
def forecast_prophet(ts: pd.DataFrame, periods: int = 7, freq: str = 'D', hyperparams: dict = None) -> pd.DataFrame:
|
| 185 |
if not PROPHET_AVAILABLE:
|
| 186 |
raise RuntimeError('Prophet not available')
|
| 187 |
+
|
| 188 |
+
# Set default hyperparameters
|
| 189 |
+
if hyperparams is None:
|
| 190 |
+
hyperparams = {}
|
| 191 |
+
|
| 192 |
+
changepoint_prior_scale = hyperparams.get('changepoint_prior_scale', 0.05)
|
| 193 |
+
seasonality_prior_scale = hyperparams.get('seasonality_prior_scale', 10.0)
|
| 194 |
+
seasonality_mode = hyperparams.get('seasonality_mode', 'additive')
|
| 195 |
+
|
| 196 |
+
m = Prophet(
|
| 197 |
+
changepoint_prior_scale=changepoint_prior_scale,
|
| 198 |
+
seasonality_prior_scale=seasonality_prior_scale,
|
| 199 |
+
seasonality_mode=seasonality_mode
|
| 200 |
+
)
|
| 201 |
m.fit(ts)
|
| 202 |
future = m.make_future_dataframe(periods=periods, freq=freq)
|
| 203 |
fcst = m.predict(future)
|
|
|
|
| 212 |
return pd.DataFrame({'ds': future_dates, 'yhat': [last_mean]*periods, 'yhat_lower':[np.nan]*periods, 'yhat_upper':[np.nan]*periods})
|
| 213 |
|
| 214 |
|
| 215 |
+
def create_sequences(data, seq_length):
|
| 216 |
+
"""Create sequences for time series forecasting"""
|
| 217 |
+
X, y = [], []
|
| 218 |
+
for i in range(len(data) - seq_length):
|
| 219 |
+
X.append(data[i:(i + seq_length)])
|
| 220 |
+
y.append(data[i + seq_length])
|
| 221 |
+
return np.array(X), np.array(y)
|
| 222 |
+
|
| 223 |
+
|
| 224 |
+
def forecast_lstm(ts: pd.DataFrame, periods: int = 7, seq_length: int = 7, hyperparams: dict = None) -> pd.DataFrame:
|
| 225 |
+
"""Forecast using LSTM model"""
|
| 226 |
+
if not TF_AVAILABLE:
|
| 227 |
+
raise RuntimeError('TensorFlow not available')
|
| 228 |
+
|
| 229 |
+
# Set default hyperparameters
|
| 230 |
+
if hyperparams is None:
|
| 231 |
+
hyperparams = {}
|
| 232 |
+
|
| 233 |
+
seq_length = hyperparams.get('seq_length', seq_length) # Use hyperparams seq_length if provided
|
| 234 |
+
epochs = hyperparams.get('epochs', 100)
|
| 235 |
+
batch_size = hyperparams.get('batch_size', 16)
|
| 236 |
+
learning_rate = hyperparams.get('learning_rate', 0.001)
|
| 237 |
+
units = hyperparams.get('units', 100)
|
| 238 |
+
dropout_rate = hyperparams.get('dropout_rate', 0.2)
|
| 239 |
+
|
| 240 |
+
# Prepare data
|
| 241 |
+
data = ts['y'].values.reshape(-1, 1)
|
| 242 |
+
scaler = MinMaxScaler(feature_range=(0, 1))
|
| 243 |
+
scaled_data = scaler.fit_transform(data)
|
| 244 |
+
|
| 245 |
+
# Create sequences
|
| 246 |
+
X, y = create_sequences(scaled_data, seq_length)
|
| 247 |
+
|
| 248 |
+
if len(X) < 10: # Not enough data
|
| 249 |
+
return forecast_naive(ts, periods)
|
| 250 |
+
|
| 251 |
+
# Split data
|
| 252 |
+
train_size = int(len(X) * 0.8)
|
| 253 |
+
X_train, X_test = X[:train_size], X[train_size:]
|
| 254 |
+
y_train, y_test = y[:train_size], y[train_size:]
|
| 255 |
+
|
| 256 |
+
# Reshape for LSTM [samples, time steps, features]
|
| 257 |
+
X_train = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
|
| 258 |
+
X_test = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))
|
| 259 |
+
|
| 260 |
+
# Build LSTM model
|
| 261 |
+
model = Sequential([
|
| 262 |
+
LSTM(units, activation='relu', return_sequences=True, input_shape=(seq_length, 1)),
|
| 263 |
+
Dropout(dropout_rate),
|
| 264 |
+
LSTM(units//2, activation='relu'),
|
| 265 |
+
Dropout(dropout_rate),
|
| 266 |
+
Dense(1)
|
| 267 |
+
])
|
| 268 |
+
|
| 269 |
+
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
|
| 270 |
+
model.compile(optimizer=optimizer, loss='mse')
|
| 271 |
+
|
| 272 |
+
# Train model
|
| 273 |
+
model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, verbose=0, validation_data=(X_test, y_test))
|
| 274 |
+
|
| 275 |
+
# Make predictions
|
| 276 |
+
predictions = []
|
| 277 |
+
current_sequence = scaled_data[-seq_length:].reshape(1, seq_length, 1)
|
| 278 |
+
|
| 279 |
+
for _ in range(periods):
|
| 280 |
+
pred = model.predict(current_sequence, verbose=0)
|
| 281 |
+
predictions.append(pred[0][0])
|
| 282 |
+
# Update sequence for next prediction
|
| 283 |
+
current_sequence = np.roll(current_sequence, -1, axis=1)
|
| 284 |
+
current_sequence[0, -1, 0] = pred[0][0]
|
| 285 |
+
|
| 286 |
+
# Inverse transform predictions
|
| 287 |
+
predictions = scaler.inverse_transform(np.array(predictions).reshape(-1, 1)).flatten()
|
| 288 |
+
|
| 289 |
+
# Create forecast dataframe
|
| 290 |
+
last_date = ts['ds'].max()
|
| 291 |
+
future_dates = pd.date_range(start=last_date + pd.Timedelta(days=1), periods=periods, freq='D')
|
| 292 |
+
|
| 293 |
+
return pd.DataFrame({
|
| 294 |
+
'ds': future_dates,
|
| 295 |
+
'yhat': predictions,
|
| 296 |
+
'yhat_lower': predictions * 0.8, # Simple confidence intervals
|
| 297 |
+
'yhat_upper': predictions * 1.2
|
| 298 |
+
})
|
| 299 |
+
|
| 300 |
+
|
| 301 |
+
def forecast_bilstm(ts: pd.DataFrame, periods: int = 7, seq_length: int = 7, hyperparams: dict = None) -> pd.DataFrame:
|
| 302 |
+
"""Forecast using Bi-LSTM model"""
|
| 303 |
+
if not TF_AVAILABLE:
|
| 304 |
+
raise RuntimeError('TensorFlow not available')
|
| 305 |
+
|
| 306 |
+
# Set default hyperparameters
|
| 307 |
+
if hyperparams is None:
|
| 308 |
+
hyperparams = {}
|
| 309 |
+
seq_length = hyperparams.get('seq_length', seq_length) # Use hyperparams seq_length if provided
|
| 310 |
+
epochs = hyperparams.get('epochs', 50)
|
| 311 |
+
batch_size = hyperparams.get('batch_size', 16)
|
| 312 |
+
learning_rate = hyperparams.get('learning_rate', 0.001)
|
| 313 |
+
units = hyperparams.get('units', 50)
|
| 314 |
+
dropout_rate = hyperparams.get('dropout_rate', 0.2)
|
| 315 |
+
|
| 316 |
+
# Prepare data
|
| 317 |
+
data = ts['y'].values.reshape(-1, 1)
|
| 318 |
+
scaler = MinMaxScaler(feature_range=(0, 1))
|
| 319 |
+
scaled_data = scaler.fit_transform(data)
|
| 320 |
+
|
| 321 |
+
# Create sequences
|
| 322 |
+
X, y = create_sequences(scaled_data, seq_length)
|
| 323 |
+
|
| 324 |
+
if len(X) < 10: # Not enough data
|
| 325 |
+
return forecast_naive(ts, periods)
|
| 326 |
+
|
| 327 |
+
# Split data
|
| 328 |
+
train_size = int(len(X) * 0.8)
|
| 329 |
+
X_train, X_test = X[:train_size], X[train_size:]
|
| 330 |
+
y_train, y_test = y[:train_size], y[train_size:]
|
| 331 |
+
|
| 332 |
+
# Reshape for Bi-LSTM [samples, time steps, features]
|
| 333 |
+
X_train = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
|
| 334 |
+
X_test = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))
|
| 335 |
+
|
| 336 |
+
# Build Bi-LSTM model
|
| 337 |
+
model = Sequential([
|
| 338 |
+
Bidirectional(LSTM(units, activation='relu'), input_shape=(seq_length, 1)),
|
| 339 |
+
Dropout(dropout_rate),
|
| 340 |
+
Dense(1)
|
| 341 |
+
])
|
| 342 |
+
|
| 343 |
+
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
|
| 344 |
+
model.compile(optimizer=optimizer, loss='mse')
|
| 345 |
+
|
| 346 |
+
# Train model
|
| 347 |
+
model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, verbose=0, validation_data=(X_test, y_test))
|
| 348 |
+
|
| 349 |
+
# Make predictions
|
| 350 |
+
predictions = []
|
| 351 |
+
current_sequence = scaled_data[-seq_length:].reshape(1, seq_length, 1)
|
| 352 |
+
|
| 353 |
+
for _ in range(periods):
|
| 354 |
+
pred = model.predict(current_sequence, verbose=0)
|
| 355 |
+
predictions.append(pred[0][0])
|
| 356 |
+
# Update sequence for next prediction
|
| 357 |
+
current_sequence = np.roll(current_sequence, -1, axis=1)
|
| 358 |
+
current_sequence[0, -1, 0] = pred[0][0]
|
| 359 |
+
|
| 360 |
+
# Inverse transform predictions
|
| 361 |
+
predictions = scaler.inverse_transform(np.array(predictions).reshape(-1, 1)).flatten()
|
| 362 |
+
|
| 363 |
+
# Create forecast dataframe
|
| 364 |
+
last_date = ts['ds'].max()
|
| 365 |
+
future_dates = pd.date_range(start=last_date + pd.Timedelta(days=1), periods=periods, freq='D')
|
| 366 |
+
|
| 367 |
+
return pd.DataFrame({
|
| 368 |
+
'ds': future_dates,
|
| 369 |
+
'yhat': predictions,
|
| 370 |
+
'yhat_lower': predictions * 0.8,
|
| 371 |
+
'yhat_upper': predictions * 1.2
|
| 372 |
+
})
|
| 373 |
+
|
| 374 |
+
|
| 375 |
+
def forecast_gru(ts: pd.DataFrame, periods: int = 7, seq_length: int = 7, hyperparams: dict = None) -> pd.DataFrame:
|
| 376 |
+
"""Forecast using GRU model"""
|
| 377 |
+
if not TF_AVAILABLE:
|
| 378 |
+
raise RuntimeError('TensorFlow not available')
|
| 379 |
+
|
| 380 |
+
# Set default hyperparameters
|
| 381 |
+
if hyperparams is None:
|
| 382 |
+
hyperparams = {}
|
| 383 |
+
seq_length = hyperparams.get('seq_length', seq_length) # Use hyperparams seq_length if provided
|
| 384 |
+
epochs = hyperparams.get('epochs', 50)
|
| 385 |
+
batch_size = hyperparams.get('batch_size', 16)
|
| 386 |
+
learning_rate = hyperparams.get('learning_rate', 0.001)
|
| 387 |
+
units = hyperparams.get('units', 50)
|
| 388 |
+
dropout_rate = hyperparams.get('dropout_rate', 0.2)
|
| 389 |
+
|
| 390 |
+
# Prepare data
|
| 391 |
+
data = ts['y'].values.reshape(-1, 1)
|
| 392 |
+
scaler = MinMaxScaler(feature_range=(0, 1))
|
| 393 |
+
scaled_data = scaler.fit_transform(data)
|
| 394 |
+
|
| 395 |
+
# Create sequences
|
| 396 |
+
X, y = create_sequences(scaled_data, seq_length)
|
| 397 |
+
|
| 398 |
+
if len(X) < 10: # Not enough data
|
| 399 |
+
return forecast_naive(ts, periods)
|
| 400 |
+
|
| 401 |
+
# Split data
|
| 402 |
+
train_size = int(len(X) * 0.8)
|
| 403 |
+
X_train, X_test = X[:train_size], X[train_size:]
|
| 404 |
+
y_train, y_test = y[:train_size], y[train_size:]
|
| 405 |
+
|
| 406 |
+
# Reshape for GRU [samples, time steps, features]
|
| 407 |
+
X_train = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
|
| 408 |
+
X_test = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))
|
| 409 |
+
|
| 410 |
+
# Build GRU model
|
| 411 |
+
model = Sequential([
|
| 412 |
+
GRU(units, activation='relu', input_shape=(seq_length, 1)),
|
| 413 |
+
Dropout(dropout_rate),
|
| 414 |
+
Dense(1)
|
| 415 |
+
])
|
| 416 |
+
|
| 417 |
+
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
|
| 418 |
+
model.compile(optimizer=optimizer, loss='mse')
|
| 419 |
+
|
| 420 |
+
# Train model
|
| 421 |
+
model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, verbose=0, validation_data=(X_test, y_test))
|
| 422 |
+
|
| 423 |
+
# Make predictions
|
| 424 |
+
predictions = []
|
| 425 |
+
current_sequence = scaled_data[-seq_length:].reshape(1, seq_length, 1)
|
| 426 |
+
|
| 427 |
+
for _ in range(periods):
|
| 428 |
+
pred = model.predict(current_sequence, verbose=0)
|
| 429 |
+
predictions.append(pred[0][0])
|
| 430 |
+
# Update sequence for next prediction
|
| 431 |
+
current_sequence = np.roll(current_sequence, -1, axis=1)
|
| 432 |
+
current_sequence[0, -1, 0] = pred[0][0]
|
| 433 |
+
|
| 434 |
+
# Inverse transform predictions
|
| 435 |
+
predictions = scaler.inverse_transform(np.array(predictions).reshape(-1, 1)).flatten()
|
| 436 |
+
|
| 437 |
+
# Create forecast dataframe
|
| 438 |
+
last_date = ts['ds'].max()
|
| 439 |
+
future_dates = pd.date_range(start=last_date + pd.Timedelta(days=1), periods=periods, freq='D')
|
| 440 |
+
|
| 441 |
+
return pd.DataFrame({
|
| 442 |
+
'ds': future_dates,
|
| 443 |
+
'yhat': predictions,
|
| 444 |
+
'yhat_lower': predictions * 0.8,
|
| 445 |
+
'yhat_upper': predictions * 1.2
|
| 446 |
+
})
|
| 447 |
+
"""Forecast using multivariate LSTM model"""
|
| 448 |
+
if not TF_AVAILABLE:
|
| 449 |
+
raise RuntimeError('TensorFlow not available')
|
| 450 |
+
|
| 451 |
+
# Select features for multivariate forecasting
|
| 452 |
+
feature_cols = [col for col in ts.columns if col not in ['ds', 'OpDeviceType', 'Owner', 'Weather', 'EventType']]
|
| 453 |
+
if target_col not in feature_cols:
|
| 454 |
+
raise ValueError(f"Target column '{target_col}' not found in features")
|
| 455 |
+
|
| 456 |
+
# Prepare data
|
| 457 |
+
data = ts[feature_cols].values
|
| 458 |
+
scaler = MinMaxScaler(feature_range=(0, 1))
|
| 459 |
+
scaled_data = scaler.fit_transform(data)
|
| 460 |
+
|
| 461 |
+
# Create sequences
|
| 462 |
+
X, y = create_sequences(scaled_data, seq_length)
|
| 463 |
+
|
| 464 |
+
if len(X) < 10: # Not enough data
|
| 465 |
+
return forecast_naive(ts[['ds', target_col]].rename(columns={target_col: 'y'}), periods)
|
| 466 |
+
|
| 467 |
+
# Split data
|
| 468 |
+
train_size = int(len(X) * 0.8)
|
| 469 |
+
X_train, X_test = X[:train_size], X[train_size:]
|
| 470 |
+
y_train, y_test = y[:train_size], y[train_size:]
|
| 471 |
+
|
| 472 |
+
# Reshape for LSTM [samples, time steps, features]
|
| 473 |
+
X_train = X_train.reshape((X_train.shape[0], X_train.shape[1], len(feature_cols)))
|
| 474 |
+
X_test = X_test.reshape((X_test.shape[0], X_test.shape[1], len(feature_cols)))
|
| 475 |
+
|
| 476 |
+
# Build multivariate LSTM model
|
| 477 |
+
model = Sequential([
|
| 478 |
+
LSTM(100, activation='relu', return_sequences=True, input_shape=(seq_length, len(feature_cols))),
|
| 479 |
+
Dropout(0.2),
|
| 480 |
+
LSTM(50, activation='relu'),
|
| 481 |
+
Dropout(0.2),
|
| 482 |
+
Dense(1)
|
| 483 |
+
])
|
| 484 |
+
|
| 485 |
+
model.compile(optimizer='adam', loss='mse')
|
| 486 |
+
|
| 487 |
+
# Train model
|
| 488 |
+
model.fit(X_train, y_train, epochs=100, batch_size=16, verbose=0, validation_data=(X_test, y_test))
|
| 489 |
+
|
| 490 |
+
# Make predictions
|
| 491 |
+
predictions = []
|
| 492 |
+
current_sequence = scaled_data[-seq_length:].reshape(1, seq_length, len(feature_cols))
|
| 493 |
+
|
| 494 |
+
for _ in range(periods):
|
| 495 |
+
pred = model.predict(current_sequence, verbose=0)
|
| 496 |
+
predictions.append(pred[0][0])
|
| 497 |
+
|
| 498 |
+
# Update sequence for next prediction (use predicted value for target, keep other features)
|
| 499 |
+
new_row = current_sequence[0, -1, :].copy()
|
| 500 |
+
new_row[feature_cols.index(target_col)] = pred[0][0] # Update target with prediction
|
| 501 |
+
|
| 502 |
+
current_sequence = np.roll(current_sequence, -1, axis=1)
|
| 503 |
+
current_sequence[0, -1, :] = new_row
|
| 504 |
+
|
| 505 |
+
# Inverse transform predictions (only for target column)
|
| 506 |
+
target_scaler = MinMaxScaler(feature_range=(0, 1))
|
| 507 |
+
target_scaler.fit(data[:, feature_cols.index(target_col)].reshape(-1, 1))
|
| 508 |
+
predictions = target_scaler.inverse_transform(np.array(predictions).reshape(-1, 1)).flatten()
|
| 509 |
+
|
| 510 |
+
# Create forecast dataframe
|
| 511 |
+
last_date = ts['ds'].max()
|
| 512 |
+
future_dates = pd.date_range(start=last_date + pd.Timedelta(days=1), periods=periods, freq='D')
|
| 513 |
+
|
| 514 |
+
return pd.DataFrame({
|
| 515 |
+
'ds': future_dates,
|
| 516 |
+
'yhat': predictions,
|
| 517 |
+
'yhat_lower': predictions * 0.8,
|
| 518 |
+
'yhat_upper': predictions * 1.2
|
| 519 |
+
})
|
| 520 |
+
"""Forecast using LSTM model"""
|
| 521 |
+
if not TF_AVAILABLE:
|
| 522 |
+
raise RuntimeError('TensorFlow not available')
|
| 523 |
+
|
| 524 |
+
# Prepare data
|
| 525 |
+
data = ts['y'].values.reshape(-1, 1)
|
| 526 |
+
scaler = MinMaxScaler(feature_range=(0, 1))
|
| 527 |
+
scaled_data = scaler.fit_transform(data)
|
| 528 |
+
|
| 529 |
+
# Create sequences
|
| 530 |
+
X, y = create_sequences(scaled_data, seq_length)
|
| 531 |
+
|
| 532 |
+
if len(X) < 10: # Not enough data
|
| 533 |
+
return forecast_naive(ts, periods)
|
| 534 |
+
|
| 535 |
+
# Split data
|
| 536 |
+
train_size = int(len(X) * 0.8)
|
| 537 |
+
X_train, X_test = X[:train_size], X[train_size:]
|
| 538 |
+
y_train, y_test = y[:train_size], y[train_size:]
|
| 539 |
+
|
| 540 |
+
# Reshape for LSTM [samples, time steps, features]
|
| 541 |
+
X_train = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
|
| 542 |
+
X_test = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))
|
| 543 |
+
|
| 544 |
+
# Build LSTM model
|
| 545 |
+
model = Sequential([
|
| 546 |
+
LSTM(50, activation='relu', input_shape=(seq_length, 1)),
|
| 547 |
+
Dropout(0.2),
|
| 548 |
+
Dense(1)
|
| 549 |
+
])
|
| 550 |
+
|
| 551 |
+
model.compile(optimizer='adam', loss='mse')
|
| 552 |
+
|
| 553 |
+
# Train model
|
| 554 |
+
model.fit(X_train, y_train, epochs=50, batch_size=16, verbose=0, validation_data=(X_test, y_test))
|
| 555 |
+
|
| 556 |
+
# Make predictions
|
| 557 |
+
predictions = []
|
| 558 |
+
current_sequence = scaled_data[-seq_length:].reshape(1, seq_length, 1)
|
| 559 |
+
|
| 560 |
+
for _ in range(periods):
|
| 561 |
+
pred = model.predict(current_sequence, verbose=0)
|
| 562 |
+
predictions.append(pred[0][0])
|
| 563 |
+
# Update sequence for next prediction
|
| 564 |
+
current_sequence = np.roll(current_sequence, -1, axis=1)
|
| 565 |
+
current_sequence[0, -1, 0] = pred[0][0]
|
| 566 |
+
|
| 567 |
+
# Inverse transform predictions
|
| 568 |
+
predictions = scaler.inverse_transform(np.array(predictions).reshape(-1, 1)).flatten()
|
| 569 |
+
|
| 570 |
+
# Create forecast dataframe
|
| 571 |
+
last_date = ts['ds'].max()
|
| 572 |
+
future_dates = pd.date_range(start=last_date + pd.Timedelta(days=1), periods=periods, freq='D')
|
| 573 |
+
|
| 574 |
+
return pd.DataFrame({
|
| 575 |
+
'ds': future_dates,
|
| 576 |
+
'yhat': predictions,
|
| 577 |
+
'yhat_lower': predictions * 0.8, # Simple confidence intervals
|
| 578 |
+
'yhat_upper': predictions * 1.2
|
| 579 |
+
})
|
| 580 |
+
|
| 581 |
+
|
| 582 |
+
def forecast_bilstm_multivariate(ts: pd.DataFrame, periods: int = 7, seq_length: int = 7, target_col: str = 'daily_count', hyperparams: dict = None) -> pd.DataFrame:
|
| 583 |
+
"""Forecast using multivariate Bi-LSTM model"""
|
| 584 |
+
if not TF_AVAILABLE:
|
| 585 |
+
raise RuntimeError('TensorFlow not available')
|
| 586 |
+
|
| 587 |
+
# Set default hyperparameters
|
| 588 |
+
if hyperparams is None:
|
| 589 |
+
hyperparams = {}
|
| 590 |
+
epochs = hyperparams.get('epochs', 100)
|
| 591 |
+
batch_size = hyperparams.get('batch_size', 16)
|
| 592 |
+
learning_rate = hyperparams.get('learning_rate', 0.001)
|
| 593 |
+
units = hyperparams.get('units', 100)
|
| 594 |
+
dropout_rate = hyperparams.get('dropout_rate', 0.2)
|
| 595 |
+
|
| 596 |
+
# Select features for multivariate forecasting
|
| 597 |
+
feature_cols = [col for col in ts.columns if col not in ['ds', 'OpDeviceType', 'Owner', 'Weather', 'EventType']]
|
| 598 |
+
if target_col not in feature_cols:
|
| 599 |
+
raise ValueError(f"Target column '{target_col}' not found in features")
|
| 600 |
+
|
| 601 |
+
# Prepare data
|
| 602 |
+
data = ts[feature_cols].values
|
| 603 |
+
scaler = MinMaxScaler(feature_range=(0, 1))
|
| 604 |
+
scaled_data = scaler.fit_transform(data)
|
| 605 |
+
|
| 606 |
+
# Create sequences
|
| 607 |
+
X, y = create_sequences(scaled_data, seq_length)
|
| 608 |
+
|
| 609 |
+
if len(X) < 10: # Not enough data
|
| 610 |
+
return forecast_naive(ts[['ds', target_col]].rename(columns={target_col: 'y'}), periods)
|
| 611 |
+
|
| 612 |
+
# Split data
|
| 613 |
+
train_size = int(len(X) * 0.8)
|
| 614 |
+
X_train, X_test = X[:train_size], X[train_size:]
|
| 615 |
+
y_train, y_test = y[:train_size], y[train_size:]
|
| 616 |
+
|
| 617 |
+
# Reshape for Bi-LSTM [samples, time steps, features]
|
| 618 |
+
X_train = X_train.reshape((X_train.shape[0], X_train.shape[1], len(feature_cols)))
|
| 619 |
+
X_test = X_test.reshape((X_test.shape[0], X_test.shape[1], len(feature_cols)))
|
| 620 |
+
|
| 621 |
+
# Build multivariate Bi-LSTM model
|
| 622 |
+
model = Sequential([
|
| 623 |
+
Bidirectional(LSTM(units, activation='relu', return_sequences=True), input_shape=(seq_length, len(feature_cols))),
|
| 624 |
+
Dropout(dropout_rate),
|
| 625 |
+
Bidirectional(LSTM(units//2, activation='relu')),
|
| 626 |
+
Dropout(dropout_rate),
|
| 627 |
+
Dense(1)
|
| 628 |
+
])
|
| 629 |
+
|
| 630 |
+
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
|
| 631 |
+
model.compile(optimizer=optimizer, loss='mse')
|
| 632 |
+
|
| 633 |
+
# Train model
|
| 634 |
+
model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, verbose=0, validation_data=(X_test, y_test))
|
| 635 |
+
|
| 636 |
+
# Make predictions
|
| 637 |
+
predictions = []
|
| 638 |
+
current_sequence = scaled_data[-seq_length:].reshape(1, seq_length, len(feature_cols))
|
| 639 |
+
|
| 640 |
+
for _ in range(periods):
|
| 641 |
+
pred = model.predict(current_sequence, verbose=0)
|
| 642 |
+
predictions.append(pred[0][0])
|
| 643 |
+
|
| 644 |
+
# Update sequence for next prediction
|
| 645 |
+
new_row = current_sequence[0, -1, :].copy()
|
| 646 |
+
new_row[feature_cols.index(target_col)] = pred[0][0]
|
| 647 |
+
|
| 648 |
+
current_sequence = np.roll(current_sequence, -1, axis=1)
|
| 649 |
+
current_sequence[0, -1, :] = new_row
|
| 650 |
+
|
| 651 |
+
# Inverse transform predictions
|
| 652 |
+
target_scaler = MinMaxScaler(feature_range=(0, 1))
|
| 653 |
+
target_scaler.fit(data[:, feature_cols.index(target_col)].reshape(-1, 1))
|
| 654 |
+
predictions = target_scaler.inverse_transform(np.array(predictions).reshape(-1, 1)).flatten()
|
| 655 |
+
|
| 656 |
+
# Create forecast dataframe
|
| 657 |
+
last_date = ts['ds'].max()
|
| 658 |
+
future_dates = pd.date_range(start=last_date + pd.Timedelta(days=1), periods=periods, freq='D')
|
| 659 |
+
|
| 660 |
+
return pd.DataFrame({
|
| 661 |
+
'ds': future_dates,
|
| 662 |
+
'yhat': predictions,
|
| 663 |
+
'yhat_lower': predictions * 0.8,
|
| 664 |
+
'yhat_upper': predictions * 1.2
|
| 665 |
+
})
|
| 666 |
+
"""Forecast using Bi-LSTM model"""
|
| 667 |
+
if not TF_AVAILABLE:
|
| 668 |
+
raise RuntimeError('TensorFlow not available')
|
| 669 |
+
|
| 670 |
+
# Prepare data
|
| 671 |
+
data = ts['y'].values.reshape(-1, 1)
|
| 672 |
+
scaler = MinMaxScaler(feature_range=(0, 1))
|
| 673 |
+
scaled_data = scaler.fit_transform(data)
|
| 674 |
+
|
| 675 |
+
# Create sequences
|
| 676 |
+
X, y = create_sequences(scaled_data, seq_length)
|
| 677 |
+
|
| 678 |
+
if len(X) < 10: # Not enough data
|
| 679 |
+
return forecast_naive(ts, periods)
|
| 680 |
+
|
| 681 |
+
# Split data
|
| 682 |
+
train_size = int(len(X) * 0.8)
|
| 683 |
+
X_train, X_test = X[:train_size], X[train_size:]
|
| 684 |
+
y_train, y_test = y[:train_size], y[train_size:]
|
| 685 |
+
|
| 686 |
+
# Reshape for Bi-LSTM [samples, time steps, features]
|
| 687 |
+
X_train = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
|
| 688 |
+
X_test = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))
|
| 689 |
+
|
| 690 |
+
# Build Bi-LSTM model
|
| 691 |
+
model = Sequential([
|
| 692 |
+
Bidirectional(LSTM(50, activation='relu'), input_shape=(seq_length, 1)),
|
| 693 |
+
Dropout(0.2),
|
| 694 |
+
Dense(1)
|
| 695 |
+
])
|
| 696 |
+
|
| 697 |
+
model.compile(optimizer='adam', loss='mse')
|
| 698 |
+
|
| 699 |
+
# Train model
|
| 700 |
+
model.fit(X_train, y_train, epochs=50, batch_size=16, verbose=0, validation_data=(X_test, y_test))
|
| 701 |
+
|
| 702 |
+
# Make predictions
|
| 703 |
+
predictions = []
|
| 704 |
+
current_sequence = scaled_data[-seq_length:].reshape(1, seq_length, 1)
|
| 705 |
+
|
| 706 |
+
for _ in range(periods):
|
| 707 |
+
pred = model.predict(current_sequence, verbose=0)
|
| 708 |
+
predictions.append(pred[0][0])
|
| 709 |
+
# Update sequence for next prediction
|
| 710 |
+
current_sequence = np.roll(current_sequence, -1, axis=1)
|
| 711 |
+
current_sequence[0, -1, 0] = pred[0][0]
|
| 712 |
+
|
| 713 |
+
# Inverse transform predictions
|
| 714 |
+
predictions = scaler.inverse_transform(np.array(predictions).reshape(-1, 1)).flatten()
|
| 715 |
+
|
| 716 |
+
# Create forecast dataframe
|
| 717 |
+
last_date = ts['ds'].max()
|
| 718 |
+
future_dates = pd.date_range(start=last_date + pd.Timedelta(days=1), periods=periods, freq='D')
|
| 719 |
+
|
| 720 |
+
return pd.DataFrame({
|
| 721 |
+
'ds': future_dates,
|
| 722 |
+
'yhat': predictions,
|
| 723 |
+
'yhat_lower': predictions * 0.8,
|
| 724 |
+
'yhat_upper': predictions * 1.2
|
| 725 |
+
})
|
| 726 |
+
|
| 727 |
+
|
| 728 |
+
def forecast_multivariate_lstm(ts: pd.DataFrame, target_col: str = 'target_count', periods: int = 7, seq_length: int = 7) -> pd.DataFrame:
|
| 729 |
+
"""Forecast using multivariate LSTM model"""
|
| 730 |
+
if not TF_AVAILABLE:
|
| 731 |
+
raise RuntimeError('TensorFlow not available')
|
| 732 |
+
|
| 733 |
+
# Prepare data - exclude date column and target
|
| 734 |
+
feature_cols = [col for col in ts.columns if col not in ['ds', target_col]]
|
| 735 |
+
target_data = ts[target_col].values.reshape(-1, 1)
|
| 736 |
+
|
| 737 |
+
# Handle categorical features - simple label encoding for demo
|
| 738 |
+
ts_encoded = ts.copy()
|
| 739 |
+
for col in feature_cols:
|
| 740 |
+
if ts[col].dtype == 'object':
|
| 741 |
+
# Simple label encoding
|
| 742 |
+
unique_vals = ts[col].unique()
|
| 743 |
+
val_to_int = {val: i for i, val in enumerate(unique_vals)}
|
| 744 |
+
ts_encoded[col] = ts[col].map(val_to_int)
|
| 745 |
+
|
| 746 |
+
feature_data = ts_encoded[feature_cols].values
|
| 747 |
+
|
| 748 |
+
# Scale features and target separately
|
| 749 |
+
feature_scaler = MinMaxScaler(feature_range=(0, 1))
|
| 750 |
+
target_scaler = MinMaxScaler(feature_range=(0, 1))
|
| 751 |
+
|
| 752 |
+
scaled_features = feature_scaler.fit_transform(feature_data)
|
| 753 |
+
scaled_target = target_scaler.fit_transform(target_data)
|
| 754 |
+
|
| 755 |
+
# Combine features and target for sequences
|
| 756 |
+
combined_data = np.column_stack([scaled_features, scaled_target])
|
| 757 |
+
|
| 758 |
+
# Create sequences
|
| 759 |
+
X, y = create_sequences(combined_data, seq_length)
|
| 760 |
+
|
| 761 |
+
if len(X) < 10: # Not enough data
|
| 762 |
+
# Fallback to univariate naive
|
| 763 |
+
univariate_ts = ts[['ds', target_col]].rename(columns={target_col: 'y'})
|
| 764 |
+
return forecast_naive(univariate_ts, periods)
|
| 765 |
+
|
| 766 |
+
# Split data
|
| 767 |
+
train_size = int(len(X) * 0.8)
|
| 768 |
+
X_train, X_test = X[:train_size], X[train_size:]
|
| 769 |
+
y_train, y_test = y[:train_size], y[train_size:]
|
| 770 |
+
|
| 771 |
+
# X shape: [samples, time_steps, features]
|
| 772 |
+
n_features = combined_data.shape[1]
|
| 773 |
+
|
| 774 |
+
# Build multivariate LSTM model
|
| 775 |
+
model = Sequential([
|
| 776 |
+
LSTM(64, activation='relu', input_shape=(seq_length, n_features), return_sequences=True),
|
| 777 |
+
Dropout(0.2),
|
| 778 |
+
LSTM(32, activation='relu'),
|
| 779 |
+
Dropout(0.2),
|
| 780 |
+
Dense(16, activation='relu'),
|
| 781 |
+
Dense(1)
|
| 782 |
+
])
|
| 783 |
+
|
| 784 |
+
model.compile(optimizer='adam', loss='mse')
|
| 785 |
+
|
| 786 |
+
# Train model
|
| 787 |
+
model.fit(X_train, y_train, epochs=50, batch_size=16, verbose=0, validation_data=(X_test, y_test))
|
| 788 |
+
|
| 789 |
+
# Make predictions
|
| 790 |
+
predictions = []
|
| 791 |
+
current_sequence = combined_data[-seq_length:].reshape(1, seq_length, n_features)
|
| 792 |
+
|
| 793 |
+
for _ in range(periods):
|
| 794 |
+
pred = model.predict(current_sequence, verbose=0)
|
| 795 |
+
predictions.append(pred[0][0])
|
| 796 |
+
|
| 797 |
+
# For next prediction, we need to estimate future features
|
| 798 |
+
# For simplicity, use the last known feature values
|
| 799 |
+
next_features = current_sequence[0, -1, :-1] # All features except target
|
| 800 |
+
next_sequence = np.column_stack([next_features, pred[0][0]]) # Add predicted target
|
| 801 |
+
|
| 802 |
+
# Update sequence
|
| 803 |
+
current_sequence = np.roll(current_sequence, -1, axis=1)
|
| 804 |
+
current_sequence[0, -1, :] = next_sequence
|
| 805 |
+
|
| 806 |
+
# Inverse transform predictions
|
| 807 |
+
predictions = target_scaler.inverse_transform(np.array(predictions).reshape(-1, 1)).flatten()
|
| 808 |
+
|
| 809 |
+
# Create forecast dataframe
|
| 810 |
+
last_date = ts['ds'].max()
|
| 811 |
+
future_dates = pd.date_range(start=last_date + pd.Timedelta(days=1), periods=periods, freq='D')
|
| 812 |
+
|
| 813 |
+
return pd.DataFrame({
|
| 814 |
+
'ds': future_dates,
|
| 815 |
+
'yhat': predictions,
|
| 816 |
+
'yhat_lower': predictions * 0.8,
|
| 817 |
+
'yhat_upper': predictions * 1.2
|
| 818 |
+
})
|
| 819 |
+
|
| 820 |
+
|
| 821 |
+
def forecast_multivariate_gru(ts: pd.DataFrame, target_col: str = 'target_count', periods: int = 7, seq_length: int = 7) -> pd.DataFrame:
|
| 822 |
+
"""Forecast using multivariate GRU model"""
|
| 823 |
+
if not TF_AVAILABLE:
|
| 824 |
+
raise RuntimeError('TensorFlow not available')
|
| 825 |
+
|
| 826 |
+
# Similar to multivariate LSTM but using GRU layers
|
| 827 |
+
feature_cols = [col for col in ts.columns if col not in ['ds', target_col]]
|
| 828 |
+
target_data = ts[target_col].values.reshape(-1, 1)
|
| 829 |
+
|
| 830 |
+
# Handle categorical features
|
| 831 |
+
ts_encoded = ts.copy()
|
| 832 |
+
for col in feature_cols:
|
| 833 |
+
if ts[col].dtype == 'object':
|
| 834 |
+
unique_vals = ts[col].unique()
|
| 835 |
+
val_to_int = {val: i for i, val in enumerate(unique_vals)}
|
| 836 |
+
ts_encoded[col] = ts[col].map(val_to_int)
|
| 837 |
+
|
| 838 |
+
feature_data = ts_encoded[feature_cols].values
|
| 839 |
+
|
| 840 |
+
# Scale data
|
| 841 |
+
feature_scaler = MinMaxScaler(feature_range=(0, 1))
|
| 842 |
+
target_scaler = MinMaxScaler(feature_range=(0, 1))
|
| 843 |
+
|
| 844 |
+
scaled_features = feature_scaler.fit_transform(feature_data)
|
| 845 |
+
scaled_target = target_scaler.fit_transform(target_data)
|
| 846 |
+
|
| 847 |
+
combined_data = np.column_stack([scaled_features, scaled_target])
|
| 848 |
+
|
| 849 |
+
# Create sequences
|
| 850 |
+
X, y = create_sequences(combined_data, seq_length)
|
| 851 |
+
|
| 852 |
+
if len(X) < 10:
|
| 853 |
+
univariate_ts = ts[['ds', target_col]].rename(columns={target_col: 'y'})
|
| 854 |
+
return forecast_naive(univariate_ts, periods)
|
| 855 |
+
|
| 856 |
+
# Split data
|
| 857 |
+
train_size = int(len(X) * 0.8)
|
| 858 |
+
X_train, X_test = X[:train_size], X[train_size:]
|
| 859 |
+
y_train, y_test = y[:train_size], y[train_size:]
|
| 860 |
+
|
| 861 |
+
n_features = combined_data.shape[1]
|
| 862 |
+
|
| 863 |
+
# Build multivariate GRU model
|
| 864 |
+
model = Sequential([
|
| 865 |
+
GRU(64, activation='relu', input_shape=(seq_length, n_features), return_sequences=True),
|
| 866 |
+
Dropout(0.2),
|
| 867 |
+
GRU(32, activation='relu'),
|
| 868 |
+
Dropout(0.2),
|
| 869 |
+
Dense(16, activation='relu'),
|
| 870 |
+
Dense(1)
|
| 871 |
+
])
|
| 872 |
+
|
| 873 |
+
model.compile(optimizer='adam', loss='mse')
|
| 874 |
+
|
| 875 |
+
# Train model
|
| 876 |
+
model.fit(X_train, y_train, epochs=50, batch_size=16, verbose=0, validation_data=(X_test, y_test))
|
| 877 |
+
|
| 878 |
+
# Make predictions (same logic as LSTM)
|
| 879 |
+
predictions = []
|
| 880 |
+
current_sequence = combined_data[-seq_length:].reshape(1, seq_length, n_features)
|
| 881 |
+
|
| 882 |
+
for _ in range(periods):
|
| 883 |
+
pred = model.predict(current_sequence, verbose=0)
|
| 884 |
+
predictions.append(pred[0][0])
|
| 885 |
+
|
| 886 |
+
next_features = current_sequence[0, -1, :-1]
|
| 887 |
+
next_sequence = np.column_stack([next_features, pred[0][0]])
|
| 888 |
+
|
| 889 |
+
current_sequence = np.roll(current_sequence, -1, axis=1)
|
| 890 |
+
current_sequence[0, -1, :] = next_sequence
|
| 891 |
+
|
| 892 |
+
# Inverse transform predictions
|
| 893 |
+
predictions = target_scaler.inverse_transform(np.array(predictions).reshape(-1, 1)).flatten()
|
| 894 |
+
|
| 895 |
+
# Create forecast dataframe
|
| 896 |
+
last_date = ts['ds'].max()
|
| 897 |
+
future_dates = pd.date_range(start=last_date + pd.Timedelta(days=1), periods=periods, freq='D')
|
| 898 |
+
|
| 899 |
+
return pd.DataFrame({
|
| 900 |
+
'ds': future_dates,
|
| 901 |
+
'yhat': predictions,
|
| 902 |
+
'yhat_lower': predictions * 0.8,
|
| 903 |
+
'yhat_upper': predictions * 1.2
|
| 904 |
+
})
|
| 905 |
+
|
| 906 |
+
|
| 907 |
+
def run_forecast(df: pd.DataFrame, metric: str = 'count', periods: int = 7, model_type: str = 'prophet', multivariate: bool = False, target_col: str = 'daily_count', hyperparams: dict = None):
|
| 908 |
+
"""
|
| 909 |
+
Run forecasting with specified model type.
|
| 910 |
+
|
| 911 |
+
Args:
|
| 912 |
+
df: Input dataframe
|
| 913 |
+
metric: 'count' or 'downtime_minutes' (for univariate)
|
| 914 |
+
periods: Number of periods to forecast
|
| 915 |
+
model_type: 'prophet', 'lstm', 'bilstm', 'gru', or 'naive'
|
| 916 |
+
multivariate: Whether to use multivariate forecasting
|
| 917 |
+
target_col: Target column for multivariate forecasting ('daily_count' or 'total_downtime_min')
|
| 918 |
+
hyperparams: Dictionary of hyperparameters for the model
|
| 919 |
+
"""
|
| 920 |
+
if multivariate:
|
| 921 |
+
ts = prepare_multivariate_timeseries(df, target_metric=metric)
|
| 922 |
+
# Map metric to target column
|
| 923 |
+
if metric == 'count':
|
| 924 |
+
target_col = 'daily_count'
|
| 925 |
+
elif metric == 'downtime_minutes':
|
| 926 |
+
target_col = 'total_downtime_min'
|
| 927 |
+
else:
|
| 928 |
+
target_col = 'daily_count'
|
| 929 |
+
|
| 930 |
+
if model_type == 'lstm':
|
| 931 |
+
if TF_AVAILABLE and len(ts) >= 14:
|
| 932 |
+
try:
|
| 933 |
+
fcst = forecast_lstm_multivariate(ts, periods=periods, target_col=target_col, hyperparams=hyperparams)
|
| 934 |
+
return ts, fcst
|
| 935 |
+
except Exception as e:
|
| 936 |
+
warnings.warn(f'Multivariate LSTM failed: {e}, falling back to univariate')
|
| 937 |
+
# Fallback to univariate
|
| 938 |
+
univariate_ts = prepare_timeseries(df, metric=metric)
|
| 939 |
+
fcst = forecast_naive(univariate_ts, periods=periods)
|
| 940 |
+
return univariate_ts, fcst
|
| 941 |
+
|
| 942 |
+
elif model_type == 'bilstm':
|
| 943 |
+
if TF_AVAILABLE and len(ts) >= 14:
|
| 944 |
+
try:
|
| 945 |
+
fcst = forecast_bilstm_multivariate(ts, periods=periods, target_col=target_col, hyperparams=hyperparams)
|
| 946 |
+
return ts, fcst
|
| 947 |
+
except Exception as e:
|
| 948 |
+
warnings.warn(f'Multivariate Bi-LSTM failed: {e}, falling back to univariate')
|
| 949 |
+
# Fallback to univariate
|
| 950 |
+
univariate_ts = prepare_timeseries(df, metric=metric)
|
| 951 |
+
fcst = forecast_naive(univariate_ts, periods=periods)
|
| 952 |
+
return univariate_ts, fcst
|
| 953 |
+
|
| 954 |
+
elif model_type == 'gru':
|
| 955 |
+
if TF_AVAILABLE and len(ts) >= 14:
|
| 956 |
+
try:
|
| 957 |
+
fcst = forecast_gru_multivariate(ts, periods=periods, target_col=target_col, hyperparams=hyperparams)
|
| 958 |
+
return ts, fcst
|
| 959 |
+
except Exception as e:
|
| 960 |
+
warnings.warn(f'Multivariate GRU failed: {e}, falling back to univariate')
|
| 961 |
+
# Fallback to univariate
|
| 962 |
+
univariate_ts = prepare_timeseries(df, metric=metric)
|
| 963 |
+
fcst = forecast_naive(univariate_ts, periods=periods)
|
| 964 |
+
return univariate_ts, fcst
|
| 965 |
+
|
| 966 |
+
else:
|
| 967 |
+
# For prophet and other models, fall back to univariate
|
| 968 |
+
if multivariate:
|
| 969 |
+
warnings.warn(f'Model {model_type} does not support multivariate forecasting. Using univariate {model_type} instead.')
|
| 970 |
+
univariate_ts = prepare_timeseries(df, metric=metric)
|
| 971 |
+
if model_type == 'prophet':
|
| 972 |
+
if PROPHET_AVAILABLE and len(univariate_ts) >= 14:
|
| 973 |
+
try:
|
| 974 |
+
fcst = forecast_prophet(univariate_ts, periods=periods, hyperparams=hyperparams)
|
| 975 |
+
return univariate_ts, fcst
|
| 976 |
+
except Exception:
|
| 977 |
+
warnings.warn('Prophet failed, falling back to naive')
|
| 978 |
+
fcst = forecast_naive(univariate_ts, periods=periods)
|
| 979 |
+
else:
|
| 980 |
+
fcst = forecast_naive(univariate_ts, periods=periods)
|
| 981 |
+
return univariate_ts, fcst
|
| 982 |
+
|
| 983 |
+
else:
|
| 984 |
+
# Use univariate approach (original logic)
|
| 985 |
+
ts = prepare_timeseries(df, metric=metric)
|
| 986 |
+
|
| 987 |
+
if model_type == 'prophet':
|
| 988 |
+
if PROPHET_AVAILABLE and len(ts) >= 14:
|
| 989 |
+
try:
|
| 990 |
+
fcst = forecast_prophet(ts, periods=periods, hyperparams=hyperparams)
|
| 991 |
+
return ts, fcst
|
| 992 |
+
except Exception:
|
| 993 |
+
warnings.warn('Prophet failed, falling back to naive')
|
| 994 |
+
fcst = forecast_naive(ts, periods=periods)
|
| 995 |
+
|
| 996 |
+
elif model_type == 'lstm':
|
| 997 |
+
if TF_AVAILABLE and len(ts) >= 14:
|
| 998 |
+
try:
|
| 999 |
+
fcst = forecast_lstm(ts, periods=periods, hyperparams=hyperparams)
|
| 1000 |
+
return ts, fcst
|
| 1001 |
+
except Exception as e:
|
| 1002 |
+
warnings.warn(f'LSTM failed: {e}, falling back to naive')
|
| 1003 |
+
fcst = forecast_naive(ts, periods=periods)
|
| 1004 |
+
|
| 1005 |
+
elif model_type == 'bilstm':
|
| 1006 |
+
if TF_AVAILABLE and len(ts) >= 14:
|
| 1007 |
+
try:
|
| 1008 |
+
fcst = forecast_bilstm(ts, periods=periods, hyperparams=hyperparams)
|
| 1009 |
+
return ts, fcst
|
| 1010 |
+
except Exception as e:
|
| 1011 |
+
warnings.warn(f'Bi-LSTM failed: {e}, falling back to naive')
|
| 1012 |
+
fcst = forecast_naive(ts, periods=periods)
|
| 1013 |
+
|
| 1014 |
+
elif model_type == 'gru':
|
| 1015 |
+
if TF_AVAILABLE and len(ts) >= 14:
|
| 1016 |
+
try:
|
| 1017 |
+
fcst = forecast_gru(ts, periods=periods, hyperparams=hyperparams)
|
| 1018 |
+
return ts, fcst
|
| 1019 |
+
except Exception as e:
|
| 1020 |
+
warnings.warn(f'GRU failed: {e}, falling back to naive')
|
| 1021 |
+
fcst = forecast_naive(ts, periods=periods)
|
| 1022 |
+
|
| 1023 |
+
else: # naive or unknown model_type
|
| 1024 |
+
fcst = forecast_naive(ts, periods=periods)
|
| 1025 |
+
|
| 1026 |
+
return ts, fcst
|
| 1027 |
+
|
| 1028 |
+
|
| 1029 |
+
def forecast_gru_multivariate(ts: pd.DataFrame, periods: int = 7, seq_length: int = 7, target_col: str = 'daily_count', hyperparams: dict = None) -> pd.DataFrame:
|
| 1030 |
+
"""Forecast using multivariate GRU model"""
|
| 1031 |
+
if not TF_AVAILABLE:
|
| 1032 |
+
raise RuntimeError('TensorFlow not available')
|
| 1033 |
+
|
| 1034 |
+
# Set default hyperparameters
|
| 1035 |
+
if hyperparams is None:
|
| 1036 |
+
hyperparams = {}
|
| 1037 |
+
epochs = hyperparams.get('epochs', 100)
|
| 1038 |
+
batch_size = hyperparams.get('batch_size', 16)
|
| 1039 |
+
learning_rate = hyperparams.get('learning_rate', 0.001)
|
| 1040 |
+
units = hyperparams.get('units', 100)
|
| 1041 |
+
dropout_rate = hyperparams.get('dropout_rate', 0.2)
|
| 1042 |
+
|
| 1043 |
+
# Select features for multivariate forecasting
|
| 1044 |
+
feature_cols = [col for col in ts.columns if col not in ['ds', 'OpDeviceType', 'Owner', 'Weather', 'EventType']]
|
| 1045 |
+
if target_col not in feature_cols:
|
| 1046 |
+
raise ValueError(f"Target column '{target_col}' not found in features")
|
| 1047 |
+
|
| 1048 |
+
# Prepare data
|
| 1049 |
+
data = ts[feature_cols].values
|
| 1050 |
+
scaler = MinMaxScaler(feature_range=(0, 1))
|
| 1051 |
+
scaled_data = scaler.fit_transform(data)
|
| 1052 |
+
|
| 1053 |
+
# Create sequences
|
| 1054 |
+
X, y = create_sequences(scaled_data, seq_length)
|
| 1055 |
+
|
| 1056 |
+
if len(X) < 10: # Not enough data
|
| 1057 |
+
return forecast_naive(ts[['ds', target_col]].rename(columns={target_col: 'y'}), periods)
|
| 1058 |
+
|
| 1059 |
+
# Split data
|
| 1060 |
+
train_size = int(len(X) * 0.8)
|
| 1061 |
+
X_train, X_test = X[:train_size], X[train_size:]
|
| 1062 |
+
y_train, y_test = y[:train_size], y[train_size:]
|
| 1063 |
+
|
| 1064 |
+
# Reshape for GRU [samples, time steps, features]
|
| 1065 |
+
X_train = X_train.reshape((X_train.shape[0], X_train.shape[1], len(feature_cols)))
|
| 1066 |
+
X_test = X_test.reshape((X_test.shape[0], X_test.shape[1], len(feature_cols)))
|
| 1067 |
+
|
| 1068 |
+
# Build multivariate GRU model
|
| 1069 |
+
model = Sequential([
|
| 1070 |
+
GRU(units, activation='relu', return_sequences=True, input_shape=(seq_length, len(feature_cols))),
|
| 1071 |
+
Dropout(dropout_rate),
|
| 1072 |
+
GRU(units//2, activation='relu'),
|
| 1073 |
+
Dropout(dropout_rate),
|
| 1074 |
+
Dense(1)
|
| 1075 |
+
])
|
| 1076 |
+
|
| 1077 |
+
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
|
| 1078 |
+
model.compile(optimizer=optimizer, loss='mse')
|
| 1079 |
+
|
| 1080 |
+
# Train model
|
| 1081 |
+
model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, verbose=0, validation_data=(X_test, y_test))
|
| 1082 |
+
|
| 1083 |
+
# Make predictions
|
| 1084 |
+
predictions = []
|
| 1085 |
+
current_sequence = scaled_data[-seq_length:].reshape(1, seq_length, len(feature_cols))
|
| 1086 |
+
|
| 1087 |
+
for _ in range(periods):
|
| 1088 |
+
pred = model.predict(current_sequence, verbose=0)
|
| 1089 |
+
predictions.append(pred[0][0])
|
| 1090 |
+
|
| 1091 |
+
# Update sequence for next prediction
|
| 1092 |
+
new_row = current_sequence[0, -1, :].copy()
|
| 1093 |
+
new_row[feature_cols.index(target_col)] = pred[0][0]
|
| 1094 |
+
|
| 1095 |
+
current_sequence = np.roll(current_sequence, -1, axis=1)
|
| 1096 |
+
current_sequence[0, -1, :] = new_row
|
| 1097 |
+
|
| 1098 |
+
# Inverse transform predictions
|
| 1099 |
+
target_scaler = MinMaxScaler(feature_range=(0, 1))
|
| 1100 |
+
target_scaler.fit(data[:, feature_cols.index(target_col)].reshape(-1, 1))
|
| 1101 |
+
predictions = target_scaler.inverse_transform(np.array(predictions).reshape(-1, 1)).flatten()
|
| 1102 |
+
|
| 1103 |
+
# Create forecast dataframe
|
| 1104 |
+
last_date = ts['ds'].max()
|
| 1105 |
+
future_dates = pd.date_range(start=last_date + pd.Timedelta(days=1), periods=periods, freq='D')
|
| 1106 |
+
|
| 1107 |
+
return pd.DataFrame({
|
| 1108 |
+
'ds': future_dates,
|
| 1109 |
+
'yhat': predictions,
|
| 1110 |
+
'yhat_lower': predictions * 0.8,
|
| 1111 |
+
'yhat_upper': predictions * 1.2
|
| 1112 |
+
})
|
| 1113 |
+
"""Forecast using GRU model (univariate)"""
|
| 1114 |
+
if not TF_AVAILABLE:
|
| 1115 |
+
raise RuntimeError('TensorFlow not available')
|
| 1116 |
+
|
| 1117 |
+
# Prepare data
|
| 1118 |
+
data = ts['y'].values.reshape(-1, 1)
|
| 1119 |
+
scaler = MinMaxScaler(feature_range=(0, 1))
|
| 1120 |
+
scaled_data = scaler.fit_transform(data)
|
| 1121 |
+
|
| 1122 |
+
# Create sequences
|
| 1123 |
+
X, y = create_sequences(scaled_data, seq_length)
|
| 1124 |
+
|
| 1125 |
+
if len(X) < 10: # Not enough data
|
| 1126 |
+
return forecast_naive(ts, periods)
|
| 1127 |
+
|
| 1128 |
+
# Split data
|
| 1129 |
+
train_size = int(len(X) * 0.8)
|
| 1130 |
+
X_train, X_test = X[:train_size], X[train_size:]
|
| 1131 |
+
y_train, y_test = y[:train_size], y[train_size:]
|
| 1132 |
+
|
| 1133 |
+
# Reshape for GRU [samples, time steps, features]
|
| 1134 |
+
X_train = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
|
| 1135 |
+
X_test = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))
|
| 1136 |
+
|
| 1137 |
+
# Build GRU model
|
| 1138 |
+
model = Sequential([
|
| 1139 |
+
GRU(50, activation='relu', input_shape=(seq_length, 1)),
|
| 1140 |
+
Dropout(0.2),
|
| 1141 |
+
Dense(1)
|
| 1142 |
+
])
|
| 1143 |
+
|
| 1144 |
+
model.compile(optimizer='adam', loss='mse')
|
| 1145 |
+
|
| 1146 |
+
# Train model
|
| 1147 |
+
model.fit(X_train, y_train, epochs=50, batch_size=16, verbose=0, validation_data=(X_test, y_test))
|
| 1148 |
+
|
| 1149 |
+
# Make predictions
|
| 1150 |
+
predictions = []
|
| 1151 |
+
current_sequence = scaled_data[-seq_length:].reshape(1, seq_length, 1)
|
| 1152 |
+
|
| 1153 |
+
for _ in range(periods):
|
| 1154 |
+
pred = model.predict(current_sequence, verbose=0)
|
| 1155 |
+
predictions.append(pred[0][0])
|
| 1156 |
+
# Update sequence for next prediction
|
| 1157 |
+
current_sequence = np.roll(current_sequence, -1, axis=1)
|
| 1158 |
+
current_sequence[0, -1, 0] = pred[0][0]
|
| 1159 |
+
|
| 1160 |
+
# Inverse transform predictions
|
| 1161 |
+
predictions = scaler.inverse_transform(np.array(predictions).reshape(-1, 1)).flatten()
|
| 1162 |
+
|
| 1163 |
+
# Create forecast dataframe
|
| 1164 |
+
last_date = ts['ds'].max()
|
| 1165 |
+
future_dates = pd.date_range(start=last_date + pd.Timedelta(days=1), periods=periods, freq='D')
|
| 1166 |
+
|
| 1167 |
+
return pd.DataFrame({
|
| 1168 |
+
'ds': future_dates,
|
| 1169 |
+
'yhat': predictions,
|
| 1170 |
+
'yhat_lower': predictions * 0.8,
|
| 1171 |
+
'yhat_upper': predictions * 1.2
|
| 1172 |
+
})
|
| 1173 |
+
|
| 1174 |
+
|
| 1175 |
+
def forecast_lstm_multivariate(ts: pd.DataFrame, periods: int = 7, seq_length: int = 7, target_col: str = 'daily_count', hyperparams: dict = None) -> pd.DataFrame:
|
| 1176 |
+
"""Forecast using multivariate LSTM model"""
|
| 1177 |
+
if not TF_AVAILABLE:
|
| 1178 |
+
raise RuntimeError('TensorFlow not available')
|
| 1179 |
+
|
| 1180 |
+
# Set default hyperparameters
|
| 1181 |
+
if hyperparams is None:
|
| 1182 |
+
hyperparams = {}
|
| 1183 |
+
seq_length = hyperparams.get('seq_length', seq_length) # Use hyperparams seq_length if provided
|
| 1184 |
+
epochs = hyperparams.get('epochs', 100)
|
| 1185 |
+
batch_size = hyperparams.get('batch_size', 16)
|
| 1186 |
+
learning_rate = hyperparams.get('learning_rate', 0.001)
|
| 1187 |
+
units = hyperparams.get('units', 100)
|
| 1188 |
+
dropout_rate = hyperparams.get('dropout_rate', 0.2)
|
| 1189 |
+
|
| 1190 |
+
# Select features for multivariate forecasting
|
| 1191 |
+
feature_cols = [col for col in ts.columns if col not in ['ds', 'OpDeviceType', 'Owner', 'Weather', 'EventType']]
|
| 1192 |
+
if target_col not in feature_cols:
|
| 1193 |
+
raise ValueError(f"Target column '{target_col}' not found in features")
|
| 1194 |
+
|
| 1195 |
+
# Prepare data
|
| 1196 |
+
data = ts[feature_cols].values
|
| 1197 |
+
scaler = MinMaxScaler(feature_range=(0, 1))
|
| 1198 |
+
scaled_data = scaler.fit_transform(data)
|
| 1199 |
+
|
| 1200 |
+
# Create sequences
|
| 1201 |
+
X, y = create_sequences(scaled_data, seq_length)
|
| 1202 |
+
|
| 1203 |
+
if len(X) < 10: # Not enough data
|
| 1204 |
+
return forecast_naive(ts[['ds', target_col]].rename(columns={target_col: 'y'}), periods)
|
| 1205 |
+
|
| 1206 |
+
# Split data
|
| 1207 |
+
train_size = int(len(X) * 0.8)
|
| 1208 |
+
X_train, X_test = X[:train_size], X[train_size:]
|
| 1209 |
+
y_train, y_test = y[:train_size], y[train_size:]
|
| 1210 |
+
|
| 1211 |
+
# Reshape for LSTM [samples, time steps, features]
|
| 1212 |
+
X_train = X_train.reshape((X_train.shape[0], X_train.shape[1], len(feature_cols)))
|
| 1213 |
+
X_test = X_test.reshape((X_test.shape[0], X_test.shape[1], len(feature_cols)))
|
| 1214 |
+
|
| 1215 |
+
# Build multivariate LSTM model
|
| 1216 |
+
model = Sequential([
|
| 1217 |
+
LSTM(units, activation='relu', return_sequences=True, input_shape=(seq_length, len(feature_cols))),
|
| 1218 |
+
Dropout(dropout_rate),
|
| 1219 |
+
LSTM(units//2, activation='relu'),
|
| 1220 |
+
Dropout(dropout_rate),
|
| 1221 |
+
Dense(1)
|
| 1222 |
+
])
|
| 1223 |
+
|
| 1224 |
+
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
|
| 1225 |
+
model.compile(optimizer=optimizer, loss='mse')
|
| 1226 |
+
|
| 1227 |
+
# Train model
|
| 1228 |
+
model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, verbose=0, validation_data=(X_test, y_test))
|
| 1229 |
+
|
| 1230 |
+
# Make predictions
|
| 1231 |
+
predictions = []
|
| 1232 |
+
current_sequence = scaled_data[-seq_length:].reshape(1, seq_length, len(feature_cols))
|
| 1233 |
+
|
| 1234 |
+
for _ in range(periods):
|
| 1235 |
+
pred = model.predict(current_sequence, verbose=0)
|
| 1236 |
+
predictions.append(pred[0][0])
|
| 1237 |
+
|
| 1238 |
+
# Update sequence for next prediction (use predicted value for target, keep other features)
|
| 1239 |
+
new_row = current_sequence[0, -1, :].copy()
|
| 1240 |
+
new_row[feature_cols.index(target_col)] = pred[0][0] # Update target with prediction
|
| 1241 |
+
|
| 1242 |
+
current_sequence = np.roll(current_sequence, -1, axis=1)
|
| 1243 |
+
current_sequence[0, -1, :] = new_row
|
| 1244 |
+
|
| 1245 |
+
# Inverse transform predictions (only for target column)
|
| 1246 |
+
target_scaler = MinMaxScaler(feature_range=(0, 1))
|
| 1247 |
+
target_scaler.fit(data[:, feature_cols.index(target_col)].reshape(-1, 1))
|
| 1248 |
+
predictions = target_scaler.inverse_transform(np.array(predictions).reshape(-1, 1)).flatten()
|
| 1249 |
+
|
| 1250 |
+
# Create forecast dataframe
|
| 1251 |
+
last_date = ts['ds'].max()
|
| 1252 |
+
future_dates = pd.date_range(start=last_date + pd.Timedelta(days=1), periods=periods, freq='D')
|
| 1253 |
+
|
| 1254 |
+
return pd.DataFrame({
|
| 1255 |
+
'ds': future_dates,
|
| 1256 |
+
'yhat': predictions,
|
| 1257 |
+
'yhat_lower': predictions * 0.8,
|
| 1258 |
+
'yhat_upper': predictions * 1.2
|
| 1259 |
+
})
|
| 1260 |
+
|
| 1261 |
+
|
| 1262 |
+
def forecast_bilstm_multivariate(ts: pd.DataFrame, periods: int = 7, seq_length: int = 7, target_col: str = 'daily_count', hyperparams: dict = None) -> pd.DataFrame:
|
| 1263 |
+
"""Forecast using multivariate Bi-LSTM model"""
|
| 1264 |
+
if not TF_AVAILABLE:
|
| 1265 |
+
raise RuntimeError('TensorFlow not available')
|
| 1266 |
+
|
| 1267 |
+
# Set default hyperparameters
|
| 1268 |
+
if hyperparams is None:
|
| 1269 |
+
hyperparams = {}
|
| 1270 |
+
epochs = hyperparams.get('epochs', 100)
|
| 1271 |
+
batch_size = hyperparams.get('batch_size', 16)
|
| 1272 |
+
learning_rate = hyperparams.get('learning_rate', 0.001)
|
| 1273 |
+
units = hyperparams.get('units', 100)
|
| 1274 |
+
dropout_rate = hyperparams.get('dropout_rate', 0.2)
|
| 1275 |
+
|
| 1276 |
+
# Select features for multivariate forecasting
|
| 1277 |
+
feature_cols = [col for col in ts.columns if col not in ['ds', 'OpDeviceType', 'Owner', 'Weather', 'EventType']]
|
| 1278 |
+
if target_col not in feature_cols:
|
| 1279 |
+
raise ValueError(f"Target column '{target_col}' not found in features")
|
| 1280 |
+
|
| 1281 |
+
# Prepare data
|
| 1282 |
+
data = ts[feature_cols].values
|
| 1283 |
+
scaler = MinMaxScaler(feature_range=(0, 1))
|
| 1284 |
+
scaled_data = scaler.fit_transform(data)
|
| 1285 |
+
|
| 1286 |
+
# Create sequences
|
| 1287 |
+
X, y = create_sequences(scaled_data, seq_length)
|
| 1288 |
+
|
| 1289 |
+
if len(X) < 10: # Not enough data
|
| 1290 |
+
return forecast_naive(ts[['ds', target_col]].rename(columns={target_col: 'y'}), periods)
|
| 1291 |
+
|
| 1292 |
+
# Split data
|
| 1293 |
+
train_size = int(len(X) * 0.8)
|
| 1294 |
+
X_train, X_test = X[:train_size], X[train_size:]
|
| 1295 |
+
y_train, y_test = y[:train_size], y[train_size:]
|
| 1296 |
+
|
| 1297 |
+
# Reshape for Bi-LSTM [samples, time steps, features]
|
| 1298 |
+
X_train = X_train.reshape((X_train.shape[0], X_train.shape[1], len(feature_cols)))
|
| 1299 |
+
X_test = X_test.reshape((X_test.shape[0], X_test.shape[1], len(feature_cols)))
|
| 1300 |
+
|
| 1301 |
+
# Build multivariate Bi-LSTM model
|
| 1302 |
+
model = Sequential([
|
| 1303 |
+
Bidirectional(LSTM(units, activation='relu', return_sequences=True), input_shape=(seq_length, len(feature_cols))),
|
| 1304 |
+
Dropout(dropout_rate),
|
| 1305 |
+
Bidirectional(LSTM(units//2, activation='relu')),
|
| 1306 |
+
Dropout(dropout_rate),
|
| 1307 |
+
Dense(1)
|
| 1308 |
+
])
|
| 1309 |
+
|
| 1310 |
+
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
|
| 1311 |
+
model.compile(optimizer=optimizer, loss='mse')
|
| 1312 |
+
|
| 1313 |
+
# Train model
|
| 1314 |
+
model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, verbose=0, validation_data=(X_test, y_test))
|
| 1315 |
+
|
| 1316 |
+
# Make predictions
|
| 1317 |
+
predictions = []
|
| 1318 |
+
current_sequence = scaled_data[-seq_length:].reshape(1, seq_length, len(feature_cols))
|
| 1319 |
+
|
| 1320 |
+
for _ in range(periods):
|
| 1321 |
+
pred = model.predict(current_sequence, verbose=0)
|
| 1322 |
+
predictions.append(pred[0][0])
|
| 1323 |
+
|
| 1324 |
+
# Update sequence for next prediction
|
| 1325 |
+
new_row = current_sequence[0, -1, :].copy()
|
| 1326 |
+
new_row[feature_cols.index(target_col)] = pred[0][0]
|
| 1327 |
+
|
| 1328 |
+
current_sequence = np.roll(current_sequence, -1, axis=1)
|
| 1329 |
+
current_sequence[0, -1, :] = new_row
|
| 1330 |
+
|
| 1331 |
+
# Inverse transform predictions
|
| 1332 |
+
target_scaler = MinMaxScaler(feature_range=(0, 1))
|
| 1333 |
+
target_scaler.fit(data[:, feature_cols.index(target_col)].reshape(-1, 1))
|
| 1334 |
+
predictions = target_scaler.inverse_transform(np.array(predictions).reshape(-1, 1)).flatten()
|
| 1335 |
+
|
| 1336 |
+
# Create forecast dataframe
|
| 1337 |
+
last_date = ts['ds'].max()
|
| 1338 |
+
future_dates = pd.date_range(start=last_date + pd.Timedelta(days=1), periods=periods, freq='D')
|
| 1339 |
+
|
| 1340 |
+
return pd.DataFrame({
|
| 1341 |
+
'ds': future_dates,
|
| 1342 |
+
'yhat': predictions,
|
| 1343 |
+
'yhat_lower': predictions * 0.8,
|
| 1344 |
+
'yhat_upper': predictions * 1.2
|
| 1345 |
+
})
|
| 1346 |
+
|
| 1347 |
+
|
| 1348 |
+
def forecast_gru_multivariate(ts: pd.DataFrame, periods: int = 7, seq_length: int = 7, target_col: str = 'daily_count', hyperparams: dict = None) -> pd.DataFrame:
|
| 1349 |
+
"""Forecast using multivariate GRU model"""
|
| 1350 |
+
if not TF_AVAILABLE:
|
| 1351 |
+
raise RuntimeError('TensorFlow not available')
|
| 1352 |
+
|
| 1353 |
+
# Set default hyperparameters
|
| 1354 |
+
if hyperparams is None:
|
| 1355 |
+
hyperparams = {}
|
| 1356 |
+
epochs = hyperparams.get('epochs', 100)
|
| 1357 |
+
batch_size = hyperparams.get('batch_size', 16)
|
| 1358 |
+
learning_rate = hyperparams.get('learning_rate', 0.001)
|
| 1359 |
+
units = hyperparams.get('units', 100)
|
| 1360 |
+
dropout_rate = hyperparams.get('dropout_rate', 0.2)
|
| 1361 |
+
|
| 1362 |
+
# Select features for multivariate forecasting
|
| 1363 |
+
feature_cols = [col for col in ts.columns if col not in ['ds', 'OpDeviceType', 'Owner', 'Weather', 'EventType']]
|
| 1364 |
+
if target_col not in feature_cols:
|
| 1365 |
+
raise ValueError(f"Target column '{target_col}' not found in features")
|
| 1366 |
+
|
| 1367 |
+
# Prepare data
|
| 1368 |
+
data = ts[feature_cols].values
|
| 1369 |
+
scaler = MinMaxScaler(feature_range=(0, 1))
|
| 1370 |
+
scaled_data = scaler.fit_transform(data)
|
| 1371 |
+
|
| 1372 |
+
# Create sequences
|
| 1373 |
+
X, y = create_sequences(scaled_data, seq_length)
|
| 1374 |
+
|
| 1375 |
+
if len(X) < 10: # Not enough data
|
| 1376 |
+
return forecast_naive(ts[['ds', target_col]].rename(columns={target_col: 'y'}), periods)
|
| 1377 |
+
|
| 1378 |
+
# Split data
|
| 1379 |
+
train_size = int(len(X) * 0.8)
|
| 1380 |
+
X_train, X_test = X[:train_size], X[train_size:]
|
| 1381 |
+
y_train, y_test = y[:train_size], y[train_size:]
|
| 1382 |
+
|
| 1383 |
+
# Reshape for GRU [samples, time steps, features]
|
| 1384 |
+
X_train = X_train.reshape((X_train.shape[0], X_train.shape[1], len(feature_cols)))
|
| 1385 |
+
X_test = X_test.reshape((X_test.shape[0], X_test.shape[1], len(feature_cols)))
|
| 1386 |
+
|
| 1387 |
+
# Build multivariate GRU model
|
| 1388 |
+
model = Sequential([
|
| 1389 |
+
GRU(units, activation='relu', return_sequences=True, input_shape=(seq_length, len(feature_cols))),
|
| 1390 |
+
Dropout(dropout_rate),
|
| 1391 |
+
GRU(units//2, activation='relu'),
|
| 1392 |
+
Dropout(dropout_rate),
|
| 1393 |
+
Dense(1)
|
| 1394 |
+
])
|
| 1395 |
+
|
| 1396 |
+
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
|
| 1397 |
+
model.compile(optimizer=optimizer, loss='mse')
|
| 1398 |
+
|
| 1399 |
+
# Train model
|
| 1400 |
+
model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, verbose=0, validation_data=(X_test, y_test))
|
| 1401 |
+
|
| 1402 |
+
# Make predictions
|
| 1403 |
+
predictions = []
|
| 1404 |
+
current_sequence = scaled_data[-seq_length:].reshape(1, seq_length, len(feature_cols))
|
| 1405 |
+
|
| 1406 |
+
for _ in range(periods):
|
| 1407 |
+
pred = model.predict(current_sequence, verbose=0)
|
| 1408 |
+
predictions.append(pred[0][0])
|
| 1409 |
+
|
| 1410 |
+
# Update sequence for next prediction
|
| 1411 |
+
new_row = current_sequence[0, -1, :].copy()
|
| 1412 |
+
new_row[feature_cols.index(target_col)] = pred[0][0]
|
| 1413 |
+
|
| 1414 |
+
current_sequence = np.roll(current_sequence, -1, axis=1)
|
| 1415 |
+
current_sequence[0, -1, :] = new_row
|
| 1416 |
+
|
| 1417 |
+
# Inverse transform predictions
|
| 1418 |
+
target_scaler = MinMaxScaler(feature_range=(0, 1))
|
| 1419 |
+
target_scaler.fit(data[:, feature_cols.index(target_col)].reshape(-1, 1))
|
| 1420 |
+
predictions = target_scaler.inverse_transform(np.array(predictions).reshape(-1, 1)).flatten()
|
| 1421 |
+
|
| 1422 |
+
# Create forecast dataframe
|
| 1423 |
+
last_date = ts['ds'].max()
|
| 1424 |
+
future_dates = pd.date_range(start=last_date + pd.Timedelta(days=1), periods=periods, freq='D')
|
| 1425 |
+
|
| 1426 |
+
return pd.DataFrame({
|
| 1427 |
+
'ds': future_dates,
|
| 1428 |
+
'yhat': predictions,
|
| 1429 |
+
'yhat_lower': predictions * 0.8,
|
| 1430 |
+
'yhat_upper': predictions * 1.2
|
| 1431 |
+
})
|
scripts/{summarize.py → recommendation.py}
RENAMED
|
File without changes
|
scripts/summary.py
ADDED
|
@@ -0,0 +1,159 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import pandas as pd
|
| 3 |
+
from typing import Dict
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
|
| 6 |
+
# Prefer HF router via OpenAI-compatible client. Use env `HF_TOKEN`.
|
| 7 |
+
# HF_TOKEN loaded lazily to allow dotenv loading after import
|
| 8 |
+
def get_hf_token():
|
| 9 |
+
return os.environ.get('HF_TOKEN')
|
| 10 |
+
|
| 11 |
+
def openai_summary(text: str, verbosity: str = 'brief', model: str = 'meta-llama/Llama-3.1-8B-Instruct:novita') -> str:
|
| 12 |
+
HF_TOKEN = get_hf_token()
|
| 13 |
+
if not HF_TOKEN:
|
| 14 |
+
return None
|
| 15 |
+
try:
|
| 16 |
+
# Import here to avoid requiring OpenAI client unless HF_TOKEN set
|
| 17 |
+
from openai import OpenAI
|
| 18 |
+
client = OpenAI(base_url="https://router.huggingface.co/v1", api_key=HF_TOKEN)
|
| 19 |
+
if verbosity == 'analyze':
|
| 20 |
+
instruction = 'วิเคราะห์สาเหตุไฟฟ้าจากข้อมูลนี้ สรุปไม่เกิน 3-4 บรรทัด (ไทย) ระบุสาเหตุทางเทคนิค ผลกระทบต่อลูกค้าและระบบ และช่วงเวลา:'
|
| 21 |
+
elif verbosity == 'recommend':
|
| 22 |
+
instruction = 'วิเคราะห์สาเหตุไฟฟ้าจากข้อมูลนี้ พร้อมแนะนำการแก้ไข สรุปไม่เกิน 3-4 บรรทัด (ไทย) ระบุสาเหตุทางเทคนิค ผลกระทบต่อลูกค้าและระบบ ช่วงเวลาและข้อเสนอแนะในการป้องกัน:'
|
| 23 |
+
prompt = f"{instruction}\n\n{text}\n\nสรุป:"
|
| 24 |
+
completion = client.chat.completions.create(
|
| 25 |
+
model=model,
|
| 26 |
+
messages=[{"role": "user", "content": prompt}],
|
| 27 |
+
max_tokens=1000,
|
| 28 |
+
)
|
| 29 |
+
# Extract text from response
|
| 30 |
+
choice = completion.choices[0]
|
| 31 |
+
msg = choice.message
|
| 32 |
+
content = msg.content
|
| 33 |
+
return content.strip() if content else None
|
| 34 |
+
except Exception:
|
| 35 |
+
return None
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
def summarize_overall(df: pd.DataFrame, use_hf: bool = False, model: str = 'meta-llama/Llama-3.1-8B-Instruct:novita', total_customers: float = None) -> Dict:
|
| 39 |
+
"""Summarize overall outage data with GenAI and reliability metrics."""
|
| 40 |
+
# Basic statistics
|
| 41 |
+
total_events = len(df)
|
| 42 |
+
date_cols = ['OutageDateTime', 'FirstRestoDateTime', 'LastRestoDateTime', 'CreateEventDateTime', 'CloseEventDateTime']
|
| 43 |
+
|
| 44 |
+
# Parse dates
|
| 45 |
+
df_copy = df.copy()
|
| 46 |
+
for col in date_cols:
|
| 47 |
+
if col in df_copy.columns:
|
| 48 |
+
df_copy[col] = pd.to_datetime(df_copy[col], dayfirst=True, errors='coerce')
|
| 49 |
+
|
| 50 |
+
# Calculate basic metrics
|
| 51 |
+
if 'OutageDateTime' in df_copy.columns:
|
| 52 |
+
date_range = f"{df_copy['OutageDateTime'].min()} ถึง {df_copy['OutageDateTime'].max()}" if pd.notna(df_copy['OutageDateTime'].min()) else "ไม่ระบุ"
|
| 53 |
+
else:
|
| 54 |
+
date_range = "ไม่ระบุ"
|
| 55 |
+
|
| 56 |
+
# Event types
|
| 57 |
+
event_types = df_copy.get('EventType', pd.Series()).value_counts().head(5).to_dict()
|
| 58 |
+
|
| 59 |
+
# Affected customers
|
| 60 |
+
total_affected = 0
|
| 61 |
+
if 'AffectedCustomer' in df_copy.columns:
|
| 62 |
+
total_affected = pd.to_numeric(df_copy['AffectedCustomer'], errors='coerce').sum()
|
| 63 |
+
|
| 64 |
+
# Create summary text for GenAI
|
| 65 |
+
summary_text = f"""
|
| 66 |
+
ข้อมูลไฟฟ้าล้มทั้งหมด:
|
| 67 |
+
- จำนวนเหตุการณ์ทั้งหมด: {total_events}
|
| 68 |
+
- ช่วงเวลาที่เกิดเหตุการณ์: {date_range}
|
| 69 |
+
- ประเภทเหตุการณ์หลัก: {', '.join([f'{k}: {v}' for k, v in event_types.items()])}
|
| 70 |
+
- จำนวนลูกค้าที่ได้รับผลกระทบทั้งหมด: {int(total_affected) if not pd.isna(total_affected) else 'ไม่ระบุ'}
|
| 71 |
+
"""
|
| 72 |
+
|
| 73 |
+
# Reliability metrics DataFrame
|
| 74 |
+
reliability_df = pd.DataFrame()
|
| 75 |
+
reliability_summary = ""
|
| 76 |
+
|
| 77 |
+
if total_customers and total_customers > 0:
|
| 78 |
+
try:
|
| 79 |
+
from scripts.compute_reliability import compute_reliability
|
| 80 |
+
import tempfile
|
| 81 |
+
import os
|
| 82 |
+
|
| 83 |
+
# Save df to temp CSV for compute_reliability
|
| 84 |
+
with tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False) as f:
|
| 85 |
+
df_copy.to_csv(f.name, index=False)
|
| 86 |
+
temp_path = f.name
|
| 87 |
+
|
| 88 |
+
try:
|
| 89 |
+
reliability_results = compute_reliability(temp_path, total_customers=total_customers, exclude_planned=True)
|
| 90 |
+
overall_metrics = reliability_results.get('overall', pd.DataFrame())
|
| 91 |
+
if not overall_metrics.empty:
|
| 92 |
+
row = overall_metrics.iloc[0]
|
| 93 |
+
|
| 94 |
+
# Create reliability DataFrame with proper metric names
|
| 95 |
+
reliability_data = [
|
| 96 |
+
{
|
| 97 |
+
'Metric': 'SAIFI',
|
| 98 |
+
'Full Name': 'System Average Interruption Frequency Index',
|
| 99 |
+
'Value': f"{row.get('SAIFI', 'N/A'):.4f}",
|
| 100 |
+
'Unit': 'ครั้ง/ลูกค้า',
|
| 101 |
+
'Description': 'ความถี่เฉลี่ยของการขัดข้องต่อลูกค้า'
|
| 102 |
+
},
|
| 103 |
+
{
|
| 104 |
+
'Metric': 'SAIDI',
|
| 105 |
+
'Full Name': 'System Average Interruption Duration Index',
|
| 106 |
+
'Value': f"{row.get('SAIDI', 'N/A'):.2f}",
|
| 107 |
+
'Unit': 'นาที/ลูกค้า',
|
| 108 |
+
'Description': 'ระยะเวลาขัดข้องเฉลี่ยต่อลูกค้า'
|
| 109 |
+
},
|
| 110 |
+
{
|
| 111 |
+
'Metric': 'CAIDI',
|
| 112 |
+
'Full Name': 'Customer Average Interruption Duration Index',
|
| 113 |
+
'Value': f"{row.get('CAIDI', 'N/A'):.2f}",
|
| 114 |
+
'Unit': 'นาที/ครั้ง',
|
| 115 |
+
'Description': 'ระยะเวลาขัดข้องเฉลี่ยต่อครั้ง'
|
| 116 |
+
},
|
| 117 |
+
{
|
| 118 |
+
'Metric': 'MAIFI',
|
| 119 |
+
'Full Name': 'Momentary Average Interruption Frequency Index',
|
| 120 |
+
'Value': f"{row.get('MAIFI', 'N/A'):.4f}",
|
| 121 |
+
'Unit': 'ครั้ง/ลูกค้า',
|
| 122 |
+
'Description': 'ความถี่เฉลี่ยของการขัดข้องชั่วคราวต่อลูกค้า'
|
| 123 |
+
}
|
| 124 |
+
]
|
| 125 |
+
reliability_df = pd.DataFrame(reliability_data)
|
| 126 |
+
|
| 127 |
+
reliability_summary = f"""
|
| 128 |
+
ดัชนีความน่าเชื่อถือ:
|
| 129 |
+
- SAIFI (System Average Interruption Frequency Index): {row.get('SAIFI', 'N/A'):.4f} ครั้ง/ลูกค้า
|
| 130 |
+
- SAIDI (System Average Interruption Duration Index): {row.get('SAIDI', 'N/A'):.2f} นาที/ลูกค้า
|
| 131 |
+
- CAIDI (Customer Average Interruption Duration Index): {row.get('CAIDI', 'N/A'):.2f} นาที/ครั้ง
|
| 132 |
+
- MAIFI (Momentary Average Interruption Frequency Index): {row.get('MAIFI', 'N/A'):.4f} ครั้ง/ลูกค้า
|
| 133 |
+
"""
|
| 134 |
+
summary_text += reliability_summary
|
| 135 |
+
finally:
|
| 136 |
+
os.unlink(temp_path)
|
| 137 |
+
except Exception as e:
|
| 138 |
+
reliability_summary = f"ไม่สามารถคำนวณดัชนีความน่าเชื่อถือได้: {str(e)}"
|
| 139 |
+
|
| 140 |
+
# Use GenAI for overall summary
|
| 141 |
+
ai_summary = None
|
| 142 |
+
if use_hf and get_hf_token():
|
| 143 |
+
try:
|
| 144 |
+
instruction = "สรุปภาพรวมข้อมูลไฟฟ้าล้มจากข้อมูลนี้ สรุปเป็นย่อหน้าเดียว (ไทย) ระบุจำนวนเหตุการณ์ สาเหตุหลัก ผลกระทบ และข้อเสนอแนะในการปรับปรุงระบบไฟฟ้า:"
|
| 145 |
+
prompt = f"{instruction}\n\n{summary_text}\n\nสรุปภาพรวม:"
|
| 146 |
+
ai_summary = openai_summary(prompt, verbosity='recommend', model=model)
|
| 147 |
+
except Exception as e:
|
| 148 |
+
ai_summary = f"ไม่สามารถสร้างสรุปด้วย AI ได้: {str(e)}"
|
| 149 |
+
|
| 150 |
+
return {
|
| 151 |
+
'total_events': total_events,
|
| 152 |
+
'date_range': date_range,
|
| 153 |
+
'event_types': event_types,
|
| 154 |
+
'total_affected_customers': int(total_affected) if not pd.isna(total_affected) else None,
|
| 155 |
+
'basic_summary': summary_text.strip(),
|
| 156 |
+
'reliability_summary': reliability_summary.strip() if reliability_summary else None,
|
| 157 |
+
'reliability_df': reliability_df,
|
| 158 |
+
'ai_summary': ai_summary,
|
| 159 |
+
}
|