Spaces:
Running
on
Zero
Running
on
Zero
Commit
·
4aa23ee
1
Parent(s):
ef08a8e
Deploy Vocal Articulation Assessment v2.0
Browse files- .env.example +10 -0
- .gitignore +74 -0
- Dockerfile +33 -0
- README.md +151 -6
- api/__init__.py +4 -0
- api/routes.py +346 -0
- app.py +319 -0
- app/__init__.py +4 -0
- app/interface.py +351 -0
- config/__init__.py +4 -0
- config/settings.py +30 -0
- core/__init__.py +9 -0
- core/constants.py +94 -0
- core/scoring_engine.py +638 -0
- requirements.txt +38 -0
- start.sh +26 -0
.env.example
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Model configuration
|
| 2 |
+
WHISPER_MODEL=openai/whisper-small
|
| 3 |
+
|
| 4 |
+
# Server configuration
|
| 5 |
+
HOST=0.0.0.0
|
| 6 |
+
PORT=7860
|
| 7 |
+
|
| 8 |
+
# Gradio configuration
|
| 9 |
+
GRADIO_SERVER_NAME=0.0.0.0
|
| 10 |
+
GRADIO_SERVER_PORT=7860
|
.gitignore
ADDED
|
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# .gitignore untuk Vocal Articulation Project
|
| 2 |
+
|
| 3 |
+
# Python
|
| 4 |
+
__pycache__/
|
| 5 |
+
*.py[cod]
|
| 6 |
+
*$py.class
|
| 7 |
+
*.so
|
| 8 |
+
.Python
|
| 9 |
+
build/
|
| 10 |
+
develop-eggs/
|
| 11 |
+
dist/
|
| 12 |
+
downloads/
|
| 13 |
+
eggs/
|
| 14 |
+
.eggs/
|
| 15 |
+
lib/
|
| 16 |
+
lib64/
|
| 17 |
+
parts/
|
| 18 |
+
sdist/
|
| 19 |
+
var/
|
| 20 |
+
wheels/
|
| 21 |
+
*.egg-info/
|
| 22 |
+
.installed.cfg
|
| 23 |
+
*.egg
|
| 24 |
+
|
| 25 |
+
# Virtual Environment
|
| 26 |
+
venv/
|
| 27 |
+
env/
|
| 28 |
+
ENV/
|
| 29 |
+
.venv
|
| 30 |
+
|
| 31 |
+
# IDE
|
| 32 |
+
.vscode/
|
| 33 |
+
.idea/
|
| 34 |
+
*.swp
|
| 35 |
+
*.swo
|
| 36 |
+
*~
|
| 37 |
+
|
| 38 |
+
# Jupyter Notebook
|
| 39 |
+
.ipynb_checkpoints
|
| 40 |
+
|
| 41 |
+
# Model files (jika besar)
|
| 42 |
+
# model_vokal/*.bin
|
| 43 |
+
# model_vokal/*.safetensors
|
| 44 |
+
|
| 45 |
+
# Audio files
|
| 46 |
+
*.wav
|
| 47 |
+
*.mp3
|
| 48 |
+
*.m4a
|
| 49 |
+
*.flac
|
| 50 |
+
*.ogg
|
| 51 |
+
!examples/*.wav
|
| 52 |
+
|
| 53 |
+
# Temporary files
|
| 54 |
+
*.tmp
|
| 55 |
+
*.temp
|
| 56 |
+
tmp/
|
| 57 |
+
temp/
|
| 58 |
+
|
| 59 |
+
# Logs
|
| 60 |
+
*.log
|
| 61 |
+
logs/
|
| 62 |
+
|
| 63 |
+
# OS
|
| 64 |
+
.DS_Store
|
| 65 |
+
Thumbs.db
|
| 66 |
+
|
| 67 |
+
# Testing
|
| 68 |
+
.pytest_cache/
|
| 69 |
+
.coverage
|
| 70 |
+
htmlcov/
|
| 71 |
+
|
| 72 |
+
# Environment variables
|
| 73 |
+
.env
|
| 74 |
+
.env.local
|
Dockerfile
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# =======================================
|
| 2 |
+
# DOCKERFILE - For Space Docker SDK
|
| 3 |
+
# =======================================
|
| 4 |
+
|
| 5 |
+
FROM python:3.10-slim
|
| 6 |
+
|
| 7 |
+
WORKDIR /app
|
| 8 |
+
|
| 9 |
+
# Install system dependencies
|
| 10 |
+
RUN apt-get update && apt-get install -y \
|
| 11 |
+
libsndfile1 \
|
| 12 |
+
ffmpeg \
|
| 13 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 14 |
+
|
| 15 |
+
# Copy requirements
|
| 16 |
+
COPY requirements.txt .
|
| 17 |
+
|
| 18 |
+
# Install Python dependencies
|
| 19 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 20 |
+
|
| 21 |
+
# Copy application code
|
| 22 |
+
COPY . .
|
| 23 |
+
|
| 24 |
+
# Expose port
|
| 25 |
+
EXPOSE 7860
|
| 26 |
+
|
| 27 |
+
# Set environment variables
|
| 28 |
+
ENV PYTHONUNBUFFERED=1
|
| 29 |
+
ENV GRADIO_SERVER_NAME="0.0.0.0"
|
| 30 |
+
ENV GRADIO_SERVER_PORT=7860
|
| 31 |
+
|
| 32 |
+
# Run application
|
| 33 |
+
CMD ["python", "app.py"]
|
README.md
CHANGED
|
@@ -1,13 +1,158 @@
|
|
| 1 |
---
|
| 2 |
-
title:
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
sdk: gradio
|
| 7 |
-
sdk_version:
|
| 8 |
app_file: app.py
|
| 9 |
pinned: false
|
| 10 |
license: mit
|
| 11 |
---
|
| 12 |
|
| 13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
+
title: Vocal Articulation Assessment
|
| 3 |
+
emoji: 🎤
|
| 4 |
+
colorFrom: purple
|
| 5 |
+
colorTo: pink
|
| 6 |
sdk: gradio
|
| 7 |
+
sdk_version: 4.0.0
|
| 8 |
app_file: app.py
|
| 9 |
pinned: false
|
| 10 |
license: mit
|
| 11 |
---
|
| 12 |
|
| 13 |
+
# 🎤 Sistem Penilaian Vokal Indonesia
|
| 14 |
+
|
| 15 |
+
Sistem penilaian artikulasi vokal bahasa Indonesia menggunakan deep learning dan audio signal processing.
|
| 16 |
+
|
| 17 |
+
## 🌟 Fitur
|
| 18 |
+
|
| 19 |
+
### Multi-Metric Assessment
|
| 20 |
+
|
| 21 |
+
1. **Clarity Score (40%)**: Kejelasan pengucapan berdasarkan model confidence
|
| 22 |
+
2. **Energy Score (25%)**: Kualitas volume dan energi suara
|
| 23 |
+
3. **Duration Score (15%)**: Kesesuaian durasi pengucapan
|
| 24 |
+
4. **Pitch Score (20%)**: Stabilitas pitch/nada suara
|
| 25 |
+
|
| 26 |
+
### Vokal yang Didukung
|
| 27 |
+
|
| 28 |
+
- **A** - Vokal terbuka depan
|
| 29 |
+
- **I** - Vokal tertutup depan
|
| 30 |
+
- **U** - Vokal tertutup belakang
|
| 31 |
+
- **E** - Vokal tengah depan
|
| 32 |
+
- **O** - Vokal tengah belakang
|
| 33 |
+
|
| 34 |
+
## 🚀 Cara Menggunakan
|
| 35 |
+
|
| 36 |
+
### Di HuggingFace Spaces
|
| 37 |
+
|
| 38 |
+
1. Upload atau record audio Anda
|
| 39 |
+
2. Pilih target vokal (A, I, U, E, O)
|
| 40 |
+
3. (Optional) Set expected duration
|
| 41 |
+
4. Klik "Nilai Pengucapan"
|
| 42 |
+
5. Lihat hasil penilaian dengan grade dan feedback
|
| 43 |
+
|
| 44 |
+
### Local Development
|
| 45 |
+
|
| 46 |
+
```bash
|
| 47 |
+
# Install dependencies
|
| 48 |
+
pip install -r requirements.txt
|
| 49 |
+
|
| 50 |
+
# Run Gradio App
|
| 51 |
+
python app.py
|
| 52 |
+
|
| 53 |
+
# Or run FastAPI server
|
| 54 |
+
python api.py
|
| 55 |
+
```
|
| 56 |
+
|
| 57 |
+
## 📊 Sistem Penilaian
|
| 58 |
+
|
| 59 |
+
| Grade | Score Range | Keterangan |
|
| 60 |
+
| ----- | ----------- | -------------------------------------------------- |
|
| 61 |
+
| A | 90-100 | Sempurna - pengucapan sangat jelas dan akurat |
|
| 62 |
+
| B | 80-89 | Bagus - pengucapan cukup jelas dengan minor errors |
|
| 63 |
+
| C | 70-79 | Cukup - ada beberapa kesalahan |
|
| 64 |
+
| D | 60-69 | Kurang - banyak kesalahan |
|
| 65 |
+
| E | <60 | Perlu latihan lebih banyak |
|
| 66 |
+
|
| 67 |
+
## 🔧 Teknologi
|
| 68 |
+
|
| 69 |
+
- **Model**: HuBERT/Wav2Vec2 fine-tuned untuk klasifikasi vokal Indonesia
|
| 70 |
+
- **Backend**: FastAPI
|
| 71 |
+
- **Frontend**: Gradio
|
| 72 |
+
- **Audio Processing**: librosa, torchaudio
|
| 73 |
+
- **Deployment**: HuggingFace Spaces with ZeroGPU
|
| 74 |
+
|
| 75 |
+
## 📁 Struktur Project
|
| 76 |
+
|
| 77 |
+
```
|
| 78 |
+
.
|
| 79 |
+
├── app.py # Gradio interface (HF Spaces)
|
| 80 |
+
├── api.py # FastAPI server
|
| 81 |
+
├── scoring_system.py # Core scoring logic
|
| 82 |
+
├── latihan_dasar.py # Advanced articulation system
|
| 83 |
+
├── model_vokal/ # Model checkpoint
|
| 84 |
+
│ ├── config.json
|
| 85 |
+
│ ├── model.safetensors
|
| 86 |
+
│ └── preprocessor_config.json
|
| 87 |
+
├── requirements.txt # Dependencies
|
| 88 |
+
└── README.md # Documentation
|
| 89 |
+
```
|
| 90 |
+
|
| 91 |
+
## 🎯 Roadmap
|
| 92 |
+
|
| 93 |
+
### Level 1: Pengenalan Vokal ✅
|
| 94 |
+
|
| 95 |
+
- A, I, U, E, O (Current)
|
| 96 |
+
|
| 97 |
+
### Level 2-5: Expansi (Coming Soon)
|
| 98 |
+
|
| 99 |
+
- Level 2: Konsonan Dasar (BA, PA, DA, TA, dll)
|
| 100 |
+
- Level 3: Kombinasi Suku Kata (BA-BE-BI-BO-BU, dll)
|
| 101 |
+
- Level 4: Kata Sulit (PSIKOLOGI, STRATEGI, dll)
|
| 102 |
+
- Level 5: Kalimat Kompleks
|
| 103 |
+
|
| 104 |
+
## 📝 API Documentation
|
| 105 |
+
|
| 106 |
+
### FastAPI Endpoints
|
| 107 |
+
|
| 108 |
+
```bash
|
| 109 |
+
# Health check
|
| 110 |
+
GET /health
|
| 111 |
+
|
| 112 |
+
# Get supported labels
|
| 113 |
+
GET /labels
|
| 114 |
+
|
| 115 |
+
# Score single audio
|
| 116 |
+
POST /score
|
| 117 |
+
- audio: file (required)
|
| 118 |
+
- target_label: string (optional)
|
| 119 |
+
- expected_duration: float (optional)
|
| 120 |
+
|
| 121 |
+
# Batch scoring
|
| 122 |
+
POST /batch_score
|
| 123 |
+
- audios: files (required)
|
| 124 |
+
- target_labels: string (optional, comma-separated)
|
| 125 |
+
```
|
| 126 |
+
|
| 127 |
+
### Example cURL
|
| 128 |
+
|
| 129 |
+
```bash
|
| 130 |
+
curl -X POST "http://localhost:8000/score" \
|
| 131 |
+
-F "audio=@test.wav" \
|
| 132 |
+
-F "target_label=a" \
|
| 133 |
+
-F "expected_duration=0.8"
|
| 134 |
+
```
|
| 135 |
+
|
| 136 |
+
## 🤝 Contributing
|
| 137 |
+
|
| 138 |
+
Contributions are welcome! Terutama untuk:
|
| 139 |
+
|
| 140 |
+
- Menambah dataset vokal
|
| 141 |
+
- Implementasi Level 2-5
|
| 142 |
+
- Optimasi model
|
| 143 |
+
- UI/UX improvements
|
| 144 |
+
|
| 145 |
+
## 📄 License
|
| 146 |
+
|
| 147 |
+
MIT License
|
| 148 |
+
|
| 149 |
+
## 👥 Author
|
| 150 |
+
|
| 151 |
+
Dibuat untuk Latihan Dasar Artikulasi Vokal Indonesia
|
| 152 |
+
|
| 153 |
+
## 🙏 Acknowledgments
|
| 154 |
+
|
| 155 |
+
- Model base: HuBERT/Wav2Vec2
|
| 156 |
+
- Audio processing: librosa
|
| 157 |
+
- Framework: FastAPI & Gradio
|
| 158 |
+
- Deployment: HuggingFace Spaces
|
api/__init__.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# API module
|
| 2 |
+
from .routes import app
|
| 3 |
+
|
| 4 |
+
__all__ = ['app']
|
api/routes.py
ADDED
|
@@ -0,0 +1,346 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# =======================================
|
| 2 |
+
# FASTAPI BACKEND - VOCAL ARTICULATION API V2
|
| 3 |
+
# Updated untuk Whisper ASR + Multi-Level Support
|
| 4 |
+
# =======================================
|
| 5 |
+
|
| 6 |
+
from fastapi import FastAPI, File, UploadFile, Form, HTTPException
|
| 7 |
+
from fastapi.responses import JSONResponse
|
| 8 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 9 |
+
from pydantic import BaseModel
|
| 10 |
+
from typing import Optional, List
|
| 11 |
+
import tempfile
|
| 12 |
+
import os
|
| 13 |
+
from pathlib import Path
|
| 14 |
+
|
| 15 |
+
from core.scoring_engine import AdvancedVocalScoringSystem, ScoreResult
|
| 16 |
+
from core.constants import ARTICULATION_LEVELS
|
| 17 |
+
|
| 18 |
+
# =======================================
|
| 19 |
+
# FASTAPI APP INITIALIZATION
|
| 20 |
+
# =======================================
|
| 21 |
+
|
| 22 |
+
app = FastAPI(
|
| 23 |
+
title="Vocal Articulation Assessment API v2",
|
| 24 |
+
description="API untuk penilaian artikulasi vokal Indonesia - Multi-level dengan Whisper ASR",
|
| 25 |
+
version="2.0.0"
|
| 26 |
+
)
|
| 27 |
+
|
| 28 |
+
# CORS middleware
|
| 29 |
+
app.add_middleware(
|
| 30 |
+
CORSMiddleware,
|
| 31 |
+
allow_origins=["*"],
|
| 32 |
+
allow_credentials=True,
|
| 33 |
+
allow_methods=["*"],
|
| 34 |
+
allow_headers=["*"],
|
| 35 |
+
)
|
| 36 |
+
|
| 37 |
+
# =======================================
|
| 38 |
+
# PYDANTIC MODELS
|
| 39 |
+
# =======================================
|
| 40 |
+
|
| 41 |
+
class ScoreResponse(BaseModel):
|
| 42 |
+
"""Response model untuk scoring"""
|
| 43 |
+
success: bool
|
| 44 |
+
overall_score: float
|
| 45 |
+
grade: str
|
| 46 |
+
|
| 47 |
+
# Component scores
|
| 48 |
+
clarity_score: float
|
| 49 |
+
energy_score: float
|
| 50 |
+
speech_rate_score: float
|
| 51 |
+
pitch_consistency_score: float
|
| 52 |
+
snr_score: float
|
| 53 |
+
articulation_score: float
|
| 54 |
+
|
| 55 |
+
# ASR results
|
| 56 |
+
transcription: str
|
| 57 |
+
target: str
|
| 58 |
+
similarity: float
|
| 59 |
+
wer: float
|
| 60 |
+
|
| 61 |
+
# Feedback
|
| 62 |
+
feedback: str
|
| 63 |
+
suggestions: List[str]
|
| 64 |
+
|
| 65 |
+
# Audio features
|
| 66 |
+
audio_features: dict
|
| 67 |
+
level: int
|
| 68 |
+
|
| 69 |
+
class HealthResponse(BaseModel):
|
| 70 |
+
"""Response untuk health check"""
|
| 71 |
+
status: str
|
| 72 |
+
model_loaded: bool
|
| 73 |
+
device: str
|
| 74 |
+
whisper_model: str
|
| 75 |
+
|
| 76 |
+
class LevelsResponse(BaseModel):
|
| 77 |
+
"""Response untuk supported levels"""
|
| 78 |
+
levels: dict
|
| 79 |
+
total_levels: int
|
| 80 |
+
|
| 81 |
+
# =======================================
|
| 82 |
+
# GLOBAL VARIABLES
|
| 83 |
+
# =======================================
|
| 84 |
+
|
| 85 |
+
scorer: Optional[AdvancedVocalScoringSystem] = None
|
| 86 |
+
|
| 87 |
+
# =======================================
|
| 88 |
+
# STARTUP & SHUTDOWN
|
| 89 |
+
# =======================================
|
| 90 |
+
|
| 91 |
+
@app.on_event("startup")
|
| 92 |
+
async def startup_event():
|
| 93 |
+
"""Load model saat startup"""
|
| 94 |
+
global scorer
|
| 95 |
+
|
| 96 |
+
print("🚀 Starting Vocal Articulation API v2...")
|
| 97 |
+
|
| 98 |
+
# Whisper model dari environment atau default
|
| 99 |
+
whisper_model = os.getenv("WHISPER_MODEL", "openai/whisper-small")
|
| 100 |
+
|
| 101 |
+
try:
|
| 102 |
+
scorer = AdvancedVocalScoringSystem(whisper_model=whisper_model)
|
| 103 |
+
print("✅ Whisper model loaded successfully!")
|
| 104 |
+
except Exception as e:
|
| 105 |
+
print(f"❌ Error loading model: {e}")
|
| 106 |
+
raise
|
| 107 |
+
|
| 108 |
+
@app.on_event("shutdown")
|
| 109 |
+
async def shutdown_event():
|
| 110 |
+
"""Cleanup saat shutdown"""
|
| 111 |
+
print("👋 Shutting down Vocal Articulation API v2...")
|
| 112 |
+
|
| 113 |
+
# =======================================
|
| 114 |
+
# API ENDPOINTS
|
| 115 |
+
# =======================================
|
| 116 |
+
|
| 117 |
+
@app.get("/", response_model=dict)
|
| 118 |
+
async def root():
|
| 119 |
+
"""Root endpoint"""
|
| 120 |
+
return {
|
| 121 |
+
"message": "Vocal Articulation Assessment API v2",
|
| 122 |
+
"version": "2.0.0",
|
| 123 |
+
"features": [
|
| 124 |
+
"Whisper ASR-based clarity scoring",
|
| 125 |
+
"Multi-level support (Level 1-5)",
|
| 126 |
+
"6 scoring metrics",
|
| 127 |
+
"Comprehensive audio analysis"
|
| 128 |
+
],
|
| 129 |
+
"endpoints": {
|
| 130 |
+
"health": "/health",
|
| 131 |
+
"levels": "/levels",
|
| 132 |
+
"score": "/score",
|
| 133 |
+
"batch_score": "/batch_score",
|
| 134 |
+
"docs": "/docs"
|
| 135 |
+
}
|
| 136 |
+
}
|
| 137 |
+
|
| 138 |
+
@app.get("/health", response_model=HealthResponse)
|
| 139 |
+
async def health_check():
|
| 140 |
+
"""Health check endpoint"""
|
| 141 |
+
return HealthResponse(
|
| 142 |
+
status="healthy" if scorer is not None else "unhealthy",
|
| 143 |
+
model_loaded=scorer is not None,
|
| 144 |
+
device=scorer.device if scorer else "unknown",
|
| 145 |
+
whisper_model="openai/whisper-small" if scorer else "not loaded"
|
| 146 |
+
)
|
| 147 |
+
|
| 148 |
+
@app.get("/levels", response_model=LevelsResponse)
|
| 149 |
+
async def get_levels():
|
| 150 |
+
"""Get all articulation levels and their targets"""
|
| 151 |
+
return LevelsResponse(
|
| 152 |
+
levels=ARTICULATION_LEVELS,
|
| 153 |
+
total_levels=len(ARTICULATION_LEVELS)
|
| 154 |
+
)
|
| 155 |
+
|
| 156 |
+
@app.post("/score", response_model=ScoreResponse)
|
| 157 |
+
async def score_audio(
|
| 158 |
+
audio: UploadFile = File(..., description="Audio file (WAV, MP3, M4A, etc.)"),
|
| 159 |
+
target_text: str = Form(..., description="Target text yang seharusnya diucapkan"),
|
| 160 |
+
level: int = Form(1, description="Level artikulasi (1-5)")
|
| 161 |
+
):
|
| 162 |
+
"""
|
| 163 |
+
Score audio file untuk penilaian artikulasi vokal
|
| 164 |
+
|
| 165 |
+
Args:
|
| 166 |
+
audio: File audio yang akan dinilai
|
| 167 |
+
target_text: Text target yang seharusnya diucapkan
|
| 168 |
+
level: Level artikulasi (1=Vokal, 2=Konsonan, 3=Suku Kata, 4=Kata, 5=Kalimat)
|
| 169 |
+
|
| 170 |
+
Returns:
|
| 171 |
+
ScoreResponse dengan hasil penilaian lengkap
|
| 172 |
+
"""
|
| 173 |
+
if scorer is None:
|
| 174 |
+
raise HTTPException(status_code=503, detail="Model not loaded")
|
| 175 |
+
|
| 176 |
+
# Validate level
|
| 177 |
+
if level not in ARTICULATION_LEVELS:
|
| 178 |
+
raise HTTPException(
|
| 179 |
+
status_code=400,
|
| 180 |
+
detail=f"Invalid level. Must be 1-5. Available levels: {list(ARTICULATION_LEVELS.keys())}"
|
| 181 |
+
)
|
| 182 |
+
|
| 183 |
+
# Validate target text
|
| 184 |
+
if not target_text or not target_text.strip():
|
| 185 |
+
raise HTTPException(
|
| 186 |
+
status_code=400,
|
| 187 |
+
detail="target_text cannot be empty"
|
| 188 |
+
)
|
| 189 |
+
|
| 190 |
+
# Save uploaded file to temporary location
|
| 191 |
+
try:
|
| 192 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=Path(audio.filename).suffix) as tmp_file:
|
| 193 |
+
content = await audio.read()
|
| 194 |
+
tmp_file.write(content)
|
| 195 |
+
tmp_path = tmp_file.name
|
| 196 |
+
|
| 197 |
+
# Score audio
|
| 198 |
+
result = scorer.score_audio(
|
| 199 |
+
audio_path=tmp_path,
|
| 200 |
+
target_text=target_text,
|
| 201 |
+
level=level
|
| 202 |
+
)
|
| 203 |
+
|
| 204 |
+
# Clean up temp file
|
| 205 |
+
os.unlink(tmp_path)
|
| 206 |
+
|
| 207 |
+
# Return response
|
| 208 |
+
return ScoreResponse(
|
| 209 |
+
success=True,
|
| 210 |
+
overall_score=result.overall_score,
|
| 211 |
+
grade=result.grade,
|
| 212 |
+
clarity_score=result.clarity_score,
|
| 213 |
+
energy_score=result.energy_score,
|
| 214 |
+
speech_rate_score=result.speech_rate_score,
|
| 215 |
+
pitch_consistency_score=result.pitch_consistency_score,
|
| 216 |
+
snr_score=result.snr_score,
|
| 217 |
+
articulation_score=result.articulation_score,
|
| 218 |
+
transcription=result.transcription,
|
| 219 |
+
target=result.target,
|
| 220 |
+
similarity=result.similarity,
|
| 221 |
+
wer=result.wer,
|
| 222 |
+
feedback=result.feedback,
|
| 223 |
+
suggestions=result.suggestions,
|
| 224 |
+
audio_features=result.audio_features,
|
| 225 |
+
level=result.level
|
| 226 |
+
)
|
| 227 |
+
|
| 228 |
+
except Exception as e:
|
| 229 |
+
# Clean up temp file if exists
|
| 230 |
+
if 'tmp_path' in locals() and os.path.exists(tmp_path):
|
| 231 |
+
os.unlink(tmp_path)
|
| 232 |
+
|
| 233 |
+
raise HTTPException(status_code=500, detail=f"Error processing audio: {str(e)}")
|
| 234 |
+
|
| 235 |
+
@app.post("/batch_score")
|
| 236 |
+
async def batch_score_audio(
|
| 237 |
+
audios: List[UploadFile] = File(..., description="Multiple audio files"),
|
| 238 |
+
target_texts: str = Form(..., description="Comma-separated target texts"),
|
| 239 |
+
levels: str = Form("1", description="Comma-separated levels (default: 1 for all)")
|
| 240 |
+
):
|
| 241 |
+
"""
|
| 242 |
+
Score multiple audio files dalam satu request
|
| 243 |
+
|
| 244 |
+
Args:
|
| 245 |
+
audios: List of audio files
|
| 246 |
+
target_texts: Comma-separated target texts
|
| 247 |
+
levels: Comma-separated levels (optional, default 1 for all)
|
| 248 |
+
|
| 249 |
+
Returns:
|
| 250 |
+
List of score results
|
| 251 |
+
"""
|
| 252 |
+
if scorer is None:
|
| 253 |
+
raise HTTPException(status_code=503, detail="Model not loaded")
|
| 254 |
+
|
| 255 |
+
# Parse target texts
|
| 256 |
+
targets = [t.strip() for t in target_texts.split(",")]
|
| 257 |
+
if len(targets) != len(audios):
|
| 258 |
+
raise HTTPException(
|
| 259 |
+
status_code=400,
|
| 260 |
+
detail="Number of target_texts must match number of audio files"
|
| 261 |
+
)
|
| 262 |
+
|
| 263 |
+
# Parse levels
|
| 264 |
+
level_list = [int(l.strip()) for l in levels.split(",")]
|
| 265 |
+
if len(level_list) == 1:
|
| 266 |
+
level_list = level_list * len(audios)
|
| 267 |
+
elif len(level_list) != len(audios):
|
| 268 |
+
raise HTTPException(
|
| 269 |
+
status_code=400,
|
| 270 |
+
detail="Number of levels must be 1 or match number of audio files"
|
| 271 |
+
)
|
| 272 |
+
|
| 273 |
+
results = []
|
| 274 |
+
|
| 275 |
+
for idx, (audio, target, level) in enumerate(zip(audios, targets, level_list)):
|
| 276 |
+
try:
|
| 277 |
+
# Save to temp file
|
| 278 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=Path(audio.filename).suffix) as tmp_file:
|
| 279 |
+
content = await audio.read()
|
| 280 |
+
tmp_file.write(content)
|
| 281 |
+
tmp_path = tmp_file.name
|
| 282 |
+
|
| 283 |
+
# Score
|
| 284 |
+
result = scorer.score_audio(
|
| 285 |
+
audio_path=tmp_path,
|
| 286 |
+
target_text=target,
|
| 287 |
+
level=level
|
| 288 |
+
)
|
| 289 |
+
|
| 290 |
+
# Clean up
|
| 291 |
+
os.unlink(tmp_path)
|
| 292 |
+
|
| 293 |
+
results.append({
|
| 294 |
+
"filename": audio.filename,
|
| 295 |
+
"success": True,
|
| 296 |
+
"overall_score": result.overall_score,
|
| 297 |
+
"grade": result.grade,
|
| 298 |
+
"clarity_score": result.clarity_score,
|
| 299 |
+
"energy_score": result.energy_score,
|
| 300 |
+
"speech_rate_score": result.speech_rate_score,
|
| 301 |
+
"pitch_consistency_score": result.pitch_consistency_score,
|
| 302 |
+
"snr_score": result.snr_score,
|
| 303 |
+
"articulation_score": result.articulation_score,
|
| 304 |
+
"transcription": result.transcription,
|
| 305 |
+
"target": result.target,
|
| 306 |
+
"similarity": result.similarity,
|
| 307 |
+
"wer": result.wer,
|
| 308 |
+
"feedback": result.feedback,
|
| 309 |
+
"suggestions": result.suggestions,
|
| 310 |
+
"audio_features": result.audio_features,
|
| 311 |
+
"level": result.level
|
| 312 |
+
})
|
| 313 |
+
|
| 314 |
+
except Exception as e:
|
| 315 |
+
if 'tmp_path' in locals() and os.path.exists(tmp_path):
|
| 316 |
+
os.unlink(tmp_path)
|
| 317 |
+
|
| 318 |
+
results.append({
|
| 319 |
+
"filename": audio.filename,
|
| 320 |
+
"success": False,
|
| 321 |
+
"error": str(e)
|
| 322 |
+
})
|
| 323 |
+
|
| 324 |
+
return {"results": results, "total": len(results)}
|
| 325 |
+
|
| 326 |
+
# =======================================
|
| 327 |
+
# RUN SERVER
|
| 328 |
+
# =======================================
|
| 329 |
+
|
| 330 |
+
if __name__ == "__main__":
|
| 331 |
+
import uvicorn
|
| 332 |
+
|
| 333 |
+
# Configuration
|
| 334 |
+
host = os.getenv("HOST", "0.0.0.0")
|
| 335 |
+
port = int(os.getenv("PORT", 8000))
|
| 336 |
+
|
| 337 |
+
print(f"🚀 Starting server on {host}:{port}")
|
| 338 |
+
print("📖 API Documentation: http://localhost:8000/docs")
|
| 339 |
+
|
| 340 |
+
uvicorn.run(
|
| 341 |
+
"api_v2:app",
|
| 342 |
+
host=host,
|
| 343 |
+
port=port,
|
| 344 |
+
reload=True,
|
| 345 |
+
log_level="info"
|
| 346 |
+
)
|
app.py
ADDED
|
@@ -0,0 +1,319 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# =======================================
|
| 2 |
+
# GRADIO INTERFACE - HUGGINGFACE SPACES
|
| 3 |
+
# UI untuk Vocal Articulation Assessment
|
| 4 |
+
# Support ZeroGPU untuk HuggingFace Spaces
|
| 5 |
+
# =======================================
|
| 6 |
+
|
| 7 |
+
import gradio as gr
|
| 8 |
+
import torch
|
| 9 |
+
import os
|
| 10 |
+
from pathlib import Path
|
| 11 |
+
from typing import Dict, Tuple
|
| 12 |
+
|
| 13 |
+
from scoring_system import VocalScoringSystem
|
| 14 |
+
|
| 15 |
+
# =======================================
|
| 16 |
+
# ZEROGPU DECORATOR (untuk HuggingFace Spaces)
|
| 17 |
+
# =======================================
|
| 18 |
+
|
| 19 |
+
try:
|
| 20 |
+
import spaces
|
| 21 |
+
ZEROGPU_AVAILABLE = True
|
| 22 |
+
print("✅ ZeroGPU available")
|
| 23 |
+
except ImportError:
|
| 24 |
+
ZEROGPU_AVAILABLE = False
|
| 25 |
+
print("⚠️ ZeroGPU not available (running locally)")
|
| 26 |
+
# Create dummy decorator
|
| 27 |
+
class spaces:
|
| 28 |
+
@staticmethod
|
| 29 |
+
def GPU(func):
|
| 30 |
+
return func
|
| 31 |
+
|
| 32 |
+
# =======================================
|
| 33 |
+
# GLOBAL VARIABLES
|
| 34 |
+
# =======================================
|
| 35 |
+
|
| 36 |
+
scorer = None
|
| 37 |
+
|
| 38 |
+
# =======================================
|
| 39 |
+
# INITIALIZATION
|
| 40 |
+
# =======================================
|
| 41 |
+
|
| 42 |
+
def initialize_model():
|
| 43 |
+
"""Initialize scoring system"""
|
| 44 |
+
global scorer
|
| 45 |
+
|
| 46 |
+
if scorer is None:
|
| 47 |
+
model_path = os.getenv("MODEL_PATH", "./model_vokal")
|
| 48 |
+
print(f"🔄 Loading model from {model_path}...")
|
| 49 |
+
scorer = VocalScoringSystem(model_path=model_path)
|
| 50 |
+
print("✅ Model loaded!")
|
| 51 |
+
|
| 52 |
+
return scorer
|
| 53 |
+
|
| 54 |
+
# =======================================
|
| 55 |
+
# GRADIO INFERENCE FUNCTION
|
| 56 |
+
# =======================================
|
| 57 |
+
|
| 58 |
+
@spaces.GPU(duration=60) # Reserve GPU for 60 seconds (jika di HF Spaces)
|
| 59 |
+
def score_vocal(
|
| 60 |
+
audio_file: str,
|
| 61 |
+
target_label: str,
|
| 62 |
+
expected_duration: float
|
| 63 |
+
) -> Tuple[str, str, Dict, str]:
|
| 64 |
+
"""
|
| 65 |
+
Score vocal audio dengan Gradio interface
|
| 66 |
+
|
| 67 |
+
Args:
|
| 68 |
+
audio_file: Path to uploaded audio
|
| 69 |
+
target_label: Target vocal (a, i, u, e, o)
|
| 70 |
+
expected_duration: Expected duration in seconds
|
| 71 |
+
|
| 72 |
+
Returns:
|
| 73 |
+
Tuple of (score_display, feedback, details_dict, grade_display)
|
| 74 |
+
"""
|
| 75 |
+
try:
|
| 76 |
+
# Initialize model
|
| 77 |
+
scorer = initialize_model()
|
| 78 |
+
|
| 79 |
+
# Validate input
|
| 80 |
+
if audio_file is None:
|
| 81 |
+
return "❌ Error", "Silakan upload file audio terlebih dahulu!", {}, ""
|
| 82 |
+
|
| 83 |
+
# Process target label
|
| 84 |
+
target = target_label.lower().strip() if target_label else None
|
| 85 |
+
exp_dur = expected_duration if expected_duration > 0 else None
|
| 86 |
+
|
| 87 |
+
# Score audio
|
| 88 |
+
result = scorer.score_audio(
|
| 89 |
+
audio_path=audio_file,
|
| 90 |
+
target_label=target,
|
| 91 |
+
expected_duration=exp_dur
|
| 92 |
+
)
|
| 93 |
+
|
| 94 |
+
# Format score display
|
| 95 |
+
score_display = f"""
|
| 96 |
+
## 📊 Hasil Penilaian
|
| 97 |
+
|
| 98 |
+
### Overall Score: {result.overall_score}/100
|
| 99 |
+
### Grade: {result.grade}
|
| 100 |
+
|
| 101 |
+
---
|
| 102 |
+
|
| 103 |
+
### 🎯 Prediksi
|
| 104 |
+
- **Target**: {result.target_label.upper() if result.target_label else 'Tidak ada'}
|
| 105 |
+
- **Terdeteksi**: {result.predicted_label.upper()}
|
| 106 |
+
- **Confidence**: {result.confidence}%
|
| 107 |
+
|
| 108 |
+
---
|
| 109 |
+
|
| 110 |
+
### 📈 Component Scores
|
| 111 |
+
|
| 112 |
+
| Komponen | Score | Bobot |
|
| 113 |
+
|----------|-------|-------|
|
| 114 |
+
| 🔊 **Clarity** | {result.clarity_score}/100 | 40% |
|
| 115 |
+
| ⚡ **Energy** | {result.energy_score}/100 | 25% |
|
| 116 |
+
| ⏱️ **Duration** | {result.duration_score}/100 | 15% |
|
| 117 |
+
| 🎵 **Pitch** | {result.pitch_score}/100 | 20% |
|
| 118 |
+
"""
|
| 119 |
+
|
| 120 |
+
# Format feedback
|
| 121 |
+
feedback_display = f"""
|
| 122 |
+
## 💬 Feedback
|
| 123 |
+
|
| 124 |
+
{result.feedback}
|
| 125 |
+
|
| 126 |
+
### 💡 Saran Perbaikan:
|
| 127 |
+
"""
|
| 128 |
+
if result.suggestions:
|
| 129 |
+
for i, suggestion in enumerate(result.suggestions, 1):
|
| 130 |
+
feedback_display += f"\n{i}. {suggestion}"
|
| 131 |
+
else:
|
| 132 |
+
feedback_display += "\n✅ Tidak ada saran - pengucapan sudah sangat baik!"
|
| 133 |
+
|
| 134 |
+
# Details dictionary
|
| 135 |
+
details = {
|
| 136 |
+
"Overall Score": result.overall_score,
|
| 137 |
+
"Grade": result.grade,
|
| 138 |
+
"Predicted": result.predicted_label.upper(),
|
| 139 |
+
"Confidence": f"{result.confidence}%",
|
| 140 |
+
"Clarity Score": result.clarity_score,
|
| 141 |
+
"Energy Score": result.energy_score,
|
| 142 |
+
"Duration Score": result.duration_score,
|
| 143 |
+
"Pitch Score": result.pitch_score,
|
| 144 |
+
**result.audio_features
|
| 145 |
+
}
|
| 146 |
+
|
| 147 |
+
# Grade display with emoji
|
| 148 |
+
grade_emoji = {
|
| 149 |
+
'A': '🌟',
|
| 150 |
+
'B': '👍',
|
| 151 |
+
'C': '😊',
|
| 152 |
+
'D': '🤔',
|
| 153 |
+
'E': '💪'
|
| 154 |
+
}
|
| 155 |
+
grade_display = f"{grade_emoji.get(result.grade, '📊')} Grade {result.grade}"
|
| 156 |
+
|
| 157 |
+
return score_display, feedback_display, details, grade_display
|
| 158 |
+
|
| 159 |
+
except Exception as e:
|
| 160 |
+
error_msg = f"❌ Error: {str(e)}"
|
| 161 |
+
return error_msg, error_msg, {}, "Error"
|
| 162 |
+
|
| 163 |
+
# =======================================
|
| 164 |
+
# GRADIO UI
|
| 165 |
+
# =======================================
|
| 166 |
+
|
| 167 |
+
def create_interface():
|
| 168 |
+
"""Create Gradio interface"""
|
| 169 |
+
|
| 170 |
+
# Custom CSS
|
| 171 |
+
custom_css = """
|
| 172 |
+
.gradio-container {
|
| 173 |
+
font-family: 'Arial', sans-serif;
|
| 174 |
+
}
|
| 175 |
+
.score-display {
|
| 176 |
+
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
|
| 177 |
+
color: white;
|
| 178 |
+
padding: 20px;
|
| 179 |
+
border-radius: 10px;
|
| 180 |
+
}
|
| 181 |
+
"""
|
| 182 |
+
|
| 183 |
+
# Create interface
|
| 184 |
+
with gr.Blocks(
|
| 185 |
+
title="Vocal Articulation Assessment",
|
| 186 |
+
theme=gr.themes.Soft(),
|
| 187 |
+
css=custom_css
|
| 188 |
+
) as demo:
|
| 189 |
+
|
| 190 |
+
gr.Markdown("""
|
| 191 |
+
# 🎤 Sistem Penilaian Vokal Indonesia
|
| 192 |
+
|
| 193 |
+
Sistem ini menilai pengucapan vokal bahasa Indonesia (A, I, U, E, O) menggunakan multiple metrics:
|
| 194 |
+
- **Clarity**: Kejelasan pengucapan dari model confidence
|
| 195 |
+
- **Energy**: Kualitas volume dan energi suara
|
| 196 |
+
- **Duration**: Kesesuaian durasi pengucapan
|
| 197 |
+
- **Pitch**: Stabilitas pitch/nada suara
|
| 198 |
+
|
| 199 |
+
### 📝 Cara Penggunaan:
|
| 200 |
+
1. Upload atau record audio Anda
|
| 201 |
+
2. Pilih target vokal yang diucapkan
|
| 202 |
+
3. (Opsional) Set expected duration
|
| 203 |
+
4. Klik "🎯 Nilai Pengucapan"
|
| 204 |
+
""")
|
| 205 |
+
|
| 206 |
+
with gr.Row():
|
| 207 |
+
with gr.Column(scale=1):
|
| 208 |
+
gr.Markdown("## 🎙️ Input")
|
| 209 |
+
|
| 210 |
+
# Audio input
|
| 211 |
+
audio_input = gr.Audio(
|
| 212 |
+
label="Upload atau Record Audio",
|
| 213 |
+
type="filepath",
|
| 214 |
+
sources=["upload", "microphone"]
|
| 215 |
+
)
|
| 216 |
+
|
| 217 |
+
# Target label dropdown
|
| 218 |
+
target_input = gr.Dropdown(
|
| 219 |
+
label="Target Vokal",
|
| 220 |
+
choices=["a", "i", "u", "e", "o"],
|
| 221 |
+
value="a",
|
| 222 |
+
info="Pilih vokal yang Anda ucapkan"
|
| 223 |
+
)
|
| 224 |
+
|
| 225 |
+
# Expected duration slider
|
| 226 |
+
duration_input = gr.Slider(
|
| 227 |
+
label="Expected Duration (detik)",
|
| 228 |
+
minimum=0,
|
| 229 |
+
maximum=3.0,
|
| 230 |
+
value=0.8,
|
| 231 |
+
step=0.1,
|
| 232 |
+
info="0 = auto (tidak diperhitungkan)"
|
| 233 |
+
)
|
| 234 |
+
|
| 235 |
+
# Submit button
|
| 236 |
+
submit_btn = gr.Button(
|
| 237 |
+
"🎯 Nilai Pengucapan",
|
| 238 |
+
variant="primary",
|
| 239 |
+
size="lg"
|
| 240 |
+
)
|
| 241 |
+
|
| 242 |
+
with gr.Column(scale=1):
|
| 243 |
+
gr.Markdown("## 📊 Hasil Penilaian")
|
| 244 |
+
|
| 245 |
+
# Grade display (large)
|
| 246 |
+
grade_output = gr.Markdown(
|
| 247 |
+
"### Belum ada penilaian",
|
| 248 |
+
elem_classes=["score-display"]
|
| 249 |
+
)
|
| 250 |
+
|
| 251 |
+
# Score display
|
| 252 |
+
score_output = gr.Markdown()
|
| 253 |
+
|
| 254 |
+
# Feedback row
|
| 255 |
+
with gr.Row():
|
| 256 |
+
feedback_output = gr.Markdown()
|
| 257 |
+
|
| 258 |
+
# Details accordion
|
| 259 |
+
with gr.Accordion("🔍 Detail Lengkap", open=False):
|
| 260 |
+
details_output = gr.JSON(label="Audio Features & Scores")
|
| 261 |
+
|
| 262 |
+
# Examples
|
| 263 |
+
gr.Markdown("## 📚 Contoh")
|
| 264 |
+
gr.Examples(
|
| 265 |
+
examples=[
|
| 266 |
+
["examples/a.wav", "a", 0.8],
|
| 267 |
+
["examples/i.wav", "i", 0.8],
|
| 268 |
+
["examples/u.wav", "u", 0.8],
|
| 269 |
+
["examples/e.wav", "e", 0.8],
|
| 270 |
+
["examples/o.wav", "o", 0.8],
|
| 271 |
+
],
|
| 272 |
+
inputs=[audio_input, target_input, duration_input],
|
| 273 |
+
label="Klik untuk mencoba contoh"
|
| 274 |
+
)
|
| 275 |
+
|
| 276 |
+
# Connect button to function
|
| 277 |
+
submit_btn.click(
|
| 278 |
+
fn=score_vocal,
|
| 279 |
+
inputs=[audio_input, target_input, duration_input],
|
| 280 |
+
outputs=[score_output, feedback_output, details_output, grade_output]
|
| 281 |
+
)
|
| 282 |
+
|
| 283 |
+
# Footer
|
| 284 |
+
gr.Markdown("""
|
| 285 |
+
---
|
| 286 |
+
### ℹ️ Informasi
|
| 287 |
+
|
| 288 |
+
**Tentang Penilaian:**
|
| 289 |
+
- **Grade A** (90-100): Sempurna - pengucapan sangat jelas dan akurat
|
| 290 |
+
- **Grade B** (80-89): Bagus - pengucapan cukup jelas dengan minor errors
|
| 291 |
+
- **Grade C** (70-79): Cukup - ada beberapa kesalahan
|
| 292 |
+
- **Grade D** (60-69): Kurang - banyak kesalahan
|
| 293 |
+
- **Grade E** (<60): Perlu latihan lebih banyak
|
| 294 |
+
|
| 295 |
+
**Model**: HuBERT/Wav2Vec2 untuk klasifikasi vokal Indonesia
|
| 296 |
+
|
| 297 |
+
**Dibuat untuk**: Latihan Dasar Artikulasi Vokal Indonesia
|
| 298 |
+
""")
|
| 299 |
+
|
| 300 |
+
return demo
|
| 301 |
+
|
| 302 |
+
# =======================================
|
| 303 |
+
# MAIN
|
| 304 |
+
# =======================================
|
| 305 |
+
|
| 306 |
+
if __name__ == "__main__":
|
| 307 |
+
# Initialize model at startup
|
| 308 |
+
initialize_model()
|
| 309 |
+
|
| 310 |
+
# Create and launch interface
|
| 311 |
+
demo = create_interface()
|
| 312 |
+
|
| 313 |
+
# Launch configuration
|
| 314 |
+
demo.launch(
|
| 315 |
+
server_name="0.0.0.0",
|
| 316 |
+
server_port=7860,
|
| 317 |
+
share=False, # Set True untuk public URL
|
| 318 |
+
show_error=True
|
| 319 |
+
)
|
app/__init__.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# App module
|
| 2 |
+
from .interface import create_interface, initialize_model
|
| 3 |
+
|
| 4 |
+
__all__ = ['create_interface', 'initialize_model']
|
app/interface.py
ADDED
|
@@ -0,0 +1,351 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# =======================================
|
| 2 |
+
# GRADIO INTERFACE V2 - HUGGINGFACE SPACES
|
| 3 |
+
# Updated untuk Whisper ASR + Multi-Level Support
|
| 4 |
+
# =======================================
|
| 5 |
+
|
| 6 |
+
import gradio as gr
|
| 7 |
+
import os
|
| 8 |
+
from typing import Dict, Tuple
|
| 9 |
+
|
| 10 |
+
from core.scoring_engine import AdvancedVocalScoringSystem
|
| 11 |
+
from core.constants import ARTICULATION_LEVELS
|
| 12 |
+
|
| 13 |
+
# =======================================
|
| 14 |
+
# ZEROGPU DECORATOR
|
| 15 |
+
# =======================================
|
| 16 |
+
|
| 17 |
+
try:
|
| 18 |
+
import spaces
|
| 19 |
+
ZEROGPU_AVAILABLE = True
|
| 20 |
+
print("✅ ZeroGPU available")
|
| 21 |
+
except ImportError:
|
| 22 |
+
ZEROGPU_AVAILABLE = False
|
| 23 |
+
print("⚠️ ZeroGPU not available (running locally)")
|
| 24 |
+
class spaces:
|
| 25 |
+
@staticmethod
|
| 26 |
+
def GPU(func):
|
| 27 |
+
return func
|
| 28 |
+
|
| 29 |
+
# =======================================
|
| 30 |
+
# GLOBAL VARIABLES
|
| 31 |
+
# =======================================
|
| 32 |
+
|
| 33 |
+
scorer = None
|
| 34 |
+
|
| 35 |
+
# =======================================
|
| 36 |
+
# INITIALIZATION
|
| 37 |
+
# =======================================
|
| 38 |
+
|
| 39 |
+
def initialize_model():
|
| 40 |
+
"""Initialize scoring system"""
|
| 41 |
+
global scorer
|
| 42 |
+
|
| 43 |
+
if scorer is None:
|
| 44 |
+
whisper_model = os.getenv("WHISPER_MODEL", "openai/whisper-small")
|
| 45 |
+
print(f"🔄 Loading Whisper model: {whisper_model}...")
|
| 46 |
+
scorer = AdvancedVocalScoringSystem(whisper_model=whisper_model)
|
| 47 |
+
print("✅ Model loaded!")
|
| 48 |
+
|
| 49 |
+
return scorer
|
| 50 |
+
|
| 51 |
+
# =======================================
|
| 52 |
+
# GRADIO INFERENCE FUNCTION
|
| 53 |
+
# =======================================
|
| 54 |
+
|
| 55 |
+
@spaces.GPU(duration=120)
|
| 56 |
+
def score_vocal(
|
| 57 |
+
audio_file: str,
|
| 58 |
+
target_text: str,
|
| 59 |
+
level: int
|
| 60 |
+
) -> Tuple[str, str, Dict, str]:
|
| 61 |
+
"""
|
| 62 |
+
Score vocal audio dengan Gradio interface
|
| 63 |
+
|
| 64 |
+
Args:
|
| 65 |
+
audio_file: Path to uploaded audio
|
| 66 |
+
target_text: Target text yang seharusnya diucapkan
|
| 67 |
+
level: Level artikulasi (1-5)
|
| 68 |
+
|
| 69 |
+
Returns:
|
| 70 |
+
Tuple of (score_display, feedback, details_dict, grade_display)
|
| 71 |
+
"""
|
| 72 |
+
try:
|
| 73 |
+
# Initialize model
|
| 74 |
+
scorer = initialize_model()
|
| 75 |
+
|
| 76 |
+
# Validate input
|
| 77 |
+
if audio_file is None:
|
| 78 |
+
return "❌ Error", "Silakan upload atau record audio terlebih dahulu!", {}, "❌"
|
| 79 |
+
|
| 80 |
+
if not target_text or not target_text.strip():
|
| 81 |
+
return "❌ Error", "Silakan masukkan target text!", {}, "❌"
|
| 82 |
+
|
| 83 |
+
# Score audio
|
| 84 |
+
result = scorer.score_audio(
|
| 85 |
+
audio_path=audio_file,
|
| 86 |
+
target_text=target_text,
|
| 87 |
+
level=level
|
| 88 |
+
)
|
| 89 |
+
|
| 90 |
+
# Format score display
|
| 91 |
+
score_display = f"""
|
| 92 |
+
## 📊 Hasil Penilaian - Level {level}
|
| 93 |
+
|
| 94 |
+
### Overall Score: **{result.overall_score}/100**
|
| 95 |
+
### Grade: **{result.grade}**
|
| 96 |
+
|
| 97 |
+
---
|
| 98 |
+
|
| 99 |
+
### 🎯 ASR Transcription
|
| 100 |
+
- **Target**: {result.target}
|
| 101 |
+
- **Terdeteksi**: {result.transcription}
|
| 102 |
+
- **Similarity**: {result.similarity*100:.2f}%
|
| 103 |
+
- **WER**: {result.wer*100:.2f}%
|
| 104 |
+
|
| 105 |
+
---
|
| 106 |
+
|
| 107 |
+
### 📈 Component Scores
|
| 108 |
+
|
| 109 |
+
| Komponen | Score | Status |
|
| 110 |
+
|----------|-------|--------|
|
| 111 |
+
| 🔊 **Clarity** (ASR Accuracy) | **{result.clarity_score:.1f}/100** | {'✅' if result.clarity_score >= 80 else '⚠️' if result.clarity_score >= 60 else '❌'} |
|
| 112 |
+
| ⚡ **Energy** (Volume) | **{result.energy_score:.1f}/100** | {'✅' if result.energy_score >= 80 else '⚠️' if result.energy_score >= 60 else '❌'} |
|
| 113 |
+
| 🗣️ **Speech Rate** | **{result.speech_rate_score:.1f}/100** | {'✅' if result.speech_rate_score >= 80 else '⚠️' if result.speech_rate_score >= 60 else '❌'} |
|
| 114 |
+
| 🎵 **Pitch Consistency** | **{result.pitch_consistency_score:.1f}/100** | {'✅' if result.pitch_consistency_score >= 80 else '⚠️' if result.pitch_consistency_score >= 60 else '❌'} |
|
| 115 |
+
| 📡 **SNR** (Noise Quality) | **{result.snr_score:.1f}/100** | {'✅' if result.snr_score >= 80 else '⚠️' if result.snr_score >= 60 else '❌'} |
|
| 116 |
+
| 🎤 **Articulation** | **{result.articulation_score:.1f}/100** | {'✅' if result.articulation_score >= 80 else '⚠️' if result.articulation_score >= 60 else '❌'} |
|
| 117 |
+
"""
|
| 118 |
+
|
| 119 |
+
# Format feedback
|
| 120 |
+
feedback_display = f"""
|
| 121 |
+
## 💬 Feedback
|
| 122 |
+
|
| 123 |
+
{result.feedback}
|
| 124 |
+
|
| 125 |
+
---
|
| 126 |
+
|
| 127 |
+
### 💡 Saran Perbaikan:
|
| 128 |
+
"""
|
| 129 |
+
if result.suggestions:
|
| 130 |
+
for i, suggestion in enumerate(result.suggestions, 1):
|
| 131 |
+
feedback_display += f"\n{i}. {suggestion}"
|
| 132 |
+
else:
|
| 133 |
+
feedback_display += "\n✨ **Sempurna!** Tidak ada saran - pengucapan Anda sudah sangat baik!"
|
| 134 |
+
|
| 135 |
+
# Details dictionary
|
| 136 |
+
details = {
|
| 137 |
+
"📊 Overall": {
|
| 138 |
+
"Score": result.overall_score,
|
| 139 |
+
"Grade": result.grade,
|
| 140 |
+
"Level": level
|
| 141 |
+
},
|
| 142 |
+
"🎯 ASR Results": {
|
| 143 |
+
"Target": result.target,
|
| 144 |
+
"Transcription": result.transcription,
|
| 145 |
+
"Similarity": f"{result.similarity*100:.2f}%",
|
| 146 |
+
"WER": f"{result.wer*100:.2f}%"
|
| 147 |
+
},
|
| 148 |
+
"📈 Component Scores": {
|
| 149 |
+
"Clarity": result.clarity_score,
|
| 150 |
+
"Energy": result.energy_score,
|
| 151 |
+
"Speech Rate": result.speech_rate_score,
|
| 152 |
+
"Pitch Consistency": result.pitch_consistency_score,
|
| 153 |
+
"SNR": result.snr_score,
|
| 154 |
+
"Articulation": result.articulation_score
|
| 155 |
+
},
|
| 156 |
+
"🔊 Audio Features": result.audio_features
|
| 157 |
+
}
|
| 158 |
+
|
| 159 |
+
# Grade display with emoji
|
| 160 |
+
grade_emoji = {
|
| 161 |
+
'A': '🌟 Grade A - Sempurna!',
|
| 162 |
+
'B': '👍 Grade B - Bagus!',
|
| 163 |
+
'C': '😊 Grade C - Cukup Baik',
|
| 164 |
+
'D': '🤔 Grade D - Perlu Latihan',
|
| 165 |
+
'E': '💪 Grade E - Terus Berlatih!'
|
| 166 |
+
}
|
| 167 |
+
grade_display = f"# {grade_emoji.get(result.grade, '📊 Grade ' + result.grade)}\n## Score: {result.overall_score}/100"
|
| 168 |
+
|
| 169 |
+
return score_display, feedback_display, details, grade_display
|
| 170 |
+
|
| 171 |
+
except Exception as e:
|
| 172 |
+
error_msg = f"❌ Error: {str(e)}"
|
| 173 |
+
return error_msg, error_msg, {"error": str(e)}, "❌ Error"
|
| 174 |
+
|
| 175 |
+
# =======================================
|
| 176 |
+
# GRADIO UI
|
| 177 |
+
# =======================================
|
| 178 |
+
|
| 179 |
+
def create_interface():
|
| 180 |
+
"""Create Gradio interface"""
|
| 181 |
+
|
| 182 |
+
# Custom CSS
|
| 183 |
+
custom_css = """
|
| 184 |
+
.gradio-container {
|
| 185 |
+
font-family: 'Segoe UI', Arial, sans-serif;
|
| 186 |
+
}
|
| 187 |
+
.grade-display {
|
| 188 |
+
text-align: center;
|
| 189 |
+
padding: 20px;
|
| 190 |
+
border-radius: 10px;
|
| 191 |
+
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
|
| 192 |
+
color: white;
|
| 193 |
+
}
|
| 194 |
+
"""
|
| 195 |
+
|
| 196 |
+
# Create interface
|
| 197 |
+
with gr.Blocks(
|
| 198 |
+
title="Vocal Articulation Assessment v2",
|
| 199 |
+
theme=gr.themes.Soft(primary_hue="purple"),
|
| 200 |
+
css=custom_css
|
| 201 |
+
) as demo:
|
| 202 |
+
|
| 203 |
+
gr.Markdown("""
|
| 204 |
+
# 🎤 Sistem Penilaian Vokal Indonesia v2.0
|
| 205 |
+
### Powered by Whisper ASR + Advanced Audio Analysis
|
| 206 |
+
|
| 207 |
+
Sistem ini menilai pengucapan vokal dan artikulasi bahasa Indonesia dengan **6 metrik komprehensif**:
|
| 208 |
+
|
| 209 |
+
| Metrik | Deskripsi |
|
| 210 |
+
|--------|-----------|
|
| 211 |
+
| 🔊 **Clarity** | Kejelasan pengucapan dari ASR accuracy (Whisper) |
|
| 212 |
+
| ⚡ **Energy** | Kualitas volume dan energi suara |
|
| 213 |
+
| 🗣️ **Speech Rate** | Kecepatan bicara (suku kata per detik) |
|
| 214 |
+
| 🎵 **Pitch Consistency** | Stabilitas nada suara |
|
| 215 |
+
| 📡 **SNR** | Signal-to-Noise Ratio (kualitas rekaman) |
|
| 216 |
+
| 🎤 **Articulation** | Kejernihan artikulasi dari analisis spektral |
|
| 217 |
+
|
| 218 |
+
---
|
| 219 |
+
|
| 220 |
+
### 📚 5 Level Latihan Artikulasi:
|
| 221 |
+
""")
|
| 222 |
+
|
| 223 |
+
# Display levels
|
| 224 |
+
for level_num, level_data in ARTICULATION_LEVELS.items():
|
| 225 |
+
with gr.Accordion(f"Level {level_num}: {level_data['name']} - {level_data['difficulty']}", open=False):
|
| 226 |
+
targets_display = ", ".join(level_data['targets'][:10])
|
| 227 |
+
if len(level_data['targets']) > 10:
|
| 228 |
+
targets_display += f"... dan {len(level_data['targets']) - 10} lainnya"
|
| 229 |
+
gr.Markdown(f"**Contoh target**: {targets_display}")
|
| 230 |
+
|
| 231 |
+
gr.Markdown("---")
|
| 232 |
+
|
| 233 |
+
with gr.Row():
|
| 234 |
+
with gr.Column(scale=1):
|
| 235 |
+
gr.Markdown("## 🎙️ Input Audio & Target")
|
| 236 |
+
|
| 237 |
+
# Audio input
|
| 238 |
+
audio_input = gr.Audio(
|
| 239 |
+
label="Upload atau Record Audio",
|
| 240 |
+
type="filepath",
|
| 241 |
+
sources=["upload", "microphone"]
|
| 242 |
+
)
|
| 243 |
+
|
| 244 |
+
# Target text input
|
| 245 |
+
target_input = gr.Textbox(
|
| 246 |
+
label="Target Text",
|
| 247 |
+
placeholder="Masukkan text yang Anda ucapkan (misal: A, BA, PSIKOLOGI, dll)",
|
| 248 |
+
info="Masukkan text sesuai level yang dipilih"
|
| 249 |
+
)
|
| 250 |
+
|
| 251 |
+
# Level selector
|
| 252 |
+
level_input = gr.Slider(
|
| 253 |
+
label="Level Artikulasi",
|
| 254 |
+
minimum=1,
|
| 255 |
+
maximum=5,
|
| 256 |
+
value=1,
|
| 257 |
+
step=1,
|
| 258 |
+
info="1=Vokal, 2=Konsonan, 3=Suku Kata, 4=Kata, 5=Kalimat"
|
| 259 |
+
)
|
| 260 |
+
|
| 261 |
+
# Submit button
|
| 262 |
+
submit_btn = gr.Button(
|
| 263 |
+
"🎯 Nilai Pengucapan",
|
| 264 |
+
variant="primary",
|
| 265 |
+
size="lg"
|
| 266 |
+
)
|
| 267 |
+
|
| 268 |
+
# Examples
|
| 269 |
+
gr.Markdown("### 📝 Contoh Quick Test")
|
| 270 |
+
gr.Examples(
|
| 271 |
+
examples=[
|
| 272 |
+
[None, "A", 1],
|
| 273 |
+
[None, "I", 1],
|
| 274 |
+
[None, "U", 1],
|
| 275 |
+
[None, "BA", 2],
|
| 276 |
+
[None, "STRATEGI", 4],
|
| 277 |
+
],
|
| 278 |
+
inputs=[audio_input, target_input, level_input],
|
| 279 |
+
label="Klik untuk auto-fill (masih perlu audio)"
|
| 280 |
+
)
|
| 281 |
+
|
| 282 |
+
with gr.Column(scale=1):
|
| 283 |
+
gr.Markdown("## 📊 Hasil & Grade")
|
| 284 |
+
|
| 285 |
+
# Grade display (large)
|
| 286 |
+
grade_output = gr.Markdown(
|
| 287 |
+
"### 🎯 Upload audio untuk mulai penilaian",
|
| 288 |
+
elem_classes=["grade-display"]
|
| 289 |
+
)
|
| 290 |
+
|
| 291 |
+
# Score display
|
| 292 |
+
score_output = gr.Markdown()
|
| 293 |
+
|
| 294 |
+
# Feedback row
|
| 295 |
+
gr.Markdown("---")
|
| 296 |
+
with gr.Row():
|
| 297 |
+
feedback_output = gr.Markdown()
|
| 298 |
+
|
| 299 |
+
# Details accordion
|
| 300 |
+
with gr.Accordion("🔍 Detail Lengkap & Audio Features", open=False):
|
| 301 |
+
details_output = gr.JSON(label="Detailed Metrics & Features")
|
| 302 |
+
|
| 303 |
+
# Connect button to function
|
| 304 |
+
submit_btn.click(
|
| 305 |
+
fn=score_vocal,
|
| 306 |
+
inputs=[audio_input, target_input, level_input],
|
| 307 |
+
outputs=[score_output, feedback_output, details_output, grade_output]
|
| 308 |
+
)
|
| 309 |
+
|
| 310 |
+
# Footer
|
| 311 |
+
gr.Markdown("""
|
| 312 |
+
---
|
| 313 |
+
|
| 314 |
+
### ℹ️ Informasi Sistem
|
| 315 |
+
|
| 316 |
+
**Grading System:**
|
| 317 |
+
- **Grade A** (90-100): 🌟 Sempurna - pengucapan sangat jelas dan akurat
|
| 318 |
+
- **Grade B** (80-89): 👍 Bagus - pengucapan cukup jelas dengan minor errors
|
| 319 |
+
- **Grade C** (70-79): 😊 Cukup - ada beberapa kesalahan
|
| 320 |
+
- **Grade D** (60-69): 🤔 Kurang - perlu latihan lebih
|
| 321 |
+
- **Grade E** (<60): 💪 Terus berlatih!
|
| 322 |
+
|
| 323 |
+
**Model**: OpenAI Whisper (multilingual ASR) + Advanced Audio Signal Processing
|
| 324 |
+
|
| 325 |
+
**Dibuat untuk**: Latihan Dasar Artikulasi Vokal Indonesia (Level 1-5)
|
| 326 |
+
|
| 327 |
+
**Version**: 2.0.0 | **Updated**: November 2025
|
| 328 |
+
""")
|
| 329 |
+
|
| 330 |
+
return demo
|
| 331 |
+
|
| 332 |
+
# =======================================
|
| 333 |
+
# MAIN
|
| 334 |
+
# =======================================
|
| 335 |
+
|
| 336 |
+
if __name__ == "__main__":
|
| 337 |
+
# Initialize model at startup
|
| 338 |
+
print("🔄 Initializing system...")
|
| 339 |
+
initialize_model()
|
| 340 |
+
print("✅ System ready!")
|
| 341 |
+
|
| 342 |
+
# Create and launch interface
|
| 343 |
+
demo = create_interface()
|
| 344 |
+
|
| 345 |
+
# Launch configuration
|
| 346 |
+
demo.launch(
|
| 347 |
+
server_name="0.0.0.0",
|
| 348 |
+
server_port=7860,
|
| 349 |
+
share=False, # Set True untuk public URL
|
| 350 |
+
show_error=True
|
| 351 |
+
)
|
config/__init__.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Config module
|
| 2 |
+
from .settings import get_settings
|
| 3 |
+
|
| 4 |
+
__all__ = ['get_settings']
|
config/settings.py
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Application settings and configuration
|
| 3 |
+
"""
|
| 4 |
+
import os
|
| 5 |
+
from functools import lru_cache
|
| 6 |
+
|
| 7 |
+
class Settings:
|
| 8 |
+
"""Application configuration"""
|
| 9 |
+
|
| 10 |
+
# Model settings
|
| 11 |
+
WHISPER_MODEL: str = os.getenv("WHISPER_MODEL", "openai/whisper-small")
|
| 12 |
+
|
| 13 |
+
# Server settings
|
| 14 |
+
HOST: str = os.getenv("HOST", "0.0.0.0")
|
| 15 |
+
PORT: int = int(os.getenv("PORT", "7860"))
|
| 16 |
+
|
| 17 |
+
# Gradio settings
|
| 18 |
+
GRADIO_SERVER_NAME: str = os.getenv("GRADIO_SERVER_NAME", "0.0.0.0")
|
| 19 |
+
GRADIO_SERVER_PORT: int = int(os.getenv("GRADIO_SERVER_PORT", "7860"))
|
| 20 |
+
GRADIO_SHARE: bool = os.getenv("GRADIO_SHARE", "False").lower() == "true"
|
| 21 |
+
|
| 22 |
+
# Application settings
|
| 23 |
+
APP_NAME: str = "Vocal Articulation Assessment"
|
| 24 |
+
VERSION: str = "2.0.0"
|
| 25 |
+
DEBUG: bool = os.getenv("DEBUG", "False").lower() == "true"
|
| 26 |
+
|
| 27 |
+
@lru_cache()
|
| 28 |
+
def get_settings() -> Settings:
|
| 29 |
+
"""Get cached settings instance"""
|
| 30 |
+
return Settings()
|
core/__init__.py
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Core module for vocal articulation scoring
|
| 2 |
+
from .scoring_engine import AdvancedVocalScoringSystem, ScoreResult
|
| 3 |
+
from .constants import ARTICULATION_LEVELS
|
| 4 |
+
|
| 5 |
+
__all__ = [
|
| 6 |
+
'AdvancedVocalScoringSystem',
|
| 7 |
+
'ScoreResult',
|
| 8 |
+
'ARTICULATION_LEVELS'
|
| 9 |
+
]
|
core/constants.py
ADDED
|
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# =======================================
|
| 2 |
+
# CONSTANTS - Articulation Levels
|
| 3 |
+
# =======================================
|
| 4 |
+
|
| 5 |
+
ARTICULATION_LEVELS = {
|
| 6 |
+
1: {
|
| 7 |
+
"name": "Pengenalan Vokal",
|
| 8 |
+
"targets": ["A", "I", "U", "E", "O"],
|
| 9 |
+
"difficulty": "Sangat Mudah",
|
| 10 |
+
"speech_rate_range": (0.5, 2.0)
|
| 11 |
+
},
|
| 12 |
+
2: {
|
| 13 |
+
"name": "Konsonan Dasar",
|
| 14 |
+
"targets": ["BA", "PA", "DA", "TA", "GA", "KA", "FA", "VA", "SA", "ZA",
|
| 15 |
+
"MA", "NA", "NGA", "NYA", "RA", "LA"],
|
| 16 |
+
"difficulty": "Mudah",
|
| 17 |
+
"speech_rate_range": (2.0, 4.0)
|
| 18 |
+
},
|
| 19 |
+
3: {
|
| 20 |
+
"name": "Kombinasi Suku Kata",
|
| 21 |
+
"targets": ["BA", "BE", "BI", "BO", "BU", "TA", "TI", "TU", "TE", "TO",
|
| 22 |
+
"KA", "KI", "KU", "KE", "KO", "RA", "RI", "RU", "RE", "RO",
|
| 23 |
+
"LA", "LI", "LU", "LE", "LO", "CHA", "CHI", "CHU", "CHE", "CHO",
|
| 24 |
+
"STRA", "STRI", "STRU", "STRE", "STRO", "AK", "IK", "UK", "EK", "OK"],
|
| 25 |
+
"difficulty": "Sedang",
|
| 26 |
+
"speech_rate_range": (2.5, 5.0)
|
| 27 |
+
},
|
| 28 |
+
4: {
|
| 29 |
+
"name": "Kata Sulit",
|
| 30 |
+
"targets": ["PSIKOLOGI", "STRATEGI", "IMPLEMENTASI", "INFRASTRUKTUR",
|
| 31 |
+
"KHARISMATIK", "TRANSKRIPSI", "OTORITER", "PROBABILITAS",
|
| 32 |
+
"KUALITAS", "SPESIFIKASI"],
|
| 33 |
+
"difficulty": "Sulit",
|
| 34 |
+
"speech_rate_range": (2.0, 4.5)
|
| 35 |
+
},
|
| 36 |
+
5: {
|
| 37 |
+
"name": "Kalimat Kompleks",
|
| 38 |
+
"targets": [
|
| 39 |
+
"ULAR LARI LURUS DI ATAS REL LURUS",
|
| 40 |
+
"KUKU KAKI KAKEK KAKAKKU KAKU DAN KOTOR",
|
| 41 |
+
"SATU SATE TUJUH TUSUK DUA SATE EMPAT BELAS TUSUK",
|
| 42 |
+
"KEPALA DIPARUT KELAPA DIGARUK JANGAN SAMPAI TERTUKAR",
|
| 43 |
+
"PSIKOLOGI MEMPELAJARI PROSES PROSES PSIKIS SECARA SPESIFIK",
|
| 44 |
+
"STRATEGI IMPLEMENTASI INFRASTRUKTUR TRANSISIONAL HARUS JELAS",
|
| 45 |
+
"KLAIM KLAIM KLIMAKS KLASIK KELOMPOK KITA KIAN KRITIS"
|
| 46 |
+
],
|
| 47 |
+
"difficulty": "Sangat Sulit",
|
| 48 |
+
"speech_rate_range": (2.5, 4.5)
|
| 49 |
+
}
|
| 50 |
+
}
|
| 51 |
+
|
| 52 |
+
# Scoring weights per level
|
| 53 |
+
LEVEL_WEIGHTS = {
|
| 54 |
+
1: { # Vokal tunggal
|
| 55 |
+
'clarity': 0.45,
|
| 56 |
+
'energy': 0.25,
|
| 57 |
+
'speech_rate': 0.0,
|
| 58 |
+
'pitch_consistency': 0.15,
|
| 59 |
+
'snr': 0.10,
|
| 60 |
+
'articulation': 0.05
|
| 61 |
+
},
|
| 62 |
+
2: { # Konsonan dasar
|
| 63 |
+
'clarity': 0.40,
|
| 64 |
+
'energy': 0.20,
|
| 65 |
+
'speech_rate': 0.15,
|
| 66 |
+
'pitch_consistency': 0.10,
|
| 67 |
+
'snr': 0.10,
|
| 68 |
+
'articulation': 0.05
|
| 69 |
+
},
|
| 70 |
+
3: { # Kombinasi suku kata
|
| 71 |
+
'clarity': 0.40,
|
| 72 |
+
'energy': 0.15,
|
| 73 |
+
'speech_rate': 0.20,
|
| 74 |
+
'pitch_consistency': 0.10,
|
| 75 |
+
'snr': 0.10,
|
| 76 |
+
'articulation': 0.05
|
| 77 |
+
},
|
| 78 |
+
4: { # Kata sulit
|
| 79 |
+
'clarity': 0.45,
|
| 80 |
+
'energy': 0.15,
|
| 81 |
+
'speech_rate': 0.15,
|
| 82 |
+
'pitch_consistency': 0.10,
|
| 83 |
+
'snr': 0.10,
|
| 84 |
+
'articulation': 0.05
|
| 85 |
+
},
|
| 86 |
+
5: { # Kalimat kompleks
|
| 87 |
+
'clarity': 0.45,
|
| 88 |
+
'energy': 0.10,
|
| 89 |
+
'speech_rate': 0.20,
|
| 90 |
+
'pitch_consistency': 0.10,
|
| 91 |
+
'snr': 0.10,
|
| 92 |
+
'articulation': 0.05
|
| 93 |
+
}
|
| 94 |
+
}
|
core/scoring_engine.py
ADDED
|
@@ -0,0 +1,638 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# =======================================
|
| 2 |
+
# ADVANCED VOCAL SCORING SYSTEM
|
| 3 |
+
# ASR-based dengan Whisper + Audio Analysis
|
| 4 |
+
# Support Level 1-5 Artikulasi
|
| 5 |
+
# =======================================
|
| 6 |
+
|
| 7 |
+
import torch
|
| 8 |
+
import torchaudio
|
| 9 |
+
import numpy as np
|
| 10 |
+
import librosa
|
| 11 |
+
from transformers import (
|
| 12 |
+
WhisperProcessor,
|
| 13 |
+
WhisperForConditionalGeneration,
|
| 14 |
+
pipeline
|
| 15 |
+
)
|
| 16 |
+
from typing import Dict, List, Tuple, Optional
|
| 17 |
+
from dataclasses import dataclass
|
| 18 |
+
import difflib
|
| 19 |
+
import re
|
| 20 |
+
|
| 21 |
+
from .constants import ARTICULATION_LEVELS, LEVEL_WEIGHTS
|
| 22 |
+
|
| 23 |
+
# =======================================
|
| 24 |
+
# SCORE RESULT DATACLASS
|
| 25 |
+
# =======================================
|
| 26 |
+
|
| 27 |
+
@dataclass
|
| 28 |
+
class ScoreResult:
|
| 29 |
+
"""Comprehensive scoring result"""
|
| 30 |
+
# Overall
|
| 31 |
+
overall_score: float # 0-100
|
| 32 |
+
grade: str # A-E
|
| 33 |
+
|
| 34 |
+
# Component scores
|
| 35 |
+
clarity_score: float # ASR accuracy (0-100)
|
| 36 |
+
energy_score: float # Volume quality (0-100)
|
| 37 |
+
speech_rate_score: float # Speech rate (0-100)
|
| 38 |
+
pitch_consistency_score: float # Pitch stability (0-100)
|
| 39 |
+
snr_score: float # Signal-to-noise ratio (0-100)
|
| 40 |
+
articulation_score: float # Articulation clarity (0-100)
|
| 41 |
+
|
| 42 |
+
# ASR results
|
| 43 |
+
transcription: str
|
| 44 |
+
target: str
|
| 45 |
+
similarity: float # 0-1
|
| 46 |
+
wer: float # Word Error Rate
|
| 47 |
+
|
| 48 |
+
# Audio features
|
| 49 |
+
audio_features: Dict
|
| 50 |
+
|
| 51 |
+
# Feedback
|
| 52 |
+
feedback: str
|
| 53 |
+
suggestions: List[str]
|
| 54 |
+
level: int
|
| 55 |
+
|
| 56 |
+
# =======================================
|
| 57 |
+
# ADVANCED SCORING SYSTEM
|
| 58 |
+
# =======================================
|
| 59 |
+
|
| 60 |
+
class AdvancedVocalScoringSystem:
|
| 61 |
+
"""
|
| 62 |
+
Sistem penilaian vokal dengan ASR (Whisper) + Audio Analysis
|
| 63 |
+
Support Level 1-5
|
| 64 |
+
"""
|
| 65 |
+
|
| 66 |
+
def __init__(
|
| 67 |
+
self,
|
| 68 |
+
whisper_model: str = "openai/whisper-small", # atau "openai/whisper-large-v3"
|
| 69 |
+
device: str = None
|
| 70 |
+
):
|
| 71 |
+
"""
|
| 72 |
+
Initialize system dengan Whisper ASR
|
| 73 |
+
|
| 74 |
+
Args:
|
| 75 |
+
whisper_model: Model Whisper yang digunakan
|
| 76 |
+
device: 'cuda' atau 'cpu'
|
| 77 |
+
"""
|
| 78 |
+
self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
|
| 79 |
+
|
| 80 |
+
print(f"🔄 Loading Whisper model: {whisper_model}...")
|
| 81 |
+
|
| 82 |
+
# Load Whisper model
|
| 83 |
+
self.processor = WhisperProcessor.from_pretrained(whisper_model)
|
| 84 |
+
self.model = WhisperForConditionalGeneration.from_pretrained(whisper_model)
|
| 85 |
+
self.model.to(self.device)
|
| 86 |
+
self.model.eval()
|
| 87 |
+
|
| 88 |
+
# Whisper pipeline untuk transcription
|
| 89 |
+
self.pipe = pipeline(
|
| 90 |
+
"automatic-speech-recognition",
|
| 91 |
+
model=whisper_model,
|
| 92 |
+
device=0 if self.device == "cuda" else -1
|
| 93 |
+
)
|
| 94 |
+
|
| 95 |
+
print(f"✅ Whisper model loaded on {self.device}")
|
| 96 |
+
|
| 97 |
+
# Scoring weights untuk setiap level
|
| 98 |
+
self.level_weights = LEVEL_WEIGHTS
|
| 99 |
+
|
| 100 |
+
def score_audio(
|
| 101 |
+
self,
|
| 102 |
+
audio_path: str,
|
| 103 |
+
target_text: str,
|
| 104 |
+
level: int = 1
|
| 105 |
+
) -> ScoreResult:
|
| 106 |
+
"""
|
| 107 |
+
Score audio file dengan comprehensive metrics
|
| 108 |
+
|
| 109 |
+
Args:
|
| 110 |
+
audio_path: Path ke audio file
|
| 111 |
+
target_text: Target text yang seharusnya diucapkan
|
| 112 |
+
level: Level artikulasi (1-5)
|
| 113 |
+
|
| 114 |
+
Returns:
|
| 115 |
+
ScoreResult dengan semua metrik
|
| 116 |
+
"""
|
| 117 |
+
# Load audio
|
| 118 |
+
waveform, sr = torchaudio.load(audio_path)
|
| 119 |
+
|
| 120 |
+
# Convert to numpy for librosa
|
| 121 |
+
audio_np = waveform.numpy()
|
| 122 |
+
if audio_np.ndim > 1:
|
| 123 |
+
audio_np = audio_np[0]
|
| 124 |
+
|
| 125 |
+
# 1. CLARITY SCORE (ASR-based)
|
| 126 |
+
clarity_score, transcription, similarity, wer = self._score_clarity(
|
| 127 |
+
audio_path, target_text
|
| 128 |
+
)
|
| 129 |
+
|
| 130 |
+
# 2. ENERGY SCORE
|
| 131 |
+
energy_score = self._score_energy(audio_np, sr)
|
| 132 |
+
|
| 133 |
+
# 3. SPEECH RATE SCORE
|
| 134 |
+
speech_rate_score = self._score_speech_rate(
|
| 135 |
+
audio_np, sr, target_text, level
|
| 136 |
+
)
|
| 137 |
+
|
| 138 |
+
# 4. PITCH CONSISTENCY SCORE
|
| 139 |
+
pitch_consistency_score = self._score_pitch_consistency(audio_np, sr)
|
| 140 |
+
|
| 141 |
+
# 5. SNR SCORE (Signal-to-Noise Ratio)
|
| 142 |
+
snr_score = self._score_snr(audio_np, sr)
|
| 143 |
+
|
| 144 |
+
# 6. ARTICULATION SCORE
|
| 145 |
+
articulation_score = self._score_articulation(audio_np, sr)
|
| 146 |
+
|
| 147 |
+
# Extract audio features
|
| 148 |
+
audio_features = self._extract_audio_features(audio_np, sr, transcription)
|
| 149 |
+
|
| 150 |
+
# Calculate overall score with level-specific weights
|
| 151 |
+
weights = self.level_weights.get(level, self.level_weights[1])
|
| 152 |
+
overall_score = (
|
| 153 |
+
clarity_score * weights['clarity'] +
|
| 154 |
+
energy_score * weights['energy'] +
|
| 155 |
+
speech_rate_score * weights['speech_rate'] +
|
| 156 |
+
pitch_consistency_score * weights['pitch_consistency'] +
|
| 157 |
+
snr_score * weights['snr'] +
|
| 158 |
+
articulation_score * weights['articulation']
|
| 159 |
+
)
|
| 160 |
+
|
| 161 |
+
# Determine grade
|
| 162 |
+
grade = self._get_grade(overall_score)
|
| 163 |
+
|
| 164 |
+
# Generate feedback
|
| 165 |
+
feedback, suggestions = self._generate_feedback(
|
| 166 |
+
overall_score=overall_score,
|
| 167 |
+
clarity_score=clarity_score,
|
| 168 |
+
energy_score=energy_score,
|
| 169 |
+
speech_rate_score=speech_rate_score,
|
| 170 |
+
pitch_consistency_score=pitch_consistency_score,
|
| 171 |
+
snr_score=snr_score,
|
| 172 |
+
articulation_score=articulation_score,
|
| 173 |
+
transcription=transcription,
|
| 174 |
+
target_text=target_text,
|
| 175 |
+
similarity=similarity,
|
| 176 |
+
level=level,
|
| 177 |
+
audio_features=audio_features
|
| 178 |
+
)
|
| 179 |
+
|
| 180 |
+
return ScoreResult(
|
| 181 |
+
overall_score=round(overall_score, 2),
|
| 182 |
+
grade=grade,
|
| 183 |
+
clarity_score=round(clarity_score, 2),
|
| 184 |
+
energy_score=round(energy_score, 2),
|
| 185 |
+
speech_rate_score=round(speech_rate_score, 2),
|
| 186 |
+
pitch_consistency_score=round(pitch_consistency_score, 2),
|
| 187 |
+
snr_score=round(snr_score, 2),
|
| 188 |
+
articulation_score=round(articulation_score, 2),
|
| 189 |
+
transcription=transcription,
|
| 190 |
+
target=target_text.upper(),
|
| 191 |
+
similarity=round(similarity, 4),
|
| 192 |
+
wer=round(wer, 4),
|
| 193 |
+
audio_features=audio_features,
|
| 194 |
+
feedback=feedback,
|
| 195 |
+
suggestions=suggestions,
|
| 196 |
+
level=level
|
| 197 |
+
)
|
| 198 |
+
|
| 199 |
+
# =======================================
|
| 200 |
+
# SCORING COMPONENTS
|
| 201 |
+
# =======================================
|
| 202 |
+
|
| 203 |
+
def _score_clarity(
|
| 204 |
+
self,
|
| 205 |
+
audio_path: str,
|
| 206 |
+
target_text: str
|
| 207 |
+
) -> Tuple[float, str, float, float]:
|
| 208 |
+
"""
|
| 209 |
+
Score clarity using Whisper ASR
|
| 210 |
+
|
| 211 |
+
Returns:
|
| 212 |
+
(clarity_score, transcription, similarity, wer)
|
| 213 |
+
"""
|
| 214 |
+
try:
|
| 215 |
+
# Transcribe with Whisper
|
| 216 |
+
result = self.pipe(
|
| 217 |
+
audio_path,
|
| 218 |
+
return_timestamps=False,
|
| 219 |
+
generate_kwargs={"language": "indonesian"}
|
| 220 |
+
)
|
| 221 |
+
transcription = result["text"].upper().strip()
|
| 222 |
+
|
| 223 |
+
except Exception as e:
|
| 224 |
+
print(f"⚠️ ASR Error: {e}")
|
| 225 |
+
transcription = ""
|
| 226 |
+
|
| 227 |
+
target_text = target_text.upper().strip()
|
| 228 |
+
|
| 229 |
+
# Calculate similarity
|
| 230 |
+
similarity = difflib.SequenceMatcher(None, transcription, target_text).ratio()
|
| 231 |
+
|
| 232 |
+
# Calculate WER
|
| 233 |
+
wer = self._calculate_wer(transcription, target_text)
|
| 234 |
+
|
| 235 |
+
# Clarity score based on similarity and WER
|
| 236 |
+
clarity_score = (similarity * 0.7 + (1 - wer) * 0.3) * 100
|
| 237 |
+
|
| 238 |
+
return clarity_score, transcription, similarity, wer
|
| 239 |
+
|
| 240 |
+
def _score_energy(self, audio: np.ndarray, sr: int) -> float:
|
| 241 |
+
"""
|
| 242 |
+
Score energy/volume quality
|
| 243 |
+
|
| 244 |
+
Returns:
|
| 245 |
+
energy_score (0-100)
|
| 246 |
+
"""
|
| 247 |
+
# RMS energy
|
| 248 |
+
rms = np.sqrt(np.mean(audio**2))
|
| 249 |
+
rms_db = 20 * np.log10(rms + 1e-10)
|
| 250 |
+
|
| 251 |
+
# Optimal range: -30 to -10 dB
|
| 252 |
+
if -30 <= rms_db <= -10:
|
| 253 |
+
energy_score = 100
|
| 254 |
+
elif -40 <= rms_db < -30:
|
| 255 |
+
energy_score = 60 + (rms_db + 40) * 4
|
| 256 |
+
elif -10 < rms_db <= -5:
|
| 257 |
+
energy_score = 100 - (rms_db + 10) * 8
|
| 258 |
+
elif rms_db < -40:
|
| 259 |
+
energy_score = max(0, 60 + (rms_db + 40) * 4)
|
| 260 |
+
else:
|
| 261 |
+
energy_score = max(0, 60 - (rms_db + 5) * 5)
|
| 262 |
+
|
| 263 |
+
return min(100, max(0, energy_score))
|
| 264 |
+
|
| 265 |
+
def _score_speech_rate(
|
| 266 |
+
self,
|
| 267 |
+
audio: np.ndarray,
|
| 268 |
+
sr: int,
|
| 269 |
+
target_text: str,
|
| 270 |
+
level: int
|
| 271 |
+
) -> float:
|
| 272 |
+
"""
|
| 273 |
+
Score speech rate (syllable per second)
|
| 274 |
+
|
| 275 |
+
Returns:
|
| 276 |
+
speech_rate_score (0-100)
|
| 277 |
+
"""
|
| 278 |
+
# Duration
|
| 279 |
+
duration = len(audio) / sr
|
| 280 |
+
|
| 281 |
+
# Count syllables in target
|
| 282 |
+
syllable_count = self._count_syllables(target_text)
|
| 283 |
+
|
| 284 |
+
if duration <= 0 or syllable_count == 0:
|
| 285 |
+
return 50 # neutral score
|
| 286 |
+
|
| 287 |
+
# Calculate speech rate
|
| 288 |
+
speech_rate = syllable_count / duration
|
| 289 |
+
|
| 290 |
+
# Get optimal speech rate from level configuration
|
| 291 |
+
level_config = ARTICULATION_LEVELS.get(level, ARTICULATION_LEVELS[1])
|
| 292 |
+
min_rate, max_rate = level_config.get('speech_rate_range', (2.0, 4.0))
|
| 293 |
+
optimal_mid = (min_rate + max_rate) / 2
|
| 294 |
+
|
| 295 |
+
# Score based on deviation from optimal
|
| 296 |
+
if min_rate <= speech_rate <= max_rate:
|
| 297 |
+
# Within optimal range
|
| 298 |
+
deviation = abs(speech_rate - optimal_mid) / (max_rate - min_rate)
|
| 299 |
+
speech_rate_score = 100 - (deviation * 20)
|
| 300 |
+
else:
|
| 301 |
+
# Outside optimal range
|
| 302 |
+
if speech_rate < min_rate:
|
| 303 |
+
deviation = (min_rate - speech_rate) / min_rate
|
| 304 |
+
else:
|
| 305 |
+
deviation = (speech_rate - max_rate) / max_rate
|
| 306 |
+
|
| 307 |
+
speech_rate_score = max(0, 80 - (deviation * 80))
|
| 308 |
+
|
| 309 |
+
return min(100, max(0, speech_rate_score))
|
| 310 |
+
|
| 311 |
+
def _score_pitch_consistency(self, audio: np.ndarray, sr: int) -> float:
|
| 312 |
+
"""
|
| 313 |
+
Score pitch consistency/stability
|
| 314 |
+
|
| 315 |
+
Returns:
|
| 316 |
+
pitch_score (0-100)
|
| 317 |
+
"""
|
| 318 |
+
try:
|
| 319 |
+
# Extract pitch using librosa
|
| 320 |
+
pitches, magnitudes = librosa.piptrack(
|
| 321 |
+
y=audio,
|
| 322 |
+
sr=sr,
|
| 323 |
+
fmin=80,
|
| 324 |
+
fmax=400
|
| 325 |
+
)
|
| 326 |
+
|
| 327 |
+
# Get pitch values
|
| 328 |
+
pitch_values = []
|
| 329 |
+
for t in range(pitches.shape[1]):
|
| 330 |
+
index = magnitudes[:, t].argmax()
|
| 331 |
+
pitch = pitches[index, t]
|
| 332 |
+
if pitch > 0:
|
| 333 |
+
pitch_values.append(pitch)
|
| 334 |
+
|
| 335 |
+
if len(pitch_values) < 5:
|
| 336 |
+
return 50 # not enough data
|
| 337 |
+
|
| 338 |
+
# Calculate coefficient of variation
|
| 339 |
+
pitch_std = np.std(pitch_values)
|
| 340 |
+
pitch_mean = np.mean(pitch_values)
|
| 341 |
+
cv = pitch_std / pitch_mean if pitch_mean > 0 else 1
|
| 342 |
+
|
| 343 |
+
# Score based on CV
|
| 344 |
+
if cv < 0.1:
|
| 345 |
+
pitch_score = 95 + (0.1 - cv) * 50
|
| 346 |
+
elif cv < 0.2:
|
| 347 |
+
pitch_score = 80 + (0.2 - cv) * 150
|
| 348 |
+
elif cv < 0.3:
|
| 349 |
+
pitch_score = 60 + (0.3 - cv) * 200
|
| 350 |
+
else:
|
| 351 |
+
pitch_score = max(0, 60 - (cv - 0.3) * 100)
|
| 352 |
+
|
| 353 |
+
return min(100, max(0, pitch_score))
|
| 354 |
+
|
| 355 |
+
except Exception as e:
|
| 356 |
+
return 50 # neutral on error
|
| 357 |
+
|
| 358 |
+
def _score_snr(self, audio: np.ndarray, sr: int) -> float:
|
| 359 |
+
"""
|
| 360 |
+
Score Signal-to-Noise Ratio
|
| 361 |
+
|
| 362 |
+
Returns:
|
| 363 |
+
snr_score (0-100)
|
| 364 |
+
"""
|
| 365 |
+
try:
|
| 366 |
+
# Simple SNR estimation
|
| 367 |
+
# Assume first and last 10% are potential noise
|
| 368 |
+
noise_samples = int(len(audio) * 0.1)
|
| 369 |
+
|
| 370 |
+
if len(audio) < noise_samples * 3:
|
| 371 |
+
return 50 # audio too short
|
| 372 |
+
|
| 373 |
+
noise = np.concatenate([audio[:noise_samples], audio[-noise_samples:]])
|
| 374 |
+
signal = audio[noise_samples:-noise_samples]
|
| 375 |
+
|
| 376 |
+
# Calculate power
|
| 377 |
+
signal_power = np.mean(signal**2)
|
| 378 |
+
noise_power = np.mean(noise**2)
|
| 379 |
+
|
| 380 |
+
if noise_power == 0:
|
| 381 |
+
return 100 # perfect
|
| 382 |
+
|
| 383 |
+
snr = 10 * np.log10(signal_power / noise_power)
|
| 384 |
+
|
| 385 |
+
# Score based on SNR
|
| 386 |
+
# Good SNR: > 20 dB
|
| 387 |
+
# Acceptable: 10-20 dB
|
| 388 |
+
# Poor: < 10 dB
|
| 389 |
+
|
| 390 |
+
if snr >= 25:
|
| 391 |
+
snr_score = 100
|
| 392 |
+
elif snr >= 15:
|
| 393 |
+
snr_score = 80 + (snr - 15) * 2
|
| 394 |
+
elif snr >= 10:
|
| 395 |
+
snr_score = 60 + (snr - 10) * 4
|
| 396 |
+
elif snr >= 5:
|
| 397 |
+
snr_score = 40 + (snr - 5) * 4
|
| 398 |
+
else:
|
| 399 |
+
snr_score = max(0, snr * 8)
|
| 400 |
+
|
| 401 |
+
return min(100, max(0, snr_score))
|
| 402 |
+
|
| 403 |
+
except Exception as e:
|
| 404 |
+
return 50 # neutral on error
|
| 405 |
+
|
| 406 |
+
def _score_articulation(self, audio: np.ndarray, sr: int) -> float:
|
| 407 |
+
"""
|
| 408 |
+
Score articulation clarity using spectral features
|
| 409 |
+
|
| 410 |
+
Returns:
|
| 411 |
+
articulation_score (0-100)
|
| 412 |
+
"""
|
| 413 |
+
try:
|
| 414 |
+
# Zero Crossing Rate (higher = more clarity)
|
| 415 |
+
zcr = librosa.zero_crossings(audio).sum() / len(audio)
|
| 416 |
+
|
| 417 |
+
# Spectral centroid (brightness)
|
| 418 |
+
spectral_centroid = librosa.feature.spectral_centroid(y=audio, sr=sr)[0]
|
| 419 |
+
spectral_centroid_mean = spectral_centroid.mean()
|
| 420 |
+
|
| 421 |
+
# Spectral rolloff
|
| 422 |
+
spectral_rolloff = librosa.feature.spectral_rolloff(y=audio, sr=sr)[0]
|
| 423 |
+
spectral_rolloff_mean = spectral_rolloff.mean()
|
| 424 |
+
|
| 425 |
+
# Normalize and score
|
| 426 |
+
# Good articulation: ZCR 0.1-0.3, Centroid 1000-3000 Hz
|
| 427 |
+
|
| 428 |
+
# ZCR score
|
| 429 |
+
if 0.1 <= zcr <= 0.3:
|
| 430 |
+
zcr_score = 100
|
| 431 |
+
elif zcr < 0.1:
|
| 432 |
+
zcr_score = zcr * 1000
|
| 433 |
+
else:
|
| 434 |
+
zcr_score = max(0, 100 - (zcr - 0.3) * 200)
|
| 435 |
+
|
| 436 |
+
# Centroid score
|
| 437 |
+
if 1000 <= spectral_centroid_mean <= 3000:
|
| 438 |
+
centroid_score = 100
|
| 439 |
+
elif spectral_centroid_mean < 1000:
|
| 440 |
+
centroid_score = (spectral_centroid_mean / 1000) * 100
|
| 441 |
+
else:
|
| 442 |
+
centroid_score = max(0, 100 - ((spectral_centroid_mean - 3000) / 3000) * 100)
|
| 443 |
+
|
| 444 |
+
# Combined score
|
| 445 |
+
articulation_score = (zcr_score * 0.4 + centroid_score * 0.6)
|
| 446 |
+
|
| 447 |
+
return min(100, max(0, articulation_score))
|
| 448 |
+
|
| 449 |
+
except Exception as e:
|
| 450 |
+
return 50 # neutral on error
|
| 451 |
+
|
| 452 |
+
# =======================================
|
| 453 |
+
# HELPER FUNCTIONS
|
| 454 |
+
# =======================================
|
| 455 |
+
|
| 456 |
+
def _calculate_wer(self, predicted: str, target: str) -> float:
|
| 457 |
+
"""Calculate Word Error Rate"""
|
| 458 |
+
pred_words = predicted.split()
|
| 459 |
+
target_words = target.split()
|
| 460 |
+
|
| 461 |
+
if not target_words:
|
| 462 |
+
return 1.0 if pred_words else 0.0
|
| 463 |
+
|
| 464 |
+
# Levenshtein distance
|
| 465 |
+
d = np.zeros((len(pred_words) + 1, len(target_words) + 1))
|
| 466 |
+
|
| 467 |
+
for i in range(len(pred_words) + 1):
|
| 468 |
+
d[i][0] = i
|
| 469 |
+
for j in range(len(target_words) + 1):
|
| 470 |
+
d[0][j] = j
|
| 471 |
+
|
| 472 |
+
for i in range(1, len(pred_words) + 1):
|
| 473 |
+
for j in range(1, len(target_words) + 1):
|
| 474 |
+
if pred_words[i-1] == target_words[j-1]:
|
| 475 |
+
d[i][j] = d[i-1][j-1]
|
| 476 |
+
else:
|
| 477 |
+
d[i][j] = min(d[i-1][j], d[i][j-1], d[i-1][j-1]) + 1
|
| 478 |
+
|
| 479 |
+
return d[len(pred_words)][len(target_words)] / len(target_words)
|
| 480 |
+
|
| 481 |
+
def _count_syllables(self, text: str) -> int:
|
| 482 |
+
"""
|
| 483 |
+
Count syllables in Indonesian text
|
| 484 |
+
Simplified: count vowels
|
| 485 |
+
"""
|
| 486 |
+
text = text.upper()
|
| 487 |
+
vowels = "AIUEO"
|
| 488 |
+
count = 0
|
| 489 |
+
prev_was_vowel = False
|
| 490 |
+
|
| 491 |
+
for char in text:
|
| 492 |
+
is_vowel = char in vowels
|
| 493 |
+
if is_vowel and not prev_was_vowel:
|
| 494 |
+
count += 1
|
| 495 |
+
prev_was_vowel = is_vowel
|
| 496 |
+
|
| 497 |
+
return max(1, count)
|
| 498 |
+
|
| 499 |
+
def _extract_audio_features(
|
| 500 |
+
self,
|
| 501 |
+
audio: np.ndarray,
|
| 502 |
+
sr: int,
|
| 503 |
+
transcription: str
|
| 504 |
+
) -> Dict:
|
| 505 |
+
"""Extract comprehensive audio features"""
|
| 506 |
+
try:
|
| 507 |
+
duration = len(audio) / sr
|
| 508 |
+
rms = np.sqrt(np.mean(audio**2))
|
| 509 |
+
rms_db = 20 * np.log10(rms + 1e-10)
|
| 510 |
+
zcr = librosa.zero_crossings(audio).sum() / len(audio)
|
| 511 |
+
|
| 512 |
+
# Spectral features
|
| 513 |
+
spectral_centroid = librosa.feature.spectral_centroid(y=audio, sr=sr)[0].mean()
|
| 514 |
+
spectral_rolloff = librosa.feature.spectral_rolloff(y=audio, sr=sr)[0].mean()
|
| 515 |
+
spectral_bandwidth = librosa.feature.spectral_bandwidth(y=audio, sr=sr)[0].mean()
|
| 516 |
+
|
| 517 |
+
# Tempo
|
| 518 |
+
tempo, _ = librosa.beat.beat_track(y=audio, sr=sr)
|
| 519 |
+
|
| 520 |
+
return {
|
| 521 |
+
'duration': round(duration, 3),
|
| 522 |
+
'rms_db': round(rms_db, 2),
|
| 523 |
+
'zero_crossing_rate': round(zcr, 4),
|
| 524 |
+
'spectral_centroid': round(float(spectral_centroid), 2),
|
| 525 |
+
'spectral_rolloff': round(float(spectral_rolloff), 2),
|
| 526 |
+
'spectral_bandwidth': round(float(spectral_bandwidth), 2),
|
| 527 |
+
'tempo': round(float(tempo), 2),
|
| 528 |
+
'transcription': transcription
|
| 529 |
+
}
|
| 530 |
+
except Exception as e:
|
| 531 |
+
return {
|
| 532 |
+
'duration': len(audio) / sr,
|
| 533 |
+
'error': str(e)
|
| 534 |
+
}
|
| 535 |
+
|
| 536 |
+
def _get_grade(self, score: float) -> str:
|
| 537 |
+
"""Convert score to grade"""
|
| 538 |
+
if score >= 90:
|
| 539 |
+
return 'A'
|
| 540 |
+
elif score >= 80:
|
| 541 |
+
return 'B'
|
| 542 |
+
elif score >= 70:
|
| 543 |
+
return 'C'
|
| 544 |
+
elif score >= 60:
|
| 545 |
+
return 'D'
|
| 546 |
+
else:
|
| 547 |
+
return 'E'
|
| 548 |
+
|
| 549 |
+
def _generate_feedback(
|
| 550 |
+
self,
|
| 551 |
+
overall_score: float,
|
| 552 |
+
clarity_score: float,
|
| 553 |
+
energy_score: float,
|
| 554 |
+
speech_rate_score: float,
|
| 555 |
+
pitch_consistency_score: float,
|
| 556 |
+
snr_score: float,
|
| 557 |
+
articulation_score: float,
|
| 558 |
+
transcription: str,
|
| 559 |
+
target_text: str,
|
| 560 |
+
similarity: float,
|
| 561 |
+
level: int,
|
| 562 |
+
audio_features: Dict
|
| 563 |
+
) -> Tuple[str, List[str]]:
|
| 564 |
+
"""Generate detailed feedback"""
|
| 565 |
+
|
| 566 |
+
feedback_parts = []
|
| 567 |
+
suggestions = []
|
| 568 |
+
|
| 569 |
+
# Overall feedback
|
| 570 |
+
if overall_score >= 90:
|
| 571 |
+
feedback_parts.append("🌟 Sempurna! Pengucapan Anda sangat baik.")
|
| 572 |
+
elif overall_score >= 80:
|
| 573 |
+
feedback_parts.append("👍 Bagus! Pengucapan sudah cukup jelas.")
|
| 574 |
+
elif overall_score >= 70:
|
| 575 |
+
feedback_parts.append("😊 Cukup baik, masih bisa ditingkatkan.")
|
| 576 |
+
elif overall_score >= 60:
|
| 577 |
+
feedback_parts.append("🤔 Perlu latihan lebih.")
|
| 578 |
+
else:
|
| 579 |
+
feedback_parts.append("💪 Terus berlatih!")
|
| 580 |
+
|
| 581 |
+
# Transcription match
|
| 582 |
+
if similarity < 0.8:
|
| 583 |
+
feedback_parts.append(f"\n❌ Target: '{target_text}', Terdeteksi: '{transcription}'")
|
| 584 |
+
suggestions.append(f"Fokus pada pengucapan '{target_text}' yang lebih jelas")
|
| 585 |
+
elif similarity < 1.0:
|
| 586 |
+
feedback_parts.append(f"\n⚠️ Hampir benar! Target: '{target_text}', Terdeteksi: '{transcription}'")
|
| 587 |
+
else:
|
| 588 |
+
feedback_parts.append(f"\n✅ Pengucapan '{target_text}' terdeteksi dengan sempurna!")
|
| 589 |
+
|
| 590 |
+
# Component-specific feedback
|
| 591 |
+
if clarity_score < 70:
|
| 592 |
+
suggestions.append("Ucapkan setiap huruf/kata dengan lebih jelas dan artikulasi yang baik")
|
| 593 |
+
|
| 594 |
+
if energy_score < 70:
|
| 595 |
+
if audio_features.get('rms_db', 0) < -35:
|
| 596 |
+
suggestions.append("Volume terlalu rendah - bicaralah lebih keras")
|
| 597 |
+
elif audio_features.get('rms_db', 0) > -8:
|
| 598 |
+
suggestions.append("Volume terlalu tinggi - bicaralah lebih lembut")
|
| 599 |
+
|
| 600 |
+
if speech_rate_score < 70 and level > 1:
|
| 601 |
+
if audio_features.get('duration', 0) > 2.0:
|
| 602 |
+
suggestions.append("Terlalu lambat - ucapkan dengan kecepatan yang lebih natural")
|
| 603 |
+
else:
|
| 604 |
+
suggestions.append("Terlalu cepat - ucapkan lebih pelan dan jelas")
|
| 605 |
+
|
| 606 |
+
if pitch_consistency_score < 70:
|
| 607 |
+
suggestions.append("Pertahankan nada suara yang lebih stabil")
|
| 608 |
+
|
| 609 |
+
if snr_score < 70:
|
| 610 |
+
suggestions.append("Rekam di tempat yang lebih tenang (kurangi noise latar belakang)")
|
| 611 |
+
|
| 612 |
+
if articulation_score < 70:
|
| 613 |
+
suggestions.append("Perbaiki artikulasi dengan membuka mulut lebih lebar")
|
| 614 |
+
|
| 615 |
+
feedback = " ".join(feedback_parts)
|
| 616 |
+
|
| 617 |
+
return feedback, suggestions
|
| 618 |
+
|
| 619 |
+
|
| 620 |
+
# =======================================
|
| 621 |
+
# USAGE EXAMPLE
|
| 622 |
+
# =======================================
|
| 623 |
+
|
| 624 |
+
if __name__ == "__main__":
|
| 625 |
+
print("="*70)
|
| 626 |
+
print("🎯 ADVANCED VOCAL SCORING SYSTEM")
|
| 627 |
+
print(" ASR-based (Whisper) + Comprehensive Audio Analysis")
|
| 628 |
+
print("="*70)
|
| 629 |
+
|
| 630 |
+
# Initialize
|
| 631 |
+
scorer = AdvancedVocalScoringSystem(
|
| 632 |
+
whisper_model="openai/whisper-small" # atau "openai/whisper-large-v3"
|
| 633 |
+
)
|
| 634 |
+
|
| 635 |
+
print("\n✅ System ready!")
|
| 636 |
+
print("\nSupported levels:")
|
| 637 |
+
for level_num, level_data in ARTICULATION_LEVELS.items():
|
| 638 |
+
print(f" Level {level_num}: {level_data['name']} ({level_data['difficulty']})")
|
requirements.txt
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Vocal Articulation Assessment System - Updated
|
| 2 |
+
# Requirements untuk Whisper ASR + Audio Analysis
|
| 3 |
+
|
| 4 |
+
# Core ML Libraries
|
| 5 |
+
torch>=2.0.0
|
| 6 |
+
torchaudio>=2.0.0
|
| 7 |
+
transformers>=4.35.0
|
| 8 |
+
|
| 9 |
+
# Audio Processing
|
| 10 |
+
librosa>=0.10.0
|
| 11 |
+
soundfile>=0.12.0
|
| 12 |
+
audioread>=3.0.0
|
| 13 |
+
|
| 14 |
+
# Whisper dependencies
|
| 15 |
+
openai-whisper>=20231117 # Optional: untuk whisper standalone
|
| 16 |
+
accelerate>=0.20.0 # untuk faster inference
|
| 17 |
+
|
| 18 |
+
# Web Framework & API
|
| 19 |
+
fastapi>=0.104.0
|
| 20 |
+
uvicorn[standard]>=0.24.0
|
| 21 |
+
python-multipart>=0.0.6
|
| 22 |
+
|
| 23 |
+
# Gradio for UI
|
| 24 |
+
gradio>=4.0.0
|
| 25 |
+
|
| 26 |
+
# HuggingFace Spaces ZeroGPU (optional, only for HF Spaces)
|
| 27 |
+
# spaces>=0.1.0
|
| 28 |
+
|
| 29 |
+
# Utilities
|
| 30 |
+
numpy>=1.24.0
|
| 31 |
+
scipy>=1.11.0
|
| 32 |
+
pydantic>=2.0.0
|
| 33 |
+
python-Levenshtein>=0.21.0 # faster string similarity
|
| 34 |
+
|
| 35 |
+
# Development & Testing
|
| 36 |
+
pytest>=7.4.0
|
| 37 |
+
black>=23.0.0
|
| 38 |
+
flake8>=6.0.0
|
start.sh
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
|
| 3 |
+
# =======================================
|
| 4 |
+
# START SCRIPT - Vocal Articulation System
|
| 5 |
+
# For Docker / HuggingFace Spaces deployment
|
| 6 |
+
# =======================================
|
| 7 |
+
|
| 8 |
+
echo "🚀 Starting Vocal Articulation Assessment System..."
|
| 9 |
+
|
| 10 |
+
# Check Python version
|
| 11 |
+
python --version
|
| 12 |
+
|
| 13 |
+
# Install dependencies if needed
|
| 14 |
+
if [ ! -d ".venv" ]; then
|
| 15 |
+
echo "📦 Installing dependencies..."
|
| 16 |
+
pip install -r requirements.txt
|
| 17 |
+
fi
|
| 18 |
+
|
| 19 |
+
# Set environment variables
|
| 20 |
+
export PYTHONUNBUFFERED=1
|
| 21 |
+
export GRADIO_SERVER_NAME="0.0.0.0"
|
| 22 |
+
export GRADIO_SERVER_PORT=7860
|
| 23 |
+
|
| 24 |
+
# Start application
|
| 25 |
+
echo "✅ Starting Gradio interface..."
|
| 26 |
+
python app.py
|