swara-api / app /services /struktur_berbicara_nlp.py
Zakha123-cyber
Initial deployment: SWARA API with eye tracking, facial expression, and gesture detection
8e73bed
# -*- coding: utf-8 -*-
"""Struktur_Berbicara_NLP.ipynb
Automatically generated by Colab.
Original file is located at
https://colab.research.google.com/drive/13UJp10f4bAJGPoYw--ASnK-U0JLhYdJl
"""
import os
os.environ['WANDB_DISABLED'] = 'true'
"""
Fine-tuning IndoBERT untuk Klasifikasi Struktur Berbicara
(Pembuka, Isi, Penutup)
Requirements:
pip install transformers torch pandas scikit-learn datasets
"""
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import (
AutoTokenizer,
AutoModelForSequenceClassification,
TrainingArguments,
Trainer
)
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import numpy as np
# ============ 1. LOAD DAN PREPROCESSING DATA ============
def load_and_prepare_data(csv_path):
"""Load data dari CSV dan siapkan untuk training"""
df = pd.read_csv(csv_path)
# Mapping label ke angka
label_map = {
'opening': 0,
'content': 1,
'closing': 2
}
df['label_id'] = df['label'].map(label_map)
# Split data: 80% train, 10% validation, 10% test
train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['label_id'])
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42, stratify=temp_df['label_id'])
print(f"Data Training: {len(train_df)}")
print(f"Data Validasi: {len(val_df)}")
print(f"Data Testing: {len(test_df)}")
print(f"\nDistribusi Label Training:")
print(train_df['label'].value_counts())
return train_df, val_df, test_df, label_map
# ============ 2. CUSTOM DATASET CLASS ============
class SpeechStructureDataset(Dataset):
"""Custom Dataset untuk handling data"""
def __init__(self, texts, labels, tokenizer, max_length=128):
self.texts = texts
self.labels = labels
self.tokenizer = tokenizer
self.max_length = max_length
def __len__(self):
return len(self.texts)
def __getitem__(self, idx):
text = str(self.texts[idx])
label = self.labels[idx]
encoding = self.tokenizer(
text,
add_special_tokens=True,
max_length=self.max_length,
padding='max_length',
truncation=True,
return_tensors='pt'
)
return {
'input_ids': encoding['input_ids'].flatten(),
'attention_mask': encoding['attention_mask'].flatten(),
'labels': torch.tensor(label, dtype=torch.long)
}
# ============ 3. METRICS UNTUK EVALUASI ============
def compute_metrics(pred):
"""Fungsi untuk menghitung metrics"""
labels = pred.label_ids
preds = pred.predictions.argmax(-1)
acc = accuracy_score(labels, preds)
return {
'accuracy': acc,
}
# ============ 4. TRAINING MODEL ============
def train_model(train_df, val_df, label_map, model_name='indobenchmark/indobert-base-p1'):
"""Fine-tune IndoBERT model"""
# Load tokenizer dan model
print(f"\nMemuat model: {model_name}")
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(
model_name,
num_labels=len(label_map)
)
# Buat dataset
train_dataset = SpeechStructureDataset(
texts=train_df['text'].tolist(),
labels=train_df['label_id'].tolist(),
tokenizer=tokenizer
)
val_dataset = SpeechStructureDataset(
texts=val_df['text'].tolist(),
labels=val_df['label_id'].tolist(),
tokenizer=tokenizer
)
# Training arguments
training_args = TrainingArguments(
output_dir='./results',
num_train_epochs=30, # Untuk dataset kecil, epoch lebih banyak
per_device_train_batch_size=8,
per_device_eval_batch_size=8,
warmup_steps=100,
weight_decay=0.01,
logging_dir='./logs',
logging_steps=10,
eval_strategy="epoch",
save_strategy="epoch",
load_best_model_at_end=True,
metric_for_best_model="accuracy",
learning_rate=2e-5,
seed=42,
report_to="none", # Disable semua logging eksternal
save_total_limit=2
)
# Trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=val_dataset,
compute_metrics=compute_metrics
)
# Training
print("\nπŸš€ Mulai training...")
trainer.train()
# Simpan model terbaik
trainer.save_model('./best_model')
tokenizer.save_pretrained('./best_model')
print("\nβœ… Training selesai! Model disimpan di './best_model'")
return trainer, tokenizer, model
# ============ 5. EVALUASI MODEL ============
def evaluate_model(trainer, test_df, tokenizer, label_map):
"""Evaluasi model pada test set"""
test_dataset = SpeechStructureDataset(
texts=test_df['text'].tolist(),
labels=test_df['label_id'].tolist(),
tokenizer=tokenizer
)
# Prediksi
predictions = trainer.predict(test_dataset)
pred_labels = predictions.predictions.argmax(-1)
true_labels = test_df['label_id'].tolist()
# Reverse mapping untuk label
id_to_label = {v: k for k, v in label_map.items()}
# Classification report
print("\nπŸ“Š HASIL EVALUASI:")
print("\nClassification Report:")
print(classification_report(
true_labels,
pred_labels,
target_names=list(label_map.keys())
))
# Confusion matrix
print("\nConfusion Matrix:")
cm = confusion_matrix(true_labels, pred_labels)
print(cm)
return predictions
# ============ 6. FUNGSI PREDIKSI ============
def predict_text(text, model_path='./best_model'):
"""Prediksi label untuk teks baru"""
# Load model dan tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)
model.eval()
# Tokenisasi
inputs = tokenizer(
text,
add_special_tokens=True,
max_length=128,
padding='max_length',
truncation=True,
return_tensors='pt'
)
# Prediksi
with torch.no_grad():
outputs = model(**inputs)
predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
predicted_class = torch.argmax(predictions, dim=-1).item()
confidence = predictions[0][predicted_class].item()
# Mapping hasil
label_map = {0: 'opening', 1: 'content', 2: 'closing'}
predicted_label = label_map[predicted_class]
return {
'text': text,
'predicted_label': predicted_label,
'confidence': confidence,
'all_probabilities': {
'opening': predictions[0][0].item(),
'content': predictions[0][1].item(),
'closing': predictions[0][2].item()
}
}
# ============ 7. MAIN EXECUTION ============
if __name__ == "__main__":
# Path ke file CSV
CSV_PATH = '/content/drive/MyDrive/Colab Notebooks/dataset/struktur.csv'
print("="*60)
print("FINE-TUNING INDOBERT - KLASIFIKASI STRUKTUR BERBICARA")
print("="*60)
# 1. Load dan prepare data
train_df, val_df, test_df, label_map = load_and_prepare_data(CSV_PATH)
# 2. Training model
trainer, tokenizer, model = train_model(train_df, val_df, label_map)
# 3. Evaluasi model
evaluate_model(trainer, test_df, tokenizer, label_map)
# 4. Contoh prediksi
print("\n" + "="*60)
print("CONTOH PREDIKSI")
print("="*60)
test_texts = [
"Selamat pagi hadirin yang saya hormati",
"Berdasarkan data yang kami kumpulkan",
"Demikian yang dapat saya sampaikan terima kasih"
]
for text in test_texts:
result = predict_text(text)
print(f"\nTeks: {result['text']}")
print(f"Prediksi: {result['predicted_label']}")
print(f"Confidence: {result['confidence']:.2%}")
print(f"Probabilitas semua kelas: {result['all_probabilities']}")
print("\n✨ Selesai!")
"""
Analisis Struktur Public Speaking
Deteksi Opening, Content, Closing dari transkrip lengkap
dengan scoring otomatis untuk penilaian
"""
import os
os.environ['WANDB_DISABLED'] = 'true'
import pandas as pd
import torch
import re
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from typing import List, Dict, Tuple
# ============ 1. SENTENCE SPLITTER ============
def split_into_sentences(text: str) -> List[str]:
"""Split text menjadi kalimat-kalimat"""
# Split berdasarkan tanda baca
sentences = re.split(r'[.!?,;\n]+', text)
# Bersihkan whitespace dan filter kalimat kosong
sentences = [s.strip() for s in sentences if s.strip()]
return sentences
# ============ 2. BATCH PREDICTION ============
def predict_sentences(sentences: List[str], model_path='./best_model') -> List[Dict]:
"""Prediksi label untuk list kalimat"""
# Load model
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)
model.eval()
label_map = {0: 'opening', 1: 'content', 2: 'closing'}
results = []
for idx, sentence in enumerate(sentences):
# Tokenisasi
inputs = tokenizer(
sentence,
add_special_tokens=True,
max_length=128,
padding='max_length',
truncation=True,
return_tensors='pt'
)
# Prediksi
with torch.no_grad():
outputs = model(**inputs)
probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
predicted_class = torch.argmax(probs, dim=-1).item()
confidence = probs[0][predicted_class].item()
results.append({
'sentence_idx': idx,
'text': sentence,
'predicted_label': label_map[predicted_class],
'confidence': confidence,
'probs': {
'opening': probs[0][0].item(),
'content': probs[0][1].item(),
'closing': probs[0][2].item()
}
})
return results
# ============ 3. POST-PROCESSING & HEURISTICS ============
def apply_structure_rules(predictions: List[Dict]) -> List[Dict]:
"""
Terapkan rules untuk memperbaiki struktur:
- Opening di awal
- Closing di akhir
- Content di tengah
"""
if not predictions:
return predictions
n = len(predictions)
# Rule 1: 2 kalimat pertama cenderung opening (jika confidence tinggi)
for i in range(min(2, n)):
if predictions[i]['probs']['opening'] > 0.8: # Threshold
predictions[i]['predicted_label'] = 'opening'
predictions[i]['adjusted'] = True
# Rule 2: 2 kalimat terakhir cenderung closing (jika confidence tinggi)
for i in range(max(0, n-2), n):
if predictions[i]['probs']['closing'] > 0.8: # Threshold
predictions[i]['predicted_label'] = 'closing'
predictions[i]['adjusted'] = True
# Rule 3: Detect transisi berdasarkan keyword
closing_keywords = ['demikian', 'terima kasih', 'sekian', 'akhir kata',
'wassalam', 'selamat pagi dan', 'sampai jumpa']
opening_keywords = ['selamat pagi', 'selamat siang', 'assalamualaikum',
'hadirin', 'pertama-tama', 'izinkan saya']
for pred in predictions:
text_lower = pred['text'].lower()
# Check closing keywords
if any(kw in text_lower for kw in closing_keywords):
pred['predicted_label'] = 'closing'
pred['keyword_match'] = True
# Check opening keywords
elif any(kw in text_lower for kw in opening_keywords):
pred['predicted_label'] = 'opening'
pred['keyword_match'] = True
return predictions
# ============ 4. STRUCTURE SEGMENTATION ============
def segment_speech_structure(predictions: List[Dict]) -> Dict:
"""
Grouping kalimat berdasarkan struktur yang terdeteksi
"""
structure = {
'opening': [],
'content': [],
'closing': []
}
for pred in predictions:
label = pred['predicted_label']
structure[label].append(pred)
return structure
# ============ 5. SCORING SYSTEM ============
def calculate_structure_score(structure: Dict) -> Dict:
"""
Hitung skor berdasarkan kriteria:
- Poin 5: ada opening (1), content (1), closing (1)
- Poin 4: ada opening (1), content (1), closing (0)
- Poin 3: ada opening (1), content (0), closing (1)
- Poin 2: ada opening (0), content (1), closing (1)
- Poin 1: ada opening (1), content (0), closing (0)
- Poin 0: tidak ada struktur yang lengkap
"""
has_opening = len(structure['opening']) > 0
has_content = len(structure['content']) > 0
has_closing = len(structure['closing']) > 0
# Hitung poin
if has_opening and has_content and has_closing:
score = 5
description = "Sempurna! Struktur lengkap (Pembuka, Isi, Penutup)"
elif has_opening and has_content and not has_closing:
score = 4
description = "Baik. Ada pembuka dan isi, tapi kurang penutup"
elif has_opening and not has_content and has_closing:
score = 3
description = "Cukup. Ada pembuka dan penutup, tapi isi kurang jelas"
elif not has_opening and has_content and has_closing:
score = 2
description = "Perlu perbaikan. Kurang pembuka yang jelas"
elif has_opening and not has_content and not has_closing:
score = 1
description = "Kurang lengkap. Hanya ada pembuka"
else:
score = 0
description = "Struktur tidak terdeteksi dengan baik"
return {
'score': score,
'max_score': 5,
'description': description,
'has_opening': has_opening,
'has_content': has_content,
'has_closing': has_closing,
'opening_count': len(structure['opening']),
'content_count': len(structure['content']),
'closing_count': len(structure['closing'])
}
# ============ 6. MAIN ANALYSIS FUNCTION ============
def analyze_speech(transcript: str, model_path='./best_model',
apply_rules=True, verbose=True) -> Dict:
"""
Fungsi utama untuk menganalisis struktur speech
Args:
transcript: Teks lengkap dari speech
model_path: Path ke model yang sudah di-train
apply_rules: Apakah menggunakan heuristic rules
verbose: Tampilkan detail atau tidak
Returns:
Dict berisi hasil analisis lengkap
"""
# 1. Split into sentences
sentences = split_into_sentences(transcript)
if verbose:
print(f"πŸ“ Jumlah kalimat terdeteksi: {len(sentences)}")
# 2. Predict each sentence
predictions = predict_sentences(sentences, model_path)
# 3. Apply rules (optional)
if apply_rules:
predictions = apply_structure_rules(predictions)
# 4. Segment structure
structure = segment_speech_structure(predictions)
# 5. Calculate score
score_result = calculate_structure_score(structure)
# 6. Generate report
if verbose:
print("\n" + "="*70)
print("πŸ“Š HASIL ANALISIS STRUKTUR BERBICARA")
print("="*70)
print(f"\n🎯 SKOR: {score_result['score']}/{score_result['max_score']}")
print(f"πŸ“ {score_result['description']}")
print(f"\nβœ… Struktur terdeteksi:")
print(f" β€’ Pembuka (Opening): {score_result['opening_count']} kalimat")
print(f" β€’ Isi (Content): {score_result['content_count']} kalimat")
print(f" β€’ Penutup (Closing): {score_result['closing_count']} kalimat")
print(f"\nπŸ“„ Detail per bagian:")
print(f"\n{'='*70}")
for section in ['opening', 'content', 'closing']:
if structure[section]:
print(f"\nπŸ”Ή {section.upper()}:")
for item in structure[section]:
print(f" [{item['sentence_idx']+1}] {item['text'][:80]}...")
print(f" Confidence: {item['confidence']:.2%}")
print(f"\n{'='*70}")
return {
'sentences': sentences,
'predictions': predictions,
'structure': structure,
'score': score_result,
'transcript': transcript
}
# ============ 8. CONTOH PENGGUNAAN ============
if __name__ == "__main__":
# Contoh transkrip speech
sample_transcript = """
Assalamualaikum warahmatullahi wabarakatuh. Selamat pagi hadirin yang saya hormati
Puji syukur kita panjatkan kehadirat Tuhan Yang Maha Esa
Pada kesempatan ini saya akan membahas tentang pentingnya pendidikan karakter
Menurut data dari Kemendikbud tahun 2023, tingkat literasi di Indonesia masih perlu ditingkatkan
Berdasarkan penelitian menunjukkan bahwa pendidikan karakter sangat penting untuk generasi muda
Contohnya seperti yang terjadi di negara-negara maju, mereka mengutamakan pendidikan karakter sejak dini
Oleh karena itu kita perlu bergerak bersama untuk meningkatkan kualitas pendidikan
Demikian yang dapat saya sampaikan
Terima kasih atas perhatian Bapak dan Ibu sekalian
Wassalamualaikum warahmatullahi wabarakatuh
"""
print("🎀 ANALISIS STRUKTUR PUBLIC SPEAKING")
print("="*70)
# Jalankan analisis
result = analyze_speech(
transcript=sample_transcript,
model_path='./best_model',
apply_rules=True,
verbose=True
)
print("\n✨ Analisis selesai!")