Zakha123-cyber
Initial deployment: SWARA API with eye tracking, facial expression, and gesture detection
8e73bed
| # -*- coding: utf-8 -*- | |
| """Struktur_Berbicara_NLP.ipynb | |
| Automatically generated by Colab. | |
| Original file is located at | |
| https://colab.research.google.com/drive/13UJp10f4bAJGPoYw--ASnK-U0JLhYdJl | |
| """ | |
| import os | |
| os.environ['WANDB_DISABLED'] = 'true' | |
| """ | |
| Fine-tuning IndoBERT untuk Klasifikasi Struktur Berbicara | |
| (Pembuka, Isi, Penutup) | |
| Requirements: | |
| pip install transformers torch pandas scikit-learn datasets | |
| """ | |
| import pandas as pd | |
| import torch | |
| from torch.utils.data import Dataset, DataLoader | |
| from transformers import ( | |
| AutoTokenizer, | |
| AutoModelForSequenceClassification, | |
| TrainingArguments, | |
| Trainer | |
| ) | |
| from sklearn.model_selection import train_test_split | |
| from sklearn.metrics import accuracy_score, classification_report, confusion_matrix | |
| import numpy as np | |
| # ============ 1. LOAD DAN PREPROCESSING DATA ============ | |
| def load_and_prepare_data(csv_path): | |
| """Load data dari CSV dan siapkan untuk training""" | |
| df = pd.read_csv(csv_path) | |
| # Mapping label ke angka | |
| label_map = { | |
| 'opening': 0, | |
| 'content': 1, | |
| 'closing': 2 | |
| } | |
| df['label_id'] = df['label'].map(label_map) | |
| # Split data: 80% train, 10% validation, 10% test | |
| train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['label_id']) | |
| val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42, stratify=temp_df['label_id']) | |
| print(f"Data Training: {len(train_df)}") | |
| print(f"Data Validasi: {len(val_df)}") | |
| print(f"Data Testing: {len(test_df)}") | |
| print(f"\nDistribusi Label Training:") | |
| print(train_df['label'].value_counts()) | |
| return train_df, val_df, test_df, label_map | |
| # ============ 2. CUSTOM DATASET CLASS ============ | |
| class SpeechStructureDataset(Dataset): | |
| """Custom Dataset untuk handling data""" | |
| def __init__(self, texts, labels, tokenizer, max_length=128): | |
| self.texts = texts | |
| self.labels = labels | |
| self.tokenizer = tokenizer | |
| self.max_length = max_length | |
| def __len__(self): | |
| return len(self.texts) | |
| def __getitem__(self, idx): | |
| text = str(self.texts[idx]) | |
| label = self.labels[idx] | |
| encoding = self.tokenizer( | |
| text, | |
| add_special_tokens=True, | |
| max_length=self.max_length, | |
| padding='max_length', | |
| truncation=True, | |
| return_tensors='pt' | |
| ) | |
| return { | |
| 'input_ids': encoding['input_ids'].flatten(), | |
| 'attention_mask': encoding['attention_mask'].flatten(), | |
| 'labels': torch.tensor(label, dtype=torch.long) | |
| } | |
| # ============ 3. METRICS UNTUK EVALUASI ============ | |
| def compute_metrics(pred): | |
| """Fungsi untuk menghitung metrics""" | |
| labels = pred.label_ids | |
| preds = pred.predictions.argmax(-1) | |
| acc = accuracy_score(labels, preds) | |
| return { | |
| 'accuracy': acc, | |
| } | |
| # ============ 4. TRAINING MODEL ============ | |
| def train_model(train_df, val_df, label_map, model_name='indobenchmark/indobert-base-p1'): | |
| """Fine-tune IndoBERT model""" | |
| # Load tokenizer dan model | |
| print(f"\nMemuat model: {model_name}") | |
| tokenizer = AutoTokenizer.from_pretrained(model_name) | |
| model = AutoModelForSequenceClassification.from_pretrained( | |
| model_name, | |
| num_labels=len(label_map) | |
| ) | |
| # Buat dataset | |
| train_dataset = SpeechStructureDataset( | |
| texts=train_df['text'].tolist(), | |
| labels=train_df['label_id'].tolist(), | |
| tokenizer=tokenizer | |
| ) | |
| val_dataset = SpeechStructureDataset( | |
| texts=val_df['text'].tolist(), | |
| labels=val_df['label_id'].tolist(), | |
| tokenizer=tokenizer | |
| ) | |
| # Training arguments | |
| training_args = TrainingArguments( | |
| output_dir='./results', | |
| num_train_epochs=30, # Untuk dataset kecil, epoch lebih banyak | |
| per_device_train_batch_size=8, | |
| per_device_eval_batch_size=8, | |
| warmup_steps=100, | |
| weight_decay=0.01, | |
| logging_dir='./logs', | |
| logging_steps=10, | |
| eval_strategy="epoch", | |
| save_strategy="epoch", | |
| load_best_model_at_end=True, | |
| metric_for_best_model="accuracy", | |
| learning_rate=2e-5, | |
| seed=42, | |
| report_to="none", # Disable semua logging eksternal | |
| save_total_limit=2 | |
| ) | |
| # Trainer | |
| trainer = Trainer( | |
| model=model, | |
| args=training_args, | |
| train_dataset=train_dataset, | |
| eval_dataset=val_dataset, | |
| compute_metrics=compute_metrics | |
| ) | |
| # Training | |
| print("\nπ Mulai training...") | |
| trainer.train() | |
| # Simpan model terbaik | |
| trainer.save_model('./best_model') | |
| tokenizer.save_pretrained('./best_model') | |
| print("\nβ Training selesai! Model disimpan di './best_model'") | |
| return trainer, tokenizer, model | |
| # ============ 5. EVALUASI MODEL ============ | |
| def evaluate_model(trainer, test_df, tokenizer, label_map): | |
| """Evaluasi model pada test set""" | |
| test_dataset = SpeechStructureDataset( | |
| texts=test_df['text'].tolist(), | |
| labels=test_df['label_id'].tolist(), | |
| tokenizer=tokenizer | |
| ) | |
| # Prediksi | |
| predictions = trainer.predict(test_dataset) | |
| pred_labels = predictions.predictions.argmax(-1) | |
| true_labels = test_df['label_id'].tolist() | |
| # Reverse mapping untuk label | |
| id_to_label = {v: k for k, v in label_map.items()} | |
| # Classification report | |
| print("\nπ HASIL EVALUASI:") | |
| print("\nClassification Report:") | |
| print(classification_report( | |
| true_labels, | |
| pred_labels, | |
| target_names=list(label_map.keys()) | |
| )) | |
| # Confusion matrix | |
| print("\nConfusion Matrix:") | |
| cm = confusion_matrix(true_labels, pred_labels) | |
| print(cm) | |
| return predictions | |
| # ============ 6. FUNGSI PREDIKSI ============ | |
| def predict_text(text, model_path='./best_model'): | |
| """Prediksi label untuk teks baru""" | |
| # Load model dan tokenizer | |
| tokenizer = AutoTokenizer.from_pretrained(model_path) | |
| model = AutoModelForSequenceClassification.from_pretrained(model_path) | |
| model.eval() | |
| # Tokenisasi | |
| inputs = tokenizer( | |
| text, | |
| add_special_tokens=True, | |
| max_length=128, | |
| padding='max_length', | |
| truncation=True, | |
| return_tensors='pt' | |
| ) | |
| # Prediksi | |
| with torch.no_grad(): | |
| outputs = model(**inputs) | |
| predictions = torch.nn.functional.softmax(outputs.logits, dim=-1) | |
| predicted_class = torch.argmax(predictions, dim=-1).item() | |
| confidence = predictions[0][predicted_class].item() | |
| # Mapping hasil | |
| label_map = {0: 'opening', 1: 'content', 2: 'closing'} | |
| predicted_label = label_map[predicted_class] | |
| return { | |
| 'text': text, | |
| 'predicted_label': predicted_label, | |
| 'confidence': confidence, | |
| 'all_probabilities': { | |
| 'opening': predictions[0][0].item(), | |
| 'content': predictions[0][1].item(), | |
| 'closing': predictions[0][2].item() | |
| } | |
| } | |
| # ============ 7. MAIN EXECUTION ============ | |
| if __name__ == "__main__": | |
| # Path ke file CSV | |
| CSV_PATH = '/content/drive/MyDrive/Colab Notebooks/dataset/struktur.csv' | |
| print("="*60) | |
| print("FINE-TUNING INDOBERT - KLASIFIKASI STRUKTUR BERBICARA") | |
| print("="*60) | |
| # 1. Load dan prepare data | |
| train_df, val_df, test_df, label_map = load_and_prepare_data(CSV_PATH) | |
| # 2. Training model | |
| trainer, tokenizer, model = train_model(train_df, val_df, label_map) | |
| # 3. Evaluasi model | |
| evaluate_model(trainer, test_df, tokenizer, label_map) | |
| # 4. Contoh prediksi | |
| print("\n" + "="*60) | |
| print("CONTOH PREDIKSI") | |
| print("="*60) | |
| test_texts = [ | |
| "Selamat pagi hadirin yang saya hormati", | |
| "Berdasarkan data yang kami kumpulkan", | |
| "Demikian yang dapat saya sampaikan terima kasih" | |
| ] | |
| for text in test_texts: | |
| result = predict_text(text) | |
| print(f"\nTeks: {result['text']}") | |
| print(f"Prediksi: {result['predicted_label']}") | |
| print(f"Confidence: {result['confidence']:.2%}") | |
| print(f"Probabilitas semua kelas: {result['all_probabilities']}") | |
| print("\n⨠Selesai!") | |
| """ | |
| Analisis Struktur Public Speaking | |
| Deteksi Opening, Content, Closing dari transkrip lengkap | |
| dengan scoring otomatis untuk penilaian | |
| """ | |
| import os | |
| os.environ['WANDB_DISABLED'] = 'true' | |
| import pandas as pd | |
| import torch | |
| import re | |
| from transformers import AutoTokenizer, AutoModelForSequenceClassification | |
| from typing import List, Dict, Tuple | |
| # ============ 1. SENTENCE SPLITTER ============ | |
| def split_into_sentences(text: str) -> List[str]: | |
| """Split text menjadi kalimat-kalimat""" | |
| # Split berdasarkan tanda baca | |
| sentences = re.split(r'[.!?,;\n]+', text) | |
| # Bersihkan whitespace dan filter kalimat kosong | |
| sentences = [s.strip() for s in sentences if s.strip()] | |
| return sentences | |
| # ============ 2. BATCH PREDICTION ============ | |
| def predict_sentences(sentences: List[str], model_path='./best_model') -> List[Dict]: | |
| """Prediksi label untuk list kalimat""" | |
| # Load model | |
| tokenizer = AutoTokenizer.from_pretrained(model_path) | |
| model = AutoModelForSequenceClassification.from_pretrained(model_path) | |
| model.eval() | |
| label_map = {0: 'opening', 1: 'content', 2: 'closing'} | |
| results = [] | |
| for idx, sentence in enumerate(sentences): | |
| # Tokenisasi | |
| inputs = tokenizer( | |
| sentence, | |
| add_special_tokens=True, | |
| max_length=128, | |
| padding='max_length', | |
| truncation=True, | |
| return_tensors='pt' | |
| ) | |
| # Prediksi | |
| with torch.no_grad(): | |
| outputs = model(**inputs) | |
| probs = torch.nn.functional.softmax(outputs.logits, dim=-1) | |
| predicted_class = torch.argmax(probs, dim=-1).item() | |
| confidence = probs[0][predicted_class].item() | |
| results.append({ | |
| 'sentence_idx': idx, | |
| 'text': sentence, | |
| 'predicted_label': label_map[predicted_class], | |
| 'confidence': confidence, | |
| 'probs': { | |
| 'opening': probs[0][0].item(), | |
| 'content': probs[0][1].item(), | |
| 'closing': probs[0][2].item() | |
| } | |
| }) | |
| return results | |
| # ============ 3. POST-PROCESSING & HEURISTICS ============ | |
| def apply_structure_rules(predictions: List[Dict]) -> List[Dict]: | |
| """ | |
| Terapkan rules untuk memperbaiki struktur: | |
| - Opening di awal | |
| - Closing di akhir | |
| - Content di tengah | |
| """ | |
| if not predictions: | |
| return predictions | |
| n = len(predictions) | |
| # Rule 1: 2 kalimat pertama cenderung opening (jika confidence tinggi) | |
| for i in range(min(2, n)): | |
| if predictions[i]['probs']['opening'] > 0.8: # Threshold | |
| predictions[i]['predicted_label'] = 'opening' | |
| predictions[i]['adjusted'] = True | |
| # Rule 2: 2 kalimat terakhir cenderung closing (jika confidence tinggi) | |
| for i in range(max(0, n-2), n): | |
| if predictions[i]['probs']['closing'] > 0.8: # Threshold | |
| predictions[i]['predicted_label'] = 'closing' | |
| predictions[i]['adjusted'] = True | |
| # Rule 3: Detect transisi berdasarkan keyword | |
| closing_keywords = ['demikian', 'terima kasih', 'sekian', 'akhir kata', | |
| 'wassalam', 'selamat pagi dan', 'sampai jumpa'] | |
| opening_keywords = ['selamat pagi', 'selamat siang', 'assalamualaikum', | |
| 'hadirin', 'pertama-tama', 'izinkan saya'] | |
| for pred in predictions: | |
| text_lower = pred['text'].lower() | |
| # Check closing keywords | |
| if any(kw in text_lower for kw in closing_keywords): | |
| pred['predicted_label'] = 'closing' | |
| pred['keyword_match'] = True | |
| # Check opening keywords | |
| elif any(kw in text_lower for kw in opening_keywords): | |
| pred['predicted_label'] = 'opening' | |
| pred['keyword_match'] = True | |
| return predictions | |
| # ============ 4. STRUCTURE SEGMENTATION ============ | |
| def segment_speech_structure(predictions: List[Dict]) -> Dict: | |
| """ | |
| Grouping kalimat berdasarkan struktur yang terdeteksi | |
| """ | |
| structure = { | |
| 'opening': [], | |
| 'content': [], | |
| 'closing': [] | |
| } | |
| for pred in predictions: | |
| label = pred['predicted_label'] | |
| structure[label].append(pred) | |
| return structure | |
| # ============ 5. SCORING SYSTEM ============ | |
| def calculate_structure_score(structure: Dict) -> Dict: | |
| """ | |
| Hitung skor berdasarkan kriteria: | |
| - Poin 5: ada opening (1), content (1), closing (1) | |
| - Poin 4: ada opening (1), content (1), closing (0) | |
| - Poin 3: ada opening (1), content (0), closing (1) | |
| - Poin 2: ada opening (0), content (1), closing (1) | |
| - Poin 1: ada opening (1), content (0), closing (0) | |
| - Poin 0: tidak ada struktur yang lengkap | |
| """ | |
| has_opening = len(structure['opening']) > 0 | |
| has_content = len(structure['content']) > 0 | |
| has_closing = len(structure['closing']) > 0 | |
| # Hitung poin | |
| if has_opening and has_content and has_closing: | |
| score = 5 | |
| description = "Sempurna! Struktur lengkap (Pembuka, Isi, Penutup)" | |
| elif has_opening and has_content and not has_closing: | |
| score = 4 | |
| description = "Baik. Ada pembuka dan isi, tapi kurang penutup" | |
| elif has_opening and not has_content and has_closing: | |
| score = 3 | |
| description = "Cukup. Ada pembuka dan penutup, tapi isi kurang jelas" | |
| elif not has_opening and has_content and has_closing: | |
| score = 2 | |
| description = "Perlu perbaikan. Kurang pembuka yang jelas" | |
| elif has_opening and not has_content and not has_closing: | |
| score = 1 | |
| description = "Kurang lengkap. Hanya ada pembuka" | |
| else: | |
| score = 0 | |
| description = "Struktur tidak terdeteksi dengan baik" | |
| return { | |
| 'score': score, | |
| 'max_score': 5, | |
| 'description': description, | |
| 'has_opening': has_opening, | |
| 'has_content': has_content, | |
| 'has_closing': has_closing, | |
| 'opening_count': len(structure['opening']), | |
| 'content_count': len(structure['content']), | |
| 'closing_count': len(structure['closing']) | |
| } | |
| # ============ 6. MAIN ANALYSIS FUNCTION ============ | |
| def analyze_speech(transcript: str, model_path='./best_model', | |
| apply_rules=True, verbose=True) -> Dict: | |
| """ | |
| Fungsi utama untuk menganalisis struktur speech | |
| Args: | |
| transcript: Teks lengkap dari speech | |
| model_path: Path ke model yang sudah di-train | |
| apply_rules: Apakah menggunakan heuristic rules | |
| verbose: Tampilkan detail atau tidak | |
| Returns: | |
| Dict berisi hasil analisis lengkap | |
| """ | |
| # 1. Split into sentences | |
| sentences = split_into_sentences(transcript) | |
| if verbose: | |
| print(f"π Jumlah kalimat terdeteksi: {len(sentences)}") | |
| # 2. Predict each sentence | |
| predictions = predict_sentences(sentences, model_path) | |
| # 3. Apply rules (optional) | |
| if apply_rules: | |
| predictions = apply_structure_rules(predictions) | |
| # 4. Segment structure | |
| structure = segment_speech_structure(predictions) | |
| # 5. Calculate score | |
| score_result = calculate_structure_score(structure) | |
| # 6. Generate report | |
| if verbose: | |
| print("\n" + "="*70) | |
| print("π HASIL ANALISIS STRUKTUR BERBICARA") | |
| print("="*70) | |
| print(f"\nπ― SKOR: {score_result['score']}/{score_result['max_score']}") | |
| print(f"π {score_result['description']}") | |
| print(f"\nβ Struktur terdeteksi:") | |
| print(f" β’ Pembuka (Opening): {score_result['opening_count']} kalimat") | |
| print(f" β’ Isi (Content): {score_result['content_count']} kalimat") | |
| print(f" β’ Penutup (Closing): {score_result['closing_count']} kalimat") | |
| print(f"\nπ Detail per bagian:") | |
| print(f"\n{'='*70}") | |
| for section in ['opening', 'content', 'closing']: | |
| if structure[section]: | |
| print(f"\nπΉ {section.upper()}:") | |
| for item in structure[section]: | |
| print(f" [{item['sentence_idx']+1}] {item['text'][:80]}...") | |
| print(f" Confidence: {item['confidence']:.2%}") | |
| print(f"\n{'='*70}") | |
| return { | |
| 'sentences': sentences, | |
| 'predictions': predictions, | |
| 'structure': structure, | |
| 'score': score_result, | |
| 'transcript': transcript | |
| } | |
| # ============ 8. CONTOH PENGGUNAAN ============ | |
| if __name__ == "__main__": | |
| # Contoh transkrip speech | |
| sample_transcript = """ | |
| Assalamualaikum warahmatullahi wabarakatuh. Selamat pagi hadirin yang saya hormati | |
| Puji syukur kita panjatkan kehadirat Tuhan Yang Maha Esa | |
| Pada kesempatan ini saya akan membahas tentang pentingnya pendidikan karakter | |
| Menurut data dari Kemendikbud tahun 2023, tingkat literasi di Indonesia masih perlu ditingkatkan | |
| Berdasarkan penelitian menunjukkan bahwa pendidikan karakter sangat penting untuk generasi muda | |
| Contohnya seperti yang terjadi di negara-negara maju, mereka mengutamakan pendidikan karakter sejak dini | |
| Oleh karena itu kita perlu bergerak bersama untuk meningkatkan kualitas pendidikan | |
| Demikian yang dapat saya sampaikan | |
| Terima kasih atas perhatian Bapak dan Ibu sekalian | |
| Wassalamualaikum warahmatullahi wabarakatuh | |
| """ | |
| print("π€ ANALISIS STRUKTUR PUBLIC SPEAKING") | |
| print("="*70) | |
| # Jalankan analisis | |
| result = analyze_speech( | |
| transcript=sample_transcript, | |
| model_path='./best_model', | |
| apply_rules=True, | |
| verbose=True | |
| ) | |
| print("\n⨠Analisis selesai!") |