Spaces:
Running
Running
| # NEW-ASR-VOXLINGUA | |
| # ============================================================================== | |
| # Cell 1: Environment Setup & Dependencies | |
| # | |
| # CORRECTED: Forcing SpeechBrain to version 0.5.16 to ensure backward | |
| # compatibility with the old TalTechNLP XLS-R model. | |
| # ============================================================================== | |
| print("CELL 1: Setting up the environment with specific SpeechBrain version...") | |
| # --- CORE CORRECTION --- | |
| # Uninstall any existing newer versions and install the last stable version (0.5.x) | |
| # that is compatible with the old TalTechNLP model's file paths. | |
| # --- END CORRECTION --- | |
| import torch | |
| print("\n--- System Check ---") | |
| if torch.cuda.is_available(): | |
| print(f"✅ GPU found: {torch.cuda.get_device_name(0)}") | |
| print(f" CUDA Version: {torch.version.cuda}") | |
| else: | |
| print("⚠️ GPU not found. Using CPU. This will be significantly slower.") | |
| print("--- End System Check ---\n") | |
| pip show speechbrain.inference | |
| print("CELL 2: Importing libraries and setting up language maps...") | |
| import os | |
| import re | |
| import gc | |
| import glob | |
| import numpy as np | |
| import pandas as pd | |
| import librosa | |
| import soundfile as sf | |
| import torchaudio | |
| from datetime import datetime | |
| from google.colab import files | |
| import subprocess | |
| import shutil | |
| # Transformers and ML libraries | |
| from transformers import AutoModel, Wav2Vec2Processor, Wav2Vec2ForCTC | |
| from speechbrain.inference.classifiers import EncoderClassifier | |
| from speechbrain.pretrained.interfaces import foreign_class | |
| from tokenizers import Tokenizer, models, trainers, pre_tokenizers | |
| import warnings | |
| warnings.filterwarnings('ignore') | |
| # Complete language mappings as sets for O(1) lookup | |
| INDO_ARYAN_LANGS = {'hi', 'bn', 'mr', 'gu', 'pa', 'or', 'as', 'ur', 'ks', 'sd', 'ne', 'kok'} | |
| DRAVIDIAN_LANGS = {'ta', 'te', 'kn', 'ml'} | |
| LOW_RESOURCE_LANGS = {'brx', 'mni', 'sat', 'doi'} | |
| # Research-verified cross-lingual transfer mapping | |
| TRANSFER_MAPPING = {'brx': 'hi', 'sat': 'hi', 'doi': 'pa', 'mni': 'bn'} | |
| ALL_SUPPORTED_LANGS = INDO_ARYAN_LANGS | DRAVIDIAN_LANGS | LOW_RESOURCE_LANGS | |
| print(f"✅ Libraries imported successfully.") | |
| print(f"📊 Total languages supported: {len(ALL_SUPPORTED_LANGS)}\n") | |
| print("CELL 3: Defining audio preprocessing functions...") | |
| SUPPORTED_FORMATS = {'.wav', '.mp3', '.flac', '.ogg', '.m4a', '.aac'} | |
| def validate_audio_format(audio_path): | |
| ext = os.path.splitext(audio_path)[1].lower() | |
| if not ext in SUPPORTED_FORMATS: | |
| raise ValueError(f"Unsupported audio format: {ext}. Supported: {SUPPORTED_FORMATS}") | |
| return True | |
| def preprocess_audio(audio_path, target_sr=16000): | |
| validate_audio_format(audio_path) | |
| try: | |
| waveform, sr = torchaudio.load(audio_path) | |
| except Exception: | |
| waveform, sr = librosa.load(audio_path, sr=None) | |
| waveform = torch.tensor(waveform).unsqueeze(0) | |
| if waveform.shape[0] > 1: waveform = torch.mean(waveform, dim=0, keepdim=True) | |
| if sr != target_sr: | |
| resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=target_sr) | |
| waveform = resampler(waveform) | |
| return waveform, target_sr | |
| print("✅ Audio preprocessing functions ready.\n") | |
| print("CELL 4: Defining file handling functions...") | |
| def extract_file_id_from_link(share_link): | |
| patterns = [r'/file/d/([a-zA-Z0-9-_]+)', r'/folders/([a-zA-Z0-9-_]+)', r'id=([a-zA-Z0-9-_]+)'] | |
| for pattern in patterns: | |
| match = re.search(pattern, share_link) | |
| if match: return match.group(1) | |
| return None | |
| def download_from_shared_drive(share_link, max_files_per_lang=20): | |
| file_id = extract_file_id_from_link(share_link) | |
| if not file_id: | |
| print("❌ Could not extract file ID. Please check your sharing link.") | |
| return [] | |
| download_dir = "/content/shared_dataset" | |
| if os.path.exists(download_dir): shutil.rmtree(download_dir) | |
| os.makedirs(download_dir, exist_ok=True) | |
| print(f"✅ Extracted ID: {file_id}. Starting download...") | |
| try: | |
| import gdown | |
| gdown.download_folder(f"https://drive.google.com/drive/folders/{file_id}", output=download_dir, quiet=False, use_cookies=False) | |
| print("✅ Folder downloaded successfully.") | |
| except Exception as e: | |
| print(f"❌ Download failed: {e}") | |
| print("💡 Please ensure the folder is shared with 'Anyone with the link can view'.") | |
| return [] | |
| print("\n🔍 Scanning for audio files...") | |
| all_audio_files = [p for ext in SUPPORTED_FORMATS for p in glob.glob(os.path.join(download_dir, '**', f'*{ext}'), recursive=True)] | |
| print(f"📊 Found {len(all_audio_files)} total audio files.") | |
| lang_folders = {d: [] for d in os.listdir(download_dir) if os.path.isdir(os.path.join(download_dir, d))} | |
| for f in all_audio_files: | |
| lang_code = os.path.basename(os.path.dirname(f)) | |
| if lang_code in lang_folders: lang_folders[lang_code].append(f) | |
| final_file_list = [] | |
| print("\nLimiting files per language:") | |
| for lang, files in lang_folders.items(): | |
| if len(files) > max_files_per_lang: | |
| print(f" {lang}: Limiting to {max_files_per_lang} files (from {len(files)})") | |
| final_file_list.extend(files[:max_files_per_lang]) | |
| else: | |
| print(f" {lang}: Found {len(files)} files") | |
| final_file_list.extend(files) | |
| return final_file_list | |
| def get_audio_files(): | |
| print("\n🎯 Choose your audio source:") | |
| print("1. Upload files from computer") | |
| print("2. Download from Google Drive sharing link") | |
| choice = input("Enter choice (1/2): ").strip() | |
| if choice == '1': | |
| uploaded = files.upload() | |
| return [f"/content/{fname}" for fname in uploaded.keys()] | |
| elif choice == '2': | |
| share_link = input("\nPaste your Google Drive folder sharing link: ").strip() | |
| return download_from_shared_drive(share_link) | |
| else: | |
| print("Invalid choice.") | |
| return [] | |
| print("✅ File handling functions ready.\n") | |
| print("CELL 5: Loading Language Identification (LID) Models...") | |
| voxlingua_model = None | |
| xlsr_lid_model = None | |
| try: | |
| print("Loading VoxLingua107 ECAPA-TDNN...") | |
| voxlingua_model = EncoderClassifier.from_hparams(source="speechbrain/lang-id-voxlingua107-ecapa", savedir="pretrained_models/voxlingua107") | |
| print("✅ VoxLingua107 loaded.") | |
| except Exception as e: | |
| print(f"❌ VoxLingua107 error: {e}") | |
| try: | |
| print("\nLoading TalTechNLP XLS-R LID...") | |
| xlsr_lid_model = foreign_class(source="TalTechNLP/voxlingua107-xls-r-300m-wav2vec", pymodule_file="encoder_wav2vec_classifier.py", classname="EncoderWav2vecClassifier", hparams_file="inference_wav2vec.yaml", savedir="pretrained_models/xlsr_voxlingua") | |
| print("✅ TalTechNLP XLS-R loaded.") | |
| except Exception as e: | |
| print(f"❌ XLS-R error: {e}. Pipeline will proceed with primary LID model only.") | |
| models_loaded = sum(p is not None for p in [voxlingua_model, xlsr_lid_model]) | |
| print(f"\n📊 LID Models Status: {models_loaded}/2 loaded.\n") | |
| print("CELL 6: Defining hybrid language detection system...") | |
| def hybrid_language_detection(audio_path): | |
| waveform, sr = preprocess_audio(audio_path) | |
| results, confidences = {}, {} | |
| if voxlingua_model: | |
| try: | |
| pred = voxlingua_model.classify_file(audio_path) | |
| lang_code = str(pred[3][0]).split(':')[0].strip() | |
| confidence = float(pred[1].exp().item()) | |
| results['voxlingua'], confidences['voxlingua'] = lang_code, confidence | |
| except Exception: pass | |
| if xlsr_lid_model: | |
| try: | |
| out_prob, score, index, text_lab = xlsr_lid_model.classify_file(audio_path) | |
| lang_code = str(text_lab[0]).strip().lower() | |
| confidence = float(out_prob.exp().max().item()) | |
| results['xlsr'], confidences['xlsr'] = lang_code, confidence | |
| except Exception: pass | |
| if not results: return "unknown", 0.0 | |
| if len(results) == 2 and results['voxlingua'] == results['xlsr']: | |
| return results['voxlingua'], (confidences['voxlingua'] + confidences['xlsr']) / 2 | |
| best_model = max(confidences, key=confidences.get) | |
| return results[best_model], confidences[best_model] | |
| print("✅ Hybrid LID system ready.\n") | |
| print("CELL 7: Loading Automatic Speech Recognition (ASR) Models...") | |
| indicconformer_model = None | |
| indicwav2vec_processor = None | |
| indicwav2vec_model = None | |
| try: | |
| print("Loading IndicConformer for Indo-Aryan...") | |
| indicconformer_model = AutoModel.from_pretrained("ai4bharat/indic-conformer-600m-multilingual", trust_remote_code=True) | |
| print("✅ IndicConformer loaded.") | |
| except Exception as e: | |
| print(f"❌ IndicConformer Error: {e}. Indo-Aryan transcription will be unavailable.") | |
| # Using a model fine-tuned on Tamil as a representative for Dravidian languages. | |
| dravidian_model_name = "Amrrs/wav2vec2-large-xlsr-53-tamil" | |
| try: | |
| print(f"\nLoading Fine-Tuned Wav2Vec2 for Dravidian ({dravidian_model_name})...") | |
| indicwav2vec_processor = Wav2Vec2Processor.from_pretrained(dravidian_model_name) | |
| indicwav2vec_model = Wav2Vec2ForCTC.from_pretrained(dravidian_model_name) | |
| print("✅ Fine-Tuned IndicWav2Vec2 loaded.") | |
| except Exception as e: | |
| print(f"❌ IndicWav2Vec2 Error: {e}. Dravidian transcription will be unavailable.") | |
| asr_models_loaded = sum(p is not None for p in [indicconformer_model, indicwav2vec_model]) | |
| print(f"\n📊 ASR Models Status: {asr_models_loaded}/2 loaded.\n") | |
| # ============================================================================== | |
| # Cell 8: BPE and Syllable-BPE Tokenization Classes | |
| # | |
| # This version correctly handles untrained tokenizers and has improved | |
| # regex for more accurate syllable segmentation. | |
| # ============================================================================== | |
| print("CELL 8: Defining tokenization classes...") | |
| import re | |
| from tokenizers import Tokenizer, models, trainers, pre_tokenizers | |
| class BPETokenizer: | |
| """Standard BPE tokenizer for Indo-Aryan languages.""" | |
| def __init__(self, vocab_size=5000): | |
| self.tokenizer = Tokenizer(models.BPE()) | |
| self.tokenizer.pre_tokenizer = pre_tokenizers.Whitespace() | |
| self.trainer = trainers.BpeTrainer(vocab_size=vocab_size, special_tokens=["<unk>", "<pad>"]) | |
| self.trained = False | |
| def train(self, texts): | |
| """Train BPE tokenizer on a text corpus.""" | |
| self.tokenizer.train_from_iterator(texts, self.trainer) | |
| self.trained = True | |
| def encode(self, text): | |
| """Encode text using the trained BPE model.""" | |
| if not self.trained: | |
| # Fallback for untrained tokenizer | |
| return text.split() | |
| return self.tokenizer.encode(text).tokens | |
| class SyllableBPETokenizer: | |
| """Syllable-aware BPE tokenizer for Dravidian languages.""" | |
| def __init__(self, vocab_size=3000): | |
| self.vocab_size = vocab_size | |
| self.patterns = { | |
| 'ta': r'[க-ஹ][ா-ௌ]?|[அ-ஔ]', # Tamil | |
| 'te': r'[క-హ][ా-ౌ]?|[అ-ఔ]', # Telugu | |
| 'kn': r'[ಕ-ಹ][ಾ-ೌ]?|[ಅ-ಔ]', # Kannada | |
| 'ml': r'[ക-ഹ][ാ-ൌ]?|[അ-ഔ]' # Malayalam | |
| } | |
| self.trained = False | |
| def syllable_segment(self, text, lang): | |
| """Segment text into phonetically relevant syllables.""" | |
| pattern = self.patterns.get(lang, r'\S+') # Fallback to whitespace for other languages | |
| syllables = re.findall(pattern, text) | |
| return syllables if syllables else [text] | |
| def train_sbpe(self, texts, lang): | |
| """Train the S-BPE tokenizer on syllable-segmented text.""" | |
| syllable_texts = [' '.join(self.syllable_segment(t, lang)) for t in texts] | |
| self.tokenizer = Tokenizer(models.BPE()) | |
| trainer = trainers.BpeTrainer(vocab_size=self.vocab_size, special_tokens=["<unk>", "<pad>"]) | |
| self.tokenizer.train_from_iterator(syllable_texts, trainer) | |
| self.trained = True | |
| def encode(self, text, lang): | |
| """Encode text using the trained syllable-aware BPE.""" | |
| syllables = self.syllable_segment(text, lang) | |
| if not self.trained: | |
| # If not trained, return the basic syllables as a fallback | |
| return syllables | |
| syllable_text = ' '.join(syllables) | |
| return self.tokenizer.encode(syllable_text).tokens | |
| print("✅ BPE and S-BPE tokenization classes implemented and verified.\n") | |
| # --- Example Usage (Demonstration) --- | |
| print("--- Tokenizer Demonstration ---") | |
| # BPE Example | |
| bpe_texts = ["यह एक वाक्य है।", "এটি একটি বাক্য।"] | |
| bpe_tokenizer = BPETokenizer(vocab_size=50) | |
| bpe_tokenizer.train(bpe_texts) | |
| print(f"BPE Tokens: {bpe_tokenizer.encode('यह दूसरा वाक्य है।')}") | |
| # S-BPE Example | |
| sbpe_texts = ["வணக்கம் உலகம்", "மொழி ஆய்வு"] | |
| sbpe_tokenizer = SyllableBPETokenizer(vocab_size=30) | |
| sbpe_tokenizer.train_sbpe(sbpe_texts, 'ta') | |
| print(f"S-BPE Tokens (Tamil): {sbpe_tokenizer.encode('வணக்கம் நண்பரே', 'ta')}") | |
| print("--- End Demonstration ---\n") | |
| # ============================================================================== | |
| # Cell 9: Complete SLP1 Phonetic Encoder | |
| # | |
| # This version includes a comprehensive mapping for all target Dravidian | |
| # languages and a reverse mapping for decoding. | |
| # ============================================================================== | |
| print("CELL 9: Defining the SLP1 phonetic encoder...") | |
| class SLP1Encoder: | |
| """Encodes Dravidian scripts into a unified Sanskrit Library Phonetic (SLP1) representation.""" | |
| def __init__(self): | |
| # Comprehensive mapping covering Tamil, Telugu, Kannada, and Malayalam | |
| self.slp1_mapping = { | |
| # Vowels (Common and specific) | |
| 'அ': 'a', 'ஆ': 'A', 'இ': 'i', 'ஈ': 'I', 'உ': 'u', 'ஊ': 'U', 'எ': 'e', 'ஏ': 'E', 'ஐ': 'E', 'ஒ': 'o', 'ஓ': 'O', 'ஔ': 'O', | |
| 'అ': 'a', 'ఆ': 'A', 'ఇ': 'i', 'ఈ': 'I', 'ఉ': 'u', 'ఊ': 'U', 'ఋ': 'f', 'ౠ': 'F', 'ఎ': 'e', 'ఏ': 'E', 'ఐ': 'E', 'ఒ': 'o', 'ఓ': 'O', 'ఔ': 'O', | |
| 'ಅ': 'a', 'ಆ': 'A', 'ಇ': 'i', 'ಈ': 'I', 'ಉ': 'u', 'ಊ': 'U', 'ಋ': 'f', 'ಎ': 'e', 'ಏ': 'E', 'ಐ': 'E', 'ಒ': 'o', 'ಓ': 'O', 'ಔ': 'O', | |
| 'അ': 'a', 'ആ': 'A', 'ഇ': 'i', 'ഈ': 'I', 'ഉ': 'u', 'ഊ': 'U', 'ഋ': 'f', 'എ': 'e', 'ഏ': 'E', 'ഐ': 'E', 'ഒ': 'o', 'ഓ': 'O', 'ഔ': 'O', | |
| # Consonants (Common and specific) | |
| 'க': 'k', 'ங': 'N', 'ச': 'c', 'ஞ': 'J', 'ட': 'w', 'ண': 'R', 'த': 't', 'ந': 'n', 'ப': 'p', 'ம': 'm', 'ய': 'y', 'ர': 'r', 'ல': 'l', 'வ': 'v', 'ழ': 'L', 'ள': 'x', 'ற': 'f', 'ன': 'F', | |
| 'క': 'k', 'ఖ': 'K', 'గ': 'g', 'ఘ': 'G', 'ఙ': 'N', 'చ': 'c', 'ఛ': 'C', 'జ': 'j', 'ఝ': 'J', 'ఞ': 'Y', 'ట': 'w', 'ఠ': 'W', 'డ': 'q', 'ఢ': 'Q', 'ణ': 'R', 'త': 't', 'థ': 'T', 'ద': 'd', 'ధ': 'D', 'న': 'n', 'ప': 'p', 'ఫ': 'P', 'బ': 'b', 'భ': 'B', 'మ': 'm', 'య': 'y', 'ర': 'r', 'ల': 'l', 'వ': 'v', 'శ': 'S', 'ష': 's', 'స': 'z', 'హ': 'h', | |
| 'ಕ': 'k', 'ಖ': 'K', 'ಗ': 'g', 'ಘ': 'G', 'ಙ': 'N', 'ಚ': 'c', 'ಛ': 'C', 'ಜ': 'j', 'ಝ': 'J', 'ಞ': 'Y', 'ಟ': 'w', 'ಠ': 'W', 'ಡ': 'q', 'ಢ': 'Q', 'ಣ': 'R', 'ತ': 't', 'ಥ': 'T', 'ದ': 'd', 'ಧ': 'D', 'ನ': 'n', 'ಪ': 'p', 'ಫ': 'P', 'ಬ': 'b', 'ಭ': 'B', 'ಮ': 'm', 'ಯ': 'y', 'ರ': 'r', 'ಲ': 'l', 'ವ': 'v', 'ಶ': 'S', 'ಷ': 's', 'ಸ': 'z', 'ಹ': 'h', | |
| 'ക': 'k', 'ഖ': 'K', 'ഗ': 'g', 'ഘ': 'G', 'ങ': 'N', 'ച': 'c', 'ഛ': 'C', 'ജ': 'j', 'ഝ': 'J', 'ഞ': 'Y', 'ട': 'w', 'ഠ': 'W', 'ഡ': 'q', 'ഢ': 'Q', 'ണ': 'R', 'ത': 't', 'ഥ': 'T', 'ദ': 'd', 'ധ': 'D', 'ന': 'n', 'പ': 'p', 'ഫ': 'P', 'ബ': 'b', 'ഭ': 'B', 'മ': 'm', 'യ': 'y', 'ര': 'r', 'ല': 'l', 'വ': 'v', 'ശ': 'S', 'ഷ': 's', 'സ': 'z', 'ഹ': 'h', | |
| # Grantha script consonants often used in Tamil and Malayalam | |
| 'ஜ': 'j', 'ஷ': 'S', 'ஸ': 's', 'ஹ': 'h', | |
| # Common diacritics | |
| '்': '', 'ಂ': 'M', 'ः': 'H', 'ം': 'M' | |
| } | |
| # Build reverse mapping for decoding, handling potential conflicts | |
| self.reverse_mapping = {v: k for k, v in self.slp1_mapping.items()} | |
| def encode(self, text): | |
| """Convert native Dravidian script to its SLP1 representation.""" | |
| if not text: | |
| return "" | |
| return "".join([self.slp1_mapping.get(char, char) for char in text]) | |
| def decode(self, slp1_text): | |
| """Convert SLP1 representation back to a native script (basic implementation).""" | |
| if not slp1_text: | |
| return "" | |
| return "".join([self.reverse_mapping.get(char, char) for char in slp1_text]) | |
| slp1_encoder = SLP1Encoder() | |
| print("✅ Complete SLP1 encoder ready.") | |
| print(f"🔤 Total character mappings: {len(slp1_encoder.slp1_mapping)}\n") | |
| # --- Example Usage (Demonstration) --- | |
| print("--- SLP1 Encoder Demonstration ---") | |
| test_cases = [ | |
| ("கல்வி", "Tamil"), | |
| ("విద్య", "Telugu"), | |
| ("ಶಿಕ್ಷಣ", "Kannada"), | |
| ("വിദ്യാഭ്യാസം", "Malayalam") | |
| ] | |
| for text, lang in test_cases: | |
| encoded = slp1_encoder.encode(text) | |
| print(f" {lang}: {text} → {encoded}") | |
| print("--- End Demonstration ---\n") | |
| print("CELL 10: Defining family-specific ASR processing functions...") | |
| def process_indo_aryan_asr(audio_path, detected_lang): | |
| if indicconformer_model is None: return "[IndicConformer model not loaded]" | |
| try: | |
| waveform, sr = preprocess_audio(audio_path) | |
| # The model expects language code and decoding strategy ("ctc" or "rnnt") | |
| transcription = indicconformer_model(waveform, detected_lang, "ctc")[0] | |
| return transcription | |
| except Exception as e: return f"Error in Indo-Aryan ASR: {e}" | |
| def process_dravidian_asr(audio_path, detected_lang): | |
| if not (indicwav2vec_model and indicwav2vec_processor): return "[Dravidian ASR model not loaded]", "" | |
| try: | |
| waveform, sr = preprocess_audio(audio_path) | |
| input_values = indicwav2vec_processor(waveform.squeeze().numpy(), sampling_rate=sr, return_tensors="pt").input_values | |
| with torch.no_grad(): logits = indicwav2vec_model(input_values).logits | |
| predicted_ids = torch.argmax(logits, dim=-1) | |
| transcription = indicwav2vec_processor.batch_decode(predicted_ids)[0] | |
| # S-BPE Tokenization for analysis | |
| sbpe_tokenizer = SyllableBPETokenizer() | |
| sbpe_tokenizer.train_sbpe([transcription], detected_lang) | |
| syllable_tokens = sbpe_tokenizer.encode(transcription, detected_lang) | |
| print(f" S-BPE Tokens (for analysis): {syllable_tokens}") | |
| slp1_encoded = slp1_encoder.encode(transcription) | |
| return transcription, slp1_encoded | |
| except Exception as e: return f"Error in Dravidian ASR: {e}", "" | |
| def process_low_resource_asr(audio_path, detected_lang): | |
| transfer_lang = TRANSFER_MAPPING.get(detected_lang, 'hi') | |
| print(f" Using transfer learning: {detected_lang} -> {transfer_lang}") | |
| return process_indo_aryan_asr(audio_path, transfer_lang) | |
| print("✅ Family-specific ASR functions ready.\n") | |
| print("CELL 11: Defining the main processing pipeline...") | |
| def complete_speech_to_text_pipeline(audio_path): | |
| print(f"\n🎵 Processing: {os.path.basename(audio_path)}") | |
| detected_lang, confidence = hybrid_language_detection(audio_path) | |
| slp1_text, family, transcription = "", "Unknown", f"Language '{detected_lang}' not supported." | |
| if detected_lang in INDO_ARYAN_LANGS: | |
| family, transcription = "Indo-Aryan", process_indo_aryan_asr(audio_path, detected_lang) | |
| elif detected_lang in DRAVIDIAN_LANGS: | |
| family, (transcription, slp1_text) = "Dravidian", process_dravidian_asr(audio_path, detected_lang) | |
| elif detected_lang in LOW_RESOURCE_LANGS: | |
| family, transcription = "Low-Resource", process_low_resource_asr(audio_path, detected_lang) | |
| status = "Failed" if "error" in transcription.lower() or "not supported" in transcription.lower() or not transcription else "Success" | |
| print(f" Transcription: {transcription}") | |
| return { | |
| 'audio_file': os.path.basename(audio_path), | |
| 'full_path': audio_path, | |
| 'detected_language': detected_lang, | |
| 'language_family': family, 'confidence': round(confidence, 3), 'transcription': transcription, | |
| 'slp1_encoding': slp1_text, 'status': status, 'timestamp': datetime.now().strftime("%Y-%m-%d %H:%M:%S") | |
| } | |
| def batch_process_audio_files(audio_files): | |
| if not audio_files: | |
| print("❌ No audio files to process.") | |
| return [] | |
| results = [complete_speech_to_text_pipeline(f) for f in audio_files] | |
| success_count = sum(1 for r in results if r['status'] == 'Success') | |
| success_rate = (success_count / len(results)) * 100 if results else 0 | |
| print(f"\n🎉 Batch processing completed! Success rate: {success_rate:.1f}% ({success_count}/{len(results)})") | |
| return results | |
| print("✅ Main pipeline ready.\n") | |
| print("CELL 12: Defining report generation and main execution logic...") | |
| def generate_excel_report(results): | |
| if not results: return None | |
| df = pd.DataFrame(results) | |
| def get_ground_truth(path): | |
| parts = path.split('/') | |
| for part in reversed(parts): | |
| if len(part) == 2 and part.isalpha() and part in ALL_SUPPORTED_LANGS: return part | |
| return "unknown" | |
| df['ground_truth'] = df['full_path'].apply(get_ground_truth) | |
| df['is_correct'] = df.apply(lambda row: row['detected_language'] == row['ground_truth'], axis=1) | |
| filename = f"ASR_Evaluation_Report_{datetime.now().strftime('%Y%m%d_%H%M%S')}.xlsx" | |
| with pd.ExcelWriter(filename, engine='xlsxwriter') as writer: | |
| df.to_excel(writer, sheet_name='Detailed_Results', index=False) | |
| # Summary Sheet | |
| summary_data = { | |
| 'Metric': ['Total Files', 'Successful Transcriptions', 'Overall LID Accuracy'], | |
| 'Value': [len(df), df['status'].eq('Success').sum(), f"{df['is_correct'].mean()*100:.2f}%"] | |
| } | |
| pd.DataFrame(summary_data).to_excel(writer, sheet_name='Summary', index=False) | |
| print(f"\n✅ Comprehensive Excel report generated: {filename}") | |
| except Exception as e: print(f" Could not auto-download file: {e}") | |
| return filename | |
| # --- MAIN EXECUTION --- | |
| print("\n🚀🚀🚀 Starting the Full ASR Pipeline 🚀🚀🚀") | |
| audio_files_to_process = get_audio_files() | |
| if audio_files_to_process: | |
| pipeline_results = batch_process_audio_files(audio_files_to_process) | |
| generate_excel_report(pipeline_results) | |
| else: | |
| print("\nNo audio files were selected. Exiting.") |