Spaces:
				
			
			
	
			
			
					
		Running
		
	
	
	
			
			
	
	
	
	
		
		
					
		Running
		
	
		Joseph Pollack
		
	commited on
		
		
					adds additional components to the interface for reccording
Browse files- interface.py +189 -68
 
    	
        interface.py
    CHANGED
    
    | 
         @@ -254,9 +254,9 @@ def start_voxtral_training( 
     | 
|
| 254 | 
         
             
            def load_multilingual_phrases(language="en", max_phrases=None, split="train"):
         
     | 
| 255 | 
         
             
                """Load phrases from various multilingual speech datasets.
         
     | 
| 256 | 
         | 
| 257 | 
         
            -
                 
     | 
| 258 | 
         
            -
                1.  
     | 
| 259 | 
         
            -
                2.  
     | 
| 260 | 
         
             
                3. Fallback to basic phrases
         
     | 
| 261 | 
         | 
| 262 | 
         
             
                Args:
         
     | 
| 
         @@ -272,70 +272,97 @@ def load_multilingual_phrases(language="en", max_phrases=None, split="train"): 
     | 
|
| 272 | 
         | 
| 273 | 
         
             
                # Language code mapping for different datasets
         
     | 
| 274 | 
         
             
                lang_mappings = {
         
     | 
| 275 | 
         
            -
                    "en": {" 
     | 
| 276 | 
         
            -
                    "de": {" 
     | 
| 277 | 
         
            -
                    "fr": {" 
     | 
| 278 | 
         
            -
                    "es": {" 
     | 
| 279 | 
         
            -
                    "it": {" 
     | 
| 280 | 
         
            -
                    "pt": {" 
     | 
| 281 | 
         
            -
                    "pl": {" 
     | 
| 282 | 
         
            -
                    "nl": {" 
     | 
| 283 | 
         
            -
                    "ru": {" 
     | 
| 284 | 
         
            -
                    "ar": {" 
     | 
| 285 | 
         
            -
                    "zh": {" 
     | 
| 286 | 
         
            -
                    "ja": {" 
     | 
| 287 | 
         
            -
                    "ko": {" 
     | 
| 288 | 
         
             
                }
         
     | 
| 289 | 
         | 
| 290 | 
         
            -
                lang_config = lang_mappings.get(language, {" 
     | 
| 291 | 
         | 
| 292 | 
         
            -
                # Try  
     | 
| 293 | 
         
             
                try:
         
     | 
| 294 | 
         
            -
                    print(f"Trying  
     | 
| 295 | 
         
            -
                     
     | 
| 296 | 
         
            -
                    ds = load_dataset(" 
     | 
| 297 | 
         | 
| 298 | 
         
             
                    phrases = []
         
     | 
| 299 | 
         
             
                    count = 0
         
     | 
| 
         | 
|
| 
         | 
|
| 300 | 
         
             
                    for example in ds:
         
     | 
| 301 | 
         
             
                        if max_phrases and count >= max_phrases:
         
     | 
| 302 | 
         
             
                            break
         
     | 
| 303 | 
         
            -
                         
     | 
| 304 | 
         
            -
                        if  
     | 
| 305 | 
         
            -
                            phrases.append( 
     | 
| 
         | 
|
| 306 | 
         
             
                            count += 1
         
     | 
| 307 | 
         | 
| 308 | 
         
             
                    if phrases:
         
     | 
| 309 | 
         
            -
                        print(f"Successfully loaded {len(phrases)} phrases from  
     | 
| 310 | 
         
             
                        random.shuffle(phrases)
         
     | 
| 311 | 
         
             
                        return phrases
         
     | 
| 312 | 
         | 
| 313 | 
         
             
                except Exception as e:
         
     | 
| 314 | 
         
            -
                    print(f" 
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 315 | 
         | 
| 316 | 
         
            -
                # Try  
     | 
| 317 | 
         
             
                try:
         
     | 
| 318 | 
         
            -
                    print(f"Trying  
     | 
| 319 | 
         
            -
                     
     | 
| 320 | 
         
            -
                    ds = load_dataset("google/fleurs", fleurs_lang, split=split, streaming=True)
         
     | 
| 321 | 
         | 
| 322 | 
         
             
                    phrases = []
         
     | 
| 323 | 
         
             
                    count = 0
         
     | 
| 324 | 
         
             
                    for example in ds:
         
     | 
| 325 | 
         
             
                        if max_phrases and count >= max_phrases:
         
     | 
| 326 | 
         
             
                            break
         
     | 
| 327 | 
         
            -
                        text = example.get(" 
     | 
| 328 | 
         
             
                        if text and len(text) > 10:  # Filter out very short phrases
         
     | 
| 329 | 
         
             
                            phrases.append(text)
         
     | 
| 330 | 
         
             
                            count += 1
         
     | 
| 331 | 
         | 
| 332 | 
         
             
                    if phrases:
         
     | 
| 333 | 
         
            -
                        print(f"Successfully loaded {len(phrases)} phrases from  
     | 
| 334 | 
         
             
                        random.shuffle(phrases)
         
     | 
| 335 | 
         
             
                        return phrases
         
     | 
| 336 | 
         | 
| 337 | 
         
             
                except Exception as e:
         
     | 
| 338 | 
         
            -
                    print(f" 
     | 
| 339 | 
         | 
| 340 | 
         
             
                # Final fallback to basic phrases
         
     | 
| 341 | 
         
             
                print("All dataset loading attempts failed, using fallback phrases")
         
     | 
| 
         @@ -434,17 +461,20 @@ with gr.Blocks(title="Voxtral ASR Fine-tuning") as demo: 
     | 
|
| 434 | 
         
             
                # Recording grid with dynamic text readouts
         
     | 
| 435 | 
         
             
                phrase_texts_state = gr.State(ALL_PHRASES)
         
     | 
| 436 | 
         
             
                visible_rows_state = gr.State(10)  # Start with 10 visible rows
         
     | 
| 437 | 
         
            -
                 
     | 
| 
         | 
|
| 
         | 
|
| 438 | 
         
             
                phrase_markdowns: list[gr.Markdown] = []
         
     | 
| 439 | 
         
             
                rec_components = []
         
     | 
| 440 | 
         | 
| 441 | 
         
            -
                def create_recording_grid( 
     | 
| 442 | 
         
            -
                    """Create recording grid components  
     | 
| 443 | 
         
             
                    markdowns = []
         
     | 
| 444 | 
         
             
                    recordings = []
         
     | 
| 445 | 
         
            -
                    for idx 
     | 
| 446 | 
         
            -
                        visible = idx <  
     | 
| 447 | 
         
            -
                         
     | 
| 
         | 
|
| 448 | 
         
             
                        markdowns.append(md)
         
     | 
| 449 | 
         
             
                        comp = gr.Audio(sources="microphone", type="numpy", label=f"Recording {idx+1}", visible=visible)
         
     | 
| 450 | 
         
             
                        recordings.append(comp)
         
     | 
| 
         @@ -452,44 +482,41 @@ with gr.Blocks(title="Voxtral ASR Fine-tuning") as demo: 
     | 
|
| 452 | 
         | 
| 453 | 
         
             
                # Initial grid creation
         
     | 
| 454 | 
         
             
                with gr.Column():
         
     | 
| 455 | 
         
            -
                    phrase_markdowns, rec_components = create_recording_grid( 
     | 
| 456 | 
         | 
| 457 | 
         
             
                # Add more rows button
         
     | 
| 458 | 
         
             
                add_rows_btn = gr.Button("➕ Add 10 More Rows", variant="secondary")
         
     | 
| 459 | 
         | 
| 460 | 
         
             
                def add_more_rows(current_visible, current_phrases):
         
     | 
| 461 | 
         
             
                    """Add 10 more rows by making them visible"""
         
     | 
| 462 | 
         
            -
                    new_visible = min(current_visible + 10, len(current_phrases))
         
     | 
| 
         | 
|
| 
         | 
|
| 463 | 
         
             
                    visibility_updates = []
         
     | 
| 464 | 
         
            -
                    for i in range( 
     | 
| 465 | 
         
            -
                        if i < new_visible:
         
     | 
| 466 | 
         
             
                            visibility_updates.append(gr.update(visible=True))
         
     | 
| 467 | 
         
             
                        else:
         
     | 
| 468 | 
         
             
                            visibility_updates.append(gr.update(visible=False))
         
     | 
| 
         | 
|
| 469 | 
         
             
                    return [new_visible] + visibility_updates
         
     | 
| 470 | 
         | 
| 471 | 
         
             
                def change_language(language):
         
     | 
| 472 | 
         
             
                    """Change the language and reload phrases from multilingual datasets"""
         
     | 
| 473 | 
         
             
                    new_phrases = load_multilingual_phrases(language, max_phrases=None)
         
     | 
| 474 | 
         
             
                    # Reset visible rows to 10
         
     | 
| 475 | 
         
            -
                    visible_count = min(10, len(new_phrases))
         
     | 
| 476 | 
         | 
| 477 | 
         
            -
                    # Create  
     | 
| 478 | 
         
            -
                    current_len = len(phrase_markdowns)
         
     | 
| 479 | 
         
             
                    combined_updates = []
         
     | 
| 480 | 
         
            -
             
     | 
| 481 | 
         
            -
             
     | 
| 482 | 
         
            -
             
     | 
| 483 | 
         
            -
                         
     | 
| 484 | 
         
            -
                             
     | 
| 485 | 
         
            -
                                combined_updates.append(gr.update(value=f"**{i+1}. {new_phrases[i]}**", visible=True))
         
     | 
| 486 | 
         
            -
                            else:
         
     | 
| 487 | 
         
            -
                                combined_updates.append(gr.update(visible=False))
         
     | 
| 488 | 
         
             
                        else:
         
     | 
| 489 | 
         
            -
                            combined_updates.append(gr.update(visible=False))
         
     | 
| 490 | 
         | 
| 491 | 
         
            -
                    # If we have more phrases than components, we can't update them via Gradio
         
     | 
| 492 | 
         
            -
                    # The interface will need to be reloaded for significantly different phrase counts
         
     | 
| 493 | 
         
             
                    return [new_phrases, visible_count] + combined_updates
         
     | 
| 494 | 
         | 
| 495 | 
         
             
                # Connect language change to phrase reloading
         
     | 
| 
         @@ -505,6 +532,56 @@ with gr.Blocks(title="Voxtral ASR Fine-tuning") as demo: 
     | 
|
| 505 | 
         
             
                    outputs=[visible_rows_state] + phrase_markdowns + rec_components
         
     | 
| 506 | 
         
             
                )
         
     | 
| 507 | 
         | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 508 | 
         
             
                # Advanced options accordion
         
     | 
| 509 | 
         
             
                with gr.Accordion("Advanced options", open=False):
         
     | 
| 510 | 
         
             
                    base_model = gr.Textbox(value="mistralai/Voxtral-Mini-3B-2507", label="Base Voxtral model")
         
     | 
| 
         @@ -576,22 +653,66 @@ with gr.Blocks(title="Voxtral ASR Fine-tuning") as demo: 
     | 
|
| 576 | 
         
             
                    vp_btn = gr.Button("Use Multilingual Dataset Sample")
         
     | 
| 577 | 
         | 
| 578 | 
         
             
                    def _collect_multilingual_sample(lang_code: str, num_samples: int, split: str):
         
     | 
| 579 | 
         
            -
                        """Load sample from multilingual datasets ( 
     | 
| 580 | 
         
             
                        from datasets import load_dataset, Audio
         
     | 
| 581 | 
         
             
                        import random
         
     | 
| 582 | 
         | 
| 583 | 
         
            -
                        # Language code mapping for  
     | 
| 584 | 
         
            -
                         
     | 
| 585 | 
         
             
                            "en": "en", "de": "de", "fr": "fr", "es": "es", "it": "it",
         
     | 
| 586 | 
         
             
                            "pl": "pl", "pt": "pt", "nl": "nl", "ru": "ru", "ar": "ar",
         
     | 
| 587 | 
         
            -
                            "zh": "zh 
     | 
| 588 | 
         
             
                        }
         
     | 
| 589 | 
         | 
| 590 | 
         
            -
                         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 591 | 
         | 
| 
         | 
|
| 592 | 
         
             
                        try:
         
     | 
| 593 | 
         
            -
                             
     | 
| 594 | 
         
            -
                            ds = load_dataset("mozilla-foundation/common_voice_11_0", cv_lang, split=split, streaming=True)
         
     | 
| 595 | 
         
             
                            ds = ds.cast_column("audio", Audio(sampling_rate=16000))
         
     | 
| 596 | 
         | 
| 597 | 
         
             
                            dataset_dir = PROJECT_ROOT / "datasets" / "voxtral_user"
         
     | 
| 
         @@ -605,7 +726,7 @@ with gr.Blocks(title="Voxtral ASR Fine-tuning") as demo: 
     | 
|
| 605 | 
         | 
| 606 | 
         
             
                                audio = ex.get("audio") or {}
         
     | 
| 607 | 
         
             
                                path = audio.get("path")
         
     | 
| 608 | 
         
            -
                                text = ex.get(" 
     | 
| 609 | 
         | 
| 610 | 
         
             
                                if path and text and len(text) > 10:
         
     | 
| 611 | 
         
             
                                    rows.append({"audio_path": path, "text": text})
         
     | 
| 
         @@ -618,7 +739,7 @@ with gr.Blocks(title="Voxtral ASR Fine-tuning") as demo: 
     | 
|
| 618 | 
         | 
| 619 | 
         
             
                                # Build markdown content updates for on-screen prompts
         
     | 
| 620 | 
         
             
                                combined_updates = []
         
     | 
| 621 | 
         
            -
                                for i in range( 
     | 
| 622 | 
         
             
                                    t = texts[i] if i < len(texts) else ""
         
     | 
| 623 | 
         
             
                                    if i < len(texts):
         
     | 
| 624 | 
         
             
                                        combined_updates.append(gr.update(value=f"**{i+1}. {t}**", visible=True))
         
     | 
| 
         @@ -628,7 +749,7 @@ with gr.Blocks(title="Voxtral ASR Fine-tuning") as demo: 
     | 
|
| 628 | 
         
             
                                return (str(jsonl_path), texts, *combined_updates)
         
     | 
| 629 | 
         | 
| 630 | 
         
             
                        except Exception as e:
         
     | 
| 631 | 
         
            -
                            print(f" 
     | 
| 632 | 
         | 
| 633 | 
         
             
                        # Fallback: generate synthetic samples with text only
         
     | 
| 634 | 
         
             
                        print("Using fallback: generating text-only samples")
         
     | 
| 
         @@ -642,7 +763,7 @@ with gr.Blocks(title="Voxtral ASR Fine-tuning") as demo: 
     | 
|
| 642 | 
         | 
| 643 | 
         
             
                        # Build markdown content updates for on-screen prompts
         
     | 
| 644 | 
         
             
                        combined_updates = []
         
     | 
| 645 | 
         
            -
                        for i in range( 
     | 
| 646 | 
         
             
                            t = texts[i] if i < len(texts) else ""
         
     | 
| 647 | 
         
             
                            if i < len(texts):
         
     | 
| 648 | 
         
             
                                combined_updates.append(gr.update(value=f"**{i+1}. {t}**", visible=True))
         
     | 
| 
         | 
|
| 254 | 
         
             
            def load_multilingual_phrases(language="en", max_phrases=None, split="train"):
         
     | 
| 255 | 
         
             
                """Load phrases from various multilingual speech datasets.
         
     | 
| 256 | 
         | 
| 257 | 
         
            +
                Uses datasets that work with current library versions:
         
     | 
| 258 | 
         
            +
                1. ML Commons Speech (modern format)
         
     | 
| 259 | 
         
            +
                2. Multilingual LibriSpeech (modern format)
         
     | 
| 260 | 
         
             
                3. Fallback to basic phrases
         
     | 
| 261 | 
         | 
| 262 | 
         
             
                Args:
         
     | 
| 
         | 
|
| 272 | 
         | 
| 273 | 
         
             
                # Language code mapping for different datasets
         
     | 
| 274 | 
         
             
                lang_mappings = {
         
     | 
| 275 | 
         
            +
                    "en": {"ml_speech": "en", "librispeech": "clean"},
         
     | 
| 276 | 
         
            +
                    "de": {"ml_speech": "de", "librispeech": None},
         
     | 
| 277 | 
         
            +
                    "fr": {"ml_speech": "fr", "librispeech": None},
         
     | 
| 278 | 
         
            +
                    "es": {"ml_speech": "es", "librispeech": None},
         
     | 
| 279 | 
         
            +
                    "it": {"ml_speech": "it", "librispeech": None},
         
     | 
| 280 | 
         
            +
                    "pt": {"ml_speech": "pt", "librispeech": None},
         
     | 
| 281 | 
         
            +
                    "pl": {"ml_speech": "pl", "librispeech": None},
         
     | 
| 282 | 
         
            +
                    "nl": {"ml_speech": "nl", "librispeech": None},
         
     | 
| 283 | 
         
            +
                    "ru": {"ml_speech": "ru", "librispeech": None},
         
     | 
| 284 | 
         
            +
                    "ar": {"ml_speech": "ar", "librispeech": None},
         
     | 
| 285 | 
         
            +
                    "zh": {"ml_speech": "zh", "librispeech": None},
         
     | 
| 286 | 
         
            +
                    "ja": {"ml_speech": "ja", "librispeech": None},
         
     | 
| 287 | 
         
            +
                    "ko": {"ml_speech": "ko", "librispeech": None},
         
     | 
| 288 | 
         
             
                }
         
     | 
| 289 | 
         | 
| 290 | 
         
            +
                lang_config = lang_mappings.get(language, {"ml_speech": language, "librispeech": None})
         
     | 
| 291 | 
         | 
| 292 | 
         
            +
                # Try ML Commons Speech first (modern format)
         
     | 
| 293 | 
         
             
                try:
         
     | 
| 294 | 
         
            +
                    print(f"Trying ML Commons Speech dataset for language: {language}")
         
     | 
| 295 | 
         
            +
                    ml_lang = lang_config["ml_speech"]
         
     | 
| 296 | 
         
            +
                    ds = load_dataset("mlcommons/ml_spoken_words", f"speech_commands_{ml_lang}", split=split, streaming=True)
         
     | 
| 297 | 
         | 
| 298 | 
         
             
                    phrases = []
         
     | 
| 299 | 
         
             
                    count = 0
         
     | 
| 300 | 
         
            +
                    seen_words = set()
         
     | 
| 301 | 
         
            +
             
     | 
| 302 | 
         
             
                    for example in ds:
         
     | 
| 303 | 
         
             
                        if max_phrases and count >= max_phrases:
         
     | 
| 304 | 
         
             
                            break
         
     | 
| 305 | 
         
            +
                        word = example.get("word", "").strip()
         
     | 
| 306 | 
         
            +
                        if word and len(word) > 2 and word not in seen_words:  # Filter duplicates and short words
         
     | 
| 307 | 
         
            +
                            phrases.append(word)
         
     | 
| 308 | 
         
            +
                            seen_words.add(word)
         
     | 
| 309 | 
         
             
                            count += 1
         
     | 
| 310 | 
         | 
| 311 | 
         
             
                    if phrases:
         
     | 
| 312 | 
         
            +
                        print(f"Successfully loaded {len(phrases)} phrases from ML Commons Speech")
         
     | 
| 313 | 
         
             
                        random.shuffle(phrases)
         
     | 
| 314 | 
         
             
                        return phrases
         
     | 
| 315 | 
         | 
| 316 | 
         
             
                except Exception as e:
         
     | 
| 317 | 
         
            +
                    print(f"ML Commons Speech failed: {e}")
         
     | 
| 318 | 
         
            +
             
     | 
| 319 | 
         
            +
                # Try Multilingual LibriSpeech as backup
         
     | 
| 320 | 
         
            +
                try:
         
     | 
| 321 | 
         
            +
                    if lang_config["librispeech"]:
         
     | 
| 322 | 
         
            +
                        print(f"Trying Multilingual LibriSpeech dataset for language: {language}")
         
     | 
| 323 | 
         
            +
                        librispeech_lang = lang_config["librispeech"]
         
     | 
| 324 | 
         
            +
                        ds = load_dataset("facebook/multilingual_librispeech", f"{language}", split=split, streaming=True)
         
     | 
| 325 | 
         
            +
             
     | 
| 326 | 
         
            +
                        phrases = []
         
     | 
| 327 | 
         
            +
                        count = 0
         
     | 
| 328 | 
         
            +
                        for example in ds:
         
     | 
| 329 | 
         
            +
                            if max_phrases and count >= max_phrases:
         
     | 
| 330 | 
         
            +
                                break
         
     | 
| 331 | 
         
            +
                            text = example.get("text", "").strip()
         
     | 
| 332 | 
         
            +
                            if text and len(text) > 10:  # Filter out very short phrases
         
     | 
| 333 | 
         
            +
                                phrases.append(text)
         
     | 
| 334 | 
         
            +
                                count += 1
         
     | 
| 335 | 
         
            +
             
     | 
| 336 | 
         
            +
                        if phrases:
         
     | 
| 337 | 
         
            +
                            print(f"Successfully loaded {len(phrases)} phrases from Multilingual LibriSpeech")
         
     | 
| 338 | 
         
            +
                            random.shuffle(phrases)
         
     | 
| 339 | 
         
            +
                            return phrases
         
     | 
| 340 | 
         
            +
             
     | 
| 341 | 
         
            +
                except Exception as e:
         
     | 
| 342 | 
         
            +
                    print(f"Multilingual LibriSpeech failed: {e}")
         
     | 
| 343 | 
         | 
| 344 | 
         
            +
                # Try TED Talk translations (works for many languages)
         
     | 
| 345 | 
         
             
                try:
         
     | 
| 346 | 
         
            +
                    print(f"Trying TED Talk translations for language: {language}")
         
     | 
| 347 | 
         
            +
                    ds = load_dataset("ted_talks_iwslt", language=[f"{language}_en"], split=split, streaming=True)
         
     | 
| 
         | 
|
| 348 | 
         | 
| 349 | 
         
             
                    phrases = []
         
     | 
| 350 | 
         
             
                    count = 0
         
     | 
| 351 | 
         
             
                    for example in ds:
         
     | 
| 352 | 
         
             
                        if max_phrases and count >= max_phrases:
         
     | 
| 353 | 
         
             
                            break
         
     | 
| 354 | 
         
            +
                        text = example.get("translation", {}).get(language, "").strip()
         
     | 
| 355 | 
         
             
                        if text and len(text) > 10:  # Filter out very short phrases
         
     | 
| 356 | 
         
             
                            phrases.append(text)
         
     | 
| 357 | 
         
             
                            count += 1
         
     | 
| 358 | 
         | 
| 359 | 
         
             
                    if phrases:
         
     | 
| 360 | 
         
            +
                        print(f"Successfully loaded {len(phrases)} phrases from TED Talks")
         
     | 
| 361 | 
         
             
                        random.shuffle(phrases)
         
     | 
| 362 | 
         
             
                        return phrases
         
     | 
| 363 | 
         | 
| 364 | 
         
             
                except Exception as e:
         
     | 
| 365 | 
         
            +
                    print(f"TED Talks failed: {e}")
         
     | 
| 366 | 
         | 
| 367 | 
         
             
                # Final fallback to basic phrases
         
     | 
| 368 | 
         
             
                print("All dataset loading attempts failed, using fallback phrases")
         
     | 
| 
         | 
|
| 461 | 
         
             
                # Recording grid with dynamic text readouts
         
     | 
| 462 | 
         
             
                phrase_texts_state = gr.State(ALL_PHRASES)
         
     | 
| 463 | 
         
             
                visible_rows_state = gr.State(10)  # Start with 10 visible rows
         
     | 
| 464 | 
         
            +
                MAX_COMPONENTS = 100  # Fixed maximum number of components
         
     | 
| 465 | 
         
            +
             
     | 
| 466 | 
         
            +
                # Create fixed number of components upfront
         
     | 
| 467 | 
         
             
                phrase_markdowns: list[gr.Markdown] = []
         
     | 
| 468 | 
         
             
                rec_components = []
         
     | 
| 469 | 
         | 
| 470 | 
         
            +
                def create_recording_grid(max_components=MAX_COMPONENTS):
         
     | 
| 471 | 
         
            +
                    """Create recording grid components with fixed maximum"""
         
     | 
| 472 | 
         
             
                    markdowns = []
         
     | 
| 473 | 
         
             
                    recordings = []
         
     | 
| 474 | 
         
            +
                    for idx in range(max_components):
         
     | 
| 475 | 
         
            +
                        visible = idx < 10  # Only first 10 visible initially
         
     | 
| 476 | 
         
            +
                        phrase_text = ALL_PHRASES[idx] if idx < len(ALL_PHRASES) else ""
         
     | 
| 477 | 
         
            +
                        md = gr.Markdown(f"**{idx+1}. {phrase_text}**", visible=visible)
         
     | 
| 478 | 
         
             
                        markdowns.append(md)
         
     | 
| 479 | 
         
             
                        comp = gr.Audio(sources="microphone", type="numpy", label=f"Recording {idx+1}", visible=visible)
         
     | 
| 480 | 
         
             
                        recordings.append(comp)
         
     | 
| 
         | 
|
| 482 | 
         | 
| 483 | 
         
             
                # Initial grid creation
         
     | 
| 484 | 
         
             
                with gr.Column():
         
     | 
| 485 | 
         
            +
                    phrase_markdowns, rec_components = create_recording_grid(MAX_COMPONENTS)
         
     | 
| 486 | 
         | 
| 487 | 
         
             
                # Add more rows button
         
     | 
| 488 | 
         
             
                add_rows_btn = gr.Button("➕ Add 10 More Rows", variant="secondary")
         
     | 
| 489 | 
         | 
| 490 | 
         
             
                def add_more_rows(current_visible, current_phrases):
         
     | 
| 491 | 
         
             
                    """Add 10 more rows by making them visible"""
         
     | 
| 492 | 
         
            +
                    new_visible = min(current_visible + 10, MAX_COMPONENTS, len(current_phrases))
         
     | 
| 493 | 
         
            +
             
     | 
| 494 | 
         
            +
                    # Create updates for all MAX_COMPONENTS
         
     | 
| 495 | 
         
             
                    visibility_updates = []
         
     | 
| 496 | 
         
            +
                    for i in range(MAX_COMPONENTS):
         
     | 
| 497 | 
         
            +
                        if i < len(current_phrases) and i < new_visible:
         
     | 
| 498 | 
         
             
                            visibility_updates.append(gr.update(visible=True))
         
     | 
| 499 | 
         
             
                        else:
         
     | 
| 500 | 
         
             
                            visibility_updates.append(gr.update(visible=False))
         
     | 
| 501 | 
         
            +
             
     | 
| 502 | 
         
             
                    return [new_visible] + visibility_updates
         
     | 
| 503 | 
         | 
| 504 | 
         
             
                def change_language(language):
         
     | 
| 505 | 
         
             
                    """Change the language and reload phrases from multilingual datasets"""
         
     | 
| 506 | 
         
             
                    new_phrases = load_multilingual_phrases(language, max_phrases=None)
         
     | 
| 507 | 
         
             
                    # Reset visible rows to 10
         
     | 
| 508 | 
         
            +
                    visible_count = min(10, len(new_phrases), MAX_COMPONENTS)
         
     | 
| 509 | 
         | 
| 510 | 
         
            +
                    # Create updates for all MAX_COMPONENTS
         
     | 
| 
         | 
|
| 511 | 
         
             
                    combined_updates = []
         
     | 
| 512 | 
         
            +
                    for i in range(MAX_COMPONENTS):
         
     | 
| 513 | 
         
            +
                        if i < len(new_phrases) and i < visible_count:
         
     | 
| 514 | 
         
            +
                            combined_updates.append(gr.update(value=f"**{i+1}. {new_phrases[i]}**", visible=True))
         
     | 
| 515 | 
         
            +
                        elif i < len(new_phrases):
         
     | 
| 516 | 
         
            +
                            combined_updates.append(gr.update(value=f"**{i+1}. {new_phrases[i]}**", visible=False))
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 517 | 
         
             
                        else:
         
     | 
| 518 | 
         
            +
                            combined_updates.append(gr.update(value=f"**{i+1}. **", visible=False))
         
     | 
| 519 | 
         | 
| 
         | 
|
| 
         | 
|
| 520 | 
         
             
                    return [new_phrases, visible_count] + combined_updates
         
     | 
| 521 | 
         | 
| 522 | 
         
             
                # Connect language change to phrase reloading
         
     | 
| 
         | 
|
| 532 | 
         
             
                    outputs=[visible_rows_state] + phrase_markdowns + rec_components
         
     | 
| 533 | 
         
             
                )
         
     | 
| 534 | 
         | 
| 535 | 
         
            +
                # Recording dataset creation button
         
     | 
| 536 | 
         
            +
                record_dataset_btn = gr.Button("🎙️ Create Dataset from Recordings", variant="primary")
         
     | 
| 537 | 
         
            +
             
     | 
| 538 | 
         
            +
                def create_recording_dataset(*recordings_and_state):
         
     | 
| 539 | 
         
            +
                    """Create dataset from visible recordings and phrases"""
         
     | 
| 540 | 
         
            +
                    try:
         
     | 
| 541 | 
         
            +
                        import soundfile as sf
         
     | 
| 542 | 
         
            +
             
     | 
| 543 | 
         
            +
                        # Extract recordings and state
         
     | 
| 544 | 
         
            +
                        recordings = recordings_and_state[:-1]  # All except the last item (phrases)
         
     | 
| 545 | 
         
            +
                        phrases = recordings_and_state[-1]      # Last item is phrases
         
     | 
| 546 | 
         
            +
             
     | 
| 547 | 
         
            +
                        dataset_dir = PROJECT_ROOT / "datasets" / "voxtral_user"
         
     | 
| 548 | 
         
            +
                        wav_dir = dataset_dir / "wavs"
         
     | 
| 549 | 
         
            +
                        wav_dir.mkdir(parents=True, exist_ok=True)
         
     | 
| 550 | 
         
            +
             
     | 
| 551 | 
         
            +
                        rows = []
         
     | 
| 552 | 
         
            +
                        successful_recordings = 0
         
     | 
| 553 | 
         
            +
             
     | 
| 554 | 
         
            +
                        # Process each recording
         
     | 
| 555 | 
         
            +
                        for i, rec in enumerate(recordings):
         
     | 
| 556 | 
         
            +
                            if rec is not None and i < len(phrases):
         
     | 
| 557 | 
         
            +
                                try:
         
     | 
| 558 | 
         
            +
                                    sr, data = rec
         
     | 
| 559 | 
         
            +
                                    out_path = wav_dir / f"recording_{i:04d}.wav"
         
     | 
| 560 | 
         
            +
                                    sf.write(str(out_path), data, sr)
         
     | 
| 561 | 
         
            +
                                    rows.append({"audio_path": str(out_path), "text": phrases[i]})
         
     | 
| 562 | 
         
            +
                                    successful_recordings += 1
         
     | 
| 563 | 
         
            +
                                except Exception as e:
         
     | 
| 564 | 
         
            +
                                    print(f"Error processing recording {i}: {e}")
         
     | 
| 565 | 
         
            +
             
     | 
| 566 | 
         
            +
                        if rows:
         
     | 
| 567 | 
         
            +
                            jsonl_path = dataset_dir / "recorded_data.jsonl"
         
     | 
| 568 | 
         
            +
                            _write_jsonl(rows, jsonl_path)
         
     | 
| 569 | 
         
            +
                            return f"✅ Dataset created successfully! {successful_recordings} recordings saved to {jsonl_path}"
         
     | 
| 570 | 
         
            +
                        else:
         
     | 
| 571 | 
         
            +
                            return "❌ No recordings found. Please record some audio first."
         
     | 
| 572 | 
         
            +
             
     | 
| 573 | 
         
            +
                    except Exception as e:
         
     | 
| 574 | 
         
            +
                        return f"❌ Error creating dataset: {str(e)}"
         
     | 
| 575 | 
         
            +
             
     | 
| 576 | 
         
            +
                # Status display for dataset creation
         
     | 
| 577 | 
         
            +
                dataset_status = gr.Textbox(label="Dataset Creation Status", interactive=False, visible=True)
         
     | 
| 578 | 
         
            +
             
     | 
| 579 | 
         
            +
                record_dataset_btn.click(
         
     | 
| 580 | 
         
            +
                    create_recording_dataset,
         
     | 
| 581 | 
         
            +
                    inputs=rec_components + [phrase_texts_state],
         
     | 
| 582 | 
         
            +
                    outputs=[dataset_status]
         
     | 
| 583 | 
         
            +
                )
         
     | 
| 584 | 
         
            +
             
     | 
| 585 | 
         
             
                # Advanced options accordion
         
     | 
| 586 | 
         
             
                with gr.Accordion("Advanced options", open=False):
         
     | 
| 587 | 
         
             
                    base_model = gr.Textbox(value="mistralai/Voxtral-Mini-3B-2507", label="Base Voxtral model")
         
     | 
| 
         | 
|
| 653 | 
         
             
                    vp_btn = gr.Button("Use Multilingual Dataset Sample")
         
     | 
| 654 | 
         | 
| 655 | 
         
             
                    def _collect_multilingual_sample(lang_code: str, num_samples: int, split: str):
         
     | 
| 656 | 
         
            +
                        """Load sample from multilingual datasets (ML Commons preferred)"""
         
     | 
| 657 | 
         
             
                        from datasets import load_dataset, Audio
         
     | 
| 658 | 
         
             
                        import random
         
     | 
| 659 | 
         | 
| 660 | 
         
            +
                        # Language code mapping for ML Commons Speech
         
     | 
| 661 | 
         
            +
                        ml_lang_map = {
         
     | 
| 662 | 
         
             
                            "en": "en", "de": "de", "fr": "fr", "es": "es", "it": "it",
         
     | 
| 663 | 
         
             
                            "pl": "pl", "pt": "pt", "nl": "nl", "ru": "ru", "ar": "ar",
         
     | 
| 664 | 
         
            +
                            "zh": "zh", "ja": "ja", "ko": "ko"
         
     | 
| 665 | 
         
             
                        }
         
     | 
| 666 | 
         | 
| 667 | 
         
            +
                        ml_lang = ml_lang_map.get(lang_code, lang_code)
         
     | 
| 668 | 
         
            +
             
     | 
| 669 | 
         
            +
                        try:
         
     | 
| 670 | 
         
            +
                            # Try ML Commons Speech first
         
     | 
| 671 | 
         
            +
                            ds = load_dataset("mlcommons/ml_spoken_words", f"speech_commands_{ml_lang}", split=split, streaming=True)
         
     | 
| 672 | 
         
            +
                            ds = ds.cast_column("audio", Audio(sampling_rate=16000))
         
     | 
| 673 | 
         
            +
             
     | 
| 674 | 
         
            +
                            dataset_dir = PROJECT_ROOT / "datasets" / "voxtral_user"
         
     | 
| 675 | 
         
            +
                            rows: list[dict] = []
         
     | 
| 676 | 
         
            +
                            texts: list[str] = []
         
     | 
| 677 | 
         
            +
             
     | 
| 678 | 
         
            +
                            count = 0
         
     | 
| 679 | 
         
            +
                            seen_words = set()
         
     | 
| 680 | 
         
            +
             
     | 
| 681 | 
         
            +
                            for ex in ds:
         
     | 
| 682 | 
         
            +
                                if count >= num_samples:
         
     | 
| 683 | 
         
            +
                                    break
         
     | 
| 684 | 
         
            +
             
     | 
| 685 | 
         
            +
                                audio = ex.get("audio") or {}
         
     | 
| 686 | 
         
            +
                                path = audio.get("path")
         
     | 
| 687 | 
         
            +
                                word = ex.get("word", "").strip()
         
     | 
| 688 | 
         
            +
             
     | 
| 689 | 
         
            +
                                if path and word and len(word) > 2 and word not in seen_words:
         
     | 
| 690 | 
         
            +
                                    rows.append({"audio_path": path, "text": word})
         
     | 
| 691 | 
         
            +
                                    texts.append(str(word))
         
     | 
| 692 | 
         
            +
                                    seen_words.add(word)
         
     | 
| 693 | 
         
            +
                                    count += 1
         
     | 
| 694 | 
         
            +
             
     | 
| 695 | 
         
            +
                            if rows:
         
     | 
| 696 | 
         
            +
                                jsonl_path = dataset_dir / "data.jsonl"
         
     | 
| 697 | 
         
            +
                                _write_jsonl(rows, jsonl_path)
         
     | 
| 698 | 
         
            +
             
     | 
| 699 | 
         
            +
                                # Build markdown content updates for on-screen prompts
         
     | 
| 700 | 
         
            +
                                combined_updates = []
         
     | 
| 701 | 
         
            +
                                for i in range(MAX_COMPONENTS):
         
     | 
| 702 | 
         
            +
                                    t = texts[i] if i < len(texts) else ""
         
     | 
| 703 | 
         
            +
                                    if i < len(texts):
         
     | 
| 704 | 
         
            +
                                        combined_updates.append(gr.update(value=f"**{i+1}. {t}**", visible=True))
         
     | 
| 705 | 
         
            +
                                    else:
         
     | 
| 706 | 
         
            +
                                        combined_updates.append(gr.update(visible=False))
         
     | 
| 707 | 
         
            +
             
     | 
| 708 | 
         
            +
                                return (str(jsonl_path), texts, *combined_updates)
         
     | 
| 709 | 
         
            +
             
     | 
| 710 | 
         
            +
                        except Exception as e:
         
     | 
| 711 | 
         
            +
                            print(f"ML Commons Speech sample loading failed: {e}")
         
     | 
| 712 | 
         | 
| 713 | 
         
            +
                        # Try Multilingual LibriSpeech as backup
         
     | 
| 714 | 
         
             
                        try:
         
     | 
| 715 | 
         
            +
                            ds = load_dataset("facebook/multilingual_librispeech", f"{lang_code}", split=split, streaming=True)
         
     | 
| 
         | 
|
| 716 | 
         
             
                            ds = ds.cast_column("audio", Audio(sampling_rate=16000))
         
     | 
| 717 | 
         | 
| 718 | 
         
             
                            dataset_dir = PROJECT_ROOT / "datasets" / "voxtral_user"
         
     | 
| 
         | 
|
| 726 | 
         | 
| 727 | 
         
             
                                audio = ex.get("audio") or {}
         
     | 
| 728 | 
         
             
                                path = audio.get("path")
         
     | 
| 729 | 
         
            +
                                text = ex.get("text", "").strip()
         
     | 
| 730 | 
         | 
| 731 | 
         
             
                                if path and text and len(text) > 10:
         
     | 
| 732 | 
         
             
                                    rows.append({"audio_path": path, "text": text})
         
     | 
| 
         | 
|
| 739 | 
         | 
| 740 | 
         
             
                                # Build markdown content updates for on-screen prompts
         
     | 
| 741 | 
         
             
                                combined_updates = []
         
     | 
| 742 | 
         
            +
                                for i in range(MAX_COMPONENTS):
         
     | 
| 743 | 
         
             
                                    t = texts[i] if i < len(texts) else ""
         
     | 
| 744 | 
         
             
                                    if i < len(texts):
         
     | 
| 745 | 
         
             
                                        combined_updates.append(gr.update(value=f"**{i+1}. {t}**", visible=True))
         
     | 
| 
         | 
|
| 749 | 
         
             
                                return (str(jsonl_path), texts, *combined_updates)
         
     | 
| 750 | 
         | 
| 751 | 
         
             
                        except Exception as e:
         
     | 
| 752 | 
         
            +
                            print(f"Multilingual LibriSpeech failed: {e}")
         
     | 
| 753 | 
         | 
| 754 | 
         
             
                        # Fallback: generate synthetic samples with text only
         
     | 
| 755 | 
         
             
                        print("Using fallback: generating text-only samples")
         
     | 
| 
         | 
|
| 763 | 
         | 
| 764 | 
         
             
                        # Build markdown content updates for on-screen prompts
         
     | 
| 765 | 
         
             
                        combined_updates = []
         
     | 
| 766 | 
         
            +
                        for i in range(MAX_COMPONENTS):
         
     | 
| 767 | 
         
             
                            t = texts[i] if i < len(texts) else ""
         
     | 
| 768 | 
         
             
                            if i < len(texts):
         
     | 
| 769 | 
         
             
                                combined_updates.append(gr.update(value=f"**{i+1}. {t}**", visible=True))
         
     |