Spaces:

kiranpantha
/

whisper-nepali

Sleeping

App Files Files Community

kiranpantha commited on 22 days ago

Commit

03c4019

verified ·

1 Parent(s): e7e598c

Update app.py

Browse files

Files changed (1) hide show

app.py +25 -22

app.py CHANGED Viewed

@@ -13,35 +13,47 @@ model_urls = [
     "kiranpantha/whisper-large-v3-turbo-nepali",
 ]
 # Cache models and processors
 model_cache = {}
 def load_model(model_name):
     """Loads and caches the model and processor with proper device management."""
     if model_name not in model_cache:
-        processor_name = model_name.replace("kiranpantha", "openai").replace(
-            "-nepali", "").replace("-ne", "").replace("-np", "")
-        # Load processor and model
         processor = AutoProcessor.from_pretrained(processor_name)
-        device = "cuda:0" if torch.cuda.is_available() else "cpu"
         model = AutoModelForSpeechSeq2Seq.from_pretrained(model_name).to(device)
         model_cache[model_name] = (processor, model, device)
     return model_cache[model_name]
 def create_pipeline(model_name):
     """Creates an ASR pipeline with proper configuration."""
     processor, model, device = load_model(model_name)
     return AutomaticSpeechRecognitionPipeline(
         model=model,
         processor=processor,
-        device=device,
-        generate_kwargs={"task": "transcribe", "language": "nepali"}  # Verify language code
     )
 def process_audio(model_url, audio_chunk):
     """Processes audio and returns transcription with error handling."""
     try:
         audio_array, sample_rate = audio_chunk
         # Convert stereo to mono
@@ -51,7 +63,7 @@ def process_audio(model_url, audio_chunk):
         # Resample to 16kHz if needed
         if sample_rate != 16000:
             resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
-            audio_array = resampler(torch.tensor(audio_array)).numpy()
         # Create pipeline and process
         asr_pipeline = create_pipeline(model_url)
@@ -65,24 +77,15 @@ def process_audio(model_url, audio_chunk):
 with gr.Blocks() as demo:
     gr.Markdown("# Nepali Speech Recognition with Whisper Models")
-    model_dropdown = gr.Dropdown(
-        choices=model_urls,
-        label="Select Model",
-        value=model_urls[0]
-    )
-    audio_input = gr.Audio(
-        type="numpy",
-        label="Input Audio",
-        streaming=True
-    )
     output_text = gr.Textbox(label="Transcription")
-    audio_input.stream(
         fn=process_audio,
         inputs=[model_dropdown, audio_input],
         outputs=output_text,
     )
-demo.launch()

     "kiranpantha/whisper-large-v3-turbo-nepali",
 ]
+# Mapping model names correctly
+processor_mappings = {
+    "kiranpantha/whisper-tiny-ne": "openai/whisper-tiny",
+    "kiranpantha/whisper-base-ne": "openai/whisper-base",
+    "kiranpantha/whisper-small-np": "openai/whisper-small",
+    "kiranpantha/whisper-medium-nepali": "openai/whisper-medium",
+    "kiranpantha/whisper-large-v3-nepali": "openai/whisper-large-v3",
+    "kiranpantha/whisper-large-v3-turbo-nepali": "openai/whisper-large-v3",
+}
 # Cache models and processors
 model_cache = {}
 def load_model(model_name):
     """Loads and caches the model and processor with proper device management."""
     if model_name not in model_cache:
+        processor_name = processor_mappings.get(model_name, model_name)  # Handle mapping
         processor = AutoProcessor.from_pretrained(processor_name)
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         model = AutoModelForSpeechSeq2Seq.from_pretrained(model_name).to(device)
         model_cache[model_name] = (processor, model, device)
     return model_cache[model_name]
 def create_pipeline(model_name):
     """Creates an ASR pipeline with proper configuration."""
     processor, model, device = load_model(model_name)
     return AutomaticSpeechRecognitionPipeline(
         model=model,
         processor=processor,
+        device=device.index if device.type == "cuda" else -1,  # Ensure compatibility
+        generate_kwargs={"task": "transcribe", "language": "ne"}  # "nepali" might not work
     )
 def process_audio(model_url, audio_chunk):
     """Processes audio and returns transcription with error handling."""
     try:
+        # Unpack audio_chunk (tuple) into audio array and sample rate
         audio_array, sample_rate = audio_chunk
         # Convert stereo to mono
         # Resample to 16kHz if needed
         if sample_rate != 16000:
             resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
+            audio_array = resampler(torch.tensor(audio_array).unsqueeze(0)).squeeze(0).numpy()
         # Create pipeline and process
         asr_pipeline = create_pipeline(model_url)
 with gr.Blocks() as demo:
     gr.Markdown("# Nepali Speech Recognition with Whisper Models")
+    model_dropdown = gr.Dropdown(choices=model_urls, label="Select Model", value=model_urls[0])
+    audio_input = gr.Audio(type="numpy", label="Input Audio")
     output_text = gr.Textbox(label="Transcription")
+    transcribe_button = gr.Button("Transcribe")
+    transcribe_button.click(
         fn=process_audio,
         inputs=[model_dropdown, audio_input],
         outputs=output_text,
     )
+demo.launch()