Spaces:

hash-map
/

audio_to_phonome

Sleeping

App Files Files Community

hash-map commited on Sep 7

Commit

44938e7

verified ·

1 Parent(s): a32630e

Update infer.py

Browse files

Files changed (1) hide show

infer.py +128 -152

infer.py CHANGED Viewed

@@ -1,153 +1,129 @@
-import gradio as gr
-import torch
-import json
-import numpy as np
-import os
-from datetime import datetime
-from model import Image2Phoneme
-from utils import ctc_post_process, audio_to_mel, mel_to_image, text_to_phonemes
-import soundfile as sf
-import shutil
-import pronouncing
-import time
-# Configuration
-DEVICE = torch.device("cpu")
-PHMAP = "phoneme_to_id.json"
-AUDIO_DIR = "audio_inputs"
-# Ensure audio directory exists
-os.makedirs(AUDIO_DIR, exist_ok=True)
-# Load phoneme vocabulary
-try:
-    vocab = json.load(open(PHMAP, "r"))
-    id_to_ph = {v: k for k, v in vocab.items()}
-except FileNotFoundError:
-    raise FileNotFoundError(f"Phoneme mapping file not found at {PHMAP}")
-# Build model
-vocab_size = max(vocab.values()) + 1
-model = Image2Phoneme(vocab_size=vocab_size).to(DEVICE)
-try:
-    ckpt = torch.load("last_checkpoint.pt", map_location=DEVICE, weights_only=True)
-    model.load_state_dict(ckpt["model_state_dict"])
-    model.eval()
-except FileNotFoundError:
-    raise FileNotFoundError(f"Checkpoint file not found at last_checkpoint.pt")
-def process_audio(audio_input):
-    """Process audio to predict phonemes and display mel spectrogram."""
-    try:
-        print(f"Received audio_input before processing: {audio_input}")
-        # Generate unique filename based on timestamp
-        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-        audio_path = os.path.join(AUDIO_DIR, f"input_{timestamp}.wav")
-        # Handle audio input
-        if audio_input is None:
-            print("Audio input is None after stopping recording")
-            return {"error": "No audio input provided"}, None, None, None
-        if isinstance(audio_input, str):
-            # File upload: Copy the uploaded file to audio_inputs/
-            print(f"Processing uploaded file: {audio_input}")
-            if not os.path.exists(audio_input):
-                return {"error": f"Uploaded file not found: {audio_input}"}, None, None, None
-            if audio_input.endswith(".mp3"):
-                print("Converting .mp3 to .wav")
-                from pydub import AudioSegment
-                audio = AudioSegment.from_mp3(audio_input)
-                audio_path = audio_path.replace(".wav", "_converted.wav")
-                audio.export(audio_path, format="wav")
-                print(f"Converted file saved to: {audio_path}")
-            else:
-                shutil.copy(audio_input, audio_path)
-                print(f"Copied file to: {audio_path}")
-        else:
-            # Microphone input: (sample_rate, audio_data)
-            print("Processing microphone input")
-            sample_rate, audio_data = audio_input
-            print(f"Sample rate: {sample_rate}, Audio data shape: {audio_data.shape if hasattr(audio_data, 'shape') else 'None'}")
-            if audio_data is None or len(audio_data) == 0:
-                print("Microphone audio data is empty or invalid")
-                return {"error": "Microphone input data is empty or invalid"}, None, None, None
-            # Add a small delay to ensure audio data is fully captured
-            time.sleep(1)
-            sf.write(audio_path, audio_data, sample_rate)
-            print(f"Saved microphone audio to: {audio_path}")
-            # Verify the file exists
-            if not os.path.exists(audio_path):
-                print(f"Failed to save audio file at: {audio_path}")
-                return {"error": "Failed to save recorded audio file"}, None, None, None
-        # Process audio to mel spectrogram
-        mel_path = audio_to_mel(audio_path)
-        print(f"Generated mel spectrogram: {mel_path}")
-        if not os.path.exists(mel_path):
-            return {"error": f"Mel spectrogram file not found: {mel_path}"}, None, None, None
-        mel_image_path = mel_to_image(mel_path)
-        print(f"Generated mel spectrogram image: {mel_image_path}")
-        if not os.path.exists(mel_image_path):
-            return {"error": f"Mel spectrogram image not found: {mel_image_path}"}, None, None, None
-        # Load mel spectrogram
-        mel = np.load(mel_path)  # shape (n_mels, T)
-        print(f"Loaded mel spectrogram shape: {mel.shape}")
-        mel_tensor = torch.tensor(mel).unsqueeze(0).to(DEVICE)  # add batch dim
-        mel_lens = torch.tensor([mel.shape[1]]).to(DEVICE)
-        # Predict phonemes
-        with torch.no_grad():
-            ph_pred = model(mel_tensor)  # shape (B, seq_len, vocab_size)
-            ph_ids = ph_pred.argmax(-1)[0].cpu().numpy()  # pick first batch
-            print(f"Predicted phoneme IDs: {ph_ids}")
-        # Convert IDs to phonemes
-        ph_seq = [id_to_ph[i] for i in ph_ids if i > 0]
-        print(f"Raw phonemes: {ph_seq}")
-        # Post-process phonemes
-        post_processed = ctc_post_process(ph_seq)
-        print(f"Post-processed phonemes: {post_processed}")
-        # Return results
-        return {
-            "audio_path": audio_path,
-            "phonemes": " ".join(ph_seq),
-            "post_processed_phonemes": " ".join(post_processed)
-        }, mel_image_path, " ".join(ph_seq), " ".join(post_processed)
-    except Exception as e:
-        print(f"Error in process_audio: {str(e)}")
-        return {"error": f"Processing failed: {str(e)}"}, None, None, None
-# Gradio interface
-with gr.Blocks() as iface:
-    gr.Markdown("# Speech to Phonemes Converter")
-    gr.Markdown("Record or upload audio to predict phonemes and display mel spectrogram. Paste input text if available.")
-    audio_input = gr.Audio(sources=[ "upload"], type="filepath", label="Upload Audio (.wav or .mp3)", interactive=True)
-    text_input = gr.Textbox(label="Enter Text", placeholder="Type a sentence to convert to phonemes")
-    process_button = gr.Button("Process")
-    audio_output = gr.JSON(label="Audio Processing Results (Audio Path, Phonemes, Post-Processed Phonemes)")
-    mel_image = gr.Image(label="Mel Spectrogram", type="filepath")
-    raw_phonemes = gr.Textbox(label="Raw Phonemes")
-    post_processed_phonemes = gr.Textbox(label="Post-Processed Phonemes")
-    text_output = gr.JSON(label="Text-to-Phoneme Results")
-    def process(audio_input, text_input):
-        print(f"Processing inputs - Audio: {audio_input}, Text: {text_input}")
-        audio_result, mel_image_path, raw_ph, post_ph = process_audio(audio_input) if audio_input else ({}, None, None, None)
-        text_result = text_to_phonemes(text_input) if text_input else {}
-        return audio_result, mel_image_path, raw_ph, post_ph, text_result
-    process_button.click(
-        fn=process,
-        inputs=[audio_input, text_input],
-        outputs=[audio_output, mel_image, raw_phonemes, post_processed_phonemes, text_output]
-    )
-if __name__ == "__main__":
     iface.launch(debug=True)

+import gradio as gr
+import torch
+import json
+import numpy as np
+import os
+from datetime import datetime
+from model import Image2Phoneme
+from utils import ctc_post_process, audio_to_mel, mel_to_image, text_to_phonemes
+import soundfile as sf
+import shutil
+import time
+# Configuration
+DEVICE = torch.device("cpu")
+PHMAP = "phoneme_to_id.json"
+AUDIO_DIR = "audio_inputs"
+# Ensure audio directory exists
+os.makedirs(AUDIO_DIR, exist_ok=True)
+# Load phoneme vocabulary
+try:
+    vocab = json.load(open(PHMAP, "r"))
+    id_to_ph = {v: k for k, v in vocab.items()}
+except FileNotFoundError:
+    raise FileNotFoundError(f"Phoneme mapping file not found at {PHMAP}")
+# Build model
+vocab_size = max(vocab.values()) + 1
+model = Image2Phoneme(vocab_size=vocab_size).to(DEVICE)
+try:
+    ckpt = torch.load("last_checkpoint.pt", map_location=DEVICE, weights_only=True)
+    model.load_state_dict(ckpt["model_state_dict"])
+    model.eval()
+except FileNotFoundError:
+    raise FileNotFoundError(f"Checkpoint file not found at last_checkpoint.pt")
+def process_audio(audio_input):
+    """Process audio to predict phonemes and display mel spectrogram."""
+    try:
+        print(f"Received audio_input before processing: {audio_input}")
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        audio_path = os.path.join(AUDIO_DIR, f"input_{timestamp}.wav")
+        if audio_input is None:
+            print("Audio input is None")
+            return {"error": "No audio input provided"}, None, None, None
+        if isinstance(audio_input, str):
+            print(f"Processing uploaded file: {audio_input}")
+            if not os.path.exists(audio_input):
+                return {"error": f"Uploaded file not found: {audio_input}"}, None, None, None
+            if audio_input.endswith(".mp3"):
+                print("Converting .mp3 to .wav")
+                from pydub import AudioSegment
+                audio = AudioSegment.from_mp3(audio_input)
+                audio_path = audio_path.replace(".wav", "_converted.wav")
+                audio.export(audio_path, format="wav")
+                print(f"Converted file saved to: {audio_path}")
+            else:
+                shutil.copy(audio_input, audio_path)
+                print(f"Copied file to: {audio_path}")
+        else:
+            raise ValueError("Microphone input not supported in this configuration")
+        mel_path = audio_to_mel(audio_path)
+        print(f"Generated mel spectrogram: {mel_path}")
+        if not os.path.exists(mel_path):
+            return {"error": f"Mel spectrogram file not found: {mel_path}"}, None, None, None
+        mel_image_path = mel_to_image(mel_path)
+        print(f"Generated mel spectrogram image: {mel_image_path}")
+        if not os.path.exists(mel_image_path):
+            return {"error": f"Mel spectrogram image not found: {mel_image_path}"}, None, None, None
+        mel = np.load(mel_path)
+        print(f"Loaded mel spectrogram shape: {mel.shape}")
+        mel_tensor = torch.tensor(mel).unsqueeze(0).to(DEVICE)
+        mel_lens = torch.tensor([mel.shape[1]]).to(DEVICE)
+        with torch.no_grad():
+            ph_pred = model(mel_tensor)
+            ph_ids = ph_pred.argmax(-1)[0].cpu().numpy()
+            print(f"Predicted phoneme IDs: {ph_ids}")
+        ph_seq = [id_to_ph[i] for i in ph_ids if i > 0]
+        print(f"Raw phonemes: {ph_seq}")
+        post_processed = ctc_post_process(ph_seq)
+        print(f"Post-processed phonemes: {post_processed}")
+        return {
+            "audio_path": audio_path,
+            "phonemes": " ".join(ph_seq),
+            "post_processed_phonemes": " ".join(post_processed)
+        }, mel_image_path, " ".join(ph_seq), " ".join(post_processed)
+    except Exception as e:
+        print(f"Error in process_audio: {str(e)}")
+        return {"error": f"Processing failed: {str(e)}"}, None, None, None
+# Gradio interface
+with gr.Blocks() as iface:
+    gr.Markdown("# Speech to Phonemes Converter")
+    gr.Markdown("Upload audio to predict phonemes and display mel spectrogram. Enter text to convert to phonemes.")
+    audio_input = gr.Audio(sources=["upload"], type="filepath", label="Upload Audio (.wav or .mp3)", interactive=True)
+    text_input = gr.Textbox(label="Enter Text", placeholder="Type a sentence to convert to phonemes")
+    process_button = gr.Button("Process")
+    audio_output = gr.JSON(label="Audio Processing Results (Audio Path, Phonemes, Post-Processed Phonemes)")
+    mel_image = gr.Image(label="Mel Spectrogram", type="filepath")
+    raw_phonemes = gr.Textbox(label="Raw Phonemes")
+    post_processed_phonemes = gr.Textbox(label="Post-Processed Phonemes")
+    text_output = gr.JSON(label="Text-to-Phoneme Results")
+    def process(audio_input, text_input):
+        print(f"Processing inputs - Audio: {audio_input}, Text: {text_input}")
+        audio_result, mel_image_path, raw_ph, post_ph = process_audio(audio_input) if audio_input else ({}, None, None, None)
+        text_result = text_to_phonemes(text_input) if text_input and text_input.strip() else {}
+        return audio_result, mel_image_path, raw_ph, post_ph, text_result
+    process_button.click(
+        fn=process,
+        inputs=[audio_input, text_input],
+        outputs=[audio_output, mel_image, raw_phonemes, post_processed_phonemes, text_output]
+    )
+if __name__ == "__main__":
     iface.launch(debug=True)