Spaces:

hash-map
/

audio_to_phonome

Sleeping

App Files Files Community

hash-map commited on Sep 7

Commit

a32630e

verified ·

1 Parent(s): 944a05e

Update utils.py

Browse files

Files changed (1) hide show

utils.py +134 -133

utils.py CHANGED Viewed

@@ -1,133 +1,134 @@
-# utils.py
-import numpy as np
-import librosa
-from matplotlib import pyplot as plt
-SR = 22050
-HOP_LENGTH = 256
-def mel_to_audio(mel_db, sr=22050, n_fft=1024, hop_length=256, win_length=1024, n_iter=60):
-    # mel_db: (n_mels, T) in dB (like saved from preprocess)
-    S = librosa.db_to_power(mel_db)
-    # invert mel to linear spectrogram
-    mel_basis = librosa.filters.mel(sr=sr, n_fft=n_fft, n_mels=S.shape[0])
-    # Approximate inverse using pseudo inverse
-    inv_mel = np.maximum(1e-10, np.linalg.pinv(mel_basis).dot(S))
-    # Griffin-Lim
-    audio = librosa.griffinlim(inv_mel, n_iter=n_iter, hop_length=hop_length, win_length=win_length)
-    return audio
-from g2p_en import G2p
-g2p = G2p()
-def text_to_phonemes(text):
-    ph = g2p(text)
-    # Remove spaces/punct tokens produced by g2p_en
-    ph = [p for p in ph if p.isalpha()]
-    return " ".join(ph)
-import librosa
-import numpy as np
-import os
-def audio_to_mel(audio_path, save_dir="mels", sr=22050, n_fft=1024, hop_length=256, win_length=1024, n_mels=80):
-    # Load audio
-    y, _ = librosa.load(audio_path, sr=sr)
-    # Compute STFT magnitude
-    S = np.abs(librosa.stft(y, n_fft=n_fft, hop_length=hop_length, win_length=win_length))
-    # Convert to mel spectrogram
-    mel_basis = librosa.filters.mel(sr=sr, n_fft=n_fft, n_mels=n_mels)
-    mel = np.dot(mel_basis, S)
-    # Convert to dB
-    mel_db = librosa.power_to_db(mel)
-    # Make sure save directory exists
-    os.makedirs(save_dir, exist_ok=True)
-    # Save mel as .npy file
-    base_name = os.path.splitext(os.path.basename(audio_path))[0]
-    mel_path = os.path.join(save_dir, base_name + "_mel.npy")
-    np.save(mel_path, mel_db)
-    return mel_path
-def ctc_post_process(phonemes):
-    """
-    Collapse repeats + remove blanks ('-') in CTC output.
-    phonemes: list of predicted phoneme tokens
-    """
-    new_seq = []
-    prev = None
-    for p in phonemes:
-        if p == "-" or p == prev:
-            continue
-        new_seq.append(p)
-        prev = p
-    return new_seq
-import numpy as np
-import matplotlib.pyplot as plt
-import librosa.display
-def mel_to_image(mel_path, sr=22050, hop_length=256, save_fig=True):
-    # Load mel spectrogram from .npy
-    mel_db = np.load(mel_path)
-    # Create figure
-    plt.figure(figsize=(14, 6))
-    # Plot mel spectrogram
-    librosa.display.specshow(mel_db, sr=sr, hop_length=hop_length, x_axis='time', y_axis='mel', cmap='magma')
-    plt.title("Mel Spectrogram (dB)")
-    plt.colorbar(format="%+2.0f dB")
-    save_path = mel_path.replace('.npy', '_mel.png')
-    plt.savefig(save_path)
-    print(f"Saved mel spectrogram image at: {save_path}")
-    """plt.show()"""
-    return save_path
-# load reverse lexicon: phoneme_seq -> [words]
-import nltk
-from collections import defaultdict
-nltk.download('cmudict')
-arpabet = nltk.corpus.cmudict.dict()
-# Build reverse lexicon
-reverse_lex = defaultdict(list)
-for word, pron_list in arpabet.items():
-    for pron in pron_list:
-        reverse_lex[tuple(pron)].append(word)
-def split_on_boundaries(phoneme_stream, boundary_token="<w>"):
-    """Split on a special token representing word boundaries."""
-    words = []
-    current = []
-    for phon in phoneme_stream:
-        if phon == boundary_token:
-            if current:
-                words.append(current)
-                current = []
-        else:
-            current.append(phon)
-    if current:
-        words.append(current)
-    return words
-def p2g_fallback(phoneme_word):
-    # Placeholder for fallback pronunciation-to-spelling
-    return "".join(phoneme_word).lower()
-def phonemes_to_text(phoneme_stream):
-    words = []
-    for phoneme_word in split_on_boundaries(phoneme_stream):
-        candidates = reverse_lex.get(tuple(phoneme_word), [])
-        if candidates:
-            words.append(candidates[0])
-        else:
-            words.append(p2g_fallback(phoneme_word))
-    return " ".join(words)

+# utils.py
+import numpy as np
+import librosa
+from matplotlib import pyplot as plt
+SR = 22050
+HOP_LENGTH = 256
+def mel_to_audio(mel_db, sr=22050, n_fft=1024, hop_length=256, win_length=1024, n_iter=60):
+    # mel_db: (n_mels, T) in dB (like saved from preprocess)
+    S = librosa.db_to_power(mel_db)
+    # invert mel to linear spectrogram
+    mel_basis = librosa.filters.mel(sr=sr, n_fft=n_fft, n_mels=S.shape[0])
+    # Approximate inverse using pseudo inverse
+    inv_mel = np.maximum(1e-10, np.linalg.pinv(mel_basis).dot(S))
+    # Griffin-Lim
+    audio = librosa.griffinlim(inv_mel, n_iter=n_iter, hop_length=hop_length, win_length=win_length)
+    return audio
+import nltk
+nltk.download('averaged_perceptron_tagger_eng')
+from g2p_en import G2p
+g2p = G2p()
+def text_to_phonemes(text):
+    ph = g2p(text)
+    # Remove spaces/punct tokens produced by g2p_en
+    ph = [p for p in ph if p.isalpha()]
+    return " ".join(ph)
+import librosa
+import numpy as np
+import os
+def audio_to_mel(audio_path, save_dir="mels", sr=22050, n_fft=1024, hop_length=256, win_length=1024, n_mels=80):
+    # Load audio
+    y, _ = librosa.load(audio_path, sr=sr)
+    # Compute STFT magnitude
+    S = np.abs(librosa.stft(y, n_fft=n_fft, hop_length=hop_length, win_length=win_length))
+    # Convert to mel spectrogram
+    mel_basis = librosa.filters.mel(sr=sr, n_fft=n_fft, n_mels=n_mels)
+    mel = np.dot(mel_basis, S)
+    # Convert to dB
+    mel_db = librosa.power_to_db(mel)
+    # Make sure save directory exists
+    os.makedirs(save_dir, exist_ok=True)
+    # Save mel as .npy file
+    base_name = os.path.splitext(os.path.basename(audio_path))[0]
+    mel_path = os.path.join(save_dir, base_name + "_mel.npy")
+    np.save(mel_path, mel_db)
+    return mel_path
+def ctc_post_process(phonemes):
+    """
+    Collapse repeats + remove blanks ('-') in CTC output.
+    phonemes: list of predicted phoneme tokens
+    """
+    new_seq = []
+    prev = None
+    for p in phonemes:
+        if p == "-" or p == prev:
+            continue
+        new_seq.append(p)
+        prev = p
+    return new_seq
+import numpy as np
+import matplotlib.pyplot as plt
+import librosa.display
+def mel_to_image(mel_path, sr=22050, hop_length=256, save_fig=True):
+    # Load mel spectrogram from .npy
+    mel_db = np.load(mel_path)
+    # Create figure
+    plt.figure(figsize=(14, 6))
+    # Plot mel spectrogram
+    librosa.display.specshow(mel_db, sr=sr, hop_length=hop_length, x_axis='time', y_axis='mel', cmap='magma')
+    plt.title("Mel Spectrogram (dB)")
+    plt.colorbar(format="%+2.0f dB")
+    save_path = mel_path.replace('.npy', '_mel.png')
+    plt.savefig(save_path)
+    print(f"Saved mel spectrogram image at: {save_path}")
+    """plt.show()"""
+    return save_path
+# load reverse lexicon: phoneme_seq -> [words]
+import nltk
+from collections import defaultdict
+nltk.download('cmudict')
+arpabet = nltk.corpus.cmudict.dict()
+# Build reverse lexicon
+reverse_lex = defaultdict(list)
+for word, pron_list in arpabet.items():
+    for pron in pron_list:
+        reverse_lex[tuple(pron)].append(word)
+def split_on_boundaries(phoneme_stream, boundary_token="<w>"):
+    """Split on a special token representing word boundaries."""
+    words = []
+    current = []
+    for phon in phoneme_stream:
+        if phon == boundary_token:
+            if current:
+                words.append(current)
+                current = []
+        else:
+            current.append(phon)
+    if current:
+        words.append(current)
+    return words
+def p2g_fallback(phoneme_word):
+    # Placeholder for fallback pronunciation-to-spelling
+    return "".join(phoneme_word).lower()
+def phonemes_to_text(phoneme_stream):
+    words = []
+    for phoneme_word in split_on_boundaries(phoneme_stream):
+        candidates = reverse_lex.get(tuple(phoneme_word), [])
+        if candidates:
+            words.append(candidates[0])
+        else:
+            words.append(p2g_fallback(phoneme_word))
+    return " ".join(words)