hash-map commited on
Commit
a32630e
·
verified ·
1 Parent(s): 944a05e

Update utils.py

Browse files
Files changed (1) hide show
  1. utils.py +134 -133
utils.py CHANGED
@@ -1,133 +1,134 @@
1
- # utils.py
2
- import numpy as np
3
- import librosa
4
- from matplotlib import pyplot as plt
5
- SR = 22050
6
- HOP_LENGTH = 256
7
- def mel_to_audio(mel_db, sr=22050, n_fft=1024, hop_length=256, win_length=1024, n_iter=60):
8
- # mel_db: (n_mels, T) in dB (like saved from preprocess)
9
- S = librosa.db_to_power(mel_db)
10
- # invert mel to linear spectrogram
11
- mel_basis = librosa.filters.mel(sr=sr, n_fft=n_fft, n_mels=S.shape[0])
12
- # Approximate inverse using pseudo inverse
13
- inv_mel = np.maximum(1e-10, np.linalg.pinv(mel_basis).dot(S))
14
- # Griffin-Lim
15
- audio = librosa.griffinlim(inv_mel, n_iter=n_iter, hop_length=hop_length, win_length=win_length)
16
- return audio
17
-
18
- from g2p_en import G2p
19
- g2p = G2p()
20
-
21
- def text_to_phonemes(text):
22
- ph = g2p(text)
23
- # Remove spaces/punct tokens produced by g2p_en
24
- ph = [p for p in ph if p.isalpha()]
25
- return " ".join(ph)
26
-
27
- import librosa
28
- import numpy as np
29
- import os
30
-
31
- def audio_to_mel(audio_path, save_dir="mels", sr=22050, n_fft=1024, hop_length=256, win_length=1024, n_mels=80):
32
- # Load audio
33
- y, _ = librosa.load(audio_path, sr=sr)
34
-
35
- # Compute STFT magnitude
36
- S = np.abs(librosa.stft(y, n_fft=n_fft, hop_length=hop_length, win_length=win_length))
37
-
38
- # Convert to mel spectrogram
39
- mel_basis = librosa.filters.mel(sr=sr, n_fft=n_fft, n_mels=n_mels)
40
- mel = np.dot(mel_basis, S)
41
-
42
- # Convert to dB
43
- mel_db = librosa.power_to_db(mel)
44
-
45
- # Make sure save directory exists
46
- os.makedirs(save_dir, exist_ok=True)
47
-
48
- # Save mel as .npy file
49
- base_name = os.path.splitext(os.path.basename(audio_path))[0]
50
- mel_path = os.path.join(save_dir, base_name + "_mel.npy")
51
- np.save(mel_path, mel_db)
52
-
53
- return mel_path
54
-
55
-
56
- def ctc_post_process(phonemes):
57
- """
58
- Collapse repeats + remove blanks ('-') in CTC output.
59
- phonemes: list of predicted phoneme tokens
60
- """
61
- new_seq = []
62
- prev = None
63
- for p in phonemes:
64
- if p == "-" or p == prev:
65
- continue
66
- new_seq.append(p)
67
- prev = p
68
- return new_seq
69
-
70
-
71
- import numpy as np
72
- import matplotlib.pyplot as plt
73
- import librosa.display
74
-
75
- def mel_to_image(mel_path, sr=22050, hop_length=256, save_fig=True):
76
- # Load mel spectrogram from .npy
77
- mel_db = np.load(mel_path)
78
-
79
- # Create figure
80
- plt.figure(figsize=(14, 6))
81
-
82
- # Plot mel spectrogram
83
- librosa.display.specshow(mel_db, sr=sr, hop_length=hop_length, x_axis='time', y_axis='mel', cmap='magma')
84
- plt.title("Mel Spectrogram (dB)")
85
- plt.colorbar(format="%+2.0f dB")
86
-
87
- save_path = mel_path.replace('.npy', '_mel.png')
88
- plt.savefig(save_path)
89
- print(f"Saved mel spectrogram image at: {save_path}")
90
-
91
- """plt.show()"""
92
- return save_path
93
- # load reverse lexicon: phoneme_seq -> [words]
94
- import nltk
95
- from collections import defaultdict
96
-
97
- nltk.download('cmudict')
98
- arpabet = nltk.corpus.cmudict.dict()
99
-
100
- # Build reverse lexicon
101
- reverse_lex = defaultdict(list)
102
- for word, pron_list in arpabet.items():
103
- for pron in pron_list:
104
- reverse_lex[tuple(pron)].append(word)
105
-
106
- def split_on_boundaries(phoneme_stream, boundary_token="<w>"):
107
- """Split on a special token representing word boundaries."""
108
- words = []
109
- current = []
110
- for phon in phoneme_stream:
111
- if phon == boundary_token:
112
- if current:
113
- words.append(current)
114
- current = []
115
- else:
116
- current.append(phon)
117
- if current:
118
- words.append(current)
119
- return words
120
-
121
- def p2g_fallback(phoneme_word):
122
- # Placeholder for fallback pronunciation-to-spelling
123
- return "".join(phoneme_word).lower()
124
-
125
- def phonemes_to_text(phoneme_stream):
126
- words = []
127
- for phoneme_word in split_on_boundaries(phoneme_stream):
128
- candidates = reverse_lex.get(tuple(phoneme_word), [])
129
- if candidates:
130
- words.append(candidates[0])
131
- else:
132
- words.append(p2g_fallback(phoneme_word))
133
- return " ".join(words)
 
 
1
+ # utils.py
2
+ import numpy as np
3
+ import librosa
4
+ from matplotlib import pyplot as plt
5
+ SR = 22050
6
+ HOP_LENGTH = 256
7
+ def mel_to_audio(mel_db, sr=22050, n_fft=1024, hop_length=256, win_length=1024, n_iter=60):
8
+ # mel_db: (n_mels, T) in dB (like saved from preprocess)
9
+ S = librosa.db_to_power(mel_db)
10
+ # invert mel to linear spectrogram
11
+ mel_basis = librosa.filters.mel(sr=sr, n_fft=n_fft, n_mels=S.shape[0])
12
+ # Approximate inverse using pseudo inverse
13
+ inv_mel = np.maximum(1e-10, np.linalg.pinv(mel_basis).dot(S))
14
+ # Griffin-Lim
15
+ audio = librosa.griffinlim(inv_mel, n_iter=n_iter, hop_length=hop_length, win_length=win_length)
16
+ return audio
17
+ import nltk
18
+ nltk.download('averaged_perceptron_tagger_eng')
19
+ from g2p_en import G2p
20
+ g2p = G2p()
21
+
22
+ def text_to_phonemes(text):
23
+ ph = g2p(text)
24
+ # Remove spaces/punct tokens produced by g2p_en
25
+ ph = [p for p in ph if p.isalpha()]
26
+ return " ".join(ph)
27
+
28
+ import librosa
29
+ import numpy as np
30
+ import os
31
+
32
+ def audio_to_mel(audio_path, save_dir="mels", sr=22050, n_fft=1024, hop_length=256, win_length=1024, n_mels=80):
33
+ # Load audio
34
+ y, _ = librosa.load(audio_path, sr=sr)
35
+
36
+ # Compute STFT magnitude
37
+ S = np.abs(librosa.stft(y, n_fft=n_fft, hop_length=hop_length, win_length=win_length))
38
+
39
+ # Convert to mel spectrogram
40
+ mel_basis = librosa.filters.mel(sr=sr, n_fft=n_fft, n_mels=n_mels)
41
+ mel = np.dot(mel_basis, S)
42
+
43
+ # Convert to dB
44
+ mel_db = librosa.power_to_db(mel)
45
+
46
+ # Make sure save directory exists
47
+ os.makedirs(save_dir, exist_ok=True)
48
+
49
+ # Save mel as .npy file
50
+ base_name = os.path.splitext(os.path.basename(audio_path))[0]
51
+ mel_path = os.path.join(save_dir, base_name + "_mel.npy")
52
+ np.save(mel_path, mel_db)
53
+
54
+ return mel_path
55
+
56
+
57
+ def ctc_post_process(phonemes):
58
+ """
59
+ Collapse repeats + remove blanks ('-') in CTC output.
60
+ phonemes: list of predicted phoneme tokens
61
+ """
62
+ new_seq = []
63
+ prev = None
64
+ for p in phonemes:
65
+ if p == "-" or p == prev:
66
+ continue
67
+ new_seq.append(p)
68
+ prev = p
69
+ return new_seq
70
+
71
+
72
+ import numpy as np
73
+ import matplotlib.pyplot as plt
74
+ import librosa.display
75
+
76
+ def mel_to_image(mel_path, sr=22050, hop_length=256, save_fig=True):
77
+ # Load mel spectrogram from .npy
78
+ mel_db = np.load(mel_path)
79
+
80
+ # Create figure
81
+ plt.figure(figsize=(14, 6))
82
+
83
+ # Plot mel spectrogram
84
+ librosa.display.specshow(mel_db, sr=sr, hop_length=hop_length, x_axis='time', y_axis='mel', cmap='magma')
85
+ plt.title("Mel Spectrogram (dB)")
86
+ plt.colorbar(format="%+2.0f dB")
87
+
88
+ save_path = mel_path.replace('.npy', '_mel.png')
89
+ plt.savefig(save_path)
90
+ print(f"Saved mel spectrogram image at: {save_path}")
91
+
92
+ """plt.show()"""
93
+ return save_path
94
+ # load reverse lexicon: phoneme_seq -> [words]
95
+ import nltk
96
+ from collections import defaultdict
97
+
98
+ nltk.download('cmudict')
99
+ arpabet = nltk.corpus.cmudict.dict()
100
+
101
+ # Build reverse lexicon
102
+ reverse_lex = defaultdict(list)
103
+ for word, pron_list in arpabet.items():
104
+ for pron in pron_list:
105
+ reverse_lex[tuple(pron)].append(word)
106
+
107
+ def split_on_boundaries(phoneme_stream, boundary_token="<w>"):
108
+ """Split on a special token representing word boundaries."""
109
+ words = []
110
+ current = []
111
+ for phon in phoneme_stream:
112
+ if phon == boundary_token:
113
+ if current:
114
+ words.append(current)
115
+ current = []
116
+ else:
117
+ current.append(phon)
118
+ if current:
119
+ words.append(current)
120
+ return words
121
+
122
+ def p2g_fallback(phoneme_word):
123
+ # Placeholder for fallback pronunciation-to-spelling
124
+ return "".join(phoneme_word).lower()
125
+
126
+ def phonemes_to_text(phoneme_stream):
127
+ words = []
128
+ for phoneme_word in split_on_boundaries(phoneme_stream):
129
+ candidates = reverse_lex.get(tuple(phoneme_word), [])
130
+ if candidates:
131
+ words.append(candidates[0])
132
+ else:
133
+ words.append(p2g_fallback(phoneme_word))
134
+ return " ".join(words)