KevinGeng commited on
Commit
5407cce
1 Parent(s): 5171765

support pitch contour and db plotting

Browse files
Files changed (2) hide show
  1. app.py +10 -65
  2. local/pitch_contour.py +107 -0
app.py CHANGED
@@ -8,7 +8,7 @@ import pdb
8
  import jiwer
9
  from local.convert_metrics import nat2avaMOS, WER2INTELI
10
  from local.indicator_plot import Intelligibility_Plot, Naturalness_Plot
11
-
12
  # ASR part
13
  from transformers import pipeline
14
  p = pipeline("automatic-speech-recognition")
@@ -85,77 +85,21 @@ def calc_mos(audio_path, ref):
85
  phone_transcription = processor.batch_decode(phone_predicted_ids)
86
  lst_phonemes = phone_transcription[0].split(" ")
87
  wav_vad = torchaudio.functional.vad(wav, sample_rate=sr)
88
- import matplotlib.pyplot as plt
89
-
90
- fig = plt.figure(figsize=(30, 10))
91
- # ax = fig.subplots(1, 1)
92
- # pdb.set_trace()
93
-
94
- # time_x = torch.arange(wav.shape[-1]) / sr
95
- # # ax.plot(time_x, wav_vad.squeeze())
96
- # pdb.set_trace()
97
- # ax.plot(time_x, wav.squeeze(), alpha=0.5)
98
- # get f0
99
- f0 = torchaudio.functional.compute_kaldi_pitch(wav, frame_length=25, frame_shift=20, min_f0=20, max_f0=600, sample_rate=sr)[0, :, 1]
100
- # # get f0 time x axis
101
- # time_x_f0 = torch.arange(f0.shape[-1]) * 20 / 1000
102
- # plot f0 with x axis as time
103
-
104
- # spectrogram with x axis as time
105
- # pdb.set_trace()
106
- spectrogram = torchaudio.transforms.MelSpectrogram(sample_rate=sr, n_fft=400, hop_length=160, n_mels=80)(wav)
107
-
108
- spectrogram = torchaudio.transforms.AmplitudeToDB(stype="power", top_db=80)(spectrogram)
109
-
110
- # plot spectrogram with x axis as time, y axis as frequency bins
111
- ax2 = fig.add_subplot(212)
112
- ax2.set_xlabel("Time (s)")
113
- ax2.set_ylabel("Frequency (Hz)")
114
- ax2.set_title("Spectrogram")
115
- ax2.set_xticks(torch.arange(0, spectrogram.shape[-1], 100))
116
- ax2.set_xticklabels(torch.arange(0, spectrogram.shape[-1], 100) * 20 / 1000)
117
- ax2.set_yticks(torch.arange(0, spectrogram.shape[1], 10))
118
- ax2.set_yticklabels(torch.arange(0, spectrogram.shape[1], 10) * 800 / 80)
119
-
120
- # add colorbar to spectrogram with limitation from -80 to 0
121
- cbar = plt.colorbar(ax2.imshow(spectrogram.squeeze().numpy(), aspect='auto', origin='lower'))
122
- cbar.set_label("dB")
123
- ax2.grid()
124
-
125
- # plot f0 with x axis as time, y axis as frequency bins, y is limited from 0 to 600
126
- ax1 = fig.add_subplot(211)
127
- ax1.set_xlabel("Time (s)")
128
- ax1.set_ylabel("Frequency (Hz)")
129
- ax1.set_title("F0")
130
- ax1.set_xticks(torch.arange(0, f0.shape[-1], 100))
131
- ax1.set_xticklabels(torch.arange(0, f0.shape[-1], 100) * 20 / 1000)
132
- ax1.set_yticks(torch.arange(0, 600, 50))
133
- ax1.set_yticklabels(torch.arange(0, 600, 50))
134
 
135
- # add colorbar to f0 with limitation from 0 to 600
136
- # cbar = plt.colorbar(ax1.imshow(f0.squeeze().numpy(), aspect='auto', origin='lower'))
137
- # cbar.set_label("Hz")
138
- ax1.grid()
139
-
140
- # remove unvoiced part based on vad
141
-
142
- # plot f0 with x axis as time
143
-
144
- # time_x = torch.arange(f0.shape[-1]) * 20 / 1000
145
- # plt.plot(time_x, f0.squeeze())
146
- # fig.savefig("vad.png")
147
- # pdb.set_trace()
148
  ppm = len(lst_phonemes) / (wav_vad.shape[-1] / sr) * 60
149
 
150
  # pdb.set_trace()
151
- return AVA_MOS, MOS_fig, INTELI_score, INT_fig, trans, phone_transcription, ppm
152
 
153
 
154
  with open("local/description.md") as f:
155
  description = f.read()
156
 
157
- # calc_mos("JOHN1.wav", "he would answer in a soft voice, 'I don't know.'")
158
-
159
 
160
  examples = [
161
  ["local/Julianna_Set1_Author_01.wav", "Once upon a time, there was a young rat named Arthur who couldn't make up his mind."],
@@ -171,11 +115,12 @@ iface = gr.Interface(
171
  gr.Plot(label="Intelligibility Score, range from 0 to 100, the higher the better", show_label=True, container=True),
172
  gr.Textbox(placeholder="Hypothesis", label="Hypothesis"),
173
  gr.Textbox(placeholder="Predicted Phonemes", label="Predicted Phonemes", visible=False),
174
- gr.Textbox(placeholder="Speaking Rate, Phonemes per minutes", label="Speaking Rate, Phonemes per minutes", visible=False)],
 
175
  title="Speech Analysis by Laronix AI",
176
  description=description,
177
  allow_flagging="auto",
178
  examples=examples,
179
  )
180
  # add password to protect the interface
181
- iface.launch(share=False, auth=['Laronix', 'LaronixSLP'], auth_message="Authentication Required, ask kevin@laronix.com for password,\n Thanks for your cooperation!")
 
8
  import jiwer
9
  from local.convert_metrics import nat2avaMOS, WER2INTELI
10
  from local.indicator_plot import Intelligibility_Plot, Naturalness_Plot
11
+ from local.pitch_contour import draw_spec_db_pitch
12
  # ASR part
13
  from transformers import pipeline
14
  p = pipeline("automatic-speech-recognition")
 
85
  phone_transcription = processor.batch_decode(phone_predicted_ids)
86
  lst_phonemes = phone_transcription[0].split(" ")
87
  wav_vad = torchaudio.functional.vad(wav, sample_rate=sr)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88
 
89
+ # draw f0 and db analysis plot
90
+ f0_db_fig = draw_spec_db_pitch(audio_path, save_fig_path=None)
91
+
 
 
 
 
 
 
 
 
 
 
92
  ppm = len(lst_phonemes) / (wav_vad.shape[-1] / sr) * 60
93
 
94
  # pdb.set_trace()
95
+ return AVA_MOS, MOS_fig, INTELI_score, INT_fig, trans, phone_transcription, ppm , f0_db_fig
96
 
97
 
98
  with open("local/description.md") as f:
99
  description = f.read()
100
 
101
+ # x = calc_mos("JOHN1.wav", "he would answer in a soft voice, 'I don't know.'")
102
+ # pdb.set_trace()
103
 
104
  examples = [
105
  ["local/Julianna_Set1_Author_01.wav", "Once upon a time, there was a young rat named Arthur who couldn't make up his mind."],
 
115
  gr.Plot(label="Intelligibility Score, range from 0 to 100, the higher the better", show_label=True, container=True),
116
  gr.Textbox(placeholder="Hypothesis", label="Hypothesis"),
117
  gr.Textbox(placeholder="Predicted Phonemes", label="Predicted Phonemes", visible=False),
118
+ gr.Textbox(placeholder="Speaking Rate, Phonemes per minutes", label="Speaking Rate, Phonemes per minutes", visible=False),
119
+ gr.Plot(label="Pitch Contour and dB Analysis", show_label=True, container=True)],
120
  title="Speech Analysis by Laronix AI",
121
  description=description,
122
  allow_flagging="auto",
123
  examples=examples,
124
  )
125
  # add password to protect the interface
126
+ iface.launch(share=False, auth=['Laronix', 'LaronixSLP'], auth_message="Authentication Required, ask kevin@laronix.com for password.\n Thanks for your cooperation!")
local/pitch_contour.py ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ### Kevin @ Laronix
2
+
3
+ from glob import glob
4
+ from pathlib import Path
5
+ import matplotlib
6
+ from matplotlib.transforms import Bbox
7
+ import matplotlib.pyplot as plt
8
+
9
+ import numpy as np
10
+ import pdb
11
+ import parselmouth
12
+
13
+ def draw_spectrogram(spectrogram, dynamic_range=80):
14
+ X, Y = spectrogram.x_grid(), spectrogram.y_grid()
15
+ sg_db = 10 * np.log10(spectrogram.values)
16
+ plt.pcolormesh(X, Y, sg_db, vmin=sg_db.max() - dynamic_range)
17
+ plt.ylim([spectrogram.ymin, spectrogram.ymax])
18
+ # TODO add colorbar to spectrogram with limitation from -40 to 0
19
+
20
+ # plt.xlabel("time [s]")
21
+ plt.ylabel("frequency [Hz]")
22
+
23
+ def draw_intensity(intensity):
24
+ # draw intensity in red with x axis as time
25
+ plt.plot(intensity.xs(), intensity.values.T, linewidth=3, color='r')
26
+ plt.plot(intensity.xs(), intensity.values.T, linewidth=1, color="w")
27
+ intensity_values = intensity.values.T
28
+ # get range of intensity
29
+ intensity_min = np.nanmin(intensity_values)
30
+ intensity_max = np.nanmax(intensity_values)
31
+ # project maximum and minimum intensity to y axis in dotted line
32
+ intensity_min_index = np.where(intensity.values.T == intensity_min)
33
+ intensity_min_time = intensity.xs()[intensity_min_index[0]][0]
34
+ intensity_max_index = np.where(intensity.values.T == intensity_max)
35
+ intensity_max_time = intensity.xs()[intensity_max_index[0]][0]
36
+
37
+ plt.plot([intensity.xmax, intensity_min_time], [intensity_min, intensity_min], linewidth=1, linestyle='dotted', color='red')
38
+ plt.plot([intensity.xmax, intensity_max_time], [intensity_max, intensity_max], linewidth=1, linestyle='dotted', color='red')
39
+ # add text at intensity_min and intensity_max on y axis
40
+ plt.text(intensity.xmax, intensity_min, str(round(intensity_min, 1)), color='red')
41
+ plt.text(intensity.xmax, intensity_max, str(round(intensity_max, 1)), color='red')
42
+ # get intensity.min's index
43
+
44
+ plt.grid(False)
45
+ plt.ylim(0)
46
+ plt.ylabel("intensity [dB]")
47
+
48
+ def draw_pitch(pitch):
49
+ # Extract selected pitch contour, and
50
+ # replace unvoiced samples by NaN to not plot
51
+ pitch_values = pitch.selected_array['frequency']
52
+ pitch_values[pitch_values==0] = np.nan
53
+ pitch_min = np.nanmin(pitch_values)
54
+ pitch_max = np.nanmax(pitch_values)
55
+ plt.plot(pitch.xs(), pitch_values, markersize=5, color='blue')
56
+ # plt.plot(pitch.xs(), pitch_values, markersize=, color='white')
57
+ # project maximum and minimum db to y axis in dotted line
58
+ pitch_min_index = np.where(pitch_values == pitch_min)
59
+ pitch_min_time = pitch.xs()[pitch_min_index[0]][0]
60
+ pitch_max_index = np.where(pitch_values == pitch_max)
61
+ pitch_max_time = pitch.xs()[pitch_max_index[0]][0]
62
+ plt.plot([pitch.xmin, pitch_min_time], [pitch_min, pitch_min], linewidth=1, linestyle='dotted', color='blue')
63
+ plt.plot([pitch.xmin, pitch_max_time], [pitch_max, pitch_max], linewidth=1, linestyle='dotted', color='blue')
64
+ # add text at pitch_min and pitch_max on y axis
65
+ # highlight pitch_min and pitch_max
66
+ plt.scatter(pitch_min_time, pitch_min, color='blue', s=100)
67
+ plt.scatter(pitch_max_time, pitch_max, color='blue', s=100)
68
+
69
+ plt.text(pitch_min_time-0.2, pitch_min-30, "f0min = " + str(round(pitch_min, 1) ), color='blue', fontsize=12)
70
+ plt.text(pitch_max_time-0.2, pitch_max+30, "f0max = " + str(round(pitch_max, 1) ), color='blue', fontsize=12)
71
+
72
+ plt.grid(False)
73
+
74
+ plt.ylim(max([0, pitch_min-50]), pitch_max+50)
75
+ plt.ylabel("fundamental frequency [Hz]")
76
+
77
+ def draw_spec_db_pitch(wav, save_fig_path=None):
78
+ # get figure
79
+ fig = plt.figure(figsize=(10, 5))
80
+ fig.tight_layout()
81
+
82
+ # get pitch, intensity, spectrogram
83
+ snd = parselmouth.Sound(str(wav))
84
+ pitch = snd.to_pitch()
85
+ intensity = snd.to_intensity()
86
+ pre_emphasized_snd = snd.copy()
87
+ pre_emphasized_snd.pre_emphasize()
88
+ spectrogram = pre_emphasized_snd.to_spectrogram(window_length=0.1)
89
+
90
+ # draw dB plot and spectrogram
91
+ plt.subplot(2, 1, 1)
92
+ draw_spectrogram(spectrogram)
93
+ plt.twinx()
94
+ draw_intensity(intensity)
95
+ plt.xlim([snd.xmin, snd.xmax])
96
+
97
+ # draw pitch contour
98
+ plt.subplot(2, 1, 2)
99
+ draw_pitch(pitch)
100
+ plt.xlim([snd.xmin, snd.xmax])
101
+ plt.xlabel("time [s]")
102
+
103
+ return fig
104
+
105
+ # f = draw_spec_db_pitch("./test.wav")
106
+ # plt.savefig("y.png")
107
+