wietsedv commited on
Commit
39a0f1b
1 Parent(s): a991354

Update neural_acoustic_distance.py

Browse files
Files changed (1) hide show
  1. neural_acoustic_distance.py +122 -119
neural_acoustic_distance.py CHANGED
@@ -107,126 +107,129 @@ def run(model_id, layer, filename_x, filename_y):
107
  return d, c, n
108
 
109
 
110
- st.title("Word-level Neural Acoustic Distance Visualizer")
111
-
112
- st.write(
113
- "This tool visualizes pronunciation differences between two recordings of the same word. The two recordings have to be wave files containing a single spoken word. \n\n\
114
- Choose any wav2vec 2.0 compatible model identifier on the [Hugging Face Model Hub](https://huggingface.co/models?filter=wav2vec2) and select the output layer you want to use.\n\n\
115
- To upload your own recordings select 'custom upload' in the audio file selection step. The first recording is put on the x-axis of the plot and the second one will be the reference recording for computing distance.\n\
116
- You should already see an example plot of two sample recordings.\n\n\
117
- This visualization tool is part of [neural representations for modeling variation in speech](https://doi.org/10.1016/j.wocn.2022.101137). \n\
118
- Please see our paper for further details.")
119
-
120
- st.subheader("Model selection:")
121
-
122
- model_id = st.selectbox("Select the wav2vec 2.0 model you want to use:",
123
- ("facebook/wav2vec2-large-960h", "facebook/wav2vec2-large", "facebook/wav2vec2-large-xlsr-53",
124
- "facebook/wav2vec2-xls-r-300m", "other"),
125
- index=0)
126
-
127
- if model_id == "other":
128
- model_id = st.text_input("Enter the wav2vec 2.0 model you want to use:",
129
- value="facebook/wav2vec2-large-960h",
130
- key="model")
131
-
132
- print(f"\n### Start new run\n") # test
133
-
134
- try:
135
- cfg = AutoConfig.from_pretrained(model_id)
136
- layer = st.number_input("Select the layer you want to use:", min_value=1, max_value=cfg.num_hidden_layers, value=10)
137
- except OSError:
138
- st.error(
139
- "Please select a wav2vec 2.0 compatible model identifier on the [Hugging Face Model Hub](https://huggingface.co/models?filter=wav2vec2)."
140
- )
141
- layer = None
142
-
143
- print('1. Model selected', datetime.now().strftime('%d-%m-%Y %H:%M:%S')) # test
144
-
145
- st.subheader("Audio file selection:")
146
-
147
- filename_x = st.selectbox("Filename (x-axis):",
148
- ("falling_huud_mobiel_201145.wav", "falling_hood_mobiel_203936.wav", "custom upload"))
149
-
150
- if filename_x == "falling_huud_mobiel_201145.wav":
151
- filename_x = "./examples/falling_huud_mobiel_201145.wav"
152
- play_audio(filename_x)
153
- if filename_x == "falling_hood_mobiel_203936.wav":
154
- filename_x = "./examples/falling_hood_mobiel_203936.wav"
155
- play_audio(filename_x)
156
-
157
- filename_y = st.selectbox("Filename (y-axis):",
158
- ("falling_hood_mobiel_203936.wav", "falling_huud_mobiel_201145.wav", "custom upload"))
159
-
160
- if filename_y == "falling_huud_mobiel_201145.wav":
161
- filename_y = "./examples/falling_huud_mobiel_201145.wav"
162
- play_audio(filename_y)
163
- if filename_y == "falling_hood_mobiel_203936.wav":
164
- filename_y = "./examples/falling_hood_mobiel_203936.wav"
165
- play_audio(filename_y)
166
-
167
- if filename_x == "custom upload":
168
- filename_x = st.file_uploader("Choose a file (x-axis)", key="f_x")
169
- if filename_y == "custom upload":
170
- filename_y = st.file_uploader("Choose a file (y-axis)", key="f_y")
171
-
172
- print('2. Files selected', datetime.now().strftime('%d-%m-%Y %H:%M:%S')) # test
173
-
174
- if filename_x is not None and filename_y is not None and layer is not None:
175
- print(f"\nX: {filename_x}\nY: {filename_y}")
176
-
177
- d, c, n = run(model_id, layer, filename_x, filename_y)
178
- # d_b, c_b, n_b = run(featurizer_b)
179
-
180
- fig, axes = plt.subplots(figsize=(4, 2.5))
181
-
182
- print('6. Plot init', datetime.now().strftime('%d-%m-%Y %H:%M:%S')) # test
183
-
184
- window_size = 9
185
- rate = 20
186
- x = np.arange(0, len(c) * rate, rate)
187
- offset = (window_size - 1) // 2
188
- x_ = x[offset:-offset]
189
-
190
- # Target layer
191
- axes.plot(x, c, alpha=0.5, color="gray", linestyle="--")
192
- axes.scatter(x, c, np.array(n) * 10, color="gray")
193
- c_ = np.convolve(c, np.ones(window_size) / window_size, mode="valid")
194
- axes.plot(x_, c_)
195
-
196
- # Last layer
197
- # axes.plot(x, c_b, alpha=0.5, color="gray", linestyle="--")
198
- # axes.scatter(x, c_b, np.array(n_b) * 10, color="gray")
199
- # c_b_ = np.convolve(c_b, np.ones(window_size) / window_size, mode="valid")
200
- # axes.plot(x_, c_b_, linestyle="--")
201
-
202
- axes.set_xlabel("time (ms)")
203
- axes.set_ylabel("distance per frame")
204
- axes.hlines(y=d, xmin=0, xmax=np.max(x), linestyles="dashdot")
205
-
206
- plt.tight_layout(pad=0)
207
- plt_id = randrange(0, 10)
208
- plt.savefig("./output/plot" + str(plt_id) + ".pdf")
209
- st.pyplot(fig)
210
-
211
- print('7. Plot filled', datetime.now().strftime('%d-%m-%Y %H:%M:%S')) # test
212
-
213
- if os.path.isfile("./output/plot.pdf"):
214
- st.caption(" Visualization of neural acoustic distances\
215
- per frame (based on wav2vec 2.0) with the pronunciation of\
216
- the first filename on the x-axis and distances to the pronunciation\
217
- of second filename on the y-axis. The horizontal line represents\
218
- the global distance value (i.e. the average of all individual frames).\
219
- The blue continuous line represents the moving average distance based on 9 frames,\
220
- corresponding to 180ms. As a result of the moving average, the blue line does not cover the entire duration of\
221
- the sample. Larger bullet sizes indicate that multiple\
222
- frames in the pronunciation on the y-axis are aligned to a single frame in the pronunciation on the x-axis.")
223
-
224
- with open("./output/plot.pdf", "rb") as file:
225
- btn = st.download_button(label="Download plot", data=file, file_name="plot.pdf", mime="image/pdf")
226
-
227
- print('8. End', datetime.now().strftime('%d-%m-%Y %H:%M:%S')) # test
228
- print(f"9. RAM used: {psutil.Process().memory_info().rss / (1024 * 1024):.2f} MB") # test
229
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
230
  for name in dir():
231
  if not name.startswith('_'):
232
  del globals()[name]
107
  return d, c, n
108
 
109
 
110
+ def main():
111
+ st.title("Word-level Neural Acoustic Distance Visualizer")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
112
 
113
+ st.write(
114
+ "This tool visualizes pronunciation differences between two recordings of the same word. The two recordings have to be wave files containing a single spoken word. \n\n\
115
+ Choose any wav2vec 2.0 compatible model identifier on the [Hugging Face Model Hub](https://huggingface.co/models?filter=wav2vec2) and select the output layer you want to use.\n\n\
116
+ To upload your own recordings select 'custom upload' in the audio file selection step. The first recording is put on the x-axis of the plot and the second one will be the reference recording for computing distance.\n\
117
+ You should already see an example plot of two sample recordings.\n\n\
118
+ This visualization tool is part of [neural representations for modeling variation in speech](https://doi.org/10.1016/j.wocn.2022.101137). \n\
119
+ Please see our paper for further details.")
120
+
121
+ st.subheader("Model selection:")
122
+
123
+ model_id = st.selectbox("Select the wav2vec 2.0 model you want to use:",
124
+ ("facebook/wav2vec2-large-960h", "facebook/wav2vec2-large", "facebook/wav2vec2-large-xlsr-53",
125
+ "facebook/wav2vec2-xls-r-300m", "other"),
126
+ index=0)
127
+
128
+ if model_id == "other":
129
+ model_id = st.text_input("Enter the wav2vec 2.0 model you want to use:",
130
+ value="facebook/wav2vec2-large-960h",
131
+ key="model")
132
+
133
+ print(f"\n### Start new run\n") # test
134
+
135
+ try:
136
+ cfg = AutoConfig.from_pretrained(model_id)
137
+ layer = st.number_input("Select the layer you want to use:", min_value=1, max_value=cfg.num_hidden_layers, value=10)
138
+ except OSError:
139
+ st.error(
140
+ "Please select a wav2vec 2.0 compatible model identifier on the [Hugging Face Model Hub](https://huggingface.co/models?filter=wav2vec2)."
141
+ )
142
+ layer = None
143
+
144
+ print('1. Model selected', datetime.now().strftime('%d-%m-%Y %H:%M:%S')) # test
145
+
146
+ st.subheader("Audio file selection:")
147
+
148
+ filename_x = st.selectbox("Filename (x-axis):",
149
+ ("falling_huud_mobiel_201145.wav", "falling_hood_mobiel_203936.wav", "custom upload"))
150
+
151
+ if filename_x == "falling_huud_mobiel_201145.wav":
152
+ filename_x = "./examples/falling_huud_mobiel_201145.wav"
153
+ play_audio(filename_x)
154
+ if filename_x == "falling_hood_mobiel_203936.wav":
155
+ filename_x = "./examples/falling_hood_mobiel_203936.wav"
156
+ play_audio(filename_x)
157
+
158
+ filename_y = st.selectbox("Filename (y-axis):",
159
+ ("falling_hood_mobiel_203936.wav", "falling_huud_mobiel_201145.wav", "custom upload"))
160
+
161
+ if filename_y == "falling_huud_mobiel_201145.wav":
162
+ filename_y = "./examples/falling_huud_mobiel_201145.wav"
163
+ play_audio(filename_y)
164
+ if filename_y == "falling_hood_mobiel_203936.wav":
165
+ filename_y = "./examples/falling_hood_mobiel_203936.wav"
166
+ play_audio(filename_y)
167
+
168
+ if filename_x == "custom upload":
169
+ filename_x = st.file_uploader("Choose a file (x-axis)", key="f_x")
170
+ if filename_y == "custom upload":
171
+ filename_y = st.file_uploader("Choose a file (y-axis)", key="f_y")
172
+
173
+ print('2. Files selected', datetime.now().strftime('%d-%m-%Y %H:%M:%S')) # test
174
+
175
+ if filename_x is not None and filename_y is not None and layer is not None:
176
+ print(f"\nX: {filename_x}\nY: {filename_y}")
177
+
178
+ d, c, n = run(model_id, layer, filename_x, filename_y)
179
+ # d_b, c_b, n_b = run(featurizer_b)
180
+
181
+ fig, axes = plt.subplots(figsize=(4, 2.5))
182
+
183
+ print('6. Plot init', datetime.now().strftime('%d-%m-%Y %H:%M:%S')) # test
184
+
185
+ window_size = 9
186
+ rate = 20
187
+ x = np.arange(0, len(c) * rate, rate)
188
+ offset = (window_size - 1) // 2
189
+ x_ = x[offset:-offset]
190
+
191
+ # Target layer
192
+ axes.plot(x, c, alpha=0.5, color="gray", linestyle="--")
193
+ axes.scatter(x, c, np.array(n) * 10, color="gray")
194
+ c_ = np.convolve(c, np.ones(window_size) / window_size, mode="valid")
195
+ axes.plot(x_, c_)
196
+
197
+ # Last layer
198
+ # axes.plot(x, c_b, alpha=0.5, color="gray", linestyle="--")
199
+ # axes.scatter(x, c_b, np.array(n_b) * 10, color="gray")
200
+ # c_b_ = np.convolve(c_b, np.ones(window_size) / window_size, mode="valid")
201
+ # axes.plot(x_, c_b_, linestyle="--")
202
+
203
+ axes.set_xlabel("time (ms)")
204
+ axes.set_ylabel("distance per frame")
205
+ axes.hlines(y=d, xmin=0, xmax=np.max(x), linestyles="dashdot")
206
+
207
+ plt.tight_layout(pad=0)
208
+ plt_id = randrange(0, 10)
209
+ plt.savefig("./output/plot" + str(plt_id) + ".pdf")
210
+ st.pyplot(fig)
211
+
212
+ print('7. Plot filled', datetime.now().strftime('%d-%m-%Y %H:%M:%S')) # test
213
+
214
+ if os.path.isfile("./output/plot.pdf"):
215
+ st.caption(" Visualization of neural acoustic distances\
216
+ per frame (based on wav2vec 2.0) with the pronunciation of\
217
+ the first filename on the x-axis and distances to the pronunciation\
218
+ of second filename on the y-axis. The horizontal line represents\
219
+ the global distance value (i.e. the average of all individual frames).\
220
+ The blue continuous line represents the moving average distance based on 9 frames,\
221
+ corresponding to 180ms. As a result of the moving average, the blue line does not cover the entire duration of\
222
+ the sample. Larger bullet sizes indicate that multiple\
223
+ frames in the pronunciation on the y-axis are aligned to a single frame in the pronunciation on the x-axis.")
224
+
225
+ with open("./output/plot.pdf", "rb") as file:
226
+ btn = st.download_button(label="Download plot", data=file, file_name="plot.pdf", mime="image/pdf")
227
+
228
+ print('8. End', datetime.now().strftime('%d-%m-%Y %H:%M:%S')) # test
229
+ print(f"9. RAM used: {psutil.Process().memory_info().rss / (1024 * 1024):.2f} MB") # test
230
+
231
+ main()
232
+
233
  for name in dir():
234
  if not name.startswith('_'):
235
  del globals()[name]