Martijn Bartelds commited on
Commit
e15a3a6
·
1 Parent(s): a0036b2

Add app files

Browse files
examples/falling_hood_mobiel_203936.wav ADDED
Binary file (51.3 kB). View file
 
examples/falling_huud_mobiel_201145.wav ADDED
Binary file (35.6 kB). View file
 
neural_acoustic_distance.py ADDED
@@ -0,0 +1,194 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import numpy as np
4
+ import os.path
5
+
6
+ from dtw import dtw
7
+ import matplotlib.pyplot as plt
8
+ import transformers
9
+ from typing import Any, Optional
10
+ from transformers import AutoConfig
11
+
12
+ st.title("Word-level Neural Acoustic Distance Visualizer")
13
+
14
+ st.write("This tool visualizes pronunciation differences between two recordings of the same word. The two recordings have to be wave files (mono 16-bit PCM at 16 kHz) containing a single spoken word. \n\n\
15
+ Choose any wav2vec 2.0 compatible model identifier on the [Hugging Face Model Hub](https://huggingface.co/models?filter=wav2vec2) and select the output layer you want to use.\n\n\
16
+ To upload your own recordings select 'custom upload' in the audio file selection step. The first recording is put on the x-axis of the plot and the second one will be the reference recording for computing distance.\n\
17
+ You should already see an example plot of two sample recordings.\n\n\
18
+ This visualization tool is part of [neural representations for modeling variation in speech](https://doi.org/10.1016/j.wocn.2022.101137). \n\
19
+ Please see our paper for further details.")
20
+
21
+ st.subheader("Model selection:")
22
+
23
+ model_id = st.selectbox(
24
+ "Select the wav2vec 2.0 model you want to use:",
25
+ ("facebook/wav2vec2-large-960h", "facebook/wav2vec2-large", "facebook/wav2vec2-large-xlsr-53", "facebook/wav2vec2-xls-r-300m", "other"), index = 0)
26
+
27
+ if model_id == "other":
28
+ model_id = st.text_input("Enter the wav2vec 2.0 model you want to use:", value = "facebook/wav2vec2-large-960h", key = "model")
29
+
30
+ try:
31
+ cfg = AutoConfig.from_pretrained(model_id)
32
+ print(cfg.num_hidden_layers)
33
+ layer = st.number_input("Select the layer you want to use:",
34
+ min_value = 1, max_value = cfg.num_hidden_layers, value=10)
35
+
36
+ def load_wav2vec2_featurizer(model_id: str, layer: Optional[int] = None):
37
+ from transformers.models.wav2vec2 import Wav2Vec2Model
38
+ import soundfile as sf
39
+ from scipy import signal
40
+ import torch
41
+ import numpy as np
42
+
43
+ transformers.logging.set_verbosity(transformers.logging.ERROR)
44
+
45
+ model_kwargs = {}
46
+ if layer is not None:
47
+ model_kwargs["num_hidden_layers"] = layer if layer > 0 else 0
48
+
49
+ with st.spinner("Loading..."):
50
+ model = Wav2Vec2Model.from_pretrained(model_id, **model_kwargs)
51
+ model.eval()
52
+ if torch.cuda.is_available():
53
+ model.cuda()
54
+ st.success("Done!")
55
+
56
+ @torch.no_grad()
57
+ def _featurize(path):
58
+ input_values, rate = sf.read(path, dtype=np.float32)
59
+ if len(input_values.shape) == 2:
60
+ input_values = input_values.mean(1)
61
+ if rate != 16_000:
62
+ new_length = int(input_values.shape[0] / rate * 16_000)
63
+ input_values = signal.resample(input_values, new_length)
64
+
65
+ input_values = torch.from_numpy(input_values).unsqueeze(0)
66
+ if torch.cuda.is_available():
67
+ input_values = input_values.cuda()
68
+
69
+ if layer is None:
70
+ hidden_states = model(input_values, output_hidden_states=True).hidden_states
71
+ hidden_states = [s.squeeze(0).cpu().numpy() for s in hidden_states]
72
+ return hidden_states
73
+
74
+ if layer >= 0:
75
+ hidden_state = model(input_values).last_hidden_state.squeeze(0).cpu().numpy()
76
+ else:
77
+ hidden_state = model.feature_extractor(input_values)
78
+ hidden_state = hidden_state.transpose(1, 2)
79
+ if layer == -1:
80
+ hidden_state = model.feature_projection(hidden_state)
81
+ hidden_state = hidden_state.squeeze(0).cpu().numpy()
82
+
83
+ return hidden_state
84
+
85
+ return _featurize
86
+
87
+ featurizer_a = load_wav2vec2_featurizer(model_id, layer)
88
+ except OSError:
89
+ st.error("Please select a wav2vec 2.0 compatible model identifier on the [Hugging Face Model Hub](https://huggingface.co/models?filter=wav2vec2).")
90
+ featurizer_a = None
91
+
92
+ def aligner(x, y) -> Any:
93
+ return dtw(x, y, keep_internals=True)
94
+
95
+ def compute_costs(gcm):
96
+ res = [[] for _ in range(gcm.N)]
97
+
98
+ for i in range(gcm.index1.shape[0]):
99
+ d = gcm.localCostMatrix[gcm.index1[i], gcm.index2[i]]
100
+ res[gcm.index1[i]].append(d)
101
+
102
+ n = [len(x) for x in res]
103
+ res = [np.mean(x) for x in res]
104
+ return res, n
105
+
106
+ st.subheader("Audio file selection:")
107
+
108
+ filename_x = st.selectbox(
109
+ "Filename (x-axis):",
110
+ ("falling_huud_mobiel_201145.wav", "falling_hood_mobiel_203936.wav", "custom upload"))
111
+
112
+ if filename_x == "falling_huud_mobiel_201145.wav":
113
+ filename_x = "./examples/falling_huud_mobiel_201145.wav"
114
+ if filename_x == "falling_hood_mobiel_203936.wav":
115
+ filename_x = "./examples/falling_hood_mobiel_203936.wav"
116
+
117
+ filename_y = st.selectbox(
118
+ "Filename (y-axis):",
119
+ ("falling_hood_mobiel_203936.wav", "falling_huud_mobiel_201145.wav", "custom upload"))
120
+
121
+ if filename_y == "falling_huud_mobiel_201145.wav":
122
+ filename_y = "./examples/falling_huud_mobiel_201145.wav"
123
+ if filename_y == "falling_hood_mobiel_203936.wav":
124
+ filename_y = "./examples/falling_hood_mobiel_203936.wav"
125
+
126
+ if filename_x == "custom upload":
127
+ filename_x = st.file_uploader("Choose a file", key = "f_x")
128
+ if filename_y == "custom upload":
129
+ filename_y = st.file_uploader("Choose a file", key = "f_y")
130
+
131
+ if filename_x is not None and filename_y is not None and featurizer_a is not None:
132
+ print(f"\nX: {filename_x}\nY: {filename_y}")
133
+
134
+ def run(featurizer):
135
+ feats_x = featurizer(filename_x)
136
+ feats_y = featurizer(filename_y)
137
+ gcm = aligner(feats_x, feats_y)
138
+
139
+ d = gcm.normalizedDistance
140
+ print("\nDistance:", d)
141
+
142
+ c, n = compute_costs(gcm)
143
+ return d, c, n
144
+
145
+ d, c, n = run(featurizer_a)
146
+ # d_b, c_b, n_b = run(featurizer_b)
147
+
148
+ fig, axes = plt.subplots(figsize=(4,2.5))
149
+
150
+ window_size = 9
151
+ rate = 20
152
+ x = np.arange(0, len(c) * rate, rate)
153
+ offset = (window_size - 1) // 2
154
+ x_ = x[offset:-offset]
155
+
156
+ # Target layer
157
+ axes.plot(x, c, alpha=0.5, color="gray", linestyle="--")
158
+ axes.scatter(x, c, np.array(n) * 10, color="gray")
159
+ c_ = np.convolve(c, np.ones(window_size) / window_size, mode="valid")
160
+ axes.plot(x_, c_)
161
+
162
+ # Last layer
163
+ # axes.plot(x, c_b, alpha=0.5, color="gray", linestyle="--")
164
+ # axes.scatter(x, c_b, np.array(n_b) * 10, color="gray")
165
+ # c_b_ = np.convolve(c_b, np.ones(window_size) / window_size, mode="valid")
166
+ # axes.plot(x_, c_b_, linestyle="--")
167
+
168
+ axes.set_xlabel("time (ms)")
169
+ axes.set_ylabel("distance per frame")
170
+ axes.hlines(y=d, xmin=0, xmax=np.max(x), linestyles="dashdot")
171
+
172
+ plt.tight_layout(pad=0)
173
+ plt.savefig("./output/plot.pdf")
174
+ st.pyplot(fig)
175
+
176
+ if os.path.isfile("./output/plot.pdf"):
177
+ if st.button("Info"):
178
+ st.write(" Visualization of neural acoustic distances\
179
+ per frame (based on wav2vec 2.0) with the pronunciation of\
180
+ of the first filename on the x-axis and distances to the pronunciation\
181
+ of second filename on the y-axis. The horizontal line represents\
182
+ the global distance value (i.e. the average of all individual frames).\
183
+ The blue continuous line represents the moving average distance based on 9 frames,\
184
+ corresponding to 180ms. As a result of the moving average, the blue line does not cover the entire duration of\
185
+ the sample. Larger bullet sizes indicate that multiple\
186
+ frames in the pronunciation on the y-axis are aligned to a single frame in the pronunciation on the x-axis.")
187
+
188
+ with open("./output/plot.pdf", "rb") as file:
189
+ btn = st.download_button(
190
+ label="Download plot",
191
+ data=file,
192
+ file_name="plot.pdf",
193
+ mime="image/pdf"
194
+ )
output/plot.pdf ADDED
Binary file (20.5 kB). View file
 
requirements.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dtw-python==1.1.6
2
+ editdistance==0.5.3
3
+ fairseq @ git+https://github.com/pytorch/fairseq@aa39ab1b4568479bf9a1360cfcdd4f4fce5f1838
4
+ matplotlib==3.3.2
5
+ numpy==1.19.1
6
+ onnxruntime==1.8.1
7
+ pandas==1.1.3
8
+ scipy==1.5.2
9
+ seaborn==0.11.0
10
+ SoundFile==0.10.2
11
+ torch==1.6.0
12
+ tqdm==4.50.2