Wietse de Vries commited on
Commit
1eec854
1 Parent(s): da8f6fb

add caching

Browse files
Files changed (1) hide show
  1. neural_acoustic_distance.py +118 -115
neural_acoustic_distance.py CHANGED
@@ -1,97 +1,28 @@
1
- from unicodedata import name
2
- import streamlit as st
3
- import pandas as pd
4
- import numpy as np
5
  import os.path
 
6
 
7
- from dtw import dtw
8
  import matplotlib.pyplot as plt
 
 
 
 
9
  import transformers
10
- from typing import Any, Optional
 
11
  from transformers import AutoConfig
 
12
 
13
- st.title("Word-level Neural Acoustic Distance Visualizer")
14
 
15
- st.write("This tool visualizes pronunciation differences between two recordings of the same word. The two recordings have to be wave files containing a single spoken word. \n\n\
16
- Choose any wav2vec 2.0 compatible model identifier on the [Hugging Face Model Hub](https://huggingface.co/models?filter=wav2vec2) and select the output layer you want to use.\n\n\
17
- To upload your own recordings select 'custom upload' in the audio file selection step. The first recording is put on the x-axis of the plot and the second one will be the reference recording for computing distance.\n\
18
- You should already see an example plot of two sample recordings.\n\n\
19
- This visualization tool is part of [neural representations for modeling variation in speech](https://doi.org/10.1016/j.wocn.2022.101137). \n\
20
- Please see our paper for further details.")
21
-
22
- st.subheader("Model selection:")
23
-
24
- model_id = st.selectbox(
25
- "Select the wav2vec 2.0 model you want to use:",
26
- ("facebook/wav2vec2-large-960h", "facebook/wav2vec2-large", "facebook/wav2vec2-large-xlsr-53", "facebook/wav2vec2-xls-r-300m", "other"), index = 0)
27
-
28
- if model_id == "other":
29
- model_id = st.text_input("Enter the wav2vec 2.0 model you want to use:", value = "facebook/wav2vec2-large-960h", key = "model")
30
 
31
- try:
32
- cfg = AutoConfig.from_pretrained(model_id)
33
- layer = st.number_input("Select the layer you want to use:",
34
- min_value = 1, max_value = cfg.num_hidden_layers, value=10)
35
-
36
- def load_wav2vec2_featurizer(model_id: str, layer: Optional[int] = None):
37
- from transformers.models.wav2vec2 import Wav2Vec2Model
38
- import soundfile as sf
39
- from scipy import signal
40
- import torch
41
- import numpy as np
42
-
43
- transformers.logging.set_verbosity(transformers.logging.ERROR)
44
-
45
- model_kwargs = {}
46
- if layer is not None:
47
- model_kwargs["num_hidden_layers"] = layer if layer > 0 else 0
48
-
49
- with st.spinner("Loading..."):
50
- model = Wav2Vec2Model.from_pretrained(model_id, **model_kwargs)
51
- model.eval()
52
- if torch.cuda.is_available():
53
- model.cuda()
54
- st.success("Done!")
55
-
56
- @torch.no_grad()
57
- def _featurize(path):
58
- input_values, rate = sf.read(path, dtype=np.float32)
59
- if len(input_values.shape) == 2:
60
- input_values = input_values.mean(1)
61
- if rate != 16_000:
62
- new_length = int(input_values.shape[0] / rate * 16_000)
63
- input_values = signal.resample(input_values, new_length)
64
-
65
- input_values = torch.from_numpy(input_values).unsqueeze(0)
66
- if torch.cuda.is_available():
67
- input_values = input_values.cuda()
68
-
69
- if layer is None:
70
- hidden_states = model(input_values, output_hidden_states=True).hidden_states
71
- hidden_states = [s.squeeze(0).cpu().numpy() for s in hidden_states]
72
- return hidden_states
73
-
74
- if layer >= 0:
75
- hidden_state = model(input_values).last_hidden_state.squeeze(0).cpu().numpy()
76
- else:
77
- hidden_state = model.feature_extractor(input_values)
78
- hidden_state = hidden_state.transpose(1, 2)
79
- if layer == -1:
80
- hidden_state = model.feature_projection(hidden_state)
81
- hidden_state = hidden_state.squeeze(0).cpu().numpy()
82
-
83
- return hidden_state
84
-
85
- return _featurize
86
-
87
- featurizer_a = load_wav2vec2_featurizer(model_id, layer)
88
- except OSError:
89
- st.error("Please select a wav2vec 2.0 compatible model identifier on the [Hugging Face Model Hub](https://huggingface.co/models?filter=wav2vec2).")
90
- featurizer_a = None
91
 
92
- def aligner(x, y) -> Any:
93
  return dtw(x, y, keep_internals=True)
94
 
 
95
  def compute_costs(gcm):
96
  res = [[] for _ in range(gcm.N)]
97
 
@@ -103,16 +34,105 @@ def compute_costs(gcm):
103
  res = [np.mean(x) for x in res]
104
  return res, n
105
 
106
- def play_audio(filename):
107
- audio_file = open(filename, "rb")
108
- audio_bytes = audio_file.read()
109
- st.audio(audio_bytes, format="audio/wav")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
 
111
  st.subheader("Audio file selection:")
112
 
113
- filename_x = st.selectbox(
114
- "Filename (x-axis):",
115
- ("falling_huud_mobiel_201145.wav", "falling_hood_mobiel_203936.wav", "custom upload"))
116
 
117
  if filename_x == "falling_huud_mobiel_201145.wav":
118
  filename_x = "./examples/falling_huud_mobiel_201145.wav"
@@ -121,9 +141,8 @@ if filename_x == "falling_hood_mobiel_203936.wav":
121
  filename_x = "./examples/falling_hood_mobiel_203936.wav"
122
  play_audio(filename_x)
123
 
124
- filename_y = st.selectbox(
125
- "Filename (y-axis):",
126
- ("falling_hood_mobiel_203936.wav", "falling_huud_mobiel_201145.wav", "custom upload"))
127
 
128
  if filename_y == "falling_huud_mobiel_201145.wav":
129
  filename_y = "./examples/falling_huud_mobiel_201145.wav"
@@ -133,28 +152,17 @@ if filename_y == "falling_hood_mobiel_203936.wav":
133
  play_audio(filename_y)
134
 
135
  if filename_x == "custom upload":
136
- filename_x = st.file_uploader("Choose a file (x-axis)", key = "f_x")
137
  if filename_y == "custom upload":
138
- filename_y = st.file_uploader("Choose a file (y-axis)", key = "f_y")
139
 
140
- if filename_x is not None and filename_y is not None and featurizer_a is not None:
141
  print(f"\nX: {filename_x}\nY: {filename_y}")
142
 
143
- def run(featurizer):
144
- feats_x = featurizer(filename_x)
145
- feats_y = featurizer(filename_y)
146
- gcm = aligner(feats_x, feats_y)
147
-
148
- d = gcm.normalizedDistance
149
- print("\nDistance:", d)
150
-
151
- c, n = compute_costs(gcm)
152
- return d, c, n
153
-
154
- d, c, n = run(featurizer_a)
155
  # d_b, c_b, n_b = run(featurizer_b)
156
 
157
- fig, axes = plt.subplots(figsize=(4,2.5))
158
 
159
  window_size = 9
160
  rate = 20
@@ -194,9 +202,4 @@ if filename_x is not None and filename_y is not None and featurizer_a is not Non
194
  frames in the pronunciation on the y-axis are aligned to a single frame in the pronunciation on the x-axis.")
195
 
196
  with open("./output/plot.pdf", "rb") as file:
197
- btn = st.download_button(
198
- label="Download plot",
199
- data=file,
200
- file_name="plot.pdf",
201
- mime="image/pdf"
202
- )
 
 
 
 
 
1
  import os.path
2
+ from typing import Optional
3
 
 
4
  import matplotlib.pyplot as plt
5
+ import numpy as np
6
+ import soundfile as sf
7
+ import streamlit as st
8
+ import torch
9
  import transformers
10
+ from dtw import dtw
11
+ from scipy import signal
12
  from transformers import AutoConfig
13
+ from transformers.models.wav2vec2 import Wav2Vec2Model
14
 
 
15
 
16
+ def play_audio(filename):
17
+ audio_file = open(filename, "rb")
18
+ audio_bytes = audio_file.read()
19
+ st.audio(audio_bytes, format="audio/wav")
 
 
 
 
 
 
 
 
 
 
 
20
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
 
22
+ def aligner(x, y):
23
  return dtw(x, y, keep_internals=True)
24
 
25
+
26
  def compute_costs(gcm):
27
  res = [[] for _ in range(gcm.N)]
28
 
 
34
  res = [np.mean(x) for x in res]
35
  return res, n
36
 
37
+
38
+ @st.cache(show_spinner=False, hash_funcs={torch.nn.parameter.Parameter: lambda _: None})
39
+ def load_wav2vec2_featurizer(model_id: str, layer: Optional[int] = None):
40
+ transformers.logging.set_verbosity(transformers.logging.ERROR)
41
+
42
+ model_kwargs = {}
43
+ if layer is not None:
44
+ model_kwargs["num_hidden_layers"] = int(layer) if layer > 0 else 0
45
+
46
+ with st.spinner("Loading model..."):
47
+ model = Wav2Vec2Model.from_pretrained(model_id, **model_kwargs)
48
+ model.eval()
49
+ if torch.cuda.is_available():
50
+ model.cuda()
51
+ # st.success("Done!")
52
+
53
+ @torch.no_grad()
54
+ def _featurize(path):
55
+ input_values, rate = sf.read(path, dtype=np.float32)
56
+ if len(input_values.shape) == 2:
57
+ input_values = input_values.mean(1)
58
+ if rate != 16_000:
59
+ new_length = int(input_values.shape[0] / rate * 16_000)
60
+ input_values = signal.resample(input_values, new_length)
61
+
62
+ input_values = torch.from_numpy(input_values).unsqueeze(0)
63
+ if torch.cuda.is_available():
64
+ input_values = input_values.cuda()
65
+
66
+ if layer is None:
67
+ hidden_states = model(input_values, output_hidden_states=True).hidden_states
68
+ hidden_states = [s.squeeze(0).cpu().numpy() for s in hidden_states]
69
+ return hidden_states
70
+
71
+ if layer >= 0:
72
+ hidden_state = model(input_values).last_hidden_state.squeeze(0).cpu().numpy()
73
+ else:
74
+ hidden_state = model.feature_extractor(input_values)
75
+ hidden_state = hidden_state.transpose(1, 2)
76
+ if layer == -1:
77
+ hidden_state = model.feature_projection(hidden_state)
78
+ hidden_state = hidden_state.squeeze(0).cpu().numpy()
79
+
80
+ return hidden_state
81
+
82
+ return _featurize
83
+
84
+
85
+ @st.cache(persist=True, show_spinner=False)
86
+ def run(model_id, layer, filename_x, filename_y):
87
+ featurizer = load_wav2vec2_featurizer(model_id, layer)
88
+
89
+ with st.spinner("Measuring distance..."):
90
+ feats_x = featurizer(filename_x)
91
+ feats_y = featurizer(filename_y)
92
+ gcm = aligner(feats_x, feats_y)
93
+
94
+ d = gcm.normalizedDistance
95
+ print("Distance:", d)
96
+
97
+ c, n = compute_costs(gcm)
98
+ return d, c, n
99
+
100
+
101
+ st.title("Word-level Neural Acoustic Distance Visualizer")
102
+
103
+ st.write(
104
+ "This tool visualizes pronunciation differences between two recordings of the same word. The two recordings have to be wave files containing a single spoken word. \n\n\
105
+ Choose any wav2vec 2.0 compatible model identifier on the [Hugging Face Model Hub](https://huggingface.co/models?filter=wav2vec2) and select the output layer you want to use.\n\n\
106
+ To upload your own recordings select 'custom upload' in the audio file selection step. The first recording is put on the x-axis of the plot and the second one will be the reference recording for computing distance.\n\
107
+ You should already see an example plot of two sample recordings.\n\n\
108
+ This visualization tool is part of [neural representations for modeling variation in speech](https://doi.org/10.1016/j.wocn.2022.101137). \n\
109
+ Please see our paper for further details.")
110
+
111
+ st.subheader("Model selection:")
112
+
113
+ model_id = st.selectbox("Select the wav2vec 2.0 model you want to use:",
114
+ ("facebook/wav2vec2-large-960h", "facebook/wav2vec2-large", "facebook/wav2vec2-large-xlsr-53",
115
+ "facebook/wav2vec2-xls-r-300m", "other"),
116
+ index=0)
117
+
118
+ if model_id == "other":
119
+ model_id = st.text_input("Enter the wav2vec 2.0 model you want to use:",
120
+ value="facebook/wav2vec2-large-960h",
121
+ key="model")
122
+
123
+ try:
124
+ cfg = AutoConfig.from_pretrained(model_id)
125
+ layer = st.number_input("Select the layer you want to use:", min_value=1, max_value=cfg.num_hidden_layers, value=10)
126
+ except OSError:
127
+ st.error(
128
+ "Please select a wav2vec 2.0 compatible model identifier on the [Hugging Face Model Hub](https://huggingface.co/models?filter=wav2vec2)."
129
+ )
130
+ layer = None
131
 
132
  st.subheader("Audio file selection:")
133
 
134
+ filename_x = st.selectbox("Filename (x-axis):",
135
+ ("falling_huud_mobiel_201145.wav", "falling_hood_mobiel_203936.wav", "custom upload"))
 
136
 
137
  if filename_x == "falling_huud_mobiel_201145.wav":
138
  filename_x = "./examples/falling_huud_mobiel_201145.wav"
 
141
  filename_x = "./examples/falling_hood_mobiel_203936.wav"
142
  play_audio(filename_x)
143
 
144
+ filename_y = st.selectbox("Filename (y-axis):",
145
+ ("falling_hood_mobiel_203936.wav", "falling_huud_mobiel_201145.wav", "custom upload"))
 
146
 
147
  if filename_y == "falling_huud_mobiel_201145.wav":
148
  filename_y = "./examples/falling_huud_mobiel_201145.wav"
 
152
  play_audio(filename_y)
153
 
154
  if filename_x == "custom upload":
155
+ filename_x = st.file_uploader("Choose a file (x-axis)", key="f_x")
156
  if filename_y == "custom upload":
157
+ filename_y = st.file_uploader("Choose a file (y-axis)", key="f_y")
158
 
159
+ if filename_x is not None and filename_y is not None and layer is not None:
160
  print(f"\nX: {filename_x}\nY: {filename_y}")
161
 
162
+ d, c, n = run(model_id, layer, filename_x, filename_y)
 
 
 
 
 
 
 
 
 
 
 
163
  # d_b, c_b, n_b = run(featurizer_b)
164
 
165
+ fig, axes = plt.subplots(figsize=(4, 2.5))
166
 
167
  window_size = 9
168
  rate = 20
 
202
  frames in the pronunciation on the y-axis are aligned to a single frame in the pronunciation on the x-axis.")
203
 
204
  with open("./output/plot.pdf", "rb") as file:
205
+ btn = st.download_button(label="Download plot", data=file, file_name="plot.pdf", mime="image/pdf")