Martijn Bartelds commited on
Commit
575567c
1 Parent(s): 9c307a0

Update app

Browse files
Files changed (1) hide show
  1. neural_acoustic_distance.py +70 -71
neural_acoustic_distance.py CHANGED
@@ -27,66 +27,66 @@ model_id = st.selectbox(
27
  if model_id == "other":
28
  model_id = st.text_input("Enter the wav2vec 2.0 model you want to use:", value = "facebook/wav2vec2-large-960h", key = "model")
29
 
30
- # try:
31
- cfg = AutoConfig.from_pretrained(model_id)
32
- layer = st.number_input("Select the layer you want to use:",
33
- min_value = 1, max_value = cfg.num_hidden_layers, value=10)
34
-
35
- def load_wav2vec2_featurizer(model_id: str, layer: Optional[int] = None):
36
- from transformers.models.wav2vec2 import Wav2Vec2Model
37
- import soundfile as sf
38
- from scipy import signal
39
- import torch
40
- import numpy as np
41
-
42
- transformers.logging.set_verbosity(transformers.logging.ERROR)
43
-
44
- model_kwargs = {}
45
- if layer is not None:
46
- model_kwargs["num_hidden_layers"] = layer if layer > 0 else 0
47
-
48
- with st.spinner("Loading..."):
49
- model = Wav2Vec2Model.from_pretrained(model_id, **model_kwargs)
50
- model.eval()
51
- if torch.cuda.is_available():
52
- model.cuda()
53
- st.success("Done!")
54
-
55
- @torch.no_grad()
56
- def _featurize(path):
57
- input_values, rate = sf.read(path, dtype=np.float32)
58
- if len(input_values.shape) == 2:
59
- input_values = input_values.mean(1)
60
- if rate != 16_000:
61
- new_length = int(input_values.shape[0] / rate * 16_000)
62
- input_values = signal.resample(input_values, new_length)
63
-
64
- input_values = torch.from_numpy(input_values).unsqueeze(0)
65
- if torch.cuda.is_available():
66
- input_values = input_values.cuda()
67
-
68
- if layer is None:
69
- hidden_states = model(input_values, output_hidden_states=True).hidden_states
70
- hidden_states = [s.squeeze(0).cpu().numpy() for s in hidden_states]
71
- return hidden_states
72
-
73
- if layer >= 0:
74
- hidden_state = model(input_values).last_hidden_state.squeeze(0).cpu().numpy()
75
- else:
76
- hidden_state = model.feature_extractor(input_values)
77
- hidden_state = hidden_state.transpose(1, 2)
78
- if layer == -1:
79
- hidden_state = model.feature_projection(hidden_state)
80
- hidden_state = hidden_state.squeeze(0).cpu().numpy()
81
-
82
- return hidden_state
83
-
84
- return _featurize
85
-
86
- featurizer_a = load_wav2vec2_featurizer(model_id, layer)
87
- # except OSError:
88
- # st.error("Please select a wav2vec 2.0 compatible model identifier on the [Hugging Face Model Hub](https://huggingface.co/models?filter=wav2vec2).")
89
- # featurizer_a = None
90
 
91
  def aligner(x, y) -> Any:
92
  return dtw(x, y, keep_internals=True)
@@ -173,16 +173,15 @@ if filename_x is not None and filename_y is not None and featurizer_a is not Non
173
  st.pyplot(fig)
174
 
175
  if os.path.isfile("./output/plot.pdf"):
176
- if st.button("Info"):
177
- st.write(" Visualization of neural acoustic distances\
178
- per frame (based on wav2vec 2.0) with the pronunciation of\
179
- of the first filename on the x-axis and distances to the pronunciation\
180
- of second filename on the y-axis. The horizontal line represents\
181
- the global distance value (i.e. the average of all individual frames).\
182
- The blue continuous line represents the moving average distance based on 9 frames,\
183
- corresponding to 180ms. As a result of the moving average, the blue line does not cover the entire duration of\
184
- the sample. Larger bullet sizes indicate that multiple\
185
- frames in the pronunciation on the y-axis are aligned to a single frame in the pronunciation on the x-axis.")
186
 
187
  with open("./output/plot.pdf", "rb") as file:
188
  btn = st.download_button(
@@ -190,4 +189,4 @@ if filename_x is not None and filename_y is not None and featurizer_a is not Non
190
  data=file,
191
  file_name="plot.pdf",
192
  mime="image/pdf"
193
- )
27
  if model_id == "other":
28
  model_id = st.text_input("Enter the wav2vec 2.0 model you want to use:", value = "facebook/wav2vec2-large-960h", key = "model")
29
 
30
+ try:
31
+ cfg = AutoConfig.from_pretrained(model_id)
32
+ layer = st.number_input("Select the layer you want to use:",
33
+ min_value = 1, max_value = cfg.num_hidden_layers, value=10)
34
+
35
+ def load_wav2vec2_featurizer(model_id: str, layer: Optional[int] = None):
36
+ from transformers.models.wav2vec2 import Wav2Vec2Model
37
+ import soundfile as sf
38
+ from scipy import signal
39
+ import torch
40
+ import numpy as np
41
+
42
+ transformers.logging.set_verbosity(transformers.logging.ERROR)
43
+
44
+ model_kwargs = {}
45
+ if layer is not None:
46
+ model_kwargs["num_hidden_layers"] = layer if layer > 0 else 0
47
+
48
+ with st.spinner("Loading..."):
49
+ model = Wav2Vec2Model.from_pretrained(model_id, **model_kwargs)
50
+ model.eval()
51
+ if torch.cuda.is_available():
52
+ model.cuda()
53
+ st.success("Done!")
54
+
55
+ @torch.no_grad()
56
+ def _featurize(path):
57
+ input_values, rate = sf.read(path, dtype=np.float32)
58
+ if len(input_values.shape) == 2:
59
+ input_values = input_values.mean(1)
60
+ if rate != 16_000:
61
+ new_length = int(input_values.shape[0] / rate * 16_000)
62
+ input_values = signal.resample(input_values, new_length)
63
+
64
+ input_values = torch.from_numpy(input_values).unsqueeze(0)
65
+ if torch.cuda.is_available():
66
+ input_values = input_values.cuda()
67
+
68
+ if layer is None:
69
+ hidden_states = model(input_values, output_hidden_states=True).hidden_states
70
+ hidden_states = [s.squeeze(0).cpu().numpy() for s in hidden_states]
71
+ return hidden_states
72
+
73
+ if layer >= 0:
74
+ hidden_state = model(input_values).last_hidden_state.squeeze(0).cpu().numpy()
75
+ else:
76
+ hidden_state = model.feature_extractor(input_values)
77
+ hidden_state = hidden_state.transpose(1, 2)
78
+ if layer == -1:
79
+ hidden_state = model.feature_projection(hidden_state)
80
+ hidden_state = hidden_state.squeeze(0).cpu().numpy()
81
+
82
+ return hidden_state
83
+
84
+ return _featurize
85
+
86
+ featurizer_a = load_wav2vec2_featurizer(model_id, layer)
87
+ except OSError:
88
+ st.error("Please select a wav2vec 2.0 compatible model identifier on the [Hugging Face Model Hub](https://huggingface.co/models?filter=wav2vec2).")
89
+ featurizer_a = None
90
 
91
  def aligner(x, y) -> Any:
92
  return dtw(x, y, keep_internals=True)
173
  st.pyplot(fig)
174
 
175
  if os.path.isfile("./output/plot.pdf"):
176
+ st.caption(" Visualization of neural acoustic distances\
177
+ per frame (based on wav2vec 2.0) with the pronunciation of\
178
+ of the first filename on the x-axis and distances to the pronunciation\
179
+ of second filename on the y-axis. The horizontal line represents\
180
+ the global distance value (i.e. the average of all individual frames).\
181
+ The blue continuous line represents the moving average distance based on 9 frames,\
182
+ corresponding to 180ms. As a result of the moving average, the blue line does not cover the entire duration of\
183
+ the sample. Larger bullet sizes indicate that multiple\
184
+ frames in the pronunciation on the y-axis are aligned to a single frame in the pronunciation on the x-axis.")
 
185
 
186
  with open("./output/plot.pdf", "rb") as file:
187
  btn = st.download_button(
189
  data=file,
190
  file_name="plot.pdf",
191
  mime="image/pdf"
192
+ )