Martijn Bartelds commited on
Commit
160f237
1 Parent(s): b341c9c

Update app

Browse files
Files changed (1) hide show
  1. neural_acoustic_distance.py +60 -60
neural_acoustic_distance.py CHANGED
@@ -27,66 +27,66 @@ model_id = st.selectbox(
27
  if model_id == "other":
28
  model_id = st.text_input("Enter the wav2vec 2.0 model you want to use:", value = "facebook/wav2vec2-large-960h", key = "model")
29
 
30
- try:
31
- # cfg = AutoConfig.from_pretrained(model_id)
32
- layer = st.number_input("Select the layer you want to use:",
33
- min_value = 1, max_value = 24, value=10)
34
-
35
- def load_wav2vec2_featurizer(model_id: str, layer: Optional[int] = None):
36
- from transformers.models.wav2vec2 import Wav2Vec2Model
37
- import soundfile as sf
38
- from scipy import signal
39
- import torch
40
- import numpy as np
41
-
42
- transformers.logging.set_verbosity(transformers.logging.ERROR)
43
-
44
- model_kwargs = {}
45
- if layer is not None:
46
- model_kwargs["num_hidden_layers"] = layer if layer > 0 else 0
47
-
48
- with st.spinner("Loading..."):
49
- model = Wav2Vec2Model.from_pretrained(model_id, **model_kwargs)
50
- model.eval()
51
- if torch.cuda.is_available():
52
- model.cuda()
53
- st.success("Done!")
54
-
55
- @torch.no_grad()
56
- def _featurize(path):
57
- input_values, rate = sf.read(path, dtype=np.float32)
58
- if len(input_values.shape) == 2:
59
- input_values = input_values.mean(1)
60
- if rate != 16_000:
61
- new_length = int(input_values.shape[0] / rate * 16_000)
62
- input_values = signal.resample(input_values, new_length)
63
-
64
- input_values = torch.from_numpy(input_values).unsqueeze(0)
65
- if torch.cuda.is_available():
66
- input_values = input_values.cuda()
67
-
68
- if layer is None:
69
- hidden_states = model(input_values, output_hidden_states=True).hidden_states
70
- hidden_states = [s.squeeze(0).cpu().numpy() for s in hidden_states]
71
- return hidden_states
72
-
73
- if layer >= 0:
74
- hidden_state = model(input_values).last_hidden_state.squeeze(0).cpu().numpy()
75
- else:
76
- hidden_state = model.feature_extractor(input_values)
77
- hidden_state = hidden_state.transpose(1, 2)
78
- if layer == -1:
79
- hidden_state = model.feature_projection(hidden_state)
80
- hidden_state = hidden_state.squeeze(0).cpu().numpy()
81
-
82
- return hidden_state
83
-
84
- return _featurize
85
-
86
- featurizer_a = load_wav2vec2_featurizer(model_id, layer)
87
- except OSError:
88
- st.error("Please select a wav2vec 2.0 compatible model identifier on the [Hugging Face Model Hub](https://huggingface.co/models?filter=wav2vec2).")
89
- featurizer_a = None
90
 
91
  def aligner(x, y) -> Any:
92
  return dtw(x, y, keep_internals=True)
27
  if model_id == "other":
28
  model_id = st.text_input("Enter the wav2vec 2.0 model you want to use:", value = "facebook/wav2vec2-large-960h", key = "model")
29
 
30
+ # try:
31
+ cfg = AutoConfig.from_pretrained(model_id)
32
+ layer = st.number_input("Select the layer you want to use:",
33
+ min_value = 1, max_value = cfg.num_hidden_layers, value=10)
34
+
35
+ def load_wav2vec2_featurizer(model_id: str, layer: Optional[int] = None):
36
+ from transformers.models.wav2vec2 import Wav2Vec2Model
37
+ import soundfile as sf
38
+ from scipy import signal
39
+ import torch
40
+ import numpy as np
41
+
42
+ transformers.logging.set_verbosity(transformers.logging.ERROR)
43
+
44
+ model_kwargs = {}
45
+ if layer is not None:
46
+ model_kwargs["num_hidden_layers"] = layer if layer > 0 else 0
47
+
48
+ with st.spinner("Loading..."):
49
+ model = Wav2Vec2Model.from_pretrained(model_id, **model_kwargs)
50
+ model.eval()
51
+ if torch.cuda.is_available():
52
+ model.cuda()
53
+ st.success("Done!")
54
+
55
+ @torch.no_grad()
56
+ def _featurize(path):
57
+ input_values, rate = sf.read(path, dtype=np.float32)
58
+ if len(input_values.shape) == 2:
59
+ input_values = input_values.mean(1)
60
+ if rate != 16_000:
61
+ new_length = int(input_values.shape[0] / rate * 16_000)
62
+ input_values = signal.resample(input_values, new_length)
63
+
64
+ input_values = torch.from_numpy(input_values).unsqueeze(0)
65
+ if torch.cuda.is_available():
66
+ input_values = input_values.cuda()
67
+
68
+ if layer is None:
69
+ hidden_states = model(input_values, output_hidden_states=True).hidden_states
70
+ hidden_states = [s.squeeze(0).cpu().numpy() for s in hidden_states]
71
+ return hidden_states
72
+
73
+ if layer >= 0:
74
+ hidden_state = model(input_values).last_hidden_state.squeeze(0).cpu().numpy()
75
+ else:
76
+ hidden_state = model.feature_extractor(input_values)
77
+ hidden_state = hidden_state.transpose(1, 2)
78
+ if layer == -1:
79
+ hidden_state = model.feature_projection(hidden_state)
80
+ hidden_state = hidden_state.squeeze(0).cpu().numpy()
81
+
82
+ return hidden_state
83
+
84
+ return _featurize
85
+
86
+ featurizer_a = load_wav2vec2_featurizer(model_id, layer)
87
+ # except OSError:
88
+ # st.error("Please select a wav2vec 2.0 compatible model identifier on the [Hugging Face Model Hub](https://huggingface.co/models?filter=wav2vec2).")
89
+ # featurizer_a = None
90
 
91
  def aligner(x, y) -> Any:
92
  return dtw(x, y, keep_internals=True)