Spaces:

GroNLP
/

neural-acoustic-distance

Running

App Files Files Community

Wietse de Vries commited on Mar 14, 2022

Commit

1eec854

1 Parent(s): da8f6fb

add caching

Browse files

Files changed (1) hide show

neural_acoustic_distance.py +118 -115

neural_acoustic_distance.py CHANGED Viewed

@@ -1,97 +1,28 @@
-from unicodedata import name
-import streamlit as st
-import pandas as pd
-import numpy as np
 import os.path
-from dtw import dtw
 import matplotlib.pyplot as plt
 import transformers
-from typing import Any, Optional
 from transformers import AutoConfig
-st.title("Word-level Neural Acoustic Distance Visualizer")
-st.write("This tool visualizes pronunciation differences between two recordings of the same word. The two recordings have to be wave files containing a single spoken word. \n\n\
-Choose any wav2vec 2.0 compatible model identifier on the [Hugging Face Model Hub](https://huggingface.co/models?filter=wav2vec2) and select the output layer you want to use.\n\n\
-To upload your own recordings select 'custom upload' in the audio file selection step. The first recording is put on the x-axis of the plot and the second one will be the reference recording for computing distance.\n\
-You should already see an example plot of two sample recordings.\n\n\
-This visualization tool is part of [neural representations for modeling variation in speech](https://doi.org/10.1016/j.wocn.2022.101137). \n\
-Please see our paper for further details.")
-st.subheader("Model selection:")
-model_id = st.selectbox(
-    "Select the wav2vec 2.0 model you want to use:",
-    ("facebook/wav2vec2-large-960h", "facebook/wav2vec2-large", "facebook/wav2vec2-large-xlsr-53", "facebook/wav2vec2-xls-r-300m", "other"), index = 0)
-if model_id == "other":
-    model_id = st.text_input("Enter the wav2vec 2.0 model you want to use:", value = "facebook/wav2vec2-large-960h", key = "model")
-try:
-    cfg = AutoConfig.from_pretrained(model_id)
-    layer = st.number_input("Select the layer you want to use:",
-        min_value = 1, max_value = cfg.num_hidden_layers, value=10)
-    def load_wav2vec2_featurizer(model_id: str, layer: Optional[int] = None):
-        from transformers.models.wav2vec2 import Wav2Vec2Model
-        import soundfile as sf
-        from scipy import signal
-        import torch
-        import numpy as np
-        transformers.logging.set_verbosity(transformers.logging.ERROR)
-        model_kwargs = {}
-        if layer is not None:
-            model_kwargs["num_hidden_layers"] = layer if layer > 0 else 0
-        with st.spinner("Loading..."):
-            model = Wav2Vec2Model.from_pretrained(model_id, **model_kwargs)
-            model.eval()
-            if torch.cuda.is_available():
-                model.cuda()
-        st.success("Done!")
-        @torch.no_grad()
-        def _featurize(path):
-            input_values, rate = sf.read(path, dtype=np.float32)
-            if len(input_values.shape) == 2:
-                input_values = input_values.mean(1)
-            if rate != 16_000:
-                new_length = int(input_values.shape[0] / rate * 16_000)
-                input_values = signal.resample(input_values, new_length)
-            input_values = torch.from_numpy(input_values).unsqueeze(0)
-            if torch.cuda.is_available():
-                input_values = input_values.cuda()
-            if layer is None:
-                hidden_states = model(input_values, output_hidden_states=True).hidden_states
-                hidden_states = [s.squeeze(0).cpu().numpy() for s in hidden_states]
-                return hidden_states
-            if layer >= 0:
-                hidden_state = model(input_values).last_hidden_state.squeeze(0).cpu().numpy()
-            else:
-                hidden_state = model.feature_extractor(input_values)
-                hidden_state = hidden_state.transpose(1, 2)
-                if layer == -1:
-                    hidden_state = model.feature_projection(hidden_state)
-                hidden_state = hidden_state.squeeze(0).cpu().numpy()
-            return hidden_state
-        return _featurize
-    featurizer_a = load_wav2vec2_featurizer(model_id, layer)
-except OSError:
-    st.error("Please select a wav2vec 2.0 compatible model identifier on the [Hugging Face Model Hub](https://huggingface.co/models?filter=wav2vec2).")
-    featurizer_a = None
-def aligner(x, y) -> Any:
     return dtw(x, y, keep_internals=True)
 def compute_costs(gcm):
     res = [[] for _ in range(gcm.N)]
@@ -103,16 +34,105 @@ def compute_costs(gcm):
     res = [np.mean(x) for x in res]
     return res, n
-def play_audio(filename):
-    audio_file = open(filename, "rb")
-    audio_bytes = audio_file.read()
-    st.audio(audio_bytes, format="audio/wav")
 st.subheader("Audio file selection:")
-filename_x = st.selectbox(
-    "Filename (x-axis):",
-    ("falling_huud_mobiel_201145.wav", "falling_hood_mobiel_203936.wav", "custom upload"))
 if filename_x == "falling_huud_mobiel_201145.wav":
     filename_x = "./examples/falling_huud_mobiel_201145.wav"
@@ -121,9 +141,8 @@ if filename_x == "falling_hood_mobiel_203936.wav":
     filename_x = "./examples/falling_hood_mobiel_203936.wav"
     play_audio(filename_x)
-filename_y = st.selectbox(
-"Filename (y-axis):",
-("falling_hood_mobiel_203936.wav", "falling_huud_mobiel_201145.wav", "custom upload"))
 if filename_y == "falling_huud_mobiel_201145.wav":
     filename_y = "./examples/falling_huud_mobiel_201145.wav"
@@ -133,28 +152,17 @@ if filename_y == "falling_hood_mobiel_203936.wav":
     play_audio(filename_y)
 if filename_x == "custom upload":
-    filename_x = st.file_uploader("Choose a file (x-axis)", key = "f_x")
 if filename_y == "custom upload":
-    filename_y = st.file_uploader("Choose a file (y-axis)", key = "f_y")
-if filename_x is not None and filename_y is not None and featurizer_a is not None:
     print(f"\nX: {filename_x}\nY: {filename_y}")
-    def run(featurizer):
-        feats_x = featurizer(filename_x)
-        feats_y = featurizer(filename_y)
-        gcm = aligner(feats_x, feats_y)
-        d = gcm.normalizedDistance
-        print("\nDistance:", d)
-        c, n = compute_costs(gcm)
-        return d, c, n
-    d, c, n = run(featurizer_a)
     # d_b, c_b, n_b = run(featurizer_b)
-    fig, axes = plt.subplots(figsize=(4,2.5))
     window_size = 9
     rate = 20
@@ -194,9 +202,4 @@ if filename_x is not None and filename_y is not None and featurizer_a is not Non
         frames in the pronunciation on the y-axis are aligned to a single frame in the pronunciation on the x-axis.")
     with open("./output/plot.pdf", "rb") as file:
-        btn = st.download_button(
-                label="Download plot",
-                data=file,
-                file_name="plot.pdf",
-                mime="image/pdf"
-            )

 import os.path
+from typing import Optional
 import matplotlib.pyplot as plt
+import numpy as np
+import soundfile as sf
+import streamlit as st
+import torch
 import transformers
+from dtw import dtw
+from scipy import signal
 from transformers import AutoConfig
+from transformers.models.wav2vec2 import Wav2Vec2Model
+def play_audio(filename):
+    audio_file = open(filename, "rb")
+    audio_bytes = audio_file.read()
+    st.audio(audio_bytes, format="audio/wav")
+def aligner(x, y):
     return dtw(x, y, keep_internals=True)
 def compute_costs(gcm):
     res = [[] for _ in range(gcm.N)]
     res = [np.mean(x) for x in res]
     return res, n
+@st.cache(show_spinner=False, hash_funcs={torch.nn.parameter.Parameter: lambda _: None})
+def load_wav2vec2_featurizer(model_id: str, layer: Optional[int] = None):
+    transformers.logging.set_verbosity(transformers.logging.ERROR)
+    model_kwargs = {}
+    if layer is not None:
+        model_kwargs["num_hidden_layers"] = int(layer) if layer > 0 else 0
+    with st.spinner("Loading model..."):
+        model = Wav2Vec2Model.from_pretrained(model_id, **model_kwargs)
+        model.eval()
+        if torch.cuda.is_available():
+            model.cuda()
+    # st.success("Done!")
+    @torch.no_grad()
+    def _featurize(path):
+        input_values, rate = sf.read(path, dtype=np.float32)
+        if len(input_values.shape) == 2:
+            input_values = input_values.mean(1)
+        if rate != 16_000:
+            new_length = int(input_values.shape[0] / rate * 16_000)
+            input_values = signal.resample(input_values, new_length)
+        input_values = torch.from_numpy(input_values).unsqueeze(0)
+        if torch.cuda.is_available():
+            input_values = input_values.cuda()
+        if layer is None:
+            hidden_states = model(input_values, output_hidden_states=True).hidden_states
+            hidden_states = [s.squeeze(0).cpu().numpy() for s in hidden_states]
+            return hidden_states
+        if layer >= 0:
+            hidden_state = model(input_values).last_hidden_state.squeeze(0).cpu().numpy()
+        else:
+            hidden_state = model.feature_extractor(input_values)
+            hidden_state = hidden_state.transpose(1, 2)
+            if layer == -1:
+                hidden_state = model.feature_projection(hidden_state)
+            hidden_state = hidden_state.squeeze(0).cpu().numpy()
+        return hidden_state
+    return _featurize
+@st.cache(persist=True, show_spinner=False)
+def run(model_id, layer, filename_x, filename_y):
+    featurizer = load_wav2vec2_featurizer(model_id, layer)
+    with st.spinner("Measuring distance..."):
+        feats_x = featurizer(filename_x)
+        feats_y = featurizer(filename_y)
+        gcm = aligner(feats_x, feats_y)
+        d = gcm.normalizedDistance
+        print("Distance:", d)
+        c, n = compute_costs(gcm)
+    return d, c, n
+st.title("Word-level Neural Acoustic Distance Visualizer")
+st.write(
+    "This tool visualizes pronunciation differences between two recordings of the same word. The two recordings have to be wave files containing a single spoken word. \n\n\
+Choose any wav2vec 2.0 compatible model identifier on the [Hugging Face Model Hub](https://huggingface.co/models?filter=wav2vec2) and select the output layer you want to use.\n\n\
+To upload your own recordings select 'custom upload' in the audio file selection step. The first recording is put on the x-axis of the plot and the second one will be the reference recording for computing distance.\n\
+You should already see an example plot of two sample recordings.\n\n\
+This visualization tool is part of [neural representations for modeling variation in speech](https://doi.org/10.1016/j.wocn.2022.101137). \n\
+Please see our paper for further details.")
+st.subheader("Model selection:")
+model_id = st.selectbox("Select the wav2vec 2.0 model you want to use:",
+                        ("facebook/wav2vec2-large-960h", "facebook/wav2vec2-large", "facebook/wav2vec2-large-xlsr-53",
+                         "facebook/wav2vec2-xls-r-300m", "other"),
+                        index=0)
+if model_id == "other":
+    model_id = st.text_input("Enter the wav2vec 2.0 model you want to use:",
+                             value="facebook/wav2vec2-large-960h",
+                             key="model")
+try:
+    cfg = AutoConfig.from_pretrained(model_id)
+    layer = st.number_input("Select the layer you want to use:", min_value=1, max_value=cfg.num_hidden_layers, value=10)
+except OSError:
+    st.error(
+        "Please select a wav2vec 2.0 compatible model identifier on the [Hugging Face Model Hub](https://huggingface.co/models?filter=wav2vec2)."
+    )
+    layer = None
 st.subheader("Audio file selection:")
+filename_x = st.selectbox("Filename (x-axis):",
+                          ("falling_huud_mobiel_201145.wav", "falling_hood_mobiel_203936.wav", "custom upload"))
 if filename_x == "falling_huud_mobiel_201145.wav":
     filename_x = "./examples/falling_huud_mobiel_201145.wav"
     filename_x = "./examples/falling_hood_mobiel_203936.wav"
     play_audio(filename_x)
+filename_y = st.selectbox("Filename (y-axis):",
+                          ("falling_hood_mobiel_203936.wav", "falling_huud_mobiel_201145.wav", "custom upload"))
 if filename_y == "falling_huud_mobiel_201145.wav":
     filename_y = "./examples/falling_huud_mobiel_201145.wav"
     play_audio(filename_y)
 if filename_x == "custom upload":
+    filename_x = st.file_uploader("Choose a file (x-axis)", key="f_x")
 if filename_y == "custom upload":
+    filename_y = st.file_uploader("Choose a file (y-axis)", key="f_y")
+if filename_x is not None and filename_y is not None and layer is not None:
     print(f"\nX: {filename_x}\nY: {filename_y}")
+    d, c, n = run(model_id, layer, filename_x, filename_y)
     # d_b, c_b, n_b = run(featurizer_b)
+    fig, axes = plt.subplots(figsize=(4, 2.5))
     window_size = 9
     rate = 20
         frames in the pronunciation on the y-axis are aligned to a single frame in the pronunciation on the x-axis.")
     with open("./output/plot.pdf", "rb") as file:
+        btn = st.download_button(label="Download plot", data=file, file_name="plot.pdf", mime="image/pdf")