Spaces:

GroNLP
/

neural-acoustic-distance

Running

App Files Files Community

Martijn Bartelds commited on Mar 10, 2022

Commit

e15a3a6

•

1 Parent(s): a0036b2

Add app files

Browse files

Files changed (5) hide show

examples/falling_hood_mobiel_203936.wav +0 -0
examples/falling_huud_mobiel_201145.wav +0 -0
neural_acoustic_distance.py +194 -0
output/plot.pdf +0 -0
requirements.txt +12 -0

examples/falling_hood_mobiel_203936.wav ADDED Viewed

Binary file (51.3 kB). View file

examples/falling_huud_mobiel_201145.wav ADDED Viewed

Binary file (35.6 kB). View file

neural_acoustic_distance.py ADDED Viewed

	@@ -0,0 +1,194 @@

+import streamlit as st
+import pandas as pd
+import numpy as np
+import os.path
+from dtw import dtw
+import matplotlib.pyplot as plt
+import transformers
+from typing import Any, Optional
+from transformers import AutoConfig
+st.title("Word-level Neural Acoustic Distance Visualizer")
+st.write("This tool visualizes pronunciation differences between two recordings of the same word. The two recordings have to be wave files (mono 16-bit PCM at 16 kHz) containing a single spoken word. \n\n\
+Choose any wav2vec 2.0 compatible model identifier on the [Hugging Face Model Hub](https://huggingface.co/models?filter=wav2vec2) and select the output layer you want to use.\n\n\
+To upload your own recordings select 'custom upload' in the audio file selection step. The first recording is put on the x-axis of the plot and the second one will be the reference recording for computing distance.\n\
+You should already see an example plot of two sample recordings.\n\n\
+This visualization tool is part of [neural representations for modeling variation in speech](https://doi.org/10.1016/j.wocn.2022.101137). \n\
+Please see our paper for further details.")
+st.subheader("Model selection:")
+model_id = st.selectbox(
+    "Select the wav2vec 2.0 model you want to use:",
+    ("facebook/wav2vec2-large-960h", "facebook/wav2vec2-large", "facebook/wav2vec2-large-xlsr-53", "facebook/wav2vec2-xls-r-300m", "other"), index = 0)
+if model_id == "other":
+    model_id = st.text_input("Enter the wav2vec 2.0 model you want to use:", value = "facebook/wav2vec2-large-960h", key = "model")
+try:
+    cfg = AutoConfig.from_pretrained(model_id)
+    print(cfg.num_hidden_layers)
+    layer = st.number_input("Select the layer you want to use:",
+        min_value = 1, max_value = cfg.num_hidden_layers, value=10)
+    def load_wav2vec2_featurizer(model_id: str, layer: Optional[int] = None):
+        from transformers.models.wav2vec2 import Wav2Vec2Model
+        import soundfile as sf
+        from scipy import signal
+        import torch
+        import numpy as np
+        transformers.logging.set_verbosity(transformers.logging.ERROR)
+        model_kwargs = {}
+        if layer is not None:
+            model_kwargs["num_hidden_layers"] = layer if layer > 0 else 0
+        with st.spinner("Loading..."):
+            model = Wav2Vec2Model.from_pretrained(model_id, **model_kwargs)
+            model.eval()
+            if torch.cuda.is_available():
+                model.cuda()
+        st.success("Done!")
+        @torch.no_grad()
+        def _featurize(path):
+            input_values, rate = sf.read(path, dtype=np.float32)
+            if len(input_values.shape) == 2:
+                input_values = input_values.mean(1)
+            if rate != 16_000:
+                new_length = int(input_values.shape[0] / rate * 16_000)
+                input_values = signal.resample(input_values, new_length)
+            input_values = torch.from_numpy(input_values).unsqueeze(0)
+            if torch.cuda.is_available():
+                input_values = input_values.cuda()
+            if layer is None:
+                hidden_states = model(input_values, output_hidden_states=True).hidden_states
+                hidden_states = [s.squeeze(0).cpu().numpy() for s in hidden_states]
+                return hidden_states
+            if layer >= 0:
+                hidden_state = model(input_values).last_hidden_state.squeeze(0).cpu().numpy()
+            else:
+                hidden_state = model.feature_extractor(input_values)
+                hidden_state = hidden_state.transpose(1, 2)
+                if layer == -1:
+                    hidden_state = model.feature_projection(hidden_state)
+                hidden_state = hidden_state.squeeze(0).cpu().numpy()
+            return hidden_state
+        return _featurize
+    featurizer_a = load_wav2vec2_featurizer(model_id, layer)
+except OSError:
+    st.error("Please select a wav2vec 2.0 compatible model identifier on the [Hugging Face Model Hub](https://huggingface.co/models?filter=wav2vec2).")
+    featurizer_a = None
+def aligner(x, y) -> Any:
+    return dtw(x, y, keep_internals=True)
+def compute_costs(gcm):
+    res = [[] for _ in range(gcm.N)]
+    for i in range(gcm.index1.shape[0]):
+        d = gcm.localCostMatrix[gcm.index1[i], gcm.index2[i]]
+        res[gcm.index1[i]].append(d)
+    n = [len(x) for x in res]
+    res = [np.mean(x) for x in res]
+    return res, n
+st.subheader("Audio file selection:")
+filename_x = st.selectbox(
+    "Filename (x-axis):",
+    ("falling_huud_mobiel_201145.wav", "falling_hood_mobiel_203936.wav", "custom upload"))
+if filename_x == "falling_huud_mobiel_201145.wav":
+    filename_x = "./examples/falling_huud_mobiel_201145.wav"
+if filename_x == "falling_hood_mobiel_203936.wav":
+    filename_x = "./examples/falling_hood_mobiel_203936.wav"
+filename_y = st.selectbox(
+"Filename (y-axis):",
+("falling_hood_mobiel_203936.wav", "falling_huud_mobiel_201145.wav", "custom upload"))
+if filename_y == "falling_huud_mobiel_201145.wav":
+    filename_y = "./examples/falling_huud_mobiel_201145.wav"
+if filename_y == "falling_hood_mobiel_203936.wav":
+    filename_y = "./examples/falling_hood_mobiel_203936.wav"
+if filename_x == "custom upload":
+    filename_x = st.file_uploader("Choose a file", key = "f_x")
+if filename_y == "custom upload":
+    filename_y = st.file_uploader("Choose a file", key = "f_y")
+if filename_x is not None and filename_y is not None and featurizer_a is not None:
+    print(f"\nX: {filename_x}\nY: {filename_y}")
+    def run(featurizer):
+        feats_x = featurizer(filename_x)
+        feats_y = featurizer(filename_y)
+        gcm = aligner(feats_x, feats_y)
+        d = gcm.normalizedDistance
+        print("\nDistance:", d)
+        c, n = compute_costs(gcm)
+        return d, c, n
+    d, c, n = run(featurizer_a)
+    # d_b, c_b, n_b = run(featurizer_b)
+    fig, axes = plt.subplots(figsize=(4,2.5))
+    window_size = 9
+    rate = 20
+    x = np.arange(0, len(c) * rate, rate)
+    offset = (window_size - 1) // 2
+    x_ = x[offset:-offset]
+    # Target layer
+    axes.plot(x, c, alpha=0.5, color="gray", linestyle="--")
+    axes.scatter(x, c, np.array(n) * 10, color="gray")
+    c_ = np.convolve(c, np.ones(window_size) / window_size, mode="valid")
+    axes.plot(x_, c_)
+    # Last layer
+    # axes.plot(x, c_b, alpha=0.5, color="gray", linestyle="--")
+    # axes.scatter(x, c_b, np.array(n_b) * 10, color="gray")
+    # c_b_ = np.convolve(c_b, np.ones(window_size) / window_size, mode="valid")
+    # axes.plot(x_, c_b_, linestyle="--")
+    axes.set_xlabel("time (ms)")
+    axes.set_ylabel("distance per frame")
+    axes.hlines(y=d, xmin=0, xmax=np.max(x), linestyles="dashdot")
+    plt.tight_layout(pad=0)
+    plt.savefig("./output/plot.pdf")
+    st.pyplot(fig)
+    if os.path.isfile("./output/plot.pdf"):
+        if st.button("Info"):
+            st.write(" Visualization of neural acoustic distances\
+            per frame (based on wav2vec 2.0) with the pronunciation of\
+            of the first filename on the x-axis and distances to the pronunciation\
+            of second filename on the y-axis. The horizontal line represents\
+            the global distance value (i.e. the average of all individual frames).\
+            The blue continuous line represents the moving average distance based on 9 frames,\
+            corresponding to 180ms. As a result of the moving average, the blue line does not cover the entire duration of\
+            the sample. Larger bullet sizes indicate that multiple\
+            frames in the pronunciation on the y-axis are aligned to a single frame in the pronunciation on the x-axis.")
+    with open("./output/plot.pdf", "rb") as file:
+        btn = st.download_button(
+                label="Download plot",
+                data=file,
+                file_name="plot.pdf",
+                mime="image/pdf"
+            )

output/plot.pdf ADDED Viewed

Binary file (20.5 kB). View file

requirements.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+dtw-python==1.1.6
+editdistance==0.5.3
+fairseq @ git+https://github.com/pytorch/fairseq@aa39ab1b4568479bf9a1360cfcdd4f4fce5f1838
+matplotlib==3.3.2
+numpy==1.19.1
+onnxruntime==1.8.1
+pandas==1.1.3
+scipy==1.5.2
+seaborn==0.11.0
+SoundFile==0.10.2
+torch==1.6.0
+tqdm==4.50.2