File size: 3,793 Bytes
3d4323f
 
403d26a
3d4323f
 
de66b6c
403d26a
3d4323f
403d26a
 
6377492
 
3d4323f
 
403d26a
 
 
 
 
3d4323f
be40fff
 
bed73f4
be40fff
3d4323f
 
 
 
 
 
 
 
987011f
 
 
 
 
 
 
3d4323f
987011f
 
 
 
 
 
 
3d4323f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42fc253
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ddc7483
 
 
 
42fc253
 
 
3d4323f
42fc253
 
3d4323f
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import gradio as gr
import pandas as pd
import json
import collections
import scipy.signal
import numpy as np

from functools import partial
import importlib.resources as ir

import openwakeword.utils

from openwakeword.model import Model

# One-time download of all pre-trained models (or only select models)
with ir.path('openwakeword', 'resources') as oir:
    if not oir.is_dir():
        openwakeword.utils.download_models()

# Load openWakeWord models
model = Model(
    inference_framework="onnx",
    wakeword_models=["borah_da.onnx", "prin_hawn_da_max_en.onnx", "max_en.onnx"],  # can also leave this argument empty to load all of the included pre-trained models
)

# Define function to process audio
def process_audio(audio, state=collections.defaultdict(partial(collections.deque, maxlen=60))):
    # Resample audio to 16khz if needed
    if audio[0] != 16000:
        data = scipy.signal.resample(audio[1], int(float(audio[1].shape[0])/audio[0]*16000))    
    
    # Get predictions
    for i in range(0, data.shape[0], 1280):
        if len(data.shape) == 2 or data.shape[-1] == 2:
            chunk = data[i:i+1280][:, 0]  # just get one channel of audio
        else:
            chunk = data[i:i+1280]

        if chunk.shape[0] == 1280:
            prediction = model.predict(chunk)
            for key in prediction:
                #Fill deque with zeros if it's empty
                if len(state[key]) == 0:
                    state[key].extend(np.zeros(60))
                    
                # Add prediction
                state[key].append(prediction[key])
    
    # Make line plot
    dfs = []
    for key in state.keys():
        df = pd.DataFrame({"x": np.arange(len(state[key])), "y": state[key], "Model": key})
        dfs.append(df)
    
    df = pd.concat(dfs)
    plot = gr.LinePlot().update(value = df, x='x', y='y', color="Model", y_lim = (0,1), tooltip="Model",
                                width=600, height=300, x_title="Time (frames)", y_title="Model Score", color_legend_position="bottom")
    
    # Manually adjust how the legend is displayed
    tmp = json.loads(plot["value"]["plot"])
    tmp["layer"][0]['encoding']['color']['legend']["direction"] = "vertical"
    tmp["layer"][0]['encoding']['color']['legend']["columns"] = 4
    tmp["layer"][0]['encoding']['color']['legend']["labelFontSize"] = 12
    tmp["layer"][0]['encoding']['color']['legend']["titleFontSize"] = 14
    
    plot["value"]['plot'] = json.dumps(tmp)
    
    return plot, state

# Create Gradio interface and launch

desc = """
This is a demo of the pre-trained models included in the latest release
of the [openWakeWord](https://github.com/dscripka/openWakeWord) library.

Click on the "record from microphone" button below to start capturing.
The real-time scores from each model will be shown in the line plot. Hover over
each line to see the name of the corresponding model.

Different models will respond to different wake words/phrases (see [the model docs](https://github.com/dscripka/openWakeWord/tree/main/docs/models) for more details).
If everything is working properly,
you should see a spike in the score for a given model after speaking a related word/phrase. Below are some suggested phrases to try!

| Model Name | Word/Phrase |
| --- | --- |
| borah_da | "bora da" |
| max_en | "Macsen" |
| prin_hawn_da_max_en  | "Prynhawn da, Macsen"|


"""

gr_int = gr.Interface(
    title = "openWakeWord Live Demo",
    description = desc,
    css = ".flex {flex-direction: column} .gr-panel {width: 100%}",
    fn=process_audio,
    inputs=[
        gr.Audio(source="microphone", type="numpy", streaming=True, show_label=False), 
        "state"
    ],
    outputs=[
        gr.LinePlot(show_label=False),
        "state"
    ],
    live=True)

gr_int.launch()