Streaming-AST / app.py
BHW's picture
Rename run.py to app.py
82cdfa9 verified
raw
history blame
3.23 kB
import time
import gradio as gr
import librosa
import numpy as np
import soundfile as sf
from transformers import pipeline
TARGET_SAMPLE_RATE = 16_000
AUDIO_SECONDS_THRESHOLD = 5
pipe = pipeline("audio-classification", model="MIT/ast-finetuned-audioset-10-10-0.4593")
prediction = [{"score": 1, "label": "recording..."}]
def normalize_waveform(waveform, datatype=np.float32): # source datatype: np.int16
waveform = waveform.astype(dtype=datatype)
waveform /= 32768.0
return waveform
def streaming_recording_fn(stream, new_chunk):
global prediction
sr, y = new_chunk
y = normalize_waveform(y)
y = librosa.resample(y, orig_sr=sr, target_sr=TARGET_SAMPLE_RATE)
if stream is not None:
if (stream.shape[-1] / TARGET_SAMPLE_RATE) >= AUDIO_SECONDS_THRESHOLD:
prediction = pipe(stream)
file_name = f'./audio/{time.strftime("%Y%m%d_%H%M%S", time.localtime())}.wav'
sf.write(file_name, stream, TARGET_SAMPLE_RATE)
print(f"SAVE AUDIO: {file_name}")
print(f">>>>>>1\t{y.shape=}, {stream.shape=}\n\t{prediction[0]=}")
stream = None
else:
stream = np.concatenate([stream, y], axis=-1)
print(f">>>>>>2\t{y.shape=}, {stream.shape=}")
else:
stream = y
print(f">>>>>>3\t{y.shape=}, {stream.shape=}")
return stream, {i['label']: i['score'] for i in prediction}
def microphone_fn(waveform):
print('-' * 120)
print(f"{waveform=}")
sr, y = waveform
y = normalize_waveform(y)
y = librosa.resample(y, orig_sr=sr, target_sr=TARGET_SAMPLE_RATE)
result = pipe(y)
file_name = f'./audio/{time.strftime("%Y%m%d_%H%M%S", time.localtime())}.wav'
sf.write(file_name, y, TARGET_SAMPLE_RATE)
return {i['label']: i['score'] for i in result}
def file_fn(waveform):
print('-' * 120)
print(f"{waveform=}")
sr, y = waveform
y = normalize_waveform(y)
y = librosa.resample(y, orig_sr=sr, target_sr=TARGET_SAMPLE_RATE)
result = pipe(y)
file_name = f'./audio/{time.strftime("%Y%m%d_%H%M%S", time.localtime())}.wav'
sf.write(file_name, y, TARGET_SAMPLE_RATE)
return {i['label']: i['score'] for i in result}
streaming_demo = gr.Interface(
fn=streaming_recording_fn,
inputs=["state", gr.Audio(sources=["microphone"], streaming=True)],
outputs=["state", "label"],
live=True,
)
microphone_demo = gr.Interface(
fn=microphone_fn,
inputs=[gr.Audio(sources=["microphone"], type="numpy")],
outputs=["label"]
)
file_demo = gr.Interface(
fn=file_fn,
inputs=[gr.Audio(sources=["upload"], type="numpy")],
outputs=["label"]
)
with gr.Blocks() as example:
inputs = [gr.Audio(sources=["upload"], type="numpy")]
output = gr.Label()
examples = [
["audio/cantina.wav"],
["audio/cat.mp3"]
]
ex = gr.Examples(examples,
fn=file_fn, inputs=inputs, outputs=output,
run_on_click=True)
with gr.Blocks() as demo:
gr.TabbedInterface([file_demo, streaming_demo, microphone_demo, example],
["Audio file", "Streaming", "Microphone", "example"])
if __name__ == "__main__":
demo.launch(share=True)