audio-streaming / app.py
sanchit-gandhi's picture
try bytes
bdf2bf2
raw history blame
No virus
2.49 kB
import gradio as gr
import math
import time
import numpy as np
from pydub import AudioSegment
import io
def numpy_to_mp3(audio_array, sampling_rate):
# Normalize audio_array if it's floating-point
if np.issubdtype(audio_array.dtype, np.floating):
max_val = np.max(np.abs(audio_array))
audio_array = (audio_array / max_val) * 32767 # Normalize to 16-bit range
audio_array = audio_array.astype(np.int16)
# Create an audio segment from the numpy array
audio_segment = AudioSegment(
audio_array.tobytes(),
frame_rate=sampling_rate,
sample_width=audio_array.dtype.itemsize,
channels=1
)
# Export the audio segment to MP3 bytes
mp3_io = io.BytesIO()
audio_segment.export(mp3_io, format="mp3")
# Get the MP3 bytes
mp3_bytes = mp3_io.getvalue()
mp3_io.close()
return mp3_bytes
def stream(audio, chunk_length_s):
start_time = time.time()
sampling_rate, array = audio
chunk_length = int(chunk_length_s * sampling_rate)
time_length = chunk_length_s / 2 # always stream outputs faster than it takes to process
audio_length = len(array)
num_batches = math.ceil(audio_length / chunk_length)
for idx in range(num_batches):
time.sleep(time_length)
start_pos = idx * chunk_length
end_pos = min((idx + 1) * chunk_length, audio_length)
chunk = array[start_pos : end_pos]
chunk_mp3 = numpy_to_mp3(chunk, sampling_rate=sampling_rate)
if idx == 0:
first_time = round(time.time() - start_time, 2)
run_time = round(time.time() - start_time, 2)
yield (sampling_rate, chunk), chunk_mp3, first_time, run_time
with gr.Blocks() as demo:
with gr.Row():
with gr.Column():
audio_in = gr.Audio(value="librispeech.wav", sources=["upload"], type="numpy")
chunk_length = gr.Slider(minimum=2, maximum=10, value=2, step=2, label="Chunk length (s)")
run_button = gr.Button("Stream audio")
with gr.Column():
audio_out = gr.Audio(streaming=True, autoplay=True, label="wav")
audio_out_mp3 = gr.Audio(streaming=True, autoplay=True, format="mp3", label="mp3")
first_time = gr.Textbox(label="Time to first chunk (s)")
run_time = gr.Textbox(label="Time to current chunk (s)")
run_button.click(fn=stream, inputs=[audio_in, chunk_length], outputs=[audio_out, audio_out_mp3, first_time, run_time])
demo.launch()