import asyncio import io import logging import traceback from typing import List import av import numpy as np import streamlit as st from streamlit_webrtc import WebRtcMode, webrtc_streamer import pydub from dotenv import load_dotenv load_dotenv() from sample_utils.turn import get_ice_servers logger = logging.getLogger(__name__) class StreamingMP3ToFrames: def __init__(self): self.append = False def process_chunk(self, chunk): audio_frames = [] try: if self.append: self.bytes_io.write(chunk) self.append = False self.bytes_io.seek(0) else: self.bytes_io = io.BytesIO(chunk) container = av.open(self.bytes_io, 'r', format='mp3') audio_stream = next(s for s in container.streams if s.type == 'audio') for frame in container.decode(audio_stream): # Convert the audio frame to a NumPy array array = frame.to_ndarray() # Now you can use av.AudioFrame.from_ndarray # audio_frame = av.AudioFrame.from_ndarray(array, format='flt', layout='mono') audio_frame = av.AudioFrame.from_ndarray(array, format='fltp', layout='mono') audio_frame.sample_rate = 44100 audio_frames.append(audio_frame) return audio_frames except Exception as e: print (e) self.append = True self.bytes_io.seek(0, io.SEEK_END) return audio_frames def video_frame_callback( frame: av.VideoFrame, ) -> av.VideoFrame: return frame streaming_mp3_to_frames = StreamingMP3ToFrames() with open("chunks.pkl", "rb") as f: import pickle debug_chunks = pickle.load(f) debug_frames = [] debug_frame_idx = 0 for chunk in debug_chunks: new_frames = streaming_mp3_to_frames.process_chunk(chunk) for frame in new_frames: debug_frames.append(frame) # print (frame) def dequeue_frame(): global debug_frame_idx, debug_frames enqueued_frame = debug_frames[debug_frame_idx] debug_frame_idx += 1 if debug_frame_idx >= len(debug_frames): debug_frame_idx = 0 return enqueued_frame # emptry array of type int16 sample_buffer = np.zeros((0), dtype=np.int16) def process_frame(old_frame): try: output_channels = 2 output_sample_rate = 44100 required_samples = old_frame.samples global sample_buffer while sample_buffer.shape[0] < required_samples: dequeued_frame = dequeue_frame() if dequeued_frame is None: break # convert dequeued_frame to same format as old_frame float_samples = dequeued_frame.to_ndarray() max_sample = np.max(np.abs(float_samples)) min_sample = np.min(np.abs(float_samples)) if max_sample > 1.0 or min_sample > 1.0: print(f"WARNING: max_sample: {max_sample}, min_sample: {min_sample}") int_samples = np.int16(float_samples * 32767) sound = pydub.AudioSegment( data=int_samples.tobytes(), sample_width=2, frame_rate=output_sample_rate, channels=len(dequeued_frame.layout.channels), ) sound = sound.set_frame_rate(old_frame.sample_rate) samples = np.array(sound.get_array_of_samples(), dtype=np.int16) sample_buffer = np.append(sample_buffer, samples) # handle case where we ran out of frames if sample_buffer.shape[0] < required_samples: empty_samples = np.zeros((required_samples - sample_buffer.shape[0]), dtype=np.int16) sample_buffer = np.append(sample_buffer, empty_samples) # take the first required_samples samples from the buffer samples = sample_buffer[:required_samples] sample_buffer = sample_buffer[required_samples:] # Duplicate mono channel for stereo if output_channels == 2: samples = np.vstack((samples, samples)).reshape((-1,), order='F') samples = samples.reshape(1, -1) layout = 'stereo' if output_channels == 2 else 'mono' new_frame = av.AudioFrame.from_ndarray(samples, format='s16', layout=layout) new_frame.sample_rate = old_frame.sample_rate new_frame.pts = old_frame.pts return new_frame except Exception as e: print (e) traceback.print_exc() raise(e) def audio_frame_callback(old_frame: av.AudioFrame) -> av.AudioFrame: global debug_frame_idx, debug_frames new_frame = process_frame(old_frame) # print (f"new_frames: {len(new_frames)}, frames: {len(frames)}") print (f"frame: {old_frame}, pts: {old_frame.pts}") print (f"new_frame: {new_frame}, pts: {new_frame.pts}") return new_frame # return old_frame webrtc_streamer( key="delay", mode=WebRtcMode.SENDRECV, rtc_configuration={"iceServers": get_ice_servers()}, video_frame_callback=video_frame_callback, audio_frame_callback=audio_frame_callback, )