Spaces:
Sleeping
Sleeping
from collections import deque | |
import os | |
import threading | |
import time | |
import av | |
import numpy as np | |
import streamlit as st | |
from streamlit_webrtc import WebRtcMode, webrtc_streamer | |
import pydub | |
# import av | |
# import cv2 | |
from sample_utils.turn import get_ice_servers | |
import json | |
from typing import List | |
from vosk import SetLogLevel, Model, KaldiRecognizer | |
SetLogLevel(-1) # mutes vosk verbosity | |
from dotenv import load_dotenv | |
load_dotenv() | |
system_one = { | |
"audio_bit_rate": 16000, | |
# "audio_bit_rate": 32000, | |
# "audio_bit_rate": 48000, | |
} | |
playing = st.checkbox("Playing", value=True) | |
def load_vosk (model='small'): | |
# load vosk model | |
# get path of current file | |
current_file_path = os.path.abspath(__file__) | |
current_directory = os.path.dirname(current_file_path) | |
_path = os.path.join(current_directory, 'models', 'vosk', model) | |
model_voice = Model(_path) | |
recognizer = KaldiRecognizer(model_voice, system_one['audio_bit_rate']) | |
return recognizer | |
vask = load_vosk() | |
def handle_audio_frame(frame): | |
# if self.vosk.AcceptWaveform(data): | |
pass | |
def do_work(data: bytearray) -> tuple[str, bool]: | |
text = '' | |
speaker_finished = False | |
if vask.AcceptWaveform(data): | |
result = vask.Result() | |
result_json = json.loads(result) | |
text = result_json['text'] | |
speaker_finished = True | |
else: | |
result = vask.PartialResult() | |
result_json = json.loads(result) | |
text = result_json['partial'] | |
return text, speaker_finished | |
audio_frames_deque_lock = threading.Lock() | |
audio_frames_deque: deque = deque([]) | |
video_frames_deque_lock = threading.Lock() | |
video_frames_deque: deque = deque([]) | |
async def queued_video_frames_callback( | |
frames: List[av.AudioFrame], | |
) -> av.AudioFrame: | |
with video_frames_deque_lock: | |
video_frames_deque.extend(frames) | |
return frames | |
async def queued_audio_frames_callback( | |
frames: List[av.AudioFrame], | |
) -> av.AudioFrame: | |
with audio_frames_deque_lock: | |
audio_frames_deque.extend(frames) | |
# create frames to be returned. | |
new_frames = [] | |
for frame in frames: | |
input_array = frame.to_ndarray() | |
new_frame = av.AudioFrame.from_ndarray( | |
np.zeros(input_array.shape, dtype=input_array.dtype), | |
layout=frame.layout.name, | |
) | |
new_frame.sample_rate = frame.sample_rate | |
new_frames.append(new_frame) | |
# TODO: replace with the audio we want to send to the other side. | |
return new_frames | |
webrtc_ctx = webrtc_streamer( | |
key="charles", | |
desired_playing_state=playing, | |
# audio_receiver_size=4096, | |
queued_audio_frames_callback=queued_audio_frames_callback, | |
queued_video_frames_callback=queued_video_frames_callback, | |
mode=WebRtcMode.SENDRECV, | |
rtc_configuration={"iceServers": get_ice_servers()}, | |
async_processing=True, | |
) | |
system_one_audio_status = st.empty() | |
if not webrtc_ctx.state.playing: | |
exit | |
system_one_audio_status.write("Initializing...") | |
system_one_audio_output = st.empty() | |
system_one_audio_history = [] | |
system_one_audio_history_output = st.empty() | |
sound_chunk = pydub.AudioSegment.empty() | |
while True: | |
if webrtc_ctx.state.playing: | |
# handle video | |
video_frames = [] | |
with video_frames_deque_lock: | |
while len(video_frames_deque) > 0: | |
frame = video_frames_deque.popleft() | |
video_frames.append(frame) | |
# handle audio | |
audio_frames = [] | |
with audio_frames_deque_lock: | |
while len(audio_frames_deque) > 0: | |
frame = audio_frames_deque.popleft() | |
audio_frames.append(frame) | |
if len(audio_frames) == 0: | |
time.sleep(0.1) | |
system_one_audio_status.write("No frame arrived.") | |
continue | |
system_one_audio_status.write("Running. Say something!") | |
for audio_frame in audio_frames: | |
sound = pydub.AudioSegment( | |
data=audio_frame.to_ndarray().tobytes(), | |
sample_width=audio_frame.format.bytes, | |
frame_rate=audio_frame.sample_rate, | |
channels=len(audio_frame.layout.channels), | |
) | |
sound = sound.set_channels(1) | |
sound = sound.set_frame_rate(system_one['audio_bit_rate']) | |
sound_chunk += sound | |
if len(sound_chunk) > 0: | |
buffer = np.array(sound_chunk.get_array_of_samples()) | |
text, speaker_finished = do_work(buffer.tobytes()) | |
system_one_audio_output.markdown(f"**System 1 Audio:** {text}") | |
if speaker_finished and len(text) > 0: | |
system_one_audio_history.append(text) | |
if len(system_one_audio_history) > 10: | |
system_one_audio_history = system_one_audio_history[-10:] | |
table_content = "| System 1 Audio History |\n| --- |\n" | |
table_content += "\n".join([f"| {item} |" for item in reversed(system_one_audio_history)]) | |
system_one_audio_history_output.markdown(table_content) | |
sound_chunk = pydub.AudioSegment.empty() | |
else: | |
system_one_audio_status.write("Stopped.") | |
break | |