Spaces:
Sleeping
Sleeping
File size: 6,606 Bytes
162d5c8 ad9371a 162d5c8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 |
import asyncio
from collections import deque
import os
import threading
import time
import traceback
import av
import numpy as np
import streamlit as st
from streamlit_webrtc import WebRtcMode, webrtc_streamer
import pydub
import torch
# import av
# import cv2
from sample_utils.turn import get_ice_servers
import json
from typing import List
from vosk import SetLogLevel, Model, KaldiRecognizer
SetLogLevel(-1) # mutes vosk verbosity
from dotenv import load_dotenv
load_dotenv()
webrtc_ctx = None
async def main():
system_one_audio_status = st.empty()
playing = st.checkbox("Playing", value=True)
system_one_audio_status.write("Initializing streaming")
system_one_audio_output = st.empty()
system_one_video_output = st.empty()
system_one_audio_history = []
system_one_audio_history_output = st.empty()
# Initialize resources if not already done
print("000")
system_one_audio_status.write("Initializing streaming")
if "streamlit_av_queue" not in st.session_state:
print("001")
from streamlit_av_queue import StreamlitAVQueue
st.session_state.streamlit_av_queue = StreamlitAVQueue()
if "speech_to_text_vosk" not in st.session_state:
print("002")
from speech_to_text_vosk import SpeechToTextVosk
st.session_state.speech_to_text_vosk = SpeechToTextVosk()
from chat_pipeline import ChatPipeline
if "chat_pipeline" not in st.session_state:
print("003")
# from chat_pipeline import ChatPipeline
# st.session_state.chat_pipeline = ChatPipeline()
# await st.session_state.chat_pipeline.start()
st.session_state.chat_pipeline = ChatPipeline()
await st.session_state.chat_pipeline.start()
if "debug_queue" not in st.session_state:
st.session_state.debug_queue = [
# "hello, how are you today?",
# "hmm, interesting, tell me more about that.",
]
system_one_audio_status.write("resources referecned")
print("010")
system_one_audio_status.write("Initializing webrtc_streamer")
webrtc_ctx = webrtc_streamer(
key="charles",
desired_playing_state=playing,
queued_audio_frames_callback=st.session_state.streamlit_av_queue.queued_audio_frames_callback,
queued_video_frames_callback=st.session_state.streamlit_av_queue.queued_video_frames_callback,
mode=WebRtcMode.SENDRECV,
rtc_configuration={"iceServers": get_ice_servers()},
async_processing=True,
)
if not webrtc_ctx.state.playing:
exit
system_one_audio_status.write("Initializing speech")
try:
while True:
if not webrtc_ctx.state.playing:
system_one_audio_status.write("Stopped.")
await asyncio.sleep(0.1)
continue
system_one_audio_status.write("Streaming.")
if len(st.session_state.debug_queue) > 0:
prompt = st.session_state.debug_queue.pop(0)
await st.session_state.chat_pipeline.enqueue(prompt)
sound_chunk = pydub.AudioSegment.empty()
audio_frames = st.session_state.streamlit_av_queue.get_audio_frames()
if len(audio_frames) > 0:
for audio_frame in audio_frames:
sound = pydub.AudioSegment(
data=audio_frame.to_ndarray().tobytes(),
sample_width=audio_frame.format.bytes,
frame_rate=audio_frame.sample_rate,
channels=len(audio_frame.layout.channels),
)
sound = sound.set_channels(1)
sound = sound.set_frame_rate(st.session_state.speech_to_text_vosk.get_audio_bit_rate())
sound_chunk += sound
buffer = np.array(sound_chunk.get_array_of_samples())
st.session_state.speech_to_text_vosk.add_speech_bytes(buffer.tobytes())
prompt, speaker_finished = st.session_state.speech_to_text_vosk.get_text()
if speaker_finished and len(prompt) > 0:
print(f"Prompt: {prompt}")
system_one_audio_history.append(prompt)
if len(system_one_audio_history) > 10:
system_one_audio_history = system_one_audio_history[-10:]
table_content = "| System 1 Audio History |\n| --- |\n"
table_content += "\n".join([f"| {item} |" for item in reversed(system_one_audio_history)])
system_one_audio_history_output.markdown(table_content)
await st.session_state.chat_pipeline.enqueue(prompt)
video_frames = st.session_state.streamlit_av_queue.get_video_frames()
if len(video_frames) > 0:
# for video_frame in video_frames:
# system_one_video_output.image(video_frame.to_ndarray())
pass
await asyncio.sleep(0.1)
# try:
# prompts = [
# "hello, how are you today?",
# "tell me about your shadow self?",
# "hmm, interesting, tell me more about that.",
# "wait, that is so interesting, what else?",
# ]
# for prompt in prompts:
# system_one_audio_history.append(prompt)
# if len(system_one_audio_history) > 10:
# system_one_audio_history = system_one_audio_history[-10:]
# table_content = "| System 1 Audio History |\n| --- |\n"
# table_content += "\n".join([f"| {item} |" for item in reversed(system_one_audio_history)])
# system_one_audio_history_output.markdown(table_content)
# await chat_pipeline.enqueue(prompt)
except Exception as e:
print(f"An error occurred: {e}")
traceback.print_exc()
raise e
# while True:
# if webrtc_ctx.state.playing:
# system_one_audio_status.write("Streaming.")
# else:
# system_one_audio_status.write("Stopped.")
# await asyncio.sleep(0.5)
if __name__ == "__main__":
try:
asyncio.run(main())
except Exception as e:
if webrtc_ctx is not None:
del webrtc_ctx
webrtc_ctx = None
if "streamlit_av_queue" in st.session_state:
del st.session_state.streamlit_av_queue
if "speech_to_text_vosk" in st.session_state:
del st.session_state.speech_to_text_vosk
if "chat_pipeline" in st.session_state:
del st.session_state.chat_pipeline
finally:
pass
|