Spaces:

sohojoe
/

project_charles

Sleeping

App Files Files Community

project_charles / app.py

sohojoe

get access to the image frames

30bac07 over 1 year ago

raw

history blame

5.24 kB

	from collections import deque
	import os
	import threading
	import time
	import av
	import numpy as np
	import streamlit as st
	from streamlit_webrtc import WebRtcMode, webrtc_streamer
	import pydub
	# import av
	# import cv2
	from sample_utils.turn import get_ice_servers
	import json
	from typing import List

	from vosk import SetLogLevel, Model, KaldiRecognizer
	SetLogLevel(-1) # mutes vosk verbosity

	from dotenv import load_dotenv
	load_dotenv()

	system_one = {
	"audio_bit_rate": 16000,
	# "audio_bit_rate": 32000,
	# "audio_bit_rate": 48000,
	}


	playing = st.checkbox("Playing", value=True)

	def load_vosk (model='small'):
	# load vosk model
	# get path of current file
	current_file_path = os.path.abspath(__file__)
	current_directory = os.path.dirname(current_file_path)
	_path = os.path.join(current_directory, 'models', 'vosk', model)
	model_voice = Model(_path)
	recognizer = KaldiRecognizer(model_voice, system_one['audio_bit_rate'])
	return recognizer

	vask = load_vosk()

	def handle_audio_frame(frame):
	# if self.vosk.AcceptWaveform(data):
	pass


	def do_work(data: bytearray) -> tuple[str, bool]:
	text = ''
	speaker_finished = False
	if vask.AcceptWaveform(data):
	result = vask.Result()
	result_json = json.loads(result)
	text = result_json['text']
	speaker_finished = True
	else:
	result = vask.PartialResult()
	result_json = json.loads(result)
	text = result_json['partial']
	return text, speaker_finished


	audio_frames_deque_lock = threading.Lock()
	audio_frames_deque: deque = deque([])

	video_frames_deque_lock = threading.Lock()
	video_frames_deque: deque = deque([])

	async def queued_video_frames_callback(
	frames: List[av.AudioFrame],
	) -> av.AudioFrame:
	with video_frames_deque_lock:
	video_frames_deque.extend(frames)
	return frames

	async def queued_audio_frames_callback(
	frames: List[av.AudioFrame],
	) -> av.AudioFrame:
	with audio_frames_deque_lock:
	audio_frames_deque.extend(frames)

	# create frames to be returned.
	new_frames = []
	for frame in frames:
	input_array = frame.to_ndarray()
	new_frame = av.AudioFrame.from_ndarray(
	np.zeros(input_array.shape, dtype=input_array.dtype),
	layout=frame.layout.name,
	)
	new_frame.sample_rate = frame.sample_rate
	new_frames.append(new_frame)

	# TODO: replace with the audio we want to send to the other side.

	return new_frames

	webrtc_ctx = webrtc_streamer(
	key="charles",
	desired_playing_state=playing,
	# audio_receiver_size=4096,
	queued_audio_frames_callback=queued_audio_frames_callback,
	queued_video_frames_callback=queued_video_frames_callback,
	mode=WebRtcMode.SENDRECV,
	rtc_configuration={"iceServers": get_ice_servers()},
	async_processing=True,
	)

	system_one_audio_status = st.empty()

	if not webrtc_ctx.state.playing:
	exit

	system_one_audio_status.write("Initializing...")
	system_one_audio_output = st.empty()
	system_one_audio_history = []
	system_one_audio_history_output = st.empty()


	sound_chunk = pydub.AudioSegment.empty()
	while True:
	if webrtc_ctx.state.playing:
	# handle video
	video_frames = []
	with video_frames_deque_lock:
	while len(video_frames_deque) > 0:
	frame = video_frames_deque.popleft()
	video_frames.append(frame)

	# handle audio
	audio_frames = []
	with audio_frames_deque_lock:
	while len(audio_frames_deque) > 0:
	frame = audio_frames_deque.popleft()
	audio_frames.append(frame)

	if len(audio_frames) == 0:
	time.sleep(0.1)
	system_one_audio_status.write("No frame arrived.")
	continue

	system_one_audio_status.write("Running. Say something!")

	for audio_frame in audio_frames:
	sound = pydub.AudioSegment(
	data=audio_frame.to_ndarray().tobytes(),
	sample_width=audio_frame.format.bytes,
	frame_rate=audio_frame.sample_rate,
	channels=len(audio_frame.layout.channels),
	)
	sound = sound.set_channels(1)
	sound = sound.set_frame_rate(system_one['audio_bit_rate'])
	sound_chunk += sound

	if len(sound_chunk) > 0:
	buffer = np.array(sound_chunk.get_array_of_samples())
	text, speaker_finished = do_work(buffer.tobytes())
	system_one_audio_output.markdown(f"System 1 Audio: {text}")
	if speaker_finished and len(text) > 0:
	system_one_audio_history.append(text)
	if len(system_one_audio_history) > 10:
	system_one_audio_history = system_one_audio_history[-10:]
	table_content = "\| System 1 Audio History \|\n\| --- \|\n"
	table_content += "\n".join([f"\| {item} \|" for item in reversed(system_one_audio_history)])
	system_one_audio_history_output.markdown(table_content)
	sound_chunk = pydub.AudioSegment.empty()

	else:
	system_one_audio_status.write("Stopped.")
	break