Spaces:
Sleeping
Sleeping
animate charles
Browse files- agent_response.py +1 -1
- charles_actor.py +20 -2
- charles_animator.py +128 -0
- ffmpeg_converter_actor.py +2 -1
- images/charles-open.png +3 -0
- images/charles.png +3 -0
- images/zoom-background.png +3 -0
- respond_to_prompt_actor.py +2 -1
- streamlit_av_queue.py +27 -9
- tests/test_image.py +192 -0
- tests/test_talking.py +65 -0
- text_to_speech_service.py +3 -2
- webrtc_av_queue_actor.py +17 -7
agent_response.py
CHANGED
@@ -9,7 +9,7 @@ class AgentResponse(dict):
|
|
9 |
self['llm_sentence'] = ''
|
10 |
self['llm_sentence_id'] = 0
|
11 |
self['llm_sentences'] = []
|
12 |
-
self['
|
13 |
self['tts_raw_chunk_id'] = 0
|
14 |
|
15 |
def make_copy(self):
|
|
|
9 |
self['llm_sentence'] = ''
|
10 |
self['llm_sentence_id'] = 0
|
11 |
self['llm_sentences'] = []
|
12 |
+
self['tts_raw_chunk_ref'] = None
|
13 |
self['tts_raw_chunk_id'] = 0
|
14 |
|
15 |
def make_copy(self):
|
charles_actor.py
CHANGED
@@ -33,7 +33,8 @@ class CharlesActor:
|
|
33 |
self._state = "000 - creating StreamlitAVQueue"
|
34 |
from streamlit_av_queue import StreamlitAVQueue
|
35 |
self._streamlit_av_queue = StreamlitAVQueue()
|
36 |
-
self._out_audio_queue = self._streamlit_av_queue.get_out_audio_queue()
|
|
|
37 |
|
38 |
print("001 - create RespondToPromptActor")
|
39 |
self._state = "001 - creating RespondToPromptActor"
|
@@ -57,6 +58,12 @@ class CharlesActor:
|
|
57 |
self._state = "003 - creating Prototypes"
|
58 |
from prototypes import Prototypes
|
59 |
self._prototypes = Prototypes()
|
|
|
|
|
|
|
|
|
|
|
|
|
60 |
print("010")
|
61 |
self._needs_init = True
|
62 |
self._state = "Initialized"
|
@@ -184,8 +191,19 @@ class CharlesActor:
|
|
184 |
|
185 |
|
186 |
await asyncio.sleep(0.01)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
187 |
loops+=1
|
188 |
-
self._state = f"Processed {total_video_frames} video frames and {total_audio_frames} audio frames, loops: {loops}. loops per second: {loops/(time.time()-start_time):.2f}. {vector_debug}"
|
189 |
|
190 |
def init_ray():
|
191 |
try:
|
|
|
33 |
self._state = "000 - creating StreamlitAVQueue"
|
34 |
from streamlit_av_queue import StreamlitAVQueue
|
35 |
self._streamlit_av_queue = StreamlitAVQueue()
|
36 |
+
self._out_audio_queue = await self._streamlit_av_queue.get_out_audio_queue()
|
37 |
+
self._out_video_queue = await self._streamlit_av_queue.get_out_video_queue()
|
38 |
|
39 |
print("001 - create RespondToPromptActor")
|
40 |
self._state = "001 - creating RespondToPromptActor"
|
|
|
58 |
self._state = "003 - creating Prototypes"
|
59 |
from prototypes import Prototypes
|
60 |
self._prototypes = Prototypes()
|
61 |
+
|
62 |
+
print("004 - create animator")
|
63 |
+
self._state = "004 - creating animator"
|
64 |
+
from charles_animator import CharlesAnimator
|
65 |
+
self._animator = CharlesAnimator()
|
66 |
+
|
67 |
print("010")
|
68 |
self._needs_init = True
|
69 |
self._state = "Initialized"
|
|
|
191 |
|
192 |
|
193 |
await asyncio.sleep(0.01)
|
194 |
+
|
195 |
+
# add observations to the environment state
|
196 |
+
count = len(self._out_audio_queue)
|
197 |
+
is_talking = bool(count > 0)
|
198 |
+
frame = self._animator.update(is_talking)
|
199 |
+
if self._out_video_queue.full():
|
200 |
+
evicted_item = await self._out_video_queue.get_async()
|
201 |
+
del evicted_item
|
202 |
+
frame_ref = ray.put(frame)
|
203 |
+
await self._out_video_queue.put_async(frame_ref)
|
204 |
+
|
205 |
loops+=1
|
206 |
+
self._state = f"Processed {total_video_frames} video frames and {total_audio_frames} audio frames, loops: {loops}. loops per second: {loops/(time.time()-start_time):.2f}. Is speaking: {is_talking}({count}). {vector_debug}"
|
207 |
|
208 |
def init_ray():
|
209 |
try:
|
charles_animator.py
ADDED
@@ -0,0 +1,128 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Modifying the code to ensure the mouth is open when the character starts talking
|
2 |
+
|
3 |
+
import random
|
4 |
+
import time
|
5 |
+
import cv2
|
6 |
+
import av
|
7 |
+
import numpy as np
|
8 |
+
|
9 |
+
def resize_and_crop(image, dim=(640, 480)):
|
10 |
+
h, w = image.shape[:2]
|
11 |
+
aspect_ratio = w / h
|
12 |
+
|
13 |
+
target_width, target_height = dim
|
14 |
+
target_aspect = target_width / target_height
|
15 |
+
|
16 |
+
if aspect_ratio > target_aspect:
|
17 |
+
# Original aspect is wider than target, fit by height
|
18 |
+
new_height = target_height
|
19 |
+
new_width = int(target_height * aspect_ratio)
|
20 |
+
else:
|
21 |
+
# Original aspect is taller than target, fit by width
|
22 |
+
new_width = target_width
|
23 |
+
new_height = int(target_width / aspect_ratio)
|
24 |
+
|
25 |
+
# Resize the image with new dimensions
|
26 |
+
resized_image = cv2.resize(image, (new_width, new_height), interpolation=cv2.INTER_AREA)
|
27 |
+
|
28 |
+
# Crop to target dimensions
|
29 |
+
x_offset = (new_width - target_width) // 2
|
30 |
+
y_offset = (new_height - target_height) // 2
|
31 |
+
|
32 |
+
cropped_image = resized_image[y_offset:y_offset + target_height, x_offset:x_offset + target_width]
|
33 |
+
|
34 |
+
return cropped_image
|
35 |
+
|
36 |
+
def overlay_images(background, overlay, x, y):
|
37 |
+
"""
|
38 |
+
Overlay an image with transparency over another image.
|
39 |
+
"""
|
40 |
+
# Check if overlay dimensions fit within the background at the given (x, y) position
|
41 |
+
if y + overlay.shape[0] > background.shape[0] or x + overlay.shape[1] > background.shape[1]:
|
42 |
+
raise ValueError("Overlay dimensions exceed background dimensions at the specified position.")
|
43 |
+
|
44 |
+
# Extract the alpha channel from the overlay and create an inverse alpha channel
|
45 |
+
alpha = overlay[:, :, 3] / 255.0
|
46 |
+
inverse_alpha = 1.0 - alpha
|
47 |
+
|
48 |
+
# Convert overlay to BGR if it's in RGB
|
49 |
+
if overlay.shape[2] == 4: # If it has an alpha channel
|
50 |
+
overlay = cv2.cvtColor(overlay[:, :, :3], cv2.COLOR_RGB2BGR)
|
51 |
+
overlay = np.concatenate([overlay, overlay[:, :, 3:]], axis=2) # Add alpha channel back
|
52 |
+
else:
|
53 |
+
overlay = cv2.cvtColor(overlay, cv2.COLOR_RGB2BGR)
|
54 |
+
|
55 |
+
# Overlay the images
|
56 |
+
for c in range(0, 3):
|
57 |
+
background[y:overlay.shape[0]+y, x:overlay.shape[1]+x, c] = (
|
58 |
+
alpha * overlay[:, :, c] + inverse_alpha * background[y:overlay.shape[0]+y, x:overlay.shape[1]+x, c]
|
59 |
+
)
|
60 |
+
|
61 |
+
return background
|
62 |
+
|
63 |
+
def create_charles_frames(background, charles_frames):
|
64 |
+
output_frames = []
|
65 |
+
# Load background image
|
66 |
+
background = cv2.imread(background, cv2.COLOR_BGR2RGB)
|
67 |
+
background = cv2.cvtColor(background, cv2.COLOR_BGR2RGB)
|
68 |
+
# resize background to match user image
|
69 |
+
background = resize_and_crop(background, (640, 480))
|
70 |
+
|
71 |
+
for bot_image_path in charles_frames:
|
72 |
+
bot_image = cv2.imread(bot_image_path, cv2.IMREAD_UNCHANGED)
|
73 |
+
|
74 |
+
# assert bot image is square
|
75 |
+
assert bot_image.shape[0] == bot_image.shape[1]
|
76 |
+
|
77 |
+
# resize bot image if it is larger than backgroun impage in any direction
|
78 |
+
if bot_image.shape[0] > background.shape[0]:
|
79 |
+
bot_image = cv2.resize(bot_image, (background.shape[0], background.shape[0]), interpolation=cv2.INTER_AREA)
|
80 |
+
|
81 |
+
# Overlay bot image on the right-hand side
|
82 |
+
x_bot = background.shape[1] - bot_image.shape[1]
|
83 |
+
y_bot = background.shape[0] - bot_image.shape[0]
|
84 |
+
background_with_bot = overlay_images(background.copy(), bot_image, x_bot, y_bot)
|
85 |
+
|
86 |
+
output_frames.append(background_with_bot)
|
87 |
+
|
88 |
+
return output_frames
|
89 |
+
|
90 |
+
class CharlesAnimator:
|
91 |
+
def __init__(self):
|
92 |
+
self.mouth_open = False
|
93 |
+
self.last_change_time = 0
|
94 |
+
self.next_change_in = 0
|
95 |
+
self.was_talking = False
|
96 |
+
# use static frames for pefromance
|
97 |
+
self.static_frames = create_charles_frames("./images/zoom-background.png", [
|
98 |
+
"./images/charles.png",
|
99 |
+
"./images/charles-open.png"
|
100 |
+
])
|
101 |
+
|
102 |
+
def update(self, is_talking):
|
103 |
+
start_talking = True if is_talking and not self.was_talking else False
|
104 |
+
self.was_talking = is_talking
|
105 |
+
current_time = time.time()
|
106 |
+
|
107 |
+
# Open the mouth when the character starts talking
|
108 |
+
if start_talking:
|
109 |
+
self.mouth_open = True
|
110 |
+
self.next_change_in = current_time + random.uniform(0.1, 0.5)
|
111 |
+
return self.mouth_open
|
112 |
+
|
113 |
+
# Initialize the next change time if it's zero.
|
114 |
+
if self.next_change_in == 0:
|
115 |
+
self.next_change_in = current_time + random.uniform(0.1, 0.5)
|
116 |
+
|
117 |
+
# Update the mouth state only if the character is talking.
|
118 |
+
if is_talking:
|
119 |
+
# Check if it's time to change the mouth state.
|
120 |
+
if current_time >= self.next_change_in:
|
121 |
+
self.mouth_open = not self.mouth_open
|
122 |
+
self.next_change_in = current_time + random.uniform(0.1, 0.5)
|
123 |
+
else:
|
124 |
+
# Close the mouth if the character is not talking.
|
125 |
+
self.mouth_open = False
|
126 |
+
|
127 |
+
frame = self.static_frames[1] if self.mouth_open else self.static_frames[0]
|
128 |
+
return frame
|
ffmpeg_converter_actor.py
CHANGED
@@ -19,7 +19,8 @@ class FFMpegConverterActor:
|
|
19 |
while True:
|
20 |
chunk = await self.output_pipe.readexactly(self.buffer_size)
|
21 |
# print(f"FFMpegConverterActor: read {len(chunk)} bytes")
|
22 |
-
|
|
|
23 |
|
24 |
async def start_process(self):
|
25 |
cmd = [
|
|
|
19 |
while True:
|
20 |
chunk = await self.output_pipe.readexactly(self.buffer_size)
|
21 |
# print(f"FFMpegConverterActor: read {len(chunk)} bytes")
|
22 |
+
chunk_ref = ray.put(chunk)
|
23 |
+
await self.output_queue.put_async(chunk_ref)
|
24 |
|
25 |
async def start_process(self):
|
26 |
cmd = [
|
images/charles-open.png
ADDED
Git LFS Details
|
images/charles.png
ADDED
Git LFS Details
|
images/zoom-background.png
ADDED
Git LFS Details
|
respond_to_prompt_actor.py
CHANGED
@@ -130,7 +130,8 @@ class SpeechToConverterActor:
|
|
130 |
self.ffmpeg_converter_actor.run.remote()
|
131 |
while True:
|
132 |
chunk_response = await self.input_queue.get_async()
|
133 |
-
|
|
|
134 |
await self.ffmpeg_converter_actor.push_chunk.remote(audio_chunk)
|
135 |
|
136 |
async def cancel(self):
|
|
|
130 |
self.ffmpeg_converter_actor.run.remote()
|
131 |
while True:
|
132 |
chunk_response = await self.input_queue.get_async()
|
133 |
+
audio_chunk_ref = chunk_response['tts_raw_chunk_ref']
|
134 |
+
audio_chunk = ray.get(audio_chunk_ref)
|
135 |
await self.ffmpeg_converter_actor.push_chunk.remote(audio_chunk)
|
136 |
|
137 |
async def cancel(self):
|
streamlit_av_queue.py
CHANGED
@@ -3,9 +3,11 @@ import av
|
|
3 |
import asyncio
|
4 |
from collections import deque
|
5 |
import threading
|
|
|
6 |
|
7 |
import numpy as np
|
8 |
import ray
|
|
|
9 |
from webrtc_av_queue_actor import WebRtcAVQueueActor
|
10 |
import pydub
|
11 |
import torch
|
@@ -20,7 +22,8 @@ class StreamlitAVQueue:
|
|
20 |
self.queue_actor = WebRtcAVQueueActor.options(
|
21 |
name="WebRtcAVQueueActor",
|
22 |
get_if_exists=True,
|
23 |
-
).remote()
|
|
|
24 |
|
25 |
def set_looking_listening(self, looking, listening: bool):
|
26 |
with self._lock:
|
@@ -31,18 +34,33 @@ class StreamlitAVQueue:
|
|
31 |
self,
|
32 |
frames: List[av.VideoFrame],
|
33 |
) -> av.VideoFrame:
|
|
|
34 |
try:
|
35 |
with self._lock:
|
36 |
should_look = self._looking
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
|
|
|
|
|
|
41 |
await self.queue_actor.enqueue_in_video_frame.remote(shared_tensor_ref)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
42 |
# print (f"tesnor len: {len(shared_tensor)}, tensor shape: {shared_tensor.shape}, tensor type:{shared_tensor.dtype} tensor ref: {shared_tensor_ref}")
|
43 |
except Exception as e:
|
44 |
print (e)
|
45 |
-
return
|
46 |
|
47 |
async def queued_audio_frames_callback(
|
48 |
self,
|
@@ -103,8 +121,8 @@ class StreamlitAVQueue:
|
|
103 |
shared_tensors = await self.queue_actor.get_in_video_frames.remote()
|
104 |
return shared_tensors
|
105 |
|
106 |
-
def get_out_audio_queue(self):
|
107 |
return self.queue_actor.get_out_audio_queue.remote()
|
108 |
|
109 |
-
|
110 |
-
|
|
|
3 |
import asyncio
|
4 |
from collections import deque
|
5 |
import threading
|
6 |
+
import cv2
|
7 |
|
8 |
import numpy as np
|
9 |
import ray
|
10 |
+
from ray.util.queue import Queue
|
11 |
from webrtc_av_queue_actor import WebRtcAVQueueActor
|
12 |
import pydub
|
13 |
import torch
|
|
|
22 |
self.queue_actor = WebRtcAVQueueActor.options(
|
23 |
name="WebRtcAVQueueActor",
|
24 |
get_if_exists=True,
|
25 |
+
).remote()
|
26 |
+
self._out_video_frame = None
|
27 |
|
28 |
def set_looking_listening(self, looking, listening: bool):
|
29 |
with self._lock:
|
|
|
34 |
self,
|
35 |
frames: List[av.VideoFrame],
|
36 |
) -> av.VideoFrame:
|
37 |
+
updated_frames = []
|
38 |
try:
|
39 |
with self._lock:
|
40 |
should_look = self._looking
|
41 |
+
next_out_video_frame = await self.queue_actor.get_out_video_frame.remote()
|
42 |
+
if next_out_video_frame is not None:
|
43 |
+
self._out_video_frame = next_out_video_frame
|
44 |
+
for i, frame in enumerate(frames):
|
45 |
+
user_image = frame.to_ndarray(format="rgb24")
|
46 |
+
if should_look:
|
47 |
+
shared_tensor_ref = ray.put(user_image)
|
48 |
await self.queue_actor.enqueue_in_video_frame.remote(shared_tensor_ref)
|
49 |
+
if self._out_video_frame is not None:
|
50 |
+
frame = self._out_video_frame
|
51 |
+
# resize user image to 1/4 size
|
52 |
+
user_frame = cv2.resize(user_image, (user_image.shape[1]//4, user_image.shape[0]//4), interpolation=cv2.INTER_AREA)
|
53 |
+
x_user = 0
|
54 |
+
y_user = frame.shape[0] - user_frame.shape[0]
|
55 |
+
final_frame = frame.copy()
|
56 |
+
final_frame[y_user:y_user+user_frame.shape[0], x_user:x_user+user_frame.shape[1]] = user_frame
|
57 |
+
frame = av.VideoFrame.from_ndarray(final_frame, format="rgb24")
|
58 |
+
|
59 |
+
updated_frames.append(frame)
|
60 |
# print (f"tesnor len: {len(shared_tensor)}, tensor shape: {shared_tensor.shape}, tensor type:{shared_tensor.dtype} tensor ref: {shared_tensor_ref}")
|
61 |
except Exception as e:
|
62 |
print (e)
|
63 |
+
return updated_frames
|
64 |
|
65 |
async def queued_audio_frames_callback(
|
66 |
self,
|
|
|
121 |
shared_tensors = await self.queue_actor.get_in_video_frames.remote()
|
122 |
return shared_tensors
|
123 |
|
124 |
+
def get_out_audio_queue(self)->Queue:
|
125 |
return self.queue_actor.get_out_audio_queue.remote()
|
126 |
|
127 |
+
def get_out_video_queue(self)->Queue:
|
128 |
+
return self.queue_actor.get_out_video_queue.remote()
|
tests/test_image.py
ADDED
@@ -0,0 +1,192 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import cv2
|
2 |
+
import av
|
3 |
+
import numpy as np
|
4 |
+
|
5 |
+
def resize_aspect_fit(image, dim=(640, 480)):
|
6 |
+
h, w = image.shape[:2]
|
7 |
+
aspect_ratio = w / h
|
8 |
+
|
9 |
+
target_width, target_height = dim
|
10 |
+
target_aspect = target_width / target_height
|
11 |
+
|
12 |
+
if aspect_ratio > target_aspect:
|
13 |
+
# Original aspect is wider than target
|
14 |
+
new_width = target_width
|
15 |
+
new_height = int(target_width / aspect_ratio)
|
16 |
+
else:
|
17 |
+
# Original aspect is taller than target
|
18 |
+
new_height = target_height
|
19 |
+
new_width = int(target_height * aspect_ratio)
|
20 |
+
|
21 |
+
resized_image = cv2.resize(image, (new_width, new_height), interpolation=cv2.INTER_AREA)
|
22 |
+
return resized_image
|
23 |
+
|
24 |
+
def resize_and_crop(image, dim=(640, 480)):
|
25 |
+
h, w = image.shape[:2]
|
26 |
+
aspect_ratio = w / h
|
27 |
+
|
28 |
+
target_width, target_height = dim
|
29 |
+
target_aspect = target_width / target_height
|
30 |
+
|
31 |
+
if aspect_ratio > target_aspect:
|
32 |
+
# Original aspect is wider than target, fit by height
|
33 |
+
new_height = target_height
|
34 |
+
new_width = int(target_height * aspect_ratio)
|
35 |
+
else:
|
36 |
+
# Original aspect is taller than target, fit by width
|
37 |
+
new_width = target_width
|
38 |
+
new_height = int(target_width / aspect_ratio)
|
39 |
+
|
40 |
+
# Resize the image with new dimensions
|
41 |
+
resized_image = cv2.resize(image, (new_width, new_height), interpolation=cv2.INTER_AREA)
|
42 |
+
|
43 |
+
# Crop to target dimensions
|
44 |
+
x_offset = (new_width - target_width) // 2
|
45 |
+
y_offset = (new_height - target_height) // 2
|
46 |
+
|
47 |
+
cropped_image = resized_image[y_offset:y_offset + target_height, x_offset:x_offset + target_width]
|
48 |
+
|
49 |
+
return cropped_image
|
50 |
+
|
51 |
+
def overlay_images(background, overlay, x, y):
|
52 |
+
"""
|
53 |
+
Overlay an image with transparency over another image.
|
54 |
+
"""
|
55 |
+
# Check if overlay dimensions fit within the background at the given (x, y) position
|
56 |
+
if y + overlay.shape[0] > background.shape[0] or x + overlay.shape[1] > background.shape[1]:
|
57 |
+
raise ValueError("Overlay dimensions exceed background dimensions at the specified position.")
|
58 |
+
|
59 |
+
# Extract the alpha channel from the overlay and create an inverse alpha channel
|
60 |
+
alpha = overlay[:, :, 3] / 255.0
|
61 |
+
inverse_alpha = 1.0 - alpha
|
62 |
+
|
63 |
+
# Convert overlay to BGR if it's in RGB
|
64 |
+
if overlay.shape[2] == 4: # If it has an alpha channel
|
65 |
+
overlay = cv2.cvtColor(overlay[:, :, :3], cv2.COLOR_RGB2BGR)
|
66 |
+
overlay = np.concatenate([overlay, overlay[:, :, 3:]], axis=2) # Add alpha channel back
|
67 |
+
else:
|
68 |
+
overlay = cv2.cvtColor(overlay, cv2.COLOR_RGB2BGR)
|
69 |
+
|
70 |
+
# Overlay the images
|
71 |
+
for c in range(0, 3):
|
72 |
+
background[y:overlay.shape[0]+y, x:overlay.shape[1]+x, c] = (
|
73 |
+
alpha * overlay[:, :, c] + inverse_alpha * background[y:overlay.shape[0]+y, x:overlay.shape[1]+x, c]
|
74 |
+
)
|
75 |
+
|
76 |
+
return background
|
77 |
+
|
78 |
+
|
79 |
+
def transform_frame(user_frame: av.VideoFrame) -> av.VideoFrame:
|
80 |
+
# Convert av.VideoFrame to numpy array (OpenCV format)
|
81 |
+
user_frame_np = np.frombuffer(user_frame.planes[0], np.uint8).reshape(user_frame.height, user_frame.width, -1)
|
82 |
+
|
83 |
+
# Load background image
|
84 |
+
background = cv2.imread("zoom-background.png")
|
85 |
+
|
86 |
+
# Load bot image (assuming it has an alpha channel for transparency)
|
87 |
+
bot_image = cv2.imread("bot-image.png", cv2.IMREAD_UNCHANGED)
|
88 |
+
|
89 |
+
# Resize background to match the user frame dimensions
|
90 |
+
aspect_ratio = background.shape[1] / background.shape[0]
|
91 |
+
new_h = user_frame.height
|
92 |
+
new_w = int(new_h * aspect_ratio)
|
93 |
+
background_resized = cv2.resize(background, (new_w, new_h))
|
94 |
+
|
95 |
+
# Crop the background if it exceeds the user frame width
|
96 |
+
if new_w > user_frame.width:
|
97 |
+
crop_x1 = (new_w - user_frame.width) // 2
|
98 |
+
crop_x2 = crop_x1 + user_frame.width
|
99 |
+
background_resized = background_resized[:, crop_x1:crop_x2, :3]
|
100 |
+
|
101 |
+
# Overlay bot image on the right-hand side
|
102 |
+
x_bot = background_resized.shape[1] - bot_image.shape[1]
|
103 |
+
y_bot = 0
|
104 |
+
background_resized = overlay_images(background_resized, bot_image, x_bot, y_bot)
|
105 |
+
|
106 |
+
# Overlay user's video frame in the bottom-left corner
|
107 |
+
x_user = 0
|
108 |
+
y_user = background_resized.shape[0] - user_frame.height
|
109 |
+
background_resized[y_user:user_frame.height+y_user, x_user:user_frame.width+x_user, :3] = user_frame_np
|
110 |
+
|
111 |
+
# Convert the final frame back to av.VideoFrame
|
112 |
+
output_frame = av.VideoFrame.from_ndarray(background_resized, format="bgr24")
|
113 |
+
|
114 |
+
return output_frame
|
115 |
+
|
116 |
+
def create_charles_frames(background, charles_frames):
|
117 |
+
output_frames = []
|
118 |
+
# Load background image
|
119 |
+
background = cv2.imread(background, cv2.COLOR_BGR2RGB)
|
120 |
+
background = cv2.cvtColor(background, cv2.COLOR_BGR2RGB)
|
121 |
+
# resize background to match user image
|
122 |
+
background = resize_and_crop(background, (640, 480))
|
123 |
+
|
124 |
+
for bot_image_path in charles_frames:
|
125 |
+
bot_image = cv2.imread(bot_image_path, cv2.IMREAD_UNCHANGED)
|
126 |
+
|
127 |
+
# assert bot image is square
|
128 |
+
assert bot_image.shape[0] == bot_image.shape[1]
|
129 |
+
|
130 |
+
# resize bot image if it is larger than backgroun impage in any direction
|
131 |
+
if bot_image.shape[0] > background.shape[0]:
|
132 |
+
bot_image = cv2.resize(bot_image, (background.shape[0], background.shape[0]), interpolation=cv2.INTER_AREA)
|
133 |
+
|
134 |
+
# Overlay bot image on the right-hand side
|
135 |
+
x_bot = background.shape[1] - bot_image.shape[1]
|
136 |
+
y_bot = background.shape[0] - bot_image.shape[0]
|
137 |
+
background_with_bot = overlay_images(background.copy(), bot_image, x_bot, y_bot)
|
138 |
+
|
139 |
+
output_frames.append(background_with_bot)
|
140 |
+
|
141 |
+
return output_frames
|
142 |
+
|
143 |
+
|
144 |
+
def test_create_bot_frames():
|
145 |
+
frames = create_charles_frames("./images/zoom-background.png", ["./images/charles.png", "./images/charles-open.png"])
|
146 |
+
index = 0
|
147 |
+
for frame in frames:
|
148 |
+
final_frame_bgr = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
|
149 |
+
cv2.imwrite(f"./images/charles_frame_{index}.jpg", final_frame_bgr)
|
150 |
+
index += 1
|
151 |
+
|
152 |
+
def test_overlay():
|
153 |
+
# Load mock user image
|
154 |
+
user_image = cv2.imread("./prototypes/person-016.jpg", cv2.COLOR_BGR2RGB)
|
155 |
+
user_image = cv2.cvtColor(user_image, cv2.COLOR_BGR2RGB)
|
156 |
+
# resize to 640x480, handle that this is smaller and can be cropped
|
157 |
+
user_image = resize_and_crop(user_image, (640, 480))
|
158 |
+
|
159 |
+
# Load background image
|
160 |
+
background = cv2.imread("./images/zoom-background.png", cv2.COLOR_BGR2RGB)
|
161 |
+
background = cv2.cvtColor(background, cv2.COLOR_BGR2RGB)
|
162 |
+
# resize background to match user image
|
163 |
+
background = resize_and_crop(background, (user_image.shape[:2][1], user_image.shape[:2][0]))
|
164 |
+
|
165 |
+
# Load bot image (assuming it has an alpha channel for transparency)
|
166 |
+
bot_image = cv2.imread("./images/charles-open.png", cv2.IMREAD_UNCHANGED)
|
167 |
+
|
168 |
+
# resize bot image if it is larger than backgroun impage in any direction
|
169 |
+
if bot_image.shape[0] > background.shape[0]:
|
170 |
+
bot_image = cv2.resize(bot_image, (background.shape[0], background.shape[0]), interpolation=cv2.INTER_AREA)
|
171 |
+
|
172 |
+
# Overlay bot image on the right-hand side
|
173 |
+
x_bot = background.shape[1] - bot_image.shape[1]
|
174 |
+
y_bot = background.shape[0] - bot_image.shape[0]
|
175 |
+
background_with_bot = overlay_images(background.copy(), bot_image, x_bot, y_bot)
|
176 |
+
|
177 |
+
# Overlay user's frame in the bottom-left corner (1/3 size)
|
178 |
+
# resize user image to 1/4 size
|
179 |
+
user_frame = cv2.resize(user_image, (user_image.shape[1]//4, user_image.shape[0]//4), interpolation=cv2.INTER_AREA)
|
180 |
+
x_user = 0
|
181 |
+
y_user = background.shape[0] - user_frame.shape[0]
|
182 |
+
final_frame = background_with_bot.copy()
|
183 |
+
# final_frame[y_user:user_frame.shape[0]+y_user, x_user:user_frame.shape[1]+x_user, :3] = user_frame
|
184 |
+
final_frame[y_user:y_user+user_frame.shape[0], x_user:x_user+user_frame.shape[1]] = user_frame
|
185 |
+
|
186 |
+
|
187 |
+
# Save the final frame as JPEG
|
188 |
+
final_frame_bgr = cv2.cvtColor(final_frame, cv2.COLOR_RGB2BGR)
|
189 |
+
cv2.imwrite("./images/final_frame.jpg", final_frame_bgr)
|
190 |
+
|
191 |
+
test_overlay()
|
192 |
+
test_create_bot_frames()
|
tests/test_talking.py
ADDED
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Modifying the code to ensure the mouth is open when the character starts talking
|
2 |
+
|
3 |
+
import random
|
4 |
+
import time
|
5 |
+
|
6 |
+
|
7 |
+
class CharacterFace:
|
8 |
+
def __init__(self):
|
9 |
+
self.mouth_open = False
|
10 |
+
self.last_change_time = 0
|
11 |
+
self.next_change_in = 0
|
12 |
+
|
13 |
+
def update(self, is_talking, start_talking=False):
|
14 |
+
current_time = time.time()
|
15 |
+
|
16 |
+
# Open the mouth when the character starts talking
|
17 |
+
if start_talking:
|
18 |
+
self.mouth_open = True
|
19 |
+
self.next_change_in = current_time + random.uniform(0.1, 0.5)
|
20 |
+
return self.mouth_open
|
21 |
+
|
22 |
+
# Initialize the next change time if it's zero.
|
23 |
+
if self.next_change_in == 0:
|
24 |
+
self.next_change_in = current_time + random.uniform(0.1, 0.5)
|
25 |
+
|
26 |
+
# Update the mouth state only if the character is talking.
|
27 |
+
if is_talking:
|
28 |
+
# Check if it's time to change the mouth state.
|
29 |
+
if current_time >= self.next_change_in:
|
30 |
+
self.mouth_open = not self.mouth_open
|
31 |
+
self.next_change_in = current_time + random.uniform(0.1, 0.5)
|
32 |
+
else:
|
33 |
+
# Close the mouth if the character is not talking.
|
34 |
+
self.mouth_open = False
|
35 |
+
|
36 |
+
return self.mouth_open
|
37 |
+
|
38 |
+
|
39 |
+
def _debug_test():
|
40 |
+
# Example usage
|
41 |
+
face = CharacterFace()
|
42 |
+
output = []
|
43 |
+
|
44 |
+
# Initialize variables to control talk and pause durations
|
45 |
+
next_talk_time = 0
|
46 |
+
next_pause_time = 0
|
47 |
+
is_talking = False
|
48 |
+
|
49 |
+
# Simulate the character talking and not talking with variable durations
|
50 |
+
for _ in range(500): # Increase the number of iterations for a longer simulation
|
51 |
+
current_time = time.time()
|
52 |
+
start_talking = False
|
53 |
+
|
54 |
+
if is_talking and current_time >= next_talk_time:
|
55 |
+
is_talking = False
|
56 |
+
next_pause_time = current_time + random.uniform(0.5, 3.0)
|
57 |
+
|
58 |
+
if not is_talking and current_time >= next_pause_time:
|
59 |
+
is_talking = True
|
60 |
+
start_talking = True # Set flag to open mouth at the start of talking
|
61 |
+
next_talk_time = current_time + random.uniform(1.0, 5.0)
|
62 |
+
|
63 |
+
mouth_open = face.update(is_talking, start_talking)
|
64 |
+
print(f"Is Talking: {is_talking}, Mouth Open: {mouth_open}")
|
65 |
+
time.sleep(random.uniform(0.1, 0.5))
|
text_to_speech_service.py
CHANGED
@@ -5,7 +5,7 @@ from elevenlabs import generate, play
|
|
5 |
from elevenlabs import set_api_key
|
6 |
from elevenlabs import generate, stream
|
7 |
from agent_response import AgentResponse
|
8 |
-
|
9 |
|
10 |
class TextToSpeechService:
|
11 |
def __init__(self, voice_id="Bella", model_id="eleven_monolingual_v1"):
|
@@ -60,7 +60,8 @@ class TextToSpeechService:
|
|
60 |
|
61 |
# Run next(stream) in a separate thread to avoid blocking the event loop
|
62 |
chunk = await asyncio.to_thread(next, stream)
|
63 |
-
|
|
|
64 |
if cancel_event.is_set():
|
65 |
return
|
66 |
yield sentence_response
|
|
|
5 |
from elevenlabs import set_api_key
|
6 |
from elevenlabs import generate, stream
|
7 |
from agent_response import AgentResponse
|
8 |
+
import ray
|
9 |
|
10 |
class TextToSpeechService:
|
11 |
def __init__(self, voice_id="Bella", model_id="eleven_monolingual_v1"):
|
|
|
60 |
|
61 |
# Run next(stream) in a separate thread to avoid blocking the event loop
|
62 |
chunk = await asyncio.to_thread(next, stream)
|
63 |
+
chunk_ref = ray.put(chunk)
|
64 |
+
sentence_response['tts_raw_chunk_ref'] = chunk_ref
|
65 |
if cancel_event.is_set():
|
66 |
return
|
67 |
yield sentence_response
|
webrtc_av_queue_actor.py
CHANGED
@@ -8,9 +8,10 @@ import numpy as np
|
|
8 |
@ray.remote
|
9 |
class WebRtcAVQueueActor:
|
10 |
def __init__(self):
|
11 |
-
self.in_audio_queue = Queue(maxsize=
|
12 |
-
self.in_video_queue = Queue(maxsize=
|
13 |
-
self.out_audio_queue = Queue(maxsize=
|
|
|
14 |
|
15 |
|
16 |
async def enqueue_in_video_frame(self, shared_tensor_ref):
|
@@ -25,7 +26,6 @@ class WebRtcAVQueueActor:
|
|
25 |
del evicted_item
|
26 |
await self.in_audio_queue.put_async(shared_buffer_ref)
|
27 |
|
28 |
-
|
29 |
async def get_in_audio_frames(self):
|
30 |
audio_frames = []
|
31 |
if self.in_audio_queue.empty():
|
@@ -44,11 +44,21 @@ class WebRtcAVQueueActor:
|
|
44 |
video_frames.append(shared_tensor_ref)
|
45 |
return video_frames
|
46 |
|
47 |
-
def get_out_audio_queue(self):
|
48 |
return self.out_audio_queue
|
49 |
|
|
|
|
|
|
|
50 |
async def get_out_audio_frame(self):
|
51 |
if self.out_audio_queue.empty():
|
52 |
return None
|
53 |
-
|
54 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
@ray.remote
|
9 |
class WebRtcAVQueueActor:
|
10 |
def __init__(self):
|
11 |
+
self.in_audio_queue = Queue(maxsize=3000) # Adjust the size as needed
|
12 |
+
self.in_video_queue = Queue(maxsize=10) # Adjust the size as needed
|
13 |
+
self.out_audio_queue = Queue(maxsize=3000) # Adjust the size as needed
|
14 |
+
self.out_video_queue = Queue(maxsize=10) # Adjust the size as needed
|
15 |
|
16 |
|
17 |
async def enqueue_in_video_frame(self, shared_tensor_ref):
|
|
|
26 |
del evicted_item
|
27 |
await self.in_audio_queue.put_async(shared_buffer_ref)
|
28 |
|
|
|
29 |
async def get_in_audio_frames(self):
|
30 |
audio_frames = []
|
31 |
if self.in_audio_queue.empty():
|
|
|
44 |
video_frames.append(shared_tensor_ref)
|
45 |
return video_frames
|
46 |
|
47 |
+
def get_out_audio_queue(self)->Queue:
|
48 |
return self.out_audio_queue
|
49 |
|
50 |
+
def get_out_video_queue(self)->Queue:
|
51 |
+
return self.out_video_queue
|
52 |
+
|
53 |
async def get_out_audio_frame(self):
|
54 |
if self.out_audio_queue.empty():
|
55 |
return None
|
56 |
+
frame = await self.out_audio_queue.get_async()
|
57 |
+
return frame
|
58 |
+
|
59 |
+
async def get_out_video_frame(self):
|
60 |
+
if self.out_video_queue.empty():
|
61 |
+
return None
|
62 |
+
while not self.out_video_queue.empty():
|
63 |
+
frame = await self.out_video_queue.get_async()
|
64 |
+
return frame
|