Spaces:
Runtime error
Runtime error
GordonChan365
commited on
Upload 2 files
Browse files- kitt.py +158 -0
- requirements.txt +6 -0
kitt.py
ADDED
@@ -0,0 +1,158 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import asyncio
|
2 |
+
import copy
|
3 |
+
import logging
|
4 |
+
from collections import deque
|
5 |
+
from typing import Annotated, List
|
6 |
+
import os
|
7 |
+
import dotenv
|
8 |
+
from livekit import agents, rtc
|
9 |
+
from livekit.agents import JobContext, JobRequest, WorkerOptions, cli
|
10 |
+
from livekit.agents.llm import (
|
11 |
+
ChatContext,
|
12 |
+
ChatMessage,
|
13 |
+
ChatRole,
|
14 |
+
)
|
15 |
+
from livekit.agents.voice_assistant import AssistantContext, VoiceAssistant
|
16 |
+
from livekit.plugins import deepgram, openai, silero
|
17 |
+
|
18 |
+
dotenv.load_dotenv()
|
19 |
+
print(os.environ.get("LIVEKIT_URL"))
|
20 |
+
setattr(openai.TTS, 'sample_rate', 32000)
|
21 |
+
MAX_IMAGES = 3
|
22 |
+
NO_IMAGE_MESSAGE_GENERIC = (
|
23 |
+
"I'm sorry, I don't have an image to process. Are you publishing your video?"
|
24 |
+
)
|
25 |
+
|
26 |
+
|
27 |
+
class AssistantFnc(agents.llm.FunctionContext):
|
28 |
+
@agents.llm.ai_callable(
|
29 |
+
desc="Called when asked to evaluate something that would require vision capabilities."
|
30 |
+
)
|
31 |
+
async def image(
|
32 |
+
self,
|
33 |
+
user_msg: Annotated[
|
34 |
+
str,
|
35 |
+
agents.llm.TypeInfo(desc="The user message that triggered this function"),
|
36 |
+
],
|
37 |
+
):
|
38 |
+
ctx = AssistantContext.get_current()
|
39 |
+
ctx.store_metadata("user_msg", user_msg)
|
40 |
+
|
41 |
+
|
42 |
+
async def get_human_video_track(room: rtc.Room):
|
43 |
+
track_future = asyncio.Future[rtc.RemoteVideoTrack]()
|
44 |
+
|
45 |
+
def on_sub(track: rtc.Track, *_):
|
46 |
+
if isinstance(track, rtc.RemoteVideoTrack):
|
47 |
+
track_future.set_result(track)
|
48 |
+
|
49 |
+
room.on("track_subscribed", on_sub)
|
50 |
+
|
51 |
+
remote_video_tracks: List[rtc.RemoteVideoTrack] = []
|
52 |
+
for _, p in room.participants.items():
|
53 |
+
for _, t_pub in p.tracks.items():
|
54 |
+
if t_pub.track is not None and isinstance(
|
55 |
+
t_pub.track, rtc.RemoteVideoTrack
|
56 |
+
):
|
57 |
+
remote_video_tracks.append(t_pub.track)
|
58 |
+
|
59 |
+
if len(remote_video_tracks) > 0:
|
60 |
+
track_future.set_result(remote_video_tracks[0])
|
61 |
+
|
62 |
+
video_track = await track_future
|
63 |
+
room.off("track_subscribed", on_sub)
|
64 |
+
return video_track
|
65 |
+
|
66 |
+
|
67 |
+
async def entrypoint(ctx: JobContext):
|
68 |
+
sip = ctx.room.name.startswith("sip")
|
69 |
+
initial_ctx = ChatContext(
|
70 |
+
messages=[
|
71 |
+
ChatMessage(
|
72 |
+
role=ChatRole.SYSTEM,
|
73 |
+
text=(
|
74 |
+
"You are a funny bot created by LiveKit. Your interface with users will be voice. "
|
75 |
+
"You should use short and concise responses, and avoiding usage of unpronouncable punctuation."
|
76 |
+
),
|
77 |
+
)
|
78 |
+
]
|
79 |
+
)
|
80 |
+
|
81 |
+
gpt = openai.LLM(
|
82 |
+
model="gpt-3.5-turbo",
|
83 |
+
)
|
84 |
+
latest_image: rtc.VideoFrame | None = None
|
85 |
+
img_msg_queue: deque[agents.llm.ChatMessage] = deque()
|
86 |
+
assistant = VoiceAssistant(
|
87 |
+
vad=silero.VAD(),
|
88 |
+
stt=deepgram.STT(),
|
89 |
+
llm=gpt,
|
90 |
+
tts=openai.TTS(
|
91 |
+
base_url=os.environ.get("TTS_URL"),
|
92 |
+
api_key=os.environ.get("TTS_API_KEY")
|
93 |
+
),
|
94 |
+
fnc_ctx=None if sip else AssistantFnc(),
|
95 |
+
chat_ctx=initial_ctx,
|
96 |
+
)
|
97 |
+
|
98 |
+
chat = rtc.ChatManager(ctx.room)
|
99 |
+
|
100 |
+
async def _answer_from_text(text: str):
|
101 |
+
chat_ctx = copy.deepcopy(assistant.chat_context)
|
102 |
+
chat_ctx.messages.append(ChatMessage(role=ChatRole.USER, text=text))
|
103 |
+
|
104 |
+
stream = await gpt.chat(chat_ctx)
|
105 |
+
await assistant.say(stream)
|
106 |
+
|
107 |
+
@chat.on("message_received")
|
108 |
+
def on_chat_received(msg: rtc.ChatMessage):
|
109 |
+
if not msg.message:
|
110 |
+
return
|
111 |
+
|
112 |
+
asyncio.create_task(_answer_from_text(msg.message))
|
113 |
+
|
114 |
+
async def respond_to_image(user_msg: str):
|
115 |
+
nonlocal latest_image, img_msg_queue, initial_ctx
|
116 |
+
if not latest_image:
|
117 |
+
await assistant.say(NO_IMAGE_MESSAGE_GENERIC)
|
118 |
+
return
|
119 |
+
|
120 |
+
initial_ctx.messages.append(
|
121 |
+
agents.llm.ChatMessage(
|
122 |
+
role=agents.llm.ChatRole.USER,
|
123 |
+
text=user_msg,
|
124 |
+
images=[agents.llm.ChatImage(image=latest_image)],
|
125 |
+
)
|
126 |
+
)
|
127 |
+
img_msg_queue.append(initial_ctx.messages[-1])
|
128 |
+
if len(img_msg_queue) >= MAX_IMAGES:
|
129 |
+
msg = img_msg_queue.popleft()
|
130 |
+
msg.images = []
|
131 |
+
|
132 |
+
stream = await gpt.chat(initial_ctx)
|
133 |
+
await assistant.say(stream, allow_interruptions=True)
|
134 |
+
|
135 |
+
@assistant.on("function_calls_finished")
|
136 |
+
def _function_calls_done(ctx: AssistantContext):
|
137 |
+
user_msg = ctx.get_metadata("user_msg")
|
138 |
+
if not user_msg:
|
139 |
+
return
|
140 |
+
asyncio.ensure_future(respond_to_image(user_msg))
|
141 |
+
|
142 |
+
assistant.start(ctx.room)
|
143 |
+
|
144 |
+
await asyncio.sleep(0.5)
|
145 |
+
await assistant.say("Hey, how can I help you today?", allow_interruptions=True)
|
146 |
+
while ctx.room.connection_state == rtc.ConnectionState.CONN_CONNECTED:
|
147 |
+
video_track = await get_human_video_track(ctx.room)
|
148 |
+
async for event in rtc.VideoStream(video_track):
|
149 |
+
latest_image = event.frame
|
150 |
+
|
151 |
+
|
152 |
+
async def request_fnc(req: JobRequest) -> None:
|
153 |
+
logging.info("received request %s", req)
|
154 |
+
await req.accept(entrypoint)
|
155 |
+
|
156 |
+
|
157 |
+
if __name__ == "__main__":
|
158 |
+
cli.run_app(WorkerOptions(request_fnc))
|
requirements.txt
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
livekit
|
2 |
+
livekit-agents
|
3 |
+
livekit-plugins-deepgram
|
4 |
+
livekit-plugins-silero
|
5 |
+
livekit-plugins-openai
|
6 |
+
python-dotenv
|