GordonChan365 commited on
Commit
a63ed2c
·
verified ·
1 Parent(s): c39ffe7

Upload 2 files

Browse files
Files changed (2) hide show
  1. kitt.py +158 -0
  2. requirements.txt +6 -0
kitt.py ADDED
@@ -0,0 +1,158 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import copy
3
+ import logging
4
+ from collections import deque
5
+ from typing import Annotated, List
6
+ import os
7
+ import dotenv
8
+ from livekit import agents, rtc
9
+ from livekit.agents import JobContext, JobRequest, WorkerOptions, cli
10
+ from livekit.agents.llm import (
11
+ ChatContext,
12
+ ChatMessage,
13
+ ChatRole,
14
+ )
15
+ from livekit.agents.voice_assistant import AssistantContext, VoiceAssistant
16
+ from livekit.plugins import deepgram, openai, silero
17
+
18
+ dotenv.load_dotenv()
19
+ print(os.environ.get("LIVEKIT_URL"))
20
+ setattr(openai.TTS, 'sample_rate', 32000)
21
+ MAX_IMAGES = 3
22
+ NO_IMAGE_MESSAGE_GENERIC = (
23
+ "I'm sorry, I don't have an image to process. Are you publishing your video?"
24
+ )
25
+
26
+
27
+ class AssistantFnc(agents.llm.FunctionContext):
28
+ @agents.llm.ai_callable(
29
+ desc="Called when asked to evaluate something that would require vision capabilities."
30
+ )
31
+ async def image(
32
+ self,
33
+ user_msg: Annotated[
34
+ str,
35
+ agents.llm.TypeInfo(desc="The user message that triggered this function"),
36
+ ],
37
+ ):
38
+ ctx = AssistantContext.get_current()
39
+ ctx.store_metadata("user_msg", user_msg)
40
+
41
+
42
+ async def get_human_video_track(room: rtc.Room):
43
+ track_future = asyncio.Future[rtc.RemoteVideoTrack]()
44
+
45
+ def on_sub(track: rtc.Track, *_):
46
+ if isinstance(track, rtc.RemoteVideoTrack):
47
+ track_future.set_result(track)
48
+
49
+ room.on("track_subscribed", on_sub)
50
+
51
+ remote_video_tracks: List[rtc.RemoteVideoTrack] = []
52
+ for _, p in room.participants.items():
53
+ for _, t_pub in p.tracks.items():
54
+ if t_pub.track is not None and isinstance(
55
+ t_pub.track, rtc.RemoteVideoTrack
56
+ ):
57
+ remote_video_tracks.append(t_pub.track)
58
+
59
+ if len(remote_video_tracks) > 0:
60
+ track_future.set_result(remote_video_tracks[0])
61
+
62
+ video_track = await track_future
63
+ room.off("track_subscribed", on_sub)
64
+ return video_track
65
+
66
+
67
+ async def entrypoint(ctx: JobContext):
68
+ sip = ctx.room.name.startswith("sip")
69
+ initial_ctx = ChatContext(
70
+ messages=[
71
+ ChatMessage(
72
+ role=ChatRole.SYSTEM,
73
+ text=(
74
+ "You are a funny bot created by LiveKit. Your interface with users will be voice. "
75
+ "You should use short and concise responses, and avoiding usage of unpronouncable punctuation."
76
+ ),
77
+ )
78
+ ]
79
+ )
80
+
81
+ gpt = openai.LLM(
82
+ model="gpt-3.5-turbo",
83
+ )
84
+ latest_image: rtc.VideoFrame | None = None
85
+ img_msg_queue: deque[agents.llm.ChatMessage] = deque()
86
+ assistant = VoiceAssistant(
87
+ vad=silero.VAD(),
88
+ stt=deepgram.STT(),
89
+ llm=gpt,
90
+ tts=openai.TTS(
91
+ base_url=os.environ.get("TTS_URL"),
92
+ api_key=os.environ.get("TTS_API_KEY")
93
+ ),
94
+ fnc_ctx=None if sip else AssistantFnc(),
95
+ chat_ctx=initial_ctx,
96
+ )
97
+
98
+ chat = rtc.ChatManager(ctx.room)
99
+
100
+ async def _answer_from_text(text: str):
101
+ chat_ctx = copy.deepcopy(assistant.chat_context)
102
+ chat_ctx.messages.append(ChatMessage(role=ChatRole.USER, text=text))
103
+
104
+ stream = await gpt.chat(chat_ctx)
105
+ await assistant.say(stream)
106
+
107
+ @chat.on("message_received")
108
+ def on_chat_received(msg: rtc.ChatMessage):
109
+ if not msg.message:
110
+ return
111
+
112
+ asyncio.create_task(_answer_from_text(msg.message))
113
+
114
+ async def respond_to_image(user_msg: str):
115
+ nonlocal latest_image, img_msg_queue, initial_ctx
116
+ if not latest_image:
117
+ await assistant.say(NO_IMAGE_MESSAGE_GENERIC)
118
+ return
119
+
120
+ initial_ctx.messages.append(
121
+ agents.llm.ChatMessage(
122
+ role=agents.llm.ChatRole.USER,
123
+ text=user_msg,
124
+ images=[agents.llm.ChatImage(image=latest_image)],
125
+ )
126
+ )
127
+ img_msg_queue.append(initial_ctx.messages[-1])
128
+ if len(img_msg_queue) >= MAX_IMAGES:
129
+ msg = img_msg_queue.popleft()
130
+ msg.images = []
131
+
132
+ stream = await gpt.chat(initial_ctx)
133
+ await assistant.say(stream, allow_interruptions=True)
134
+
135
+ @assistant.on("function_calls_finished")
136
+ def _function_calls_done(ctx: AssistantContext):
137
+ user_msg = ctx.get_metadata("user_msg")
138
+ if not user_msg:
139
+ return
140
+ asyncio.ensure_future(respond_to_image(user_msg))
141
+
142
+ assistant.start(ctx.room)
143
+
144
+ await asyncio.sleep(0.5)
145
+ await assistant.say("Hey, how can I help you today?", allow_interruptions=True)
146
+ while ctx.room.connection_state == rtc.ConnectionState.CONN_CONNECTED:
147
+ video_track = await get_human_video_track(ctx.room)
148
+ async for event in rtc.VideoStream(video_track):
149
+ latest_image = event.frame
150
+
151
+
152
+ async def request_fnc(req: JobRequest) -> None:
153
+ logging.info("received request %s", req)
154
+ await req.accept(entrypoint)
155
+
156
+
157
+ if __name__ == "__main__":
158
+ cli.run_app(WorkerOptions(request_fnc))
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ livekit
2
+ livekit-agents
3
+ livekit-plugins-deepgram
4
+ livekit-plugins-silero
5
+ livekit-plugins-openai
6
+ python-dotenv