Spaces:
Running
Running
File size: 7,335 Bytes
011bd45 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 |
from pathlib import Path
import os
import cv2
import gradio as gr
from fastrtc import Stream,WebRTC
from app.config import env
from fastrtc import AdditionalOutputs
from app.memory import Memory,Message
from fastrtc import get_cloudflare_turn_credentials
from app.agent import build_agent
from fastrtc import get_current_context
session_memories = {}
def get_session_memory(session_id: str = None) -> Memory:
if session_id not in session_memories:
session_memories[session_id] = Memory(build_agent())
welcome_message = "π Now I can see. Feel free to ask me about anything!"
session_memories[session_id].chat.append(Message.assistant(welcome_message))
return session_memories[session_id]
def video_handler(frame):
frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
rtcid = get_current_context().webrtc_id
mem = get_session_memory(rtcid)
if (s := mem.enqueue(frame)):
if mem.chat.history[-1].metadata.get('status') == 'pending':
mem.chat.history[-1] = Message.tool(s.gr, title=s.sender, status=s.status)
else:
mem.chat.append(Message.tool(s.gr, title=s.sender, status=s.status))
return frame, AdditionalOutputs(mem.chat.messages, rtcid)
def chat_handler(text, webrtc_state):
if webrtc_state is None:
return "", [{"role": "assistant", "content": "Please start your camera first to begin the conversation."}], webrtc_state
mem = get_session_memory(webrtc_state)
if not mem.is_running:
mem.receive(text.strip())
return "", mem.chat.messages, webrtc_state
if __name__ == "__main__":
print("π Starting Perceptual Copilot...")
print(f"HF Spaces: {os.getenv('SPACE_ID') is not None}")
print(f"Environment check - API_KEY: {'β' if env.api_key else 'β'}")
print(f"Environment check - END_LANG: {'β' if env.end_lang else 'β'}")
print(f"Environment check - OpenAI Client: {'β' if env.client else 'β'}")
with gr.Blocks(
title="π€ Perceptual Copilot - AI Vision Assistant",
theme=gr.themes.Soft(
primary_hue="blue",
secondary_hue="orange",
neutral_hue="slate",
font=("system-ui", "sans-serif")
),
css=Path("styles.css").read_text(),
) as demo:
# Header section with sleek styling
gr.Markdown("""
<div class="ultra-sleek-header">
<h1 class="hero-title">
<span class="title-primary">Perceptual</span>
<span class="title-accent">Copilot</span>
</h1>
<p class="hero-subtitle">
<span class="status-dot"></span>
An experimental prototype that integrates OpenAI agents with visual tools to process real-time video streams.
</p>
<div class="feature-pills">
<span class="pill">Real-time streaming</span>
<span class="pill">Visual Agent</span>
<span class="pill">Large vision language model</span>
<span class="pill">Reasoning</span>
</div>
</div>
""", elem_classes="ultra-sleek-header")
state = gr.State(value=None)
# Main interface with improved layout
with gr.Row(equal_height=True):
with gr.Column(scale=1, elem_classes="video-container"):
video = WebRTC(
label="π₯ Camera Stream",
rtc_configuration=get_cloudflare_turn_credentials(hf_token=env.hf_token),
track_constraints={
"width": {"exact": 600},
"height": {"exact": 600},
"aspectRatio": {"exact": 1}},
mode="send",
modality="video",
mirror_webcam=True,
width=600,
height=600,
)
with gr.Column(scale=1, elem_classes="chat-container"):
gr.Markdown("### π¬ Chat")
chatbot = gr.Chatbot(
type="messages",
height=450,
label="π€ AI Assistant",
placeholder="Chat history will appear here...",
show_label=False,
)
with gr.Row(elem_classes="items-center"):
textbox = gr.Textbox(
placeholder="π Question goes here, press ENTER to send",
lines=1,
show_label=False,
)
# Event handlers
video.stream(
fn=video_handler,
inputs=[video],
outputs=[video],
concurrency_limit=10,
)
video.on_additional_outputs(
fn=lambda messages, webrtc_id: (messages, webrtc_id),
outputs=[chatbot, state]
)
# Chat handler for textbox
textbox.submit(
chat_handler,
inputs=[textbox, state],
outputs=[textbox, chatbot, state]
)
# Enhanced instructions section
with gr.Column(elem_classes="instructions-container"):
gr.Markdown("""
## π Get Started
**π Quick Reminder:**
1. Allow camera access when prompted
2. Wait for the camera to initialize and first message to appear
3. π‘ **Tip:** If you find it hard to see the interface, please turn off night mode for better visibility
""")
with gr.Row():
with gr.Column():
gr.Markdown("""
### π‘ Example Prompts
**π General Vision:**
- *"What do you see in front of me?"*
- *"What's the overall environment like?"*
**π Text & Documents:**
- *"Read the text in this document"*
- *"Extract the code snippet from this image"*
**π Object Recognition:**
- *"What objects are visible?"*
- *"Help me identify this item"*
""")
with gr.Column():
gr.Markdown("""
### π§ Current Capabilities
**π Available Features:**
- **OCR** - Text extraction and reading
- **Q&A** - Visual question answering
- **Caption** - Scene description and analysis
- **Localization** - Object detection and positioning
- **Time** - Current time and temporal context
**π More Coming Soon:**
We're continuously adding new capabilities to enhance your visual AI experience.
**β οΈ Important Note:**
All models are self-hosted. Please avoid abuse of the system.
""")
demo.queue(default_concurrency_limit=None)
demo.launch() |