PierreBrunelle's picture
Update app.py
dd1bdb6 verified
import gradio as gr
import pixeltable as pxt
from pixeltable.iterators import DocumentSplitter, FrameIterator, StringSplitter
from pixeltable.functions.huggingface import sentence_transformer, clip_image, clip_text
from pixeltable.functions.video import extract_audio
from pixeltable.functions.audio import get_metadata
from pixeltable.functions import openai
import numpy as np
import PIL.Image
import os
import getpass
import requests
import tempfile
from datetime import datetime
# Configuration
PIXELTABLE_MEDIA_DIR = os.path.expanduser("~/.pixeltable/media")
MAX_TOKENS_DEFAULT = 300
TEMPERATURE_DEFAULT = 0.7
CHUNK_SIZE_DEFAULT = 300
# Initialize API keys
def init_api_keys():
if 'OPENAI_API_KEY' not in os.environ:
os.environ['OPENAI_API_KEY'] = getpass.getpass('OpenAI API key:')
# Embedding Functions
@pxt.expr_udf
def e5_embed(text: str) -> np.ndarray:
return sentence_transformer(text, model_id='intfloat/e5-large-v2')
@pxt.expr_udf
def embed_image(img: PIL.Image.Image):
return clip_image(img, model_id='openai/clip-vit-base-patch32')
@pxt.expr_udf
def str_embed(s: str):
return clip_text(s, model_id='openai/clip-vit-base-patch32')
# Common Utilities
def initialize_pixeltable(dir_name='unified_app'):
"""Initialize Pixeltable directory"""
pxt.drop_dir(dir_name, force=True)
pxt.create_dir(dir_name)
@pxt.udf
def create_prompt(top_k_list: list[dict], question: str) -> str:
"""Create a standardized prompt format"""
concat_top_k = '\n\n'.join(elt['text'] for elt in reversed(top_k_list))
return f'''
PASSAGES:
{concat_top_k}
QUESTION:
{question}'''
@pxt.udf(return_type=pxt.AudioType())
def generate_audio(script: str, voice: str, api_key: str):
"""Generate audio from text using OpenAI's API"""
if not script or not voice:
return None
try:
response = requests.post(
"https://api.openai.com/v1/audio/speech",
headers={"Authorization": f"Bearer {api_key}"},
json={"model": "tts-1", "input": script, "voice": voice}
)
if response.status_code == 200:
temp_dir = os.path.join(os.getcwd(), "temp")
os.makedirs(temp_dir, exist_ok=True)
temp_file = os.path.join(temp_dir, f"audio_{os.urandom(8).hex()}.mp3")
with open(temp_file, 'wb') as f:
f.write(response.content)
return temp_file
except Exception as e:
print(f"Error in audio synthesis: {e}")
return None
# Document Processing
class DocumentProcessor:
@staticmethod
def process_documents(pdf_files, chunk_limit, chunk_separator):
"""Process uploaded documents for chatbot functionality"""
initialize_pixeltable()
docs = pxt.create_table(
'unified_app.documents',
{'document': pxt.DocumentType(nullable=True)}
)
docs.insert({'document': file.name} for file in pdf_files if file.name.endswith('.pdf'))
chunks = pxt.create_view(
'unified_app.chunks',
docs,
iterator=DocumentSplitter.create(
document=docs.document,
separators=chunk_separator,
limit=chunk_limit if chunk_separator in ["token_limit", "char_limit"] else None
)
)
chunks.add_embedding_index('text', string_embed=e5_embed)
return "Documents processed successfully. You can start asking questions."
@staticmethod
def get_document_answer(question):
"""Get answer from processed documents"""
try:
chunks = pxt.get_table('unified_app.chunks')
sim = chunks.text.similarity(question)
relevant_chunks = chunks.order_by(sim, asc=False).limit(5).select(chunks.text).collect()
context = "\n\n".join(chunk['text'] for chunk in relevant_chunks)
temp_table = pxt.create_table(
'unified_app.temp_response',
{
'question': pxt.StringType(),
'context': pxt.StringType()
}
)
temp_table.insert([{'question': question, 'context': context}])
temp_table['response'] = openai.chat_completions(
messages=[
{
'role': 'system',
'content': 'Answer the question based only on the provided context. If the context doesn\'t contain enough information, say so.'
},
{
'role': 'user',
'content': f"Context:\n{context}\n\nQuestion: {question}"
}
],
model='gpt-4o-mini-2024-07-18'
)
answer = temp_table.select(
answer=temp_table.response.choices[0].message.content
).tail(1)['answer'][0]
pxt.drop_table('unified_app.temp_response', force=True)
return answer
except Exception as e:
return f"Error: {str(e)}"
# Call Analysis
class CallAnalyzer:
@staticmethod
def process_call(video_file):
"""Process and analyze call recordings"""
try:
initialize_pixeltable()
calls = pxt.create_table(
'unified_app.calls',
{"video": pxt.VideoType(nullable=True)}
)
calls['audio'] = extract_audio(calls.video, format='mp3')
calls['transcription'] = openai.transcriptions(audio=calls.audio, model='whisper-1')
calls['text'] = calls.transcription.text
sentences = pxt.create_view(
'unified_app.sentences',
calls,
iterator=StringSplitter.create(text=calls.text, separators='sentence')
)
sentences.add_embedding_index('text', string_embed=e5_embed)
@pxt.udf
def generate_insights(text: str) -> list[dict]:
return [
{'role': 'system', 'content': 'Analyze this call transcript and provide key insights:'},
{'role': 'user', 'content': text}
]
calls['insights_prompt'] = generate_insights(calls.text)
calls['insights'] = openai.chat_completions(
messages=calls.insights_prompt,
model='gpt-4o-mini-2024-07-18'
).choices[0].message.content
calls.insert([{"video": video_file}])
result = calls.select(calls.text, calls.audio, calls.insights).tail(1)
return result['text'][0], result['audio'][0], result['insights'][0]
except Exception as e:
return f"Error processing call: {str(e)}", None, None
# Video Search
class VideoSearcher:
@staticmethod
def process_video(video_file):
"""Process video for searching"""
try:
initialize_pixeltable()
videos = pxt.create_table('unified_app.videos', {'video': pxt.VideoType()})
frames = pxt.create_view(
'unified_app.frames',
videos,
iterator=FrameIterator.create(video=videos.video, fps=1)
)
frames.add_embedding_index('frame', string_embed=str_embed, image_embed=embed_image)
videos.insert([{'video': video_file.name}])
return "Video processed and indexed for search."
except Exception as e:
return f"Error processing video: {str(e)}"
@staticmethod
def search_video(search_type, text_query=None, image_query=None):
"""Search processed video frames"""
try:
frames = pxt.get_table('unified_app.frames')
if search_type == "Text" and text_query:
sim = frames.frame.similarity(text_query)
elif search_type == "Image" and image_query is not None:
sim = frames.frame.similarity(image_query)
else:
return []
results = frames.order_by(sim, asc=False).limit(5).select(frames.frame).collect()
return [row['frame'] for row in results]
except Exception as e:
print(f"Search error: {str(e)}")
return []
# Gradio Interface
def create_interface():
with gr.Blocks(theme=gr.themes.Base()) as demo:
# Header
gr.HTML(
"""
<div style="text-align: left; margin-bottom: 1rem;">
<img src="https://raw.githubusercontent.com/pixeltable/pixeltable/main/docs/source/data/pixeltable-logo-large.png" alt="Pixeltable" style="max-width: 150px;" />
</div>
"""
)
gr.Markdown(
"""
# Multimodal Powerhouse
"""
)
gr.HTML(
"""
<p>
<a href="https://github.com/pixeltable/pixeltable" target="_blank" style="color: #F25022; text-decoration: none; font-weight: bold;">Pixeltable</a>
is a declarative interface for working with text, images, embeddings, and video, enabling you to store, transform, index, and iterate on data.
</p>
<div style="background-color: #E5DDD4; border: 1px solid #e9ecef; border-radius: 8px; padding: 15px; margin: 15px 0;">
<strong>โš ๏ธ Note:</strong> This app runs best with GPU. For optimal performance, consider
<a href="https://huggingface.co/spaces/Pixeltable/Multimodal-Processing-Suite?duplicate=true" target="_blank" style="color: #F25022; text-decoration: none; font-weight: bold;">duplicating this space</a>
to run locally or with better computing resources.
</div>
"""
)
# Documentation Sections
with gr.Row():
with gr.Column():
with gr.Accordion("๐ŸŽฏ What This App Does", open=False):
gr.Markdown("""
1. ๐Ÿ“š **Document Processing**
* Chat with your documents using RAG
* Process multiple document formats
* Extract key insights
2. ๐ŸŽฅ **Video Analysis**
* Text and image-based video search
* Frame extraction and indexing
* Visual content discovery
3. ๐ŸŽ™๏ธ **Call Analysis**
* Automatic transcription
* Key insight extraction
* Audio processing
""")
with gr.Column():
with gr.Accordion("โš™๏ธ How It Works", open=False):
gr.Markdown("""
1. ๐Ÿ”„ **Data Processing**
* Chunking and indexing documents
* Embedding generation for search
* Multi-modal data handling
2. ๐Ÿค– **AI Integration**
* LLM-powered analysis
* Speech-to-text conversion
* Semantic search capabilities
3. ๐Ÿ“Š **Storage & Retrieval**
* Efficient data organization
* Quick content retrieval
* Structured data management
""")
with gr.Tabs():
# Document Chat Tab
with gr.TabItem("๐Ÿ“š Document Chat"):
with gr.Row():
with gr.Column():
doc_files = gr.File(label="Upload Documents", file_count="multiple")
chunk_size = gr.Slider(
minimum=100,
maximum=500,
value=CHUNK_SIZE_DEFAULT,
label="Chunk Size"
)
chunk_type = gr.Dropdown(
choices=["token_limit", "char_limit", "sentence", "paragraph"],
value="token_limit",
label="Chunking Method"
)
process_docs_btn = gr.Button("Process Documents")
process_status = gr.Textbox(label="Status")
with gr.Column():
chatbot = gr.Chatbot(label="Document Chat")
msg = gr.Textbox(label="Ask a question")
send_btn = gr.Button("Send")
# Call Analysis Tab
with gr.TabItem("๐ŸŽ™๏ธ Call Analysis"):
with gr.Row():
with gr.Column():
call_upload = gr.Video(label="Upload Call Recording")
analyze_btn = gr.Button("Analyze Call")
with gr.Column():
with gr.Tabs():
with gr.TabItem("๐Ÿ“ Transcript"):
transcript = gr.Textbox(label="Transcript", lines=10)
with gr.TabItem("๐Ÿ’ก Insights"):
insights = gr.Textbox(label="Key Insights", lines=10)
with gr.TabItem("๐Ÿ”Š Audio"):
audio_output = gr.Audio(label="Extracted Audio")
# Video Search Tab
with gr.TabItem("๐ŸŽฅ Video Search"):
with gr.Row():
with gr.Column():
video_upload = gr.File(label="Upload Video")
process_video_btn = gr.Button("Process Video")
video_status = gr.Textbox(label="Processing Status")
search_type = gr.Radio(
choices=["Text", "Image"],
label="Search Type",
value="Text"
)
text_input = gr.Textbox(label="Text Query")
image_input = gr.Image(label="Image Query", type="pil", visible=False)
search_btn = gr.Button("Search")
with gr.Column():
results_gallery = gr.Gallery(label="Search Results")
# Event Handlers
def document_chat(message, chat_history):
bot_message = DocumentProcessor.get_document_answer(message)
chat_history.append((message, bot_message))
return "", chat_history
def update_search_type(choice):
return {
text_input: gr.update(visible=choice=="Text"),
image_input: gr.update(visible=choice=="Image")
}
# Connect Events
process_docs_btn.click(
DocumentProcessor.process_documents,
inputs=[doc_files, chunk_size, chunk_type],
outputs=[process_status]
)
send_btn.click(
document_chat,
inputs=[msg, chatbot],
outputs=[msg, chatbot]
)
analyze_btn.click(
CallAnalyzer.process_call,
inputs=[call_upload],
outputs=[transcript, audio_output, insights]
)
process_video_btn.click(
VideoSearcher.process_video,
inputs=[video_upload],
outputs=[video_status]
)
search_type.change(
update_search_type,
search_type,
[text_input, image_input]
)
search_btn.click(
VideoSearcher.search_video,
inputs=[search_type, text_input, image_input],
outputs=[results_gallery]
)
# Related Pixeltable Spaces
gr.Markdown("## ๐ŸŒŸ Explore More Pixeltable Apps")
with gr.Row():
with gr.Column():
gr.HTML(
"""
<div style="border: 1px solid #ddd; padding: 15px; border-radius: 8px; margin-bottom: 10px;">
<h3>๐Ÿ“š Document & Text Processing</h3>
<ul style="list-style-type: none; padding-left: 0;">
<li style="margin-bottom: 10px;">
<a href="https://huggingface.co/spaces/Pixeltable/Multi-LLM-RAG-with-Groundtruth-Comparison" target="_blank" style="color: #F25022; text-decoration: none;">
๐Ÿค– Multi-LLM RAG Comparison
</a>
</li>
<li style="margin-bottom: 10px;">
<a href="https://huggingface.co/spaces/Pixeltable/Document-to-Audio-Synthesis" target="_blank" style="color: #F25022; text-decoration: none;">
๐Ÿ”Š Document to Audio Synthesis
</a>
</li>
<li style="margin-bottom: 10px;">
<a href="https://huggingface.co/spaces/Pixeltable/Prompt-Engineering-and-LLM-Studio" target="_blank" style="color: #F25022; text-decoration: none;">
๐Ÿ’ก Prompt Engineering Studio
</a>
</li>
</ul>
</div>
"""
)
with gr.Column():
gr.HTML(
"""
<div style="border: 1px solid #ddd; padding: 15px; border-radius: 8px; margin-bottom: 10px;">
<h3>๐ŸŽฅ Video & Audio Processing</h3>
<ul style="list-style-type: none; padding-left: 0;">
<li style="margin-bottom: 10px;">
<a href="https://huggingface.co/spaces/Pixeltable/AI-Video-Analyzer-GTP4-Vision-TTS-Narration" target="_blank" style="color: #F25022; text-decoration: none;">
๐ŸŽฅ Video GPT Vision & TTS Narration
</a>
</li>
<li style="margin-bottom: 10px;">
<a href="https://huggingface.co/spaces/Pixeltable/Call-Analysis-AI-Tool" target="_blank" style="color: #F25022; text-decoration: none;">
๐ŸŽ™๏ธ Call Analysis Tool
</a>
</li>
<li style="margin-bottom: 10px;">
<a href="https://huggingface.co/spaces/Pixeltable/object-detection-in-videos-with-yolox" target="_blank" style="color: #F25022; text-decoration: none;">
๐Ÿ” Video Object Detection
</a>
</li>
</ul>
</div>
"""
)
with gr.Column():
gr.HTML(
"""
<div style="border: 1px solid #ddd; padding: 15px; border-radius: 8px; margin-bottom: 10px;">
<h3>๐ŸŽฎ Interactive Applications</h3>
<ul style="list-style-type: none; padding-left: 0;">
<li style="margin-bottom: 10px;">
<a href="https://huggingface.co/spaces/Pixeltable/AI-RPG-Adventure" target="_blank" style="color: #F25022; text-decoration: none;">
๐ŸŽฒ AI RPG Adventure
</a>
</li>
<li style="margin-bottom: 10px;">
<a href="https://huggingface.co/spaces/Pixeltable/AI-Financial-Analysis-Platform" target="_blank" style="color: #F25022; text-decoration: none;">
๐Ÿ“ˆ Financial Analysis Platform
</a>
</li>
<li style="margin-bottom: 10px;">
<a href="https://huggingface.co/spaces/Pixeltable/video-to-social-media-post-generator" target="_blank" style="color: #F25022; text-decoration: none;">
๐Ÿ“ฑ Social Media Post Generator
</a>
</li>
</ul>
</div>
"""
)
gr.HTML(
"""
<div style="margin-top: 2rem; padding-top: 1rem; border-top: 1px solid #e5e7eb;">
<div style="display: flex; justify-content: space-between; align-items: center; flex-wrap: wrap; gap: 1rem;">
<div style="flex: 1;">
<h4 style="margin: 0; color: #374151;">๐Ÿš€ Built with Pixeltable</h4>
<p style="margin: 0.5rem 0; color: #6b7280;">
Open Source AI Data infrastructure.
</p>
</div>
<div style="flex: 1;">
<h4 style="margin: 0; color: #374151;">๐Ÿ”— Resources</h4>
<div style="display: flex; gap: 1.5rem; margin-top: 0.5rem;">
<a href="https://github.com/pixeltable/pixeltable" target="_blank" style="color: #4F46E5; text-decoration: none;">
๐Ÿ’ป GitHub
</a>
<a href="https://docs.pixeltable.com" target="_blank" style="color: #4F46E5; text-decoration: none;">
๐Ÿ“š Documentation
</a>
<a href="https://huggingface.co/Pixeltable" target="_blank" style="color: #4F46E5; text-decoration: none;">
๐Ÿค— Hugging Face
</a>
</div>
</div>
</div>
<p style="margin: 1rem 0 0; text-align: center; color: #9CA3AF; font-size: 0.875rem;">
ยฉ 2024 Pixeltable | Apache License 2.0
</p>
</div>
"""
)
return demo
if __name__ == "__main__":
init_api_keys()
demo = create_interface()
demo.launch(
allowed_paths=[PIXELTABLE_MEDIA_DIR],
show_api=False
)