Spaces:

Pixeltable
/

Multimodal-Powerhouse

Sleeping

File size: 23,105 Bytes

import gradio as gr
import pixeltable as pxt
from pixeltable.iterators import DocumentSplitter, FrameIterator, StringSplitter
from pixeltable.functions.huggingface import sentence_transformer, clip_image, clip_text
from pixeltable.functions.video import extract_audio
from pixeltable.functions.audio import get_metadata
from pixeltable.functions import openai
import numpy as np
import PIL.Image
import os
import getpass
import requests
import tempfile
from datetime import datetime

# Configuration
PIXELTABLE_MEDIA_DIR = os.path.expanduser("~/.pixeltable/media")
MAX_TOKENS_DEFAULT = 300
TEMPERATURE_DEFAULT = 0.7
CHUNK_SIZE_DEFAULT = 300

# Initialize API keys
def init_api_keys():
    if 'OPENAI_API_KEY' not in os.environ:
        os.environ['OPENAI_API_KEY'] = getpass.getpass('OpenAI API key:')

# Embedding Functions
@pxt.expr_udf
def e5_embed(text: str) -> np.ndarray:
    return sentence_transformer(text, model_id='intfloat/e5-large-v2')

@pxt.expr_udf
def embed_image(img: PIL.Image.Image):
    return clip_image(img, model_id='openai/clip-vit-base-patch32')

@pxt.expr_udf
def str_embed(s: str):
    return clip_text(s, model_id='openai/clip-vit-base-patch32')

# Common Utilities
def initialize_pixeltable(dir_name='unified_app'):
    """Initialize Pixeltable directory"""
    pxt.drop_dir(dir_name, force=True)
    pxt.create_dir(dir_name)

@pxt.udf
def create_prompt(top_k_list: list[dict], question: str) -> str:
    """Create a standardized prompt format"""
    concat_top_k = '\n\n'.join(elt['text'] for elt in reversed(top_k_list))
    return f'''
    PASSAGES:
    {concat_top_k}
    QUESTION:
    {question}'''

@pxt.udf(return_type=pxt.AudioType())
def generate_audio(script: str, voice: str, api_key: str):
    """Generate audio from text using OpenAI's API"""
    if not script or not voice:
        return None
    
    try:
        response = requests.post(
            "https://api.openai.com/v1/audio/speech",
            headers={"Authorization": f"Bearer {api_key}"},
            json={"model": "tts-1", "input": script, "voice": voice}
        )
        
        if response.status_code == 200:
            temp_dir = os.path.join(os.getcwd(), "temp")
            os.makedirs(temp_dir, exist_ok=True)
            temp_file = os.path.join(temp_dir, f"audio_{os.urandom(8).hex()}.mp3")
            
            with open(temp_file, 'wb') as f:
                f.write(response.content)
            return temp_file
    except Exception as e:
        print(f"Error in audio synthesis: {e}")
    return None

# Document Processing
class DocumentProcessor:
    @staticmethod
    def process_documents(pdf_files, chunk_limit, chunk_separator):
        """Process uploaded documents for chatbot functionality"""
        initialize_pixeltable()
        
        docs = pxt.create_table(
            'unified_app.documents',
            {'document': pxt.DocumentType(nullable=True)}
        )
        
        docs.insert({'document': file.name} for file in pdf_files if file.name.endswith('.pdf'))
        
        chunks = pxt.create_view(
            'unified_app.chunks',
            docs,
            iterator=DocumentSplitter.create(
                document=docs.document,
                separators=chunk_separator,
                limit=chunk_limit if chunk_separator in ["token_limit", "char_limit"] else None
            )
        )
        
        chunks.add_embedding_index('text', string_embed=e5_embed)
        return "Documents processed successfully. You can start asking questions."

    @staticmethod
    def get_document_answer(question):
        """Get answer from processed documents"""
        try:
            chunks = pxt.get_table('unified_app.chunks')
            sim = chunks.text.similarity(question)
            relevant_chunks = chunks.order_by(sim, asc=False).limit(5).select(chunks.text).collect()
            context = "\n\n".join(chunk['text'] for chunk in relevant_chunks)
            
            temp_table = pxt.create_table(
                'unified_app.temp_response',
                {
                    'question': pxt.StringType(),
                    'context': pxt.StringType()
                }
            )
            
            temp_table.insert([{'question': question, 'context': context}])
            
            temp_table['response'] = openai.chat_completions(
                messages=[
                    {
                        'role': 'system',
                        'content': 'Answer the question based only on the provided context. If the context doesn\'t contain enough information, say so.'
                    },
                    {
                        'role': 'user',
                        'content': f"Context:\n{context}\n\nQuestion: {question}"
                    }
                ],
                model='gpt-4o-mini-2024-07-18'
            )
            
            answer = temp_table.select(
                answer=temp_table.response.choices[0].message.content
            ).tail(1)['answer'][0]
            
            pxt.drop_table('unified_app.temp_response', force=True)
            return answer
            
        except Exception as e:
            return f"Error: {str(e)}"

# Call Analysis
class CallAnalyzer:
    @staticmethod
    def process_call(video_file):      
        """Process and analyze call recordings"""
        try:
            initialize_pixeltable()
            calls = pxt.create_table(
                'unified_app.calls',
                {"video": pxt.VideoType(nullable=True)}
            )
            
            calls['audio'] = extract_audio(calls.video, format='mp3')
            calls['transcription'] = openai.transcriptions(audio=calls.audio, model='whisper-1')
            calls['text'] = calls.transcription.text
            
            sentences = pxt.create_view(
                'unified_app.sentences',
                calls,
                iterator=StringSplitter.create(text=calls.text, separators='sentence')
            )
            
            sentences.add_embedding_index('text', string_embed=e5_embed)
            
            @pxt.udf
            def generate_insights(text: str) -> list[dict]:
                return [
                    {'role': 'system', 'content': 'Analyze this call transcript and provide key insights:'},
                    {'role': 'user', 'content': text}
                ]
            
            calls['insights_prompt'] = generate_insights(calls.text)
            calls['insights'] = openai.chat_completions(
                messages=calls.insights_prompt,
                model='gpt-4o-mini-2024-07-18'
            ).choices[0].message.content
            
            calls.insert([{"video": video_file}])
            
            result = calls.select(calls.text, calls.audio, calls.insights).tail(1)
            return result['text'][0], result['audio'][0], result['insights'][0]
            
        except Exception as e:
            return f"Error processing call: {str(e)}", None, None

# Video Search
class VideoSearcher:
    @staticmethod
    def process_video(video_file):
        """Process video for searching"""
        try:
            initialize_pixeltable()
            videos = pxt.create_table('unified_app.videos', {'video': pxt.VideoType()})
            
            frames = pxt.create_view(
                'unified_app.frames',
                videos,
                iterator=FrameIterator.create(video=videos.video, fps=1)
            )
            
            frames.add_embedding_index('frame', string_embed=str_embed, image_embed=embed_image)
            videos.insert([{'video': video_file.name}])
            
            return "Video processed and indexed for search."
        except Exception as e:
            return f"Error processing video: {str(e)}"

    @staticmethod
    def search_video(search_type, text_query=None, image_query=None):
        """Search processed video frames"""
        try:
            frames = pxt.get_table('unified_app.frames')
            
            if search_type == "Text" and text_query:
                sim = frames.frame.similarity(text_query)
            elif search_type == "Image" and image_query is not None:
                sim = frames.frame.similarity(image_query)
            else:
                return []
                
            results = frames.order_by(sim, asc=False).limit(5).select(frames.frame).collect()
            return [row['frame'] for row in results]
        except Exception as e:
            print(f"Search error: {str(e)}")
            return []

# Gradio Interface
def create_interface():
    with gr.Blocks(theme=gr.themes.Base()) as demo:
        # Header
        gr.HTML(
            """
            <div style="text-align: left; margin-bottom: 1rem;">
                <img src="https://raw.githubusercontent.com/pixeltable/pixeltable/main/docs/source/data/pixeltable-logo-large.png" alt="Pixeltable" style="max-width: 150px;" />
            </div>
            """
        )

        gr.Markdown(
            """
            # Multimodal Powerhouse
            """
        )

        gr.HTML(
            """
            <p>
                <a href="https://github.com/pixeltable/pixeltable" target="_blank" style="color: #F25022; text-decoration: none; font-weight: bold;">Pixeltable</a> 
                is a declarative interface for working with text, images, embeddings, and video, enabling you to store, transform, index, and iterate on data.
            </p>
            
            <div style="background-color: #E5DDD4; border: 1px solid #e9ecef; border-radius: 8px; padding: 15px; margin: 15px 0;">
                <strong>⚠️ Note:</strong> This app runs best with GPU. For optimal performance, consider 
                <a href="https://huggingface.co/spaces/Pixeltable/Multimodal-Processing-Suite?duplicate=true" target="_blank" style="color: #F25022; text-decoration: none; font-weight: bold;">duplicating this space</a> 
                to run locally or with better computing resources.
            </div>
            """
        )

        # Documentation Sections
        with gr.Row():
            with gr.Column():
                with gr.Accordion("🎯 What This App Does", open=False):
                    gr.Markdown("""
                    1. 📚 **Document Processing**
                       * Chat with your documents using RAG
                       * Process multiple document formats
                       * Extract key insights
                    
                    2. 🎥 **Video Analysis**
                       * Text and image-based video search
                       * Frame extraction and indexing
                       * Visual content discovery
                    
                    3. 🎙️ **Call Analysis**
                       * Automatic transcription
                       * Key insight extraction
                       * Audio processing
                    """)
            
            with gr.Column():
                with gr.Accordion("⚙️ How It Works", open=False):
                    gr.Markdown("""
                    1. 🔄 **Data Processing**
                       * Chunking and indexing documents
                       * Embedding generation for search
                       * Multi-modal data handling
                    
                    2. 🤖 **AI Integration**
                       * LLM-powered analysis
                       * Speech-to-text conversion
                       * Semantic search capabilities
                    
                    3. 📊 **Storage & Retrieval**
                       * Efficient data organization
                       * Quick content retrieval
                       * Structured data management
                    """)
        
        with gr.Tabs():
            # Document Chat Tab
            with gr.TabItem("📚 Document Chat"):
                with gr.Row():
                    with gr.Column():
                        doc_files = gr.File(label="Upload Documents", file_count="multiple")
                        chunk_size = gr.Slider(
                            minimum=100,
                            maximum=500,
                            value=CHUNK_SIZE_DEFAULT,
                            label="Chunk Size"
                        )
                        chunk_type = gr.Dropdown(
                            choices=["token_limit", "char_limit", "sentence", "paragraph"],
                            value="token_limit",
                            label="Chunking Method"
                        )
                        process_docs_btn = gr.Button("Process Documents")
                        process_status = gr.Textbox(label="Status")
                    with gr.Column():
                        chatbot = gr.Chatbot(label="Document Chat")
                        msg = gr.Textbox(label="Ask a question")
                        send_btn = gr.Button("Send")
            
            # Call Analysis Tab
            with gr.TabItem("🎙️ Call Analysis"):
                with gr.Row():
                    with gr.Column():
                        call_upload = gr.Video(label="Upload Call Recording")
                        analyze_btn = gr.Button("Analyze Call")
                    with gr.Column():
                        with gr.Tabs():
                            with gr.TabItem("📝 Transcript"):
                                transcript = gr.Textbox(label="Transcript", lines=10)
                            with gr.TabItem("💡 Insights"):
                                insights = gr.Textbox(label="Key Insights", lines=10)
                            with gr.TabItem("🔊 Audio"):
                                audio_output = gr.Audio(label="Extracted Audio")
            
            # Video Search Tab
            with gr.TabItem("🎥 Video Search"):
                with gr.Row():
                    with gr.Column():
                        video_upload = gr.File(label="Upload Video")
                        process_video_btn = gr.Button("Process Video")
                        video_status = gr.Textbox(label="Processing Status")
                        search_type = gr.Radio(
                            choices=["Text", "Image"],
                            label="Search Type",
                            value="Text"
                        )
                        text_input = gr.Textbox(label="Text Query")
                        image_input = gr.Image(label="Image Query", type="pil", visible=False)
                        search_btn = gr.Button("Search")
                    with gr.Column():
                        results_gallery = gr.Gallery(label="Search Results")

        # Event Handlers
        def document_chat(message, chat_history):
            bot_message = DocumentProcessor.get_document_answer(message)
            chat_history.append((message, bot_message))
            return "", chat_history

        def update_search_type(choice):
            return {
                text_input: gr.update(visible=choice=="Text"),
                image_input: gr.update(visible=choice=="Image")
            }

        # Connect Events
        process_docs_btn.click(
            DocumentProcessor.process_documents,
            inputs=[doc_files, chunk_size, chunk_type],
            outputs=[process_status]
        )
        
        send_btn.click(
            document_chat,
            inputs=[msg, chatbot],
            outputs=[msg, chatbot]
        )
        
        analyze_btn.click(
            CallAnalyzer.process_call,
            inputs=[call_upload],
            outputs=[transcript, audio_output, insights]
        )
        
        process_video_btn.click(
            VideoSearcher.process_video,
            inputs=[video_upload],
            outputs=[video_status]
        )
        
        search_type.change(
            update_search_type,
            search_type,
            [text_input, image_input]
        )
        
        search_btn.click(
            VideoSearcher.search_video,
            inputs=[search_type, text_input, image_input],
            outputs=[results_gallery]
        )

        # Related Pixeltable Spaces
        gr.Markdown("## 🌟 Explore More Pixeltable Apps")
        
        with gr.Row():
            with gr.Column():
                gr.HTML(
                    """
                    <div style="border: 1px solid #ddd; padding: 15px; border-radius: 8px; margin-bottom: 10px;">
                        <h3>📚 Document & Text Processing</h3>
                        <ul style="list-style-type: none; padding-left: 0;">
                            <li style="margin-bottom: 10px;">
                                <a href="https://huggingface.co/spaces/Pixeltable/Multi-LLM-RAG-with-Groundtruth-Comparison" target="_blank" style="color: #F25022; text-decoration: none;">
                                    🤖 Multi-LLM RAG Comparison
                                </a>
                            </li>
                            <li style="margin-bottom: 10px;">
                                <a href="https://huggingface.co/spaces/Pixeltable/Document-to-Audio-Synthesis" target="_blank" style="color: #F25022; text-decoration: none;">
                                    🔊 Document to Audio Synthesis
                                </a>
                            </li>
                            <li style="margin-bottom: 10px;">
                                <a href="https://huggingface.co/spaces/Pixeltable/Prompt-Engineering-and-LLM-Studio" target="_blank" style="color: #F25022; text-decoration: none;">
                                    💡 Prompt Engineering Studio
                                </a>
                            </li>
                        </ul>
                    </div>
                    """
                )
            
            with gr.Column():
                gr.HTML(
                    """
                    <div style="border: 1px solid #ddd; padding: 15px; border-radius: 8px; margin-bottom: 10px;">
                        <h3>🎥 Video & Audio Processing</h3>
                        <ul style="list-style-type: none; padding-left: 0;">
                            <li style="margin-bottom: 10px;">
                                <a href="https://huggingface.co/spaces/Pixeltable/AI-Video-Analyzer-GTP4-Vision-TTS-Narration" target="_blank" style="color: #F25022; text-decoration: none;">
                                    🎥 Video GPT Vision & TTS Narration
                                </a>
                            </li>
                            <li style="margin-bottom: 10px;">
                                <a href="https://huggingface.co/spaces/Pixeltable/Call-Analysis-AI-Tool" target="_blank" style="color: #F25022; text-decoration: none;">
                                    🎙️ Call Analysis Tool
                                </a>
                            </li>
                            <li style="margin-bottom: 10px;">
                                <a href="https://huggingface.co/spaces/Pixeltable/object-detection-in-videos-with-yolox" target="_blank" style="color: #F25022; text-decoration: none;">
                                    🔍 Video Object Detection
                                </a>
                            </li>
                        </ul>
                    </div>
                    """
                )
            
            with gr.Column():
                gr.HTML(
                    """
                    <div style="border: 1px solid #ddd; padding: 15px; border-radius: 8px; margin-bottom: 10px;">
                        <h3>🎮 Interactive Applications</h3>
                        <ul style="list-style-type: none; padding-left: 0;">
                            <li style="margin-bottom: 10px;">
                                <a href="https://huggingface.co/spaces/Pixeltable/AI-RPG-Adventure" target="_blank" style="color: #F25022; text-decoration: none;">
                                    🎲 AI RPG Adventure
                                </a>
                            </li>
                            <li style="margin-bottom: 10px;">
                                <a href="https://huggingface.co/spaces/Pixeltable/AI-Financial-Analysis-Platform" target="_blank" style="color: #F25022; text-decoration: none;">
                                    📈 Financial Analysis Platform
                                </a>
                            </li>
                            <li style="margin-bottom: 10px;">
                                <a href="https://huggingface.co/spaces/Pixeltable/video-to-social-media-post-generator" target="_blank" style="color: #F25022; text-decoration: none;">
                                    📱 Social Media Post Generator
                                </a>
                            </li>
                        </ul>
                    </div>
                    """
                )

        gr.HTML(
                """
                <div style="margin-top: 2rem; padding-top: 1rem; border-top: 1px solid #e5e7eb;">
                    <div style="display: flex; justify-content: space-between; align-items: center; flex-wrap: wrap; gap: 1rem;">
                        <div style="flex: 1;">
                            <h4 style="margin: 0; color: #374151;">🚀 Built with Pixeltable</h4>
                            <p style="margin: 0.5rem 0; color: #6b7280;">
                                Open Source AI Data infrastructure.
                            </p>
                        </div>
                        <div style="flex: 1;">
                            <h4 style="margin: 0; color: #374151;">🔗 Resources</h4>
                            <div style="display: flex; gap: 1.5rem; margin-top: 0.5rem;">
                                <a href="https://github.com/pixeltable/pixeltable" target="_blank" style="color: #4F46E5; text-decoration: none;">
                                    💻 GitHub
                                </a>
                                <a href="https://docs.pixeltable.com" target="_blank" style="color: #4F46E5; text-decoration: none;">
                                    📚 Documentation
                                </a>
                                <a href="https://huggingface.co/Pixeltable" target="_blank" style="color: #4F46E5; text-decoration: none;">
                                    🤗 Hugging Face
                                </a>
                            </div>
                        </div>
                    </div>
                    <p style="margin: 1rem 0 0; text-align: center; color: #9CA3AF; font-size: 0.875rem;">
                        © 2024 Pixeltable | Apache License 2.0
                    </p>
                </div>
                """
            )
    
    return demo

if __name__ == "__main__":
    init_api_keys()
    demo = create_interface()
    demo.launch(
        allowed_paths=[PIXELTABLE_MEDIA_DIR],
        show_api=False
    )