|
import gradio as gr |
|
import pixeltable as pxt |
|
from pixeltable.iterators import DocumentSplitter, FrameIterator, StringSplitter |
|
from pixeltable.functions.huggingface import sentence_transformer, clip_image, clip_text |
|
from pixeltable.functions.video import extract_audio |
|
from pixeltable.functions.audio import get_metadata |
|
from pixeltable.functions import openai |
|
import numpy as np |
|
import PIL.Image |
|
import os |
|
import getpass |
|
import requests |
|
import tempfile |
|
from datetime import datetime |
|
|
|
|
|
PIXELTABLE_MEDIA_DIR = os.path.expanduser("~/.pixeltable/media") |
|
MAX_TOKENS_DEFAULT = 300 |
|
TEMPERATURE_DEFAULT = 0.7 |
|
CHUNK_SIZE_DEFAULT = 300 |
|
|
|
|
|
def init_api_keys(): |
|
if 'OPENAI_API_KEY' not in os.environ: |
|
os.environ['OPENAI_API_KEY'] = getpass.getpass('OpenAI API key:') |
|
|
|
|
|
@pxt.expr_udf |
|
def e5_embed(text: str) -> np.ndarray: |
|
return sentence_transformer(text, model_id='intfloat/e5-large-v2') |
|
|
|
@pxt.expr_udf |
|
def embed_image(img: PIL.Image.Image): |
|
return clip_image(img, model_id='openai/clip-vit-base-patch32') |
|
|
|
@pxt.expr_udf |
|
def str_embed(s: str): |
|
return clip_text(s, model_id='openai/clip-vit-base-patch32') |
|
|
|
|
|
def initialize_pixeltable(dir_name='unified_app'): |
|
"""Initialize Pixeltable directory""" |
|
pxt.drop_dir(dir_name, force=True) |
|
pxt.create_dir(dir_name) |
|
|
|
@pxt.udf |
|
def create_prompt(top_k_list: list[dict], question: str) -> str: |
|
"""Create a standardized prompt format""" |
|
concat_top_k = '\n\n'.join(elt['text'] for elt in reversed(top_k_list)) |
|
return f''' |
|
PASSAGES: |
|
{concat_top_k} |
|
QUESTION: |
|
{question}''' |
|
|
|
@pxt.udf(return_type=pxt.AudioType()) |
|
def generate_audio(script: str, voice: str, api_key: str): |
|
"""Generate audio from text using OpenAI's API""" |
|
if not script or not voice: |
|
return None |
|
|
|
try: |
|
response = requests.post( |
|
"https://api.openai.com/v1/audio/speech", |
|
headers={"Authorization": f"Bearer {api_key}"}, |
|
json={"model": "tts-1", "input": script, "voice": voice} |
|
) |
|
|
|
if response.status_code == 200: |
|
temp_dir = os.path.join(os.getcwd(), "temp") |
|
os.makedirs(temp_dir, exist_ok=True) |
|
temp_file = os.path.join(temp_dir, f"audio_{os.urandom(8).hex()}.mp3") |
|
|
|
with open(temp_file, 'wb') as f: |
|
f.write(response.content) |
|
return temp_file |
|
except Exception as e: |
|
print(f"Error in audio synthesis: {e}") |
|
return None |
|
|
|
|
|
class DocumentProcessor: |
|
@staticmethod |
|
def process_documents(pdf_files, chunk_limit, chunk_separator): |
|
"""Process uploaded documents for chatbot functionality""" |
|
initialize_pixeltable() |
|
|
|
docs = pxt.create_table( |
|
'unified_app.documents', |
|
{'document': pxt.DocumentType(nullable=True)} |
|
) |
|
|
|
docs.insert({'document': file.name} for file in pdf_files if file.name.endswith('.pdf')) |
|
|
|
chunks = pxt.create_view( |
|
'unified_app.chunks', |
|
docs, |
|
iterator=DocumentSplitter.create( |
|
document=docs.document, |
|
separators=chunk_separator, |
|
limit=chunk_limit if chunk_separator in ["token_limit", "char_limit"] else None |
|
) |
|
) |
|
|
|
chunks.add_embedding_index('text', string_embed=e5_embed) |
|
return "Documents processed successfully. You can start asking questions." |
|
|
|
@staticmethod |
|
def get_document_answer(question): |
|
"""Get answer from processed documents""" |
|
try: |
|
chunks = pxt.get_table('unified_app.chunks') |
|
sim = chunks.text.similarity(question) |
|
relevant_chunks = chunks.order_by(sim, asc=False).limit(5).select(chunks.text).collect() |
|
context = "\n\n".join(chunk['text'] for chunk in relevant_chunks) |
|
|
|
temp_table = pxt.create_table( |
|
'unified_app.temp_response', |
|
{ |
|
'question': pxt.StringType(), |
|
'context': pxt.StringType() |
|
} |
|
) |
|
|
|
temp_table.insert([{'question': question, 'context': context}]) |
|
|
|
temp_table['response'] = openai.chat_completions( |
|
messages=[ |
|
{ |
|
'role': 'system', |
|
'content': 'Answer the question based only on the provided context. If the context doesn\'t contain enough information, say so.' |
|
}, |
|
{ |
|
'role': 'user', |
|
'content': f"Context:\n{context}\n\nQuestion: {question}" |
|
} |
|
], |
|
model='gpt-4o-mini-2024-07-18' |
|
) |
|
|
|
answer = temp_table.select( |
|
answer=temp_table.response.choices[0].message.content |
|
).tail(1)['answer'][0] |
|
|
|
pxt.drop_table('unified_app.temp_response', force=True) |
|
return answer |
|
|
|
except Exception as e: |
|
return f"Error: {str(e)}" |
|
|
|
|
|
class CallAnalyzer: |
|
@staticmethod |
|
def process_call(video_file): |
|
"""Process and analyze call recordings""" |
|
try: |
|
calls = pxt.create_table( |
|
'unified_app.calls', |
|
{"video": pxt.VideoType(nullable=True)} |
|
) |
|
|
|
calls['audio'] = extract_audio(calls.video, format='mp3') |
|
calls['transcription'] = openai.transcriptions(audio=calls.audio, model='whisper-1') |
|
calls['text'] = calls.transcription.text |
|
|
|
sentences = pxt.create_view( |
|
'unified_app.sentences', |
|
calls, |
|
iterator=StringSplitter.create(text=calls.text, separators='sentence') |
|
) |
|
|
|
sentences.add_embedding_index('text', string_embed=e5_embed) |
|
|
|
@pxt.udf |
|
def generate_insights(text: str) -> list[dict]: |
|
return [ |
|
{'role': 'system', 'content': 'Analyze this call transcript and provide key insights:'}, |
|
{'role': 'user', 'content': text} |
|
] |
|
|
|
calls['insights_prompt'] = generate_insights(calls.text) |
|
calls['insights'] = openai.chat_completions( |
|
messages=calls.insights_prompt, |
|
model='gpt-4o-mini-2024-07-18' |
|
).choices[0].message.content |
|
|
|
calls.insert([{"video": video_file}]) |
|
|
|
result = calls.select(calls.text, calls.audio, calls.insights).tail(1) |
|
return result['text'][0], result['audio'][0], result['insights'][0] |
|
|
|
except Exception as e: |
|
return f"Error processing call: {str(e)}", None, None |
|
|
|
|
|
class VideoSearcher: |
|
@staticmethod |
|
def process_video(video_file): |
|
"""Process video for searching""" |
|
try: |
|
initialize_pixeltable() |
|
videos = pxt.create_table('unified_app.videos', {'video': pxt.VideoType()}) |
|
|
|
frames = pxt.create_view( |
|
'unified_app.frames', |
|
videos, |
|
iterator=FrameIterator.create(video=videos.video, fps=1) |
|
) |
|
|
|
frames.add_embedding_index('frame', string_embed=str_embed, image_embed=embed_image) |
|
videos.insert([{'video': video_file.name}]) |
|
|
|
return "Video processed and indexed for search." |
|
except Exception as e: |
|
return f"Error processing video: {str(e)}" |
|
|
|
@staticmethod |
|
def search_video(search_type, text_query=None, image_query=None): |
|
"""Search processed video frames""" |
|
try: |
|
frames = pxt.get_table('unified_app.frames') |
|
|
|
if search_type == "Text" and text_query: |
|
sim = frames.frame.similarity(text_query) |
|
elif search_type == "Image" and image_query is not None: |
|
sim = frames.frame.similarity(image_query) |
|
else: |
|
return [] |
|
|
|
results = frames.order_by(sim, asc=False).limit(5).select(frames.frame).collect() |
|
return [row['frame'] for row in results] |
|
except Exception as e: |
|
print(f"Search error: {str(e)}") |
|
return [] |
|
|
|
|
|
def create_interface(): |
|
with gr.Blocks(theme=gr.themes.Base()) as demo: |
|
|
|
gr.HTML( |
|
""" |
|
<div style="text-align: left; margin-bottom: 1rem;"> |
|
<img src="https://raw.githubusercontent.com/pixeltable/pixeltable/main/docs/source/data/pixeltable-logo-large.png" alt="Pixeltable" style="max-width: 150px;" /> |
|
</div> |
|
""" |
|
) |
|
|
|
gr.Markdown( |
|
""" |
|
# Multimodal Powerhouse |
|
""" |
|
) |
|
|
|
gr.HTML( |
|
""" |
|
<p> |
|
<a href="https://github.com/pixeltable/pixeltable" target="_blank" style="color: #F25022; text-decoration: none; font-weight: bold;">Pixeltable</a> |
|
is a declarative interface for working with text, images, embeddings, and video, enabling you to store, transform, index, and iterate on data. |
|
</p> |
|
|
|
<div style="background-color: #E5DDD4; border: 1px solid #e9ecef; border-radius: 8px; padding: 15px; margin: 15px 0;"> |
|
<strong>โ ๏ธ Note:</strong> This app runs best with GPU. For optimal performance, consider |
|
<a href="https://huggingface.co/spaces/Pixeltable/Multimodal-Processing-Suite?duplicate=true" target="_blank" style="color: #F25022; text-decoration: none; font-weight: bold;">duplicating this space</a> |
|
to run locally or with better computing resources. |
|
</div> |
|
""" |
|
) |
|
|
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
with gr.Accordion("๐ฏ What This App Does", open=False): |
|
gr.Markdown(""" |
|
1. ๐ **Document Processing** |
|
* Chat with your documents using RAG |
|
* Process multiple document formats |
|
* Extract key insights |
|
|
|
2. ๐ฅ **Video Analysis** |
|
* Text and image-based video search |
|
* Frame extraction and indexing |
|
* Visual content discovery |
|
|
|
3. ๐๏ธ **Call Analysis** |
|
* Automatic transcription |
|
* Key insight extraction |
|
* Audio processing |
|
""") |
|
|
|
with gr.Column(): |
|
with gr.Accordion("โ๏ธ How It Works", open=False): |
|
gr.Markdown(""" |
|
1. ๐ **Data Processing** |
|
* Chunking and indexing documents |
|
* Embedding generation for search |
|
* Multi-modal data handling |
|
|
|
2. ๐ค **AI Integration** |
|
* LLM-powered analysis |
|
* Speech-to-text conversion |
|
* Semantic search capabilities |
|
|
|
3. ๐ **Storage & Retrieval** |
|
* Efficient data organization |
|
* Quick content retrieval |
|
* Structured data management |
|
""") |
|
|
|
with gr.Tabs(): |
|
|
|
with gr.TabItem("๐ Document Chat"): |
|
with gr.Row(): |
|
with gr.Column(): |
|
doc_files = gr.File(label="Upload Documents", file_count="multiple") |
|
chunk_size = gr.Slider( |
|
minimum=100, |
|
maximum=500, |
|
value=CHUNK_SIZE_DEFAULT, |
|
label="Chunk Size" |
|
) |
|
chunk_type = gr.Dropdown( |
|
choices=["token_limit", "char_limit", "sentence", "paragraph"], |
|
value="token_limit", |
|
label="Chunking Method" |
|
) |
|
process_docs_btn = gr.Button("Process Documents") |
|
process_status = gr.Textbox(label="Status") |
|
with gr.Column(): |
|
chatbot = gr.Chatbot(label="Document Chat") |
|
msg = gr.Textbox(label="Ask a question") |
|
send_btn = gr.Button("Send") |
|
|
|
|
|
with gr.TabItem("๐๏ธ Call Analysis"): |
|
with gr.Row(): |
|
with gr.Column(): |
|
call_upload = gr.Video(label="Upload Call Recording") |
|
analyze_btn = gr.Button("Analyze Call") |
|
with gr.Column(): |
|
with gr.Tabs(): |
|
with gr.TabItem("๐ Transcript"): |
|
transcript = gr.Textbox(label="Transcript", lines=10) |
|
with gr.TabItem("๐ก Insights"): |
|
insights = gr.Textbox(label="Key Insights", lines=10) |
|
with gr.TabItem("๐ Audio"): |
|
audio_output = gr.Audio(label="Extracted Audio") |
|
|
|
|
|
with gr.TabItem("๐ฅ Video Search"): |
|
with gr.Row(): |
|
with gr.Column(): |
|
video_upload = gr.File(label="Upload Video") |
|
process_video_btn = gr.Button("Process Video") |
|
video_status = gr.Textbox(label="Processing Status") |
|
search_type = gr.Radio( |
|
choices=["Text", "Image"], |
|
label="Search Type", |
|
value="Text" |
|
) |
|
text_input = gr.Textbox(label="Text Query") |
|
image_input = gr.Image(label="Image Query", type="pil", visible=False) |
|
search_btn = gr.Button("Search") |
|
with gr.Column(): |
|
results_gallery = gr.Gallery(label="Search Results") |
|
|
|
|
|
def document_chat(message, chat_history): |
|
bot_message = DocumentProcessor.get_document_answer(message) |
|
chat_history.append((message, bot_message)) |
|
return "", chat_history |
|
|
|
def update_search_type(choice): |
|
return { |
|
text_input: gr.update(visible=choice=="Text"), |
|
image_input: gr.update(visible=choice=="Image") |
|
} |
|
|
|
|
|
process_docs_btn.click( |
|
DocumentProcessor.process_documents, |
|
inputs=[doc_files, chunk_size, chunk_type], |
|
outputs=[process_status] |
|
) |
|
|
|
send_btn.click( |
|
document_chat, |
|
inputs=[msg, chatbot], |
|
outputs=[msg, chatbot] |
|
) |
|
|
|
analyze_btn.click( |
|
CallAnalyzer.process_call, |
|
inputs=[call_upload], |
|
outputs=[transcript, audio_output, insights] |
|
) |
|
|
|
process_video_btn.click( |
|
VideoSearcher.process_video, |
|
inputs=[video_upload], |
|
outputs=[video_status] |
|
) |
|
|
|
search_type.change( |
|
update_search_type, |
|
search_type, |
|
[text_input, image_input] |
|
) |
|
|
|
search_btn.click( |
|
VideoSearcher.search_video, |
|
inputs=[search_type, text_input, image_input], |
|
outputs=[results_gallery] |
|
) |
|
|
|
|
|
gr.Markdown("## ๐ Explore More Pixeltable Apps") |
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
gr.HTML( |
|
""" |
|
<div style="border: 1px solid #ddd; padding: 15px; border-radius: 8px; margin-bottom: 10px;"> |
|
<h3>๐ Document & Text Processing</h3> |
|
<ul style="list-style-type: none; padding-left: 0;"> |
|
<li style="margin-bottom: 10px;"> |
|
<a href="https://huggingface.co/spaces/Pixeltable/Multi-LLM-RAG-with-Groundtruth-Comparison" target="_blank" style="color: #F25022; text-decoration: none;"> |
|
๐ค Multi-LLM RAG Comparison |
|
</a> |
|
</li> |
|
<li style="margin-bottom: 10px;"> |
|
<a href="https://huggingface.co/spaces/Pixeltable/Document-to-Audio-Synthesis" target="_blank" style="color: #F25022; text-decoration: none;"> |
|
๐ Document to Audio Synthesis |
|
</a> |
|
</li> |
|
<li style="margin-bottom: 10px;"> |
|
<a href="https://huggingface.co/spaces/Pixeltable/Prompt-Engineering-and-LLM-Studio" target="_blank" style="color: #F25022; text-decoration: none;"> |
|
๐ก Prompt Engineering Studio |
|
</a> |
|
</li> |
|
</ul> |
|
</div> |
|
""" |
|
) |
|
|
|
with gr.Column(): |
|
gr.HTML( |
|
""" |
|
<div style="border: 1px solid #ddd; padding: 15px; border-radius: 8px; margin-bottom: 10px;"> |
|
<h3>๐ฅ Video & Audio Processing</h3> |
|
<ul style="list-style-type: none; padding-left: 0;"> |
|
<li style="margin-bottom: 10px;"> |
|
<a href="https://huggingface.co/spaces/Pixeltable/video-to-social-media-post-generator" target="_blank" style="color: #F25022; text-decoration: none;"> |
|
๐ฑ Social Media Post Generator |
|
</a> |
|
</li> |
|
<li style="margin-bottom: 10px;"> |
|
<a href="https://huggingface.co/spaces/Pixeltable/Call-Analysis-AI-Tool" target="_blank" style="color: #F25022; text-decoration: none;"> |
|
๐๏ธ Call Analysis Tool |
|
</a> |
|
</li> |
|
<li style="margin-bottom: 10px;"> |
|
<a href="https://huggingface.co/spaces/Pixeltable/object-detection-in-videos-with-yolox" target="_blank" style="color: #F25022; text-decoration: none;"> |
|
๐ Video Object Detection |
|
</a> |
|
</li> |
|
</ul> |
|
</div> |
|
""" |
|
) |
|
|
|
with gr.Column(): |
|
gr.HTML( |
|
""" |
|
<div style="border: 1px solid #ddd; padding: 15px; border-radius: 8px; margin-bottom: 10px;"> |
|
<h3>๐ฎ Interactive Applications</h3> |
|
<ul style="list-style-type: none; padding-left: 0;"> |
|
<li style="margin-bottom: 10px;"> |
|
<a href="https://huggingface.co/spaces/Pixeltable/AI-RPG-Adventure" target="_blank" style="color: #F25022; text-decoration: none;"> |
|
๐ฒ AI RPG Adventure |
|
</a> |
|
</li> |
|
<li style="margin-bottom: 10px;"> |
|
<a href="https://huggingface.co/spaces/Pixeltable/AI-Financial-Analysis-Platform" target="_blank" style="color: #F25022; text-decoration: none;"> |
|
๐ Financial Analysis Platform |
|
</a> |
|
</li> |
|
</ul> |
|
</div> |
|
""" |
|
) |
|
|
|
gr.HTML( |
|
""" |
|
<div style="margin-top: 2rem; padding-top: 1rem; border-top: 1px solid #e5e7eb;"> |
|
<div style="display: flex; justify-content: space-between; align-items: center; flex-wrap: wrap; gap: 1rem;"> |
|
<div style="flex: 1;"> |
|
<h4 style="margin: 0; color: #374151;">๐ Built with Pixeltable</h4> |
|
<p style="margin: 0.5rem 0; color: #6b7280;"> |
|
Open Source AI Data infrastructure. |
|
</p> |
|
</div> |
|
<div style="flex: 1;"> |
|
<h4 style="margin: 0; color: #374151;">๐ Resources</h4> |
|
<div style="display: flex; gap: 1.5rem; margin-top: 0.5rem;"> |
|
<a href="https://github.com/pixeltable/pixeltable" target="_blank" style="color: #4F46E5; text-decoration: none;"> |
|
๐ป GitHub |
|
</a> |
|
<a href="https://docs.pixeltable.com" target="_blank" style="color: #4F46E5; text-decoration: none;"> |
|
๐ Documentation |
|
</a> |
|
<a href="https://huggingface.co/Pixeltable" target="_blank" style="color: #4F46E5; text-decoration: none;"> |
|
๐ค Hugging Face |
|
</a> |
|
</div> |
|
</div> |
|
</div> |
|
<p style="margin: 1rem 0 0; text-align: center; color: #9CA3AF; font-size: 0.875rem;"> |
|
ยฉ 2024 Pixeltable | Apache License 2.0 |
|
</p> |
|
</div> |
|
""" |
|
) |
|
|
|
return demo |
|
|
|
if __name__ == "__main__": |
|
init_api_keys() |
|
demo = create_interface() |
|
demo.launch( |
|
allowed_paths=[PIXELTABLE_MEDIA_DIR], |
|
show_api=False |
|
) |