import gradio as gr import pandas as pd import io import base64 import uuid import pixeltable as pxt from pixeltable.iterators import DocumentSplitter import numpy as np from pixeltable.functions.huggingface import sentence_transformer from pixeltable.functions import openai from pixeltable.functions.fireworks import chat_completions as f_chat_completions from pixeltable.functions.mistralai import chat_completions from gradio.themes import Monochrome import os import getpass """## Store OpenAI API Key""" if 'OPENAI_API_KEY' not in os.environ: os.environ['OPENAI_API_KEY'] = getpass.getpass('OpenAI API key:') if 'FIREWORKS_API_KEY' not in os.environ: os.environ['FIREWORKS_API_KEY'] = getpass.getpass('Fireworks API Key:') if 'MISTRAL_API_KEY' not in os.environ: os.environ['MISTRAL_API_KEY'] = getpass.getpass('Mistral AI API Key:') """Gradio Application""" def process_files(ground_truth_file, pdf_files, chunk_limit, chunk_separator, show_question, show_correct_answer, show_gpt4omini, show_llamav3p23b, show_mistralsmall, progress=gr.Progress()): # Ensure a clean slate for the demo by removing and recreating the 'rag_demo' directory progress(0, desc="Initializing...") pxt.drop_dir('rag_demo', force=True) pxt.create_dir('rag_demo') # Process the ground truth file, which contains questions and correct answers # Import as CSV or Excel depending on the file extension if ground_truth_file.name.endswith('.csv'): queries_t = pxt.io.import_csv('rag_demo.queries', ground_truth_file.name) else: queries_t = pxt.io.import_excel('rag_demo.queries', ground_truth_file.name) progress(0.2, desc="Processing documents...") # Create a table to store the uploaded PDF documents documents_t = pxt.create_table( 'rag_demo.documents', {'document': pxt.DocumentType()} ) # Insert the PDF files into the documents table documents_t.insert({'document': file.name} for file in pdf_files if file.name.endswith('.pdf')) # Create a view that splits the documents into smaller chunks chunks_t = pxt.create_view( 'rag_demo.chunks', documents_t, iterator=DocumentSplitter.create( document=documents_t.document, separators=chunk_separator, limit=chunk_limit if chunk_separator in ["token_limit", "char_limit"] else None ) ) progress(0.4, desc="Generating embeddings...") chunks_t.add_embedding_index( 'text', idx_name='minilm_idx', string_embed=sentence_transformer.using(model_id='sentence-transformers/all-MiniLM-L12-v2') ) # Create prompt function @pxt.udf def create_prompt(top_k_list: list[dict], question: str) -> str: if not top_k_list: return f"QUESTION:\n{question}" concat_top_k = '\n\n'.join( elt['text'] for elt in reversed(top_k_list) if elt and 'text' in elt ) return f'''PASSAGES:\n{concat_top_k}\n\nQUESTION:\n{question}''' # Define a query function to retrieve the top-k most similar chunks for a given question @chunks_t.query def top_k(query_text: str): sim = chunks_t.text.similarity(query_text) return ( chunks_t.order_by(sim, asc=False) .select(chunks_t.text, sim=sim) .limit(5) ) # Then modify the messages structure to use a UDF @pxt.udf def create_messages(prompt: str) -> list[dict]: return [ { 'role': 'system', 'content': 'Read the following passages and answer the question based on their contents.' }, { 'role': 'user', 'content': prompt } ] # First add the context and prompt columns queries_t.add_computed_column(question_context=chunks_t.queries.top_k(queries_t.question)) queries_t.add_computed_column(prompt=create_prompt( queries_t.question_context, queries_t.question )) # Add the messages column queries_t.add_computed_column(messages=create_messages(queries_t.prompt)) # Then add the response columns using the messages queries_t.add_computed_column(response=openai.chat_completions( model='gpt-4o-mini-2024-07-18', messages=queries_t.messages, max_tokens=300, top_p=0.9, temperature=0.7 )) queries_t.add_computed_column(response_2=f_chat_completions( messages=queries_t.messages, model='accounts/fireworks/models/llama-v3p2-3b-instruct', max_tokens=300, top_p=0.9, temperature=0.7 )) queries_t.add_computed_column(response_3=chat_completions( messages=queries_t.messages, model='mistral-small-latest', max_tokens=300, top_p=0.9, temperature=0.7 )) # Extract the answer text from the API response queries_t.add_computed_column(gpt4omini=queries_t.response.choices[0].message.content) queries_t.add_computed_column(llamav3p23b=queries_t.response_2.choices[0].message.content) queries_t.add_computed_column(mistralsmall=queries_t.response_3.choices[0].message.content) # Prepare the output dataframe with selected columns columns_to_show = [] if show_question: columns_to_show.append(queries_t.question) if show_correct_answer: columns_to_show.append(queries_t.correct_answer) if show_gpt4omini: columns_to_show.append(queries_t.gpt4omini) if show_llamav3p23b: columns_to_show.append(queries_t.llamav3p23b) if show_mistralsmall: columns_to_show.append(queries_t.mistralsmall) df_output = queries_t.select(*columns_to_show).collect().to_pandas() try: # Return the output dataframe for display return df_output except Exception as e: return f"An error occurred: {str(e)}", None def save_dataframe_as_csv(data): print(f"Type of data: {type(data)}") if isinstance(data, pd.DataFrame): print(f"Shape of DataFrame: {data.shape}") if isinstance(data, pd.DataFrame) and not data.empty: filename = f"results_{uuid.uuid4().hex[:8]}.csv" filepath = os.path.join('tmp', filename) os.makedirs('tmp', exist_ok=True) data.to_csv(filepath, index=False) return filepath return None # Gradio interface with gr.Blocks(theme=Monochrome) as demo: gr.Markdown( """
Pixeltable is a declarative interface for working with text, images, embeddings, and even video, enabling you to store, transform, index, and iterate on data.
""" ) # Add the disclaimer gr.HTML( """