Spaces:
Build error
Build error
File size: 9,413 Bytes
3506b46 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 |
import os
import gradio as gr
import pandas as pd
import numpy as np
import chromadb
from chromadb.config import Settings
from io import StringIO
from sentence_transformers import SentenceTransformer
import openai
import plotly.express as px
from sklearn.manifold import TSNE
# Initialize Chroma client with DuckDB and Parquet for persistence
chroma_client = chromadb.Client(Settings(
chroma_db_impl="duckdb+parquet",
persist_directory="./chroma_db"
))
# Model Configuration for Dynamic Dropdown
model_config = {
"gpt-4": {
"endpoint": "https://roger-m38jr9pd-eastus2.openai.azure.com/openai/deployments/gpt-4/chat/completions?api-version=2024-08-01-preview",
"api_key": os.getenv("GPT4_API_KEY")
},
"gpt-4o": {
"endpoint": "https://roger-m38jr9pd-eastus2.openai.azure.com/openai/deployments/gpt-4o/chat/completions?api-version=2024-08-01-preview",
"api_key": os.getenv("GPT4O_API_KEY")
},
"gpt-35-turbo": {
"endpoint": "https://rogerkoranteng.openai.azure.com/openai/deployments/gpt-35-turbo/chat/completions?api-version=2024-08-01-preview",
"api_key": os.getenv("GPT35_TURBO_API_KEY")
},
"gpt-4-32k": {
"endpoint": "https://roger-m38orjxq-australiaeast.openai.azure.com/openai/deployments/gpt-4-32k/chat/completions?api-version=2024-08-01-preview",
"api_key": os.getenv("GPT4_32K_API_KEY")
}
}
# Function to process uploaded CSV
def process_csv_text(temp_file):
if isinstance(temp_file, str):
df = pd.read_csv(StringIO(temp_file))
else:
df = pd.read_csv(temp_file.name, header='infer', sep=',')
return df, gr.Dropdown.update(choices=list(df.columns))
# Insert or update ChromaDB with embeddings
def insert_or_update_chroma(col, table, model_name, similarity_metric, client=chroma_client):
try:
collection = client.create_collection(name="my_collection",
embedding_function=SentenceTransformer(model_name),
metadata={"hnsw:space": similarity_metric})
except Exception as e:
print("Collection exists, deleting it")
client.delete_collection(name='my_collection')
collection = client.create_collection(name="my_collection",
embedding_function=SentenceTransformer(model_name),
metadata={"hnsw:space": similarity_metric})
if collection:
try:
collection.add(
documents=list(table[col]),
metadatas=[{"source": i} for i in range(len(table))],
ids=[str(i + 1) for i in range(len(table))]
)
return "Embedding calculations and insertions successful"
except Exception as e:
return "Error in embedding calculations"
# Show plot with embeddings using t-SNE
def show_fig():
collection = chroma_client.get_collection(name="my_collection")
embeddings = collection.get(include=['embeddings', 'documents'])
df = pd.DataFrame({
'text': embeddings['documents'],
'embedding': embeddings['embeddings']
})
embeddings_np = np.array(df['embedding'].tolist())
tsne = TSNE(n_components=2, random_state=42)
transformed = tsne.fit_transform(embeddings_np)
df['tsne_x'] = transformed[:, 0]
df['tsne_y'] = transformed[:, 1]
fig = px.scatter(df, x='tsne_x', y='tsne_y', hover_name='text')
return fig, transformed
# Show test string figure
def show_test_string_fig(test_string, tsne, model_name, similarity_metric):
collection = chroma_client.get_collection(name="my_collection",
embedding_function=SentenceTransformer(model_name))
collection.add(
documents=[test_string],
metadatas=[{"source": 'test'}],
ids=['test_sample']
)
embeddings = collection.get(include=['embeddings', 'documents'])
df = pd.DataFrame({
'text': embeddings['documents'],
'embedding': embeddings['embeddings'],
'set': ['orig' if document != test_string else 'test_string' for document in embeddings["documents"]]
})
embeddings_np = np.array(df['embedding'].tolist())
transformed = tsne.transform(embeddings_np)
df['tsne_x'] = transformed[:, 0]
df['tsne_y'] = transformed[:, 1]
fig = px.scatter(df, x='tsne_x', y='tsne_y', hover_name='text', color='set')
return fig, tsne
# Function to interact with OpenAI's Azure API
def ask_gpt(message, messages_history, embedding_model, system_prompt, temperature, max_tokens, chatgpt_model):
if len(messages_history) < 1:
messages_history = [{"role": "system", "content": system_prompt}]
model_info = model_config[chatgpt_model]
headers = {
"Content-Type": "application/json",
"api-key": model_info["api_key"]
}
message = retrieve_similar(message, embedding_model)
messages_history += [{"role": "user", "content": message}]
response = openai.ChatCompletion.create(
model=chatgpt_model,
messages=messages_history,
temperature=temperature,
max_tokens=max_tokens
)
return response['choices'][0]['message']['content'], messages_history
# Function to retrieve similar questions from ChromaDB
def retrieve_similar(prompt, embedding_model, client=chroma_client):
collection = client.get_collection(name="my_collection", embedding_function=SentenceTransformer(model_name=embedding_model))
results = collection.query(query_texts=prompt, n_results=10)
additional_context = ''
for i, document in enumerate(results['documents'][0]):
if i == 0:
additional_context = 'Information: \n' + str(i+1) + '. ' + document
else:
additional_context += '\n' + str(i+1) + '. ' + document
prompt_with_context = additional_context + '\nQuestion: ' + prompt
return prompt_with_context
# Gradio App Setup
with gr.Blocks() as demo:
# Tab 1: Upload CSV and Display Data
with gr.Tab("Upload data"):
upload_button = gr.UploadButton(label="Upload csv", file_types=['.csv'], file_count="single")
table = gr.Dataframe(type="pandas", max_rows='20', overflow_row_behaviour='paginate', interactive=True)
cols = gr.Dropdown(choices=[], label='Dataframe columns')
upload_button.upload(fn=process_csv_text, inputs=upload_button, outputs=[table, cols], api_name="upload_csv")
# Tab 2: ChromaDB, Embeddings, and Plotting
with gr.Tab("Select Column and insert embeddings to ChromaDb"):
with gr.Row():
gr.Markdown("<br>")
with gr.Row():
cols = gr.Dropdown(choices=['text_column_1_placeholder'], label='Dataframe columns')
with gr.Row():
embedding_model = gr.Dropdown(value='all-MiniLM-L6-v2', choices=['all-MiniLM-L6-v2', 'intfloat/e5-small-v2', 'intfloat/e5-base-v2', 'intfloat/e5-large-v2','paraphrase-multilingual-MiniLM-L12-v2'], label='Embedding model to use')
similarity_metric = gr.Dropdown(value='cosine', choices=['cosine', 'l2'], label='Similarity metric to use')
with gr.Row():
embedding_button = gr.Button(value="Insert or update rows from selected column to embeddings db")
text = gr.Textbox(label='Process status for Chroma', placeholder='This will be updated once you click "Process status for Chroma"')
with gr.Row():
show_embeddings_button = gr.Button(value="Calculate 2d values from embeddings and show scatter plot")
embeddings_plot = gr.Plot()
with gr.Row():
tsne = gr.State(value=None)
test_string = gr.Textbox(label='test string to try to embed', value="Insert test string here")
with gr.Row():
calculate_2d_repr_button = gr.Button(value="See where text string is in 2d")
embeddings_plot_with_text_string = gr.Plot()
embedding_button.click(insert_or_update_chroma, inputs=[cols, table, embedding_model, similarity_metric], outputs=[text])
show_embeddings_button.click(show_fig, inputs=[], outputs=[embeddings_plot, tsne])
calculate_2d_repr_button.click(show_test_string_fig, inputs=[test_string, tsne, embedding_model, similarity_metric], outputs=[embeddings_plot_with_text_string, tsne])
# Tab 3: Chat with GPT Models
with gr.Tab("Chat"):
system_prompt = gr.Textbox(value="You are a helpful assistant.", label="System Message")
chatgpt_model = gr.Dropdown(value="gpt-4", choices=list(model_config.keys()), label="ChatGPT Model to Use")
temperature = gr.Slider(minimum=0, maximum=2, step=0.1, value=0.7, label="Temperature")
max_tokens = gr.Slider(minimum=50, maximum=2000, step=50, value=300, label="Max Tokens")
chatbot = gr.Chatbot(label="ChatGPT Chat")
clear_button = gr.Button("Clear Chat History")
msg = gr.Textbox()
msg_log = gr.Textbox("Message history will be visible here", label='Message history')
msg.submit(ask_gpt, [msg, chatbot], [msg, chatbot])
chatbot.submit(ask_gpt, [chatbot, system_prompt, embedding_model, temperature, max_tokens, chatgpt_model], [chatbot, system_prompt])
clear_button.click(fn=lambda: None, inputs=None, outputs=[chatbot])
# Launch Gradio interface
demo.launch()
|