File size: 9,413 Bytes
3506b46
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
import os
import gradio as gr
import pandas as pd
import numpy as np
import chromadb
from chromadb.config import Settings
from io import StringIO
from sentence_transformers import SentenceTransformer
import openai
import plotly.express as px
from sklearn.manifold import TSNE

# Initialize Chroma client with DuckDB and Parquet for persistence
chroma_client = chromadb.Client(Settings(
    chroma_db_impl="duckdb+parquet", 
    persist_directory="./chroma_db"
))

# Model Configuration for Dynamic Dropdown
model_config = {
    "gpt-4": {
        "endpoint": "https://roger-m38jr9pd-eastus2.openai.azure.com/openai/deployments/gpt-4/chat/completions?api-version=2024-08-01-preview",
        "api_key": os.getenv("GPT4_API_KEY")
    },
    "gpt-4o": {
        "endpoint": "https://roger-m38jr9pd-eastus2.openai.azure.com/openai/deployments/gpt-4o/chat/completions?api-version=2024-08-01-preview",
        "api_key": os.getenv("GPT4O_API_KEY")
    },
    "gpt-35-turbo": {
        "endpoint": "https://rogerkoranteng.openai.azure.com/openai/deployments/gpt-35-turbo/chat/completions?api-version=2024-08-01-preview",
        "api_key": os.getenv("GPT35_TURBO_API_KEY")
    },
    "gpt-4-32k": {
        "endpoint": "https://roger-m38orjxq-australiaeast.openai.azure.com/openai/deployments/gpt-4-32k/chat/completions?api-version=2024-08-01-preview",
        "api_key": os.getenv("GPT4_32K_API_KEY")
    }
}

# Function to process uploaded CSV
def process_csv_text(temp_file):
    if isinstance(temp_file, str):
        df = pd.read_csv(StringIO(temp_file))
    else:
        df = pd.read_csv(temp_file.name, header='infer', sep=',')
    return df, gr.Dropdown.update(choices=list(df.columns))

# Insert or update ChromaDB with embeddings
def insert_or_update_chroma(col, table, model_name, similarity_metric, client=chroma_client):
    try:
        collection = client.create_collection(name="my_collection", 
                                              embedding_function=SentenceTransformer(model_name),
                                              metadata={"hnsw:space": similarity_metric})
    except Exception as e:
        print("Collection exists, deleting it")
        client.delete_collection(name='my_collection')
        collection = client.create_collection(name="my_collection", 
                                              embedding_function=SentenceTransformer(model_name),
                                              metadata={"hnsw:space": similarity_metric})
    if collection:
        try:
            collection.add(
                documents=list(table[col]),
                metadatas=[{"source": i} for i in range(len(table))],
                ids=[str(i + 1) for i in range(len(table))]
            )
            return "Embedding calculations and insertions successful"
        except Exception as e:
            return "Error in embedding calculations"

# Show plot with embeddings using t-SNE
def show_fig():
    collection = chroma_client.get_collection(name="my_collection")
    embeddings = collection.get(include=['embeddings', 'documents'])

    df = pd.DataFrame({
        'text': embeddings['documents'],
        'embedding': embeddings['embeddings']
    })

    embeddings_np = np.array(df['embedding'].tolist())
    tsne = TSNE(n_components=2, random_state=42)
    transformed = tsne.fit_transform(embeddings_np)

    df['tsne_x'] = transformed[:, 0]
    df['tsne_y'] = transformed[:, 1]

    fig = px.scatter(df, x='tsne_x', y='tsne_y', hover_name='text')
    return fig, transformed

# Show test string figure
def show_test_string_fig(test_string, tsne, model_name, similarity_metric):
    collection = chroma_client.get_collection(name="my_collection", 
                                              embedding_function=SentenceTransformer(model_name))
    
    collection.add(
        documents=[test_string],
        metadatas=[{"source": 'test'}],
        ids=['test_sample']
    )

    embeddings = collection.get(include=['embeddings', 'documents'])

    df = pd.DataFrame({
        'text': embeddings['documents'],
        'embedding': embeddings['embeddings'],
        'set': ['orig' if document != test_string else 'test_string' for document in embeddings["documents"]]
    })

    embeddings_np = np.array(df['embedding'].tolist())
    transformed = tsne.transform(embeddings_np)

    df['tsne_x'] = transformed[:, 0]
    df['tsne_y'] = transformed[:, 1]

    fig = px.scatter(df, x='tsne_x', y='tsne_y', hover_name='text', color='set')
    return fig, tsne

# Function to interact with OpenAI's Azure API
def ask_gpt(message, messages_history, embedding_model, system_prompt, temperature, max_tokens, chatgpt_model):
    if len(messages_history) < 1:
        messages_history = [{"role": "system", "content": system_prompt}]
    model_info = model_config[chatgpt_model]
    headers = {
        "Content-Type": "application/json",
        "api-key": model_info["api_key"]
    }

    message = retrieve_similar(message, embedding_model)

    messages_history += [{"role": "user", "content": message}]
    response = openai.ChatCompletion.create(
        model=chatgpt_model,
        messages=messages_history,
        temperature=temperature,
        max_tokens=max_tokens
    )
    
    return response['choices'][0]['message']['content'], messages_history

# Function to retrieve similar questions from ChromaDB
def retrieve_similar(prompt, embedding_model, client=chroma_client):
    collection = client.get_collection(name="my_collection", embedding_function=SentenceTransformer(model_name=embedding_model))

    results = collection.query(query_texts=prompt, n_results=10)
    additional_context = ''
    for i, document in enumerate(results['documents'][0]):
        if i == 0:
            additional_context = 'Information: \n' + str(i+1) + '. ' + document
        else:
            additional_context += '\n' + str(i+1) + '. ' + document

    prompt_with_context = additional_context + '\nQuestion: ' + prompt
    return prompt_with_context

# Gradio App Setup
with gr.Blocks() as demo:
    # Tab 1: Upload CSV and Display Data
    with gr.Tab("Upload data"):
        upload_button = gr.UploadButton(label="Upload csv", file_types=['.csv'], file_count="single")
        table = gr.Dataframe(type="pandas", max_rows='20', overflow_row_behaviour='paginate', interactive=True)
        cols = gr.Dropdown(choices=[], label='Dataframe columns')

        upload_button.upload(fn=process_csv_text, inputs=upload_button, outputs=[table, cols], api_name="upload_csv")

    # Tab 2: ChromaDB, Embeddings, and Plotting
    with gr.Tab("Select Column and insert embeddings to ChromaDb"):
        with gr.Row():
            gr.Markdown("<br>")
        with gr.Row():
            cols = gr.Dropdown(choices=['text_column_1_placeholder'], label='Dataframe columns')

        with gr.Row():
            embedding_model = gr.Dropdown(value='all-MiniLM-L6-v2', choices=['all-MiniLM-L6-v2', 'intfloat/e5-small-v2', 'intfloat/e5-base-v2', 'intfloat/e5-large-v2','paraphrase-multilingual-MiniLM-L12-v2'], label='Embedding model to use')
            similarity_metric = gr.Dropdown(value='cosine', choices=['cosine', 'l2'], label='Similarity metric to use')

        with gr.Row():
            embedding_button = gr.Button(value="Insert or update rows from selected column to embeddings db")
            text = gr.Textbox(label='Process status for Chroma', placeholder='This will be updated once you click "Process status for Chroma"')

        with gr.Row():
            show_embeddings_button = gr.Button(value="Calculate 2d values from embeddings and show scatter plot")
            embeddings_plot = gr.Plot()

        with gr.Row():
            tsne = gr.State(value=None)
            test_string = gr.Textbox(label='test string to try to embed', value="Insert test string here")
            
        with gr.Row():
            calculate_2d_repr_button = gr.Button(value="See where text string is in 2d")
            embeddings_plot_with_text_string = gr.Plot()

        embedding_button.click(insert_or_update_chroma, inputs=[cols, table, embedding_model, similarity_metric], outputs=[text])
        show_embeddings_button.click(show_fig, inputs=[], outputs=[embeddings_plot, tsne])
        calculate_2d_repr_button.click(show_test_string_fig, inputs=[test_string, tsne, embedding_model, similarity_metric], outputs=[embeddings_plot_with_text_string, tsne])

    # Tab 3: Chat with GPT Models
    with gr.Tab("Chat"):
        system_prompt = gr.Textbox(value="You are a helpful assistant.", label="System Message")
        chatgpt_model = gr.Dropdown(value="gpt-4", choices=list(model_config.keys()), label="ChatGPT Model to Use")
        temperature = gr.Slider(minimum=0, maximum=2, step=0.1, value=0.7, label="Temperature")
        max_tokens = gr.Slider(minimum=50, maximum=2000, step=50, value=300, label="Max Tokens")
        chatbot = gr.Chatbot(label="ChatGPT Chat")
        clear_button = gr.Button("Clear Chat History")

        msg = gr.Textbox()
        msg_log = gr.Textbox("Message history will be visible here", label='Message history')

        msg.submit(ask_gpt, [msg, chatbot], [msg, chatbot])
        chatbot.submit(ask_gpt, [chatbot, system_prompt, embedding_model, temperature, max_tokens, chatgpt_model], [chatbot, system_prompt])
        clear_button.click(fn=lambda: None, inputs=None, outputs=[chatbot])

# Launch Gradio interface
demo.launch()