|
import os |
|
import openai |
|
from openai.embeddings_utils import get_embedding, cosine_similarity |
|
from sklearn.manifold import TSNE |
|
import streamlit as st |
|
from matplotlib import cm |
|
import pandas as pd |
|
import numpy as np |
|
from ast import literal_eval |
|
import nomic |
|
from nomic import atlas |
|
import matplotlib.pyplot as plt |
|
import matplotlib |
|
import numpy as np |
|
|
|
from dotenv import load_dotenv |
|
load_dotenv() |
|
MODEL = "text-embedding-ada-002" |
|
st.set_page_config(page_title="Visual Embeddings and Similarity", page_icon="🤖", layout="wide") |
|
|
|
def main(): |
|
|
|
st.sidebar.title("Credentials") |
|
st.sidebar.write("OpenAI API Key") |
|
openai_api_key = st.sidebar.text_input("Enter your OpenAI API Key", value=os.getenv("OPENAI_API_KEY")) |
|
st.sidebar.write("Nomic Token") |
|
nomic_token = st.sidebar.text_input("Enter your Nomic Token", value=os.getenv("NOMIC_TOKEN")) |
|
|
|
openai.api_key = os.getenv("OPENAI_API_KEY") |
|
nomic.login(os.getenv("NOMIC_TOKEN")) |
|
|
|
|
|
datafile_path = "food_review.csv" |
|
|
|
df = pd.read_csv(datafile_path, usecols=[0,1,3, 5, 7, 8]) |
|
st.title("Visual Embeddings and Similarity") |
|
st.write("Amazon food reviews dataset") |
|
st.write(df) |
|
|
|
st.write("Search similarity") |
|
form = st.form('Embeddings') |
|
question = form.text_input("Enter a sentence to search for semantic similarity", value="I love this soup") |
|
btn = form.form_submit_button("Run") |
|
|
|
if btn: |
|
|
|
if openai_api_key is not None and nomic_token is not None: |
|
with st.spinner("Loading"): |
|
search_term_vector = get_embedding(question, engine="text-embedding-ada-002") |
|
search_term_vector = np.array(search_term_vector) |
|
|
|
matrix = np.array(df.embedding.apply(literal_eval).to_list()) |
|
|
|
|
|
distances = np.linalg.norm(matrix - search_term_vector, axis=1) |
|
df['distance_to_search_term'] = distances |
|
|
|
|
|
df['normalized_distance'] = (df['distance_to_search_term'] - df['distance_to_search_term'].min()) / (df['distance_to_search_term'].max() - df['distance_to_search_term'].min()) |
|
|
|
|
|
|
|
tsne = TSNE(n_components=2, perplexity=15, random_state=42, init='random', learning_rate=200) |
|
vis_dims = tsne.fit_transform(matrix) |
|
|
|
colors = cm.rainbow(df['normalized_distance']) |
|
x = [x for x,y in vis_dims] |
|
y = [y for x,y in vis_dims] |
|
|
|
|
|
plt.scatter(x, y, color=colors, alpha=0.3) |
|
|
|
|
|
plt.title("Similarity to search term visualized in language using t-SNE") |
|
|
|
|
|
|
|
df['embedding'] = df['embedding'].apply(lambda x: np.array(literal_eval(x))) |
|
df["similarities"] = df['embedding'].apply(lambda x: cosine_similarity(x, search_term_vector)) |
|
|
|
st.title("Visual embedding of the search term and the 20 most similar sentences") |
|
|
|
col1, col2 = st.columns(2) |
|
|
|
|
|
col1.pyplot(plt) |
|
|
|
|
|
|
|
col2.write(df[['similarities','Text']].sort_values("similarities", ascending=False).head(20)) |
|
|
|
|
|
st.title("Nomic mappping embeddings") |
|
embeddings = np.array(df.embedding.to_list()) |
|
df = df.drop('embedding', axis=1) |
|
df = df.rename(columns={'Unnamed: 0': 'id'}) |
|
|
|
data = df.to_dict('records') |
|
project = atlas.map_embeddings(embeddings=embeddings, data=data, |
|
id_field='id', |
|
colorable_fields=['Score']) |
|
|
|
project_str = str(project) |
|
|
|
st.text(project_str) |
|
|
|
project_link = project_str.split(':', 1)[1] |
|
|
|
|
|
project_link = project_link.strip() |
|
|
|
|
|
st.markdown(f'<iframe src="{project_link}" width="100%" height="600px"></iframe>', unsafe_allow_html=True) |
|
else: |
|
st.write("Please enter your OpenAI API Key and Nomic Token in the sidebar") |
|
if __name__ == "__main__": |
|
main() |