import os import openai from openai.embeddings_utils import get_embedding, cosine_similarity from sklearn.manifold import TSNE import streamlit as st from matplotlib import cm import pandas as pd import numpy as np from ast import literal_eval import nomic from nomic import atlas import matplotlib.pyplot as plt import matplotlib import numpy as np from dotenv import load_dotenv load_dotenv() MODEL = "text-embedding-ada-002" st.set_page_config(page_title="Visual Embeddings and Similarity", page_icon="šŸ¤–", layout="wide") # sidebar with openai api key and nomic token st.sidebar.title("Credentials") st.sidebar.write("OpenAI API Key") openai_api_key = st.sidebar.text_input("Enter your OpenAI API Key", value=os.getenv("OPENAI_API_KEY")) st.sidebar.write("Nomic Token") nomic_token = st.sidebar.text_input("Enter your Nomic Token", value=os.getenv("NOMIC_TOKEN")) openai.api_key = os.getenv("OPENAI_API_KEY") nomic.login(os.getenv("NOMIC_TOKEN")) # get data datafile_path = "food_review.csv" # show only columns ProductId, Score, Summary, Text, n_tokens, embedding df = pd.read_csv(datafile_path, usecols=[0,1,3, 5, 7, 8]) st.title("Visual Embeddings and Similarity") st.write("Amazon food reviews dataset") st.write(df) st.write("Search similarity") form = st.form('Embeddings') question = form.text_input("Enter a sentence to search for semantic similarity", value="I love this soup") btn = form.form_submit_button("Run") if btn: # si openai api key no es none y nomic token no es none if openai_api_key is not None and nomic_token is not None: with st.spinner("Loading"): search_term_vector = get_embedding(question, engine="text-embedding-ada-002") search_term_vector = np.array(search_term_vector) matrix = np.array(df.embedding.apply(literal_eval).to_list()) # Compute distances to the search_term_vector distances = np.linalg.norm(matrix - search_term_vector, axis=1) df['distance_to_search_term'] = distances # Normalize the distances to range 0-1 for coloring df['normalized_distance'] = (df['distance_to_search_term'] - df['distance_to_search_term'].min()) / (df['distance_to_search_term'].max() - df['distance_to_search_term'].min()) # 2D visualization # Create a t-SNE model and transform the data tsne = TSNE(n_components=2, perplexity=15, random_state=42, init='random', learning_rate=200) vis_dims = tsne.fit_transform(matrix) colors = cm.rainbow(df['normalized_distance']) x = [x for x,y in vis_dims] y = [y for x,y in vis_dims] # Plot points with colors corresponding to their distance from search_term_vector plt.scatter(x, y, color=colors, alpha=0.3) # Set title and plot plt.title("Similarity to search term visualized in language using t-SNE") # Convert 'embedding' column to numpy arrays df['embedding'] = df['embedding'].apply(lambda x: np.array(literal_eval(x))) df["similarities"] = df['embedding'].apply(lambda x: cosine_similarity(x, search_term_vector)) st.title("Visual embedding of the search term and the 20 most similar sentences") #create two columns col1, col2 = st.columns(2) #col1 #show st.plot in col1 col1.pyplot(plt) #col2 #show df in col2, but only the columns, text and similarities col2.write(df[['similarities','Text']].sort_values("similarities", ascending=False).head(20)) # Convert to a list of lists of floats st.title("Nomic mappping embeddings") embeddings = np.array(df.embedding.to_list()) df = df.drop('embedding', axis=1) df = df.rename(columns={'Unnamed: 0': 'id'}) data = df.to_dict('records') project = atlas.map_embeddings(embeddings=embeddings, data=data, id_field='id', colorable_fields=['Score']) # Convert project to a string before getting link information project_str = str(project) st.text(project_str) # Split the project string at the colon and take the second part (index 1) project_link = project_str.split(':', 1)[1] # Trim any leading or trailing whitespace project_link = project_link.strip() # Crea un iframe con la URL y muĆ©stralo con Streamlit st.markdown(f'', unsafe_allow_html=True) else: st.write("Please enter your OpenAI API Key and Nomic Token in the sidebar")