davila7's picture
Update app.py
c284122
raw
history blame contribute delete
No virus
4.88 kB
import os
import openai
from openai.embeddings_utils import get_embedding, cosine_similarity
from sklearn.manifold import TSNE
import streamlit as st
from matplotlib import cm
import pandas as pd
import numpy as np
from ast import literal_eval
import nomic
from nomic import atlas
import matplotlib.pyplot as plt
import matplotlib
import numpy as np
from dotenv import load_dotenv
load_dotenv()
MODEL = "text-embedding-ada-002"
st.set_page_config(page_title="Visual Embeddings and Similarity", page_icon="🤖", layout="wide")
# sidebar with openai api key and nomic token
st.sidebar.title("Credentials")
st.sidebar.write("OpenAI API Key")
openai_api_key = st.sidebar.text_input("Enter your OpenAI API Key", value=os.getenv("OPENAI_API_KEY"))
st.sidebar.write("Nomic Token")
nomic_token = st.sidebar.text_input("Enter your Nomic Token", value=os.getenv("NOMIC_TOKEN"))
openai.api_key = os.getenv("OPENAI_API_KEY")
nomic.login(os.getenv("NOMIC_TOKEN"))
# get data
datafile_path = "food_review.csv"
# show only columns ProductId, Score, Summary, Text, n_tokens, embedding
df = pd.read_csv(datafile_path, usecols=[0,1,3, 5, 7, 8])
st.title("Visual Embeddings and Similarity")
st.write("Amazon food reviews dataset")
st.write(df)
st.write("Search similarity")
form = st.form('Embeddings')
question = form.text_input("Enter a sentence to search for semantic similarity", value="I love this soup")
btn = form.form_submit_button("Run")
if btn:
# si openai api key no es none y nomic token no es none
if openai_api_key is not None and nomic_token is not None:
with st.spinner("Loading"):
search_term_vector = get_embedding(question, engine="text-embedding-ada-002")
search_term_vector = np.array(search_term_vector)
matrix = np.array(df.embedding.apply(literal_eval).to_list())
# Compute distances to the search_term_vector
distances = np.linalg.norm(matrix - search_term_vector, axis=1)
df['distance_to_search_term'] = distances
# Normalize the distances to range 0-1 for coloring
df['normalized_distance'] = (df['distance_to_search_term'] - df['distance_to_search_term'].min()) / (df['distance_to_search_term'].max() - df['distance_to_search_term'].min())
# 2D visualization
# Create a t-SNE model and transform the data
tsne = TSNE(n_components=2, perplexity=15, random_state=42, init='random', learning_rate=200)
vis_dims = tsne.fit_transform(matrix)
colors = cm.rainbow(df['normalized_distance'])
x = [x for x,y in vis_dims]
y = [y for x,y in vis_dims]
# Plot points with colors corresponding to their distance from search_term_vector
plt.scatter(x, y, color=colors, alpha=0.3)
# Set title and plot
plt.title("Similarity to search term visualized in language using t-SNE")
# Convert 'embedding' column to numpy arrays
df['embedding'] = df['embedding'].apply(lambda x: np.array(literal_eval(x)))
df["similarities"] = df['embedding'].apply(lambda x: cosine_similarity(x, search_term_vector))
st.title("Visual embedding of the search term and the 20 most similar sentences")
#create two columns
col1, col2 = st.columns(2)
#col1
#show st.plot in col1
col1.pyplot(plt)
#col2
#show df in col2, but only the columns, text and similarities
col2.write(df[['similarities','Text']].sort_values("similarities", ascending=False).head(20))
# Convert to a list of lists of floats
st.title("Nomic mappping embeddings")
embeddings = np.array(df.embedding.to_list())
df = df.drop('embedding', axis=1)
df = df.rename(columns={'Unnamed: 0': 'id'})
data = df.to_dict('records')
project = atlas.map_embeddings(embeddings=embeddings, data=data,
id_field='id',
colorable_fields=['Score'])
# Convert project to a string before getting link information
project_str = str(project)
st.text(project_str)
# Split the project string at the colon and take the second part (index 1)
project_link = project_str.split(':', 1)[1]
# Trim any leading or trailing whitespace
project_link = project_link.strip()
# Crea un iframe con la URL y muéstralo con Streamlit
st.markdown(f'<iframe src="{project_link}" width="100%" height="600px"></iframe>', unsafe_allow_html=True)
else:
st.write("Please enter your OpenAI API Key and Nomic Token in the sidebar")