Spaces:
Runtime error
Runtime error
Upload 3 files
Browse files- .gitattributes +1 -0
- app.py +117 -0
- food_review.csv +3 -0
- requirements.txt +5 -0
.gitattributes
CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
food_review.csv filter=lfs diff=lfs merge=lfs -text
|
app.py
ADDED
@@ -0,0 +1,117 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import openai
|
3 |
+
from openai.embeddings_utils import get_embedding, cosine_similarity
|
4 |
+
from sklearn.manifold import TSNE
|
5 |
+
import streamlit as st
|
6 |
+
from matplotlib import cm
|
7 |
+
import pandas as pd
|
8 |
+
import numpy as np
|
9 |
+
from ast import literal_eval
|
10 |
+
import nomic
|
11 |
+
from nomic import atlas
|
12 |
+
import matplotlib.pyplot as plt
|
13 |
+
import matplotlib
|
14 |
+
import numpy as np
|
15 |
+
|
16 |
+
from dotenv import load_dotenv
|
17 |
+
load_dotenv()
|
18 |
+
MODEL = "text-embedding-ada-002"
|
19 |
+
st.set_page_config(page_title="Visual Embeddings and Similarity", page_icon="🤖", layout="wide")
|
20 |
+
|
21 |
+
def main():
|
22 |
+
# sidebar with openai api key and nomic token
|
23 |
+
st.sidebar.title("Credentials")
|
24 |
+
st.sidebar.write("OpenAI API Key")
|
25 |
+
openai_api_key = st.sidebar.text_input("Enter your OpenAI API Key", value=os.getenv("OPENAI_API_KEY"))
|
26 |
+
st.sidebar.write("Nomic Token")
|
27 |
+
nomic_token = st.sidebar.text_input("Enter your Nomic Token", value=os.getenv("NOMIC_TOKEN"))
|
28 |
+
|
29 |
+
openai.api_key = os.getenv("OPENAI_API_KEY")
|
30 |
+
nomic.login(os.getenv("NOMIC_TOKEN"))
|
31 |
+
|
32 |
+
# get data
|
33 |
+
datafile_path = "food_review.csv"
|
34 |
+
# show only columns ProductId, Score, Summary, Text, n_tokens, embedding
|
35 |
+
df = pd.read_csv(datafile_path, usecols=[0,1,3, 5, 7, 8])
|
36 |
+
st.title("Visual Embeddings and Similarity")
|
37 |
+
st.write("Amazon food reviews dataset")
|
38 |
+
st.write(df)
|
39 |
+
|
40 |
+
st.write("Search similarity")
|
41 |
+
form = st.form('Embeddings')
|
42 |
+
question = form.text_input("Enter a sentence to search for semantic similarity", value="I love this soup")
|
43 |
+
btn = form.form_submit_button("Run")
|
44 |
+
|
45 |
+
if btn:
|
46 |
+
# si openai api key no es none y nomic token no es none
|
47 |
+
if openai_api_key is not None and nomic_token is not None:
|
48 |
+
with st.spinner("Loading"):
|
49 |
+
search_term_vector = get_embedding(question, engine="text-embedding-ada-002")
|
50 |
+
search_term_vector = np.array(search_term_vector)
|
51 |
+
|
52 |
+
matrix = np.array(df.embedding.apply(literal_eval).to_list())
|
53 |
+
|
54 |
+
# Compute distances to the search_term_vector
|
55 |
+
distances = np.linalg.norm(matrix - search_term_vector, axis=1)
|
56 |
+
df['distance_to_search_term'] = distances
|
57 |
+
|
58 |
+
# Normalize the distances to range 0-1 for coloring
|
59 |
+
df['normalized_distance'] = (df['distance_to_search_term'] - df['distance_to_search_term'].min()) / (df['distance_to_search_term'].max() - df['distance_to_search_term'].min())
|
60 |
+
|
61 |
+
# 2D visualization
|
62 |
+
# Create a t-SNE model and transform the data
|
63 |
+
tsne = TSNE(n_components=2, perplexity=15, random_state=42, init='random', learning_rate=200)
|
64 |
+
vis_dims = tsne.fit_transform(matrix)
|
65 |
+
|
66 |
+
colors = cm.rainbow(df['normalized_distance'])
|
67 |
+
x = [x for x,y in vis_dims]
|
68 |
+
y = [y for x,y in vis_dims]
|
69 |
+
|
70 |
+
# Plot points with colors corresponding to their distance from search_term_vector
|
71 |
+
plt.scatter(x, y, color=colors, alpha=0.3)
|
72 |
+
|
73 |
+
# Set title and plot
|
74 |
+
plt.title("Similarity to search term visualized in language using t-SNE")
|
75 |
+
|
76 |
+
|
77 |
+
# Convert 'embedding' column to numpy arrays
|
78 |
+
df['embedding'] = df['embedding'].apply(lambda x: np.array(literal_eval(x)))
|
79 |
+
df["similarities"] = df['embedding'].apply(lambda x: cosine_similarity(x, search_term_vector))
|
80 |
+
|
81 |
+
st.title("Visual embedding of the search term and the 20 most similar sentences")
|
82 |
+
#create two columns
|
83 |
+
col1, col2 = st.columns(2)
|
84 |
+
#col1
|
85 |
+
#show st.plot in col1
|
86 |
+
col1.pyplot(plt)
|
87 |
+
|
88 |
+
#col2
|
89 |
+
#show df in col2, but only the columns, text and similarities
|
90 |
+
col2.write(df[['similarities','Text']].sort_values("similarities", ascending=False).head(20))
|
91 |
+
|
92 |
+
# Convert to a list of lists of floats
|
93 |
+
st.title("Nomic mappping embeddings")
|
94 |
+
embeddings = np.array(df.embedding.to_list())
|
95 |
+
df = df.drop('embedding', axis=1)
|
96 |
+
df = df.rename(columns={'Unnamed: 0': 'id'})
|
97 |
+
|
98 |
+
data = df.to_dict('records')
|
99 |
+
project = atlas.map_embeddings(embeddings=embeddings, data=data,
|
100 |
+
id_field='id',
|
101 |
+
colorable_fields=['Score'])
|
102 |
+
# Convert project to a string before getting link information
|
103 |
+
project_str = str(project)
|
104 |
+
|
105 |
+
st.text(project_str)
|
106 |
+
# Split the project string at the colon and take the second part (index 1)
|
107 |
+
project_link = project_str.split(':', 1)[1]
|
108 |
+
|
109 |
+
# Trim any leading or trailing whitespace
|
110 |
+
project_link = project_link.strip()
|
111 |
+
|
112 |
+
# Crea un iframe con la URL y muéstralo con Streamlit
|
113 |
+
st.markdown(f'<iframe src="{project_link}" width="100%" height="600px"></iframe>', unsafe_allow_html=True)
|
114 |
+
else:
|
115 |
+
st.write("Please enter your OpenAI API Key and Nomic Token in the sidebar")
|
116 |
+
if __name__ == "__main__":
|
117 |
+
main()
|
food_review.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0acc913f3deda7b91fcfb73e86a8780d490a54e33f2d2b9b6343078c45f0501b
|
3 |
+
size 35254390
|
requirements.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
openai
|
2 |
+
streamlit
|
3 |
+
pandas
|
4 |
+
numpy
|
5 |
+
nomic
|