davila7 commited on
Commit
8eb90ef
1 Parent(s): 8102077

Upload 3 files

Browse files
Files changed (4) hide show
  1. .gitattributes +1 -0
  2. app.py +117 -0
  3. food_review.csv +3 -0
  4. requirements.txt +5 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ food_review.csv filter=lfs diff=lfs merge=lfs -text
app.py ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import openai
3
+ from openai.embeddings_utils import get_embedding, cosine_similarity
4
+ from sklearn.manifold import TSNE
5
+ import streamlit as st
6
+ from matplotlib import cm
7
+ import pandas as pd
8
+ import numpy as np
9
+ from ast import literal_eval
10
+ import nomic
11
+ from nomic import atlas
12
+ import matplotlib.pyplot as plt
13
+ import matplotlib
14
+ import numpy as np
15
+
16
+ from dotenv import load_dotenv
17
+ load_dotenv()
18
+ MODEL = "text-embedding-ada-002"
19
+ st.set_page_config(page_title="Visual Embeddings and Similarity", page_icon="🤖", layout="wide")
20
+
21
+ def main():
22
+ # sidebar with openai api key and nomic token
23
+ st.sidebar.title("Credentials")
24
+ st.sidebar.write("OpenAI API Key")
25
+ openai_api_key = st.sidebar.text_input("Enter your OpenAI API Key", value=os.getenv("OPENAI_API_KEY"))
26
+ st.sidebar.write("Nomic Token")
27
+ nomic_token = st.sidebar.text_input("Enter your Nomic Token", value=os.getenv("NOMIC_TOKEN"))
28
+
29
+ openai.api_key = os.getenv("OPENAI_API_KEY")
30
+ nomic.login(os.getenv("NOMIC_TOKEN"))
31
+
32
+ # get data
33
+ datafile_path = "food_review.csv"
34
+ # show only columns ProductId, Score, Summary, Text, n_tokens, embedding
35
+ df = pd.read_csv(datafile_path, usecols=[0,1,3, 5, 7, 8])
36
+ st.title("Visual Embeddings and Similarity")
37
+ st.write("Amazon food reviews dataset")
38
+ st.write(df)
39
+
40
+ st.write("Search similarity")
41
+ form = st.form('Embeddings')
42
+ question = form.text_input("Enter a sentence to search for semantic similarity", value="I love this soup")
43
+ btn = form.form_submit_button("Run")
44
+
45
+ if btn:
46
+ # si openai api key no es none y nomic token no es none
47
+ if openai_api_key is not None and nomic_token is not None:
48
+ with st.spinner("Loading"):
49
+ search_term_vector = get_embedding(question, engine="text-embedding-ada-002")
50
+ search_term_vector = np.array(search_term_vector)
51
+
52
+ matrix = np.array(df.embedding.apply(literal_eval).to_list())
53
+
54
+ # Compute distances to the search_term_vector
55
+ distances = np.linalg.norm(matrix - search_term_vector, axis=1)
56
+ df['distance_to_search_term'] = distances
57
+
58
+ # Normalize the distances to range 0-1 for coloring
59
+ df['normalized_distance'] = (df['distance_to_search_term'] - df['distance_to_search_term'].min()) / (df['distance_to_search_term'].max() - df['distance_to_search_term'].min())
60
+
61
+ # 2D visualization
62
+ # Create a t-SNE model and transform the data
63
+ tsne = TSNE(n_components=2, perplexity=15, random_state=42, init='random', learning_rate=200)
64
+ vis_dims = tsne.fit_transform(matrix)
65
+
66
+ colors = cm.rainbow(df['normalized_distance'])
67
+ x = [x for x,y in vis_dims]
68
+ y = [y for x,y in vis_dims]
69
+
70
+ # Plot points with colors corresponding to their distance from search_term_vector
71
+ plt.scatter(x, y, color=colors, alpha=0.3)
72
+
73
+ # Set title and plot
74
+ plt.title("Similarity to search term visualized in language using t-SNE")
75
+
76
+
77
+ # Convert 'embedding' column to numpy arrays
78
+ df['embedding'] = df['embedding'].apply(lambda x: np.array(literal_eval(x)))
79
+ df["similarities"] = df['embedding'].apply(lambda x: cosine_similarity(x, search_term_vector))
80
+
81
+ st.title("Visual embedding of the search term and the 20 most similar sentences")
82
+ #create two columns
83
+ col1, col2 = st.columns(2)
84
+ #col1
85
+ #show st.plot in col1
86
+ col1.pyplot(plt)
87
+
88
+ #col2
89
+ #show df in col2, but only the columns, text and similarities
90
+ col2.write(df[['similarities','Text']].sort_values("similarities", ascending=False).head(20))
91
+
92
+ # Convert to a list of lists of floats
93
+ st.title("Nomic mappping embeddings")
94
+ embeddings = np.array(df.embedding.to_list())
95
+ df = df.drop('embedding', axis=1)
96
+ df = df.rename(columns={'Unnamed: 0': 'id'})
97
+
98
+ data = df.to_dict('records')
99
+ project = atlas.map_embeddings(embeddings=embeddings, data=data,
100
+ id_field='id',
101
+ colorable_fields=['Score'])
102
+ # Convert project to a string before getting link information
103
+ project_str = str(project)
104
+
105
+ st.text(project_str)
106
+ # Split the project string at the colon and take the second part (index 1)
107
+ project_link = project_str.split(':', 1)[1]
108
+
109
+ # Trim any leading or trailing whitespace
110
+ project_link = project_link.strip()
111
+
112
+ # Crea un iframe con la URL y muéstralo con Streamlit
113
+ st.markdown(f'<iframe src="{project_link}" width="100%" height="600px"></iframe>', unsafe_allow_html=True)
114
+ else:
115
+ st.write("Please enter your OpenAI API Key and Nomic Token in the sidebar")
116
+ if __name__ == "__main__":
117
+ main()
food_review.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0acc913f3deda7b91fcfb73e86a8780d490a54e33f2d2b9b6343078c45f0501b
3
+ size 35254390
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ openai
2
+ streamlit
3
+ pandas
4
+ numpy
5
+ nomic