davila7 commited on
Commit
c284122
1 Parent(s): d7fa900

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +94 -97
app.py CHANGED
@@ -18,100 +18,97 @@ load_dotenv()
18
  MODEL = "text-embedding-ada-002"
19
  st.set_page_config(page_title="Visual Embeddings and Similarity", page_icon="🤖", layout="wide")
20
 
21
- def main():
22
- # sidebar with openai api key and nomic token
23
- st.sidebar.title("Credentials")
24
- st.sidebar.write("OpenAI API Key")
25
- openai_api_key = st.sidebar.text_input("Enter your OpenAI API Key", value=os.getenv("OPENAI_API_KEY"))
26
- st.sidebar.write("Nomic Token")
27
- nomic_token = st.sidebar.text_input("Enter your Nomic Token", value=os.getenv("NOMIC_TOKEN"))
28
-
29
- openai.api_key = os.getenv("OPENAI_API_KEY")
30
- nomic.login(os.getenv("NOMIC_TOKEN"))
31
-
32
- # get data
33
- datafile_path = "food_review.csv"
34
- # show only columns ProductId, Score, Summary, Text, n_tokens, embedding
35
- df = pd.read_csv(datafile_path, usecols=[0,1,3, 5, 7, 8])
36
- st.title("Visual Embeddings and Similarity")
37
- st.write("Amazon food reviews dataset")
38
- st.write(df)
39
-
40
- st.write("Search similarity")
41
- form = st.form('Embeddings')
42
- question = form.text_input("Enter a sentence to search for semantic similarity", value="I love this soup")
43
- btn = form.form_submit_button("Run")
44
-
45
- if btn:
46
- # si openai api key no es none y nomic token no es none
47
- if openai_api_key is not None and nomic_token is not None:
48
- with st.spinner("Loading"):
49
- search_term_vector = get_embedding(question, engine="text-embedding-ada-002")
50
- search_term_vector = np.array(search_term_vector)
51
-
52
- matrix = np.array(df.embedding.apply(literal_eval).to_list())
53
-
54
- # Compute distances to the search_term_vector
55
- distances = np.linalg.norm(matrix - search_term_vector, axis=1)
56
- df['distance_to_search_term'] = distances
57
-
58
- # Normalize the distances to range 0-1 for coloring
59
- df['normalized_distance'] = (df['distance_to_search_term'] - df['distance_to_search_term'].min()) / (df['distance_to_search_term'].max() - df['distance_to_search_term'].min())
60
-
61
- # 2D visualization
62
- # Create a t-SNE model and transform the data
63
- tsne = TSNE(n_components=2, perplexity=15, random_state=42, init='random', learning_rate=200)
64
- vis_dims = tsne.fit_transform(matrix)
65
-
66
- colors = cm.rainbow(df['normalized_distance'])
67
- x = [x for x,y in vis_dims]
68
- y = [y for x,y in vis_dims]
69
-
70
- # Plot points with colors corresponding to their distance from search_term_vector
71
- plt.scatter(x, y, color=colors, alpha=0.3)
72
-
73
- # Set title and plot
74
- plt.title("Similarity to search term visualized in language using t-SNE")
75
-
76
-
77
- # Convert 'embedding' column to numpy arrays
78
- df['embedding'] = df['embedding'].apply(lambda x: np.array(literal_eval(x)))
79
- df["similarities"] = df['embedding'].apply(lambda x: cosine_similarity(x, search_term_vector))
80
-
81
- st.title("Visual embedding of the search term and the 20 most similar sentences")
82
- #create two columns
83
- col1, col2 = st.columns(2)
84
- #col1
85
- #show st.plot in col1
86
- col1.pyplot(plt)
87
-
88
- #col2
89
- #show df in col2, but only the columns, text and similarities
90
- col2.write(df[['similarities','Text']].sort_values("similarities", ascending=False).head(20))
91
-
92
- # Convert to a list of lists of floats
93
- st.title("Nomic mappping embeddings")
94
- embeddings = np.array(df.embedding.to_list())
95
- df = df.drop('embedding', axis=1)
96
- df = df.rename(columns={'Unnamed: 0': 'id'})
97
-
98
- data = df.to_dict('records')
99
- project = atlas.map_embeddings(embeddings=embeddings, data=data,
100
- id_field='id',
101
- colorable_fields=['Score'])
102
- # Convert project to a string before getting link information
103
- project_str = str(project)
104
-
105
- st.text(project_str)
106
- # Split the project string at the colon and take the second part (index 1)
107
- project_link = project_str.split(':', 1)[1]
108
-
109
- # Trim any leading or trailing whitespace
110
- project_link = project_link.strip()
111
-
112
- # Crea un iframe con la URL y muéstralo con Streamlit
113
- st.markdown(f'<iframe src="{project_link}" width="100%" height="600px"></iframe>', unsafe_allow_html=True)
114
- else:
115
- st.write("Please enter your OpenAI API Key and Nomic Token in the sidebar")
116
- if __name__ == "__main__":
117
- main()
 
18
  MODEL = "text-embedding-ada-002"
19
  st.set_page_config(page_title="Visual Embeddings and Similarity", page_icon="🤖", layout="wide")
20
 
21
+ # sidebar with openai api key and nomic token
22
+ st.sidebar.title("Credentials")
23
+ st.sidebar.write("OpenAI API Key")
24
+ openai_api_key = st.sidebar.text_input("Enter your OpenAI API Key", value=os.getenv("OPENAI_API_KEY"))
25
+ st.sidebar.write("Nomic Token")
26
+ nomic_token = st.sidebar.text_input("Enter your Nomic Token", value=os.getenv("NOMIC_TOKEN"))
27
+
28
+ openai.api_key = os.getenv("OPENAI_API_KEY")
29
+ nomic.login(os.getenv("NOMIC_TOKEN"))
30
+
31
+ # get data
32
+ datafile_path = "food_review.csv"
33
+ # show only columns ProductId, Score, Summary, Text, n_tokens, embedding
34
+ df = pd.read_csv(datafile_path, usecols=[0,1,3, 5, 7, 8])
35
+ st.title("Visual Embeddings and Similarity")
36
+ st.write("Amazon food reviews dataset")
37
+ st.write(df)
38
+
39
+ st.write("Search similarity")
40
+ form = st.form('Embeddings')
41
+ question = form.text_input("Enter a sentence to search for semantic similarity", value="I love this soup")
42
+ btn = form.form_submit_button("Run")
43
+
44
+ if btn:
45
+ # si openai api key no es none y nomic token no es none
46
+ if openai_api_key is not None and nomic_token is not None:
47
+ with st.spinner("Loading"):
48
+ search_term_vector = get_embedding(question, engine="text-embedding-ada-002")
49
+ search_term_vector = np.array(search_term_vector)
50
+
51
+ matrix = np.array(df.embedding.apply(literal_eval).to_list())
52
+
53
+ # Compute distances to the search_term_vector
54
+ distances = np.linalg.norm(matrix - search_term_vector, axis=1)
55
+ df['distance_to_search_term'] = distances
56
+
57
+ # Normalize the distances to range 0-1 for coloring
58
+ df['normalized_distance'] = (df['distance_to_search_term'] - df['distance_to_search_term'].min()) / (df['distance_to_search_term'].max() - df['distance_to_search_term'].min())
59
+
60
+ # 2D visualization
61
+ # Create a t-SNE model and transform the data
62
+ tsne = TSNE(n_components=2, perplexity=15, random_state=42, init='random', learning_rate=200)
63
+ vis_dims = tsne.fit_transform(matrix)
64
+
65
+ colors = cm.rainbow(df['normalized_distance'])
66
+ x = [x for x,y in vis_dims]
67
+ y = [y for x,y in vis_dims]
68
+
69
+ # Plot points with colors corresponding to their distance from search_term_vector
70
+ plt.scatter(x, y, color=colors, alpha=0.3)
71
+
72
+ # Set title and plot
73
+ plt.title("Similarity to search term visualized in language using t-SNE")
74
+
75
+
76
+ # Convert 'embedding' column to numpy arrays
77
+ df['embedding'] = df['embedding'].apply(lambda x: np.array(literal_eval(x)))
78
+ df["similarities"] = df['embedding'].apply(lambda x: cosine_similarity(x, search_term_vector))
79
+
80
+ st.title("Visual embedding of the search term and the 20 most similar sentences")
81
+ #create two columns
82
+ col1, col2 = st.columns(2)
83
+ #col1
84
+ #show st.plot in col1
85
+ col1.pyplot(plt)
86
+
87
+ #col2
88
+ #show df in col2, but only the columns, text and similarities
89
+ col2.write(df[['similarities','Text']].sort_values("similarities", ascending=False).head(20))
90
+
91
+ # Convert to a list of lists of floats
92
+ st.title("Nomic mappping embeddings")
93
+ embeddings = np.array(df.embedding.to_list())
94
+ df = df.drop('embedding', axis=1)
95
+ df = df.rename(columns={'Unnamed: 0': 'id'})
96
+
97
+ data = df.to_dict('records')
98
+ project = atlas.map_embeddings(embeddings=embeddings, data=data,
99
+ id_field='id',
100
+ colorable_fields=['Score'])
101
+ # Convert project to a string before getting link information
102
+ project_str = str(project)
103
+
104
+ st.text(project_str)
105
+ # Split the project string at the colon and take the second part (index 1)
106
+ project_link = project_str.split(':', 1)[1]
107
+
108
+ # Trim any leading or trailing whitespace
109
+ project_link = project_link.strip()
110
+
111
+ # Crea un iframe con la URL y muéstralo con Streamlit
112
+ st.markdown(f'<iframe src="{project_link}" width="100%" height="600px"></iframe>', unsafe_allow_html=True)
113
+ else:
114
+ st.write("Please enter your OpenAI API Key and Nomic Token in the sidebar")