im commited on
Commit
92b778c
1 Parent(s): a6a7ec1

add vector database 3d space visualisation

Browse files
Files changed (1) hide show
  1. app.py +84 -12
app.py CHANGED
@@ -548,6 +548,7 @@ st.plotly_chart(fig, use_container_width=True)
548
  with st.expander("Python Code:"):
549
  st.code(f"""\
550
  import openai
 
551
 
552
  EMBEDDING_MODEL = 'text-embedding-ada-002'
553
 
@@ -582,7 +583,84 @@ fig.update_layout(coloraxis_showscale=False)
582
  fig.update_layout(width=6000)
583
  st.plotly_chart(fig, use_container_width=True)
584
 
585
- st.subheader(":green[Try Yourself:]")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
586
 
587
  from langchain.embeddings.openai import OpenAIEmbeddings
588
  from langchain.vectorstores import FAISS
@@ -599,13 +677,6 @@ def search_vector_database(term):
599
  docs = db.similarity_search_by_vector(embedding_vector)
600
  return docs
601
 
602
- st.write("""\
603
- *There is a vector database containing two words: 'king' and 'queen'. Your task is to pinpoint search
604
- terms that would yield either of these words. To facilitate this, use the previously presented similarity matrix to
605
- seek out words that give a higher correlation with the word in question. For instance, you might want to explore
606
- terms such as 'king', 'queen', 'dog', 'prince', 'man', 'minister', 'boy'.*
607
- """)
608
- embeddings_query = st.text_input(label="search term")
609
  if embeddings_query is not None and embeddings_query != '':
610
  docs = search_vector_database(embeddings_query)
611
  st.warning(docs[0].page_content)
@@ -623,7 +694,7 @@ if embeddings_query is not None and embeddings_query != '':
623
  """)
624
 
625
  divider()
626
- st.caption("Conclusion")
627
  st.write("""\
628
  As embedding algorithms are trained on a vast corpus of data, they inherently encapsulate a rich
629
  tapestry of information about our language and even the world at large. Therefore, they can be used for:
@@ -643,10 +714,11 @@ with st.expander("References:"):
643
  - https://platform.openai.com/docs/guides/embeddings/use-cases
644
  """)
645
 
 
 
646
  divider()
647
  st.header("Dimensionality Reduction (optional)")
648
 
649
-
650
  st.write("""\
651
  As was mentioned above, embedding vectors are learned in such a way that words with similar meanings
652
  are located close to each other in the space. However, this is an abstract concept that might be difficult to
@@ -728,7 +800,7 @@ elif dimensionality_name == 'PCA':
728
  """)
729
  embedding_dim = 1536
730
  embeddings = st.text_input("words to explore:",
731
- value="king queen man woman prince prince princess counselor minister teacher")
732
  embeddings = embeddings.split()
733
  embeddings = {word: get_embeddings(word) for word in embeddings}
734
 
@@ -787,7 +859,7 @@ elif dimensionality_name == 't-SNE':
787
  """)
788
  embedding_dim = 1536
789
  embeddings = st.text_input("words to explore:",
790
- value="king queen man woman prince prince princess counselor minister teacher")
791
  embeddings = embeddings.split()
792
  embeddings = {word: get_embeddings(word) for word in embeddings}
793
 
 
548
  with st.expander("Python Code:"):
549
  st.code(f"""\
550
  import openai
551
+ import numpy as np
552
 
553
  EMBEDDING_MODEL = 'text-embedding-ada-002'
554
 
 
583
  fig.update_layout(width=6000)
584
  st.plotly_chart(fig, use_container_width=True)
585
 
586
+ st.subheader("Vector Databases")
587
+ st.write("""\
588
+ In a vector database, each item (e.g., a document) is represented as a point in a multidimensional
589
+ space. Each point is a vector that represents the features of the item. The goal is to place similar items close to
590
+ each other and dissimilar items farther apart. In the case of documents, the features could be derived from the words
591
+ in the document, and the similarity might be based on the overlapping words or concepts between the documents.
592
+
593
+ The retrieval of documents based on search terms involves two main steps:
594
+
595
+ Vectorization of the search query: The search query is converted into a vector using the same process used to vectorize the documents in the database.
596
+
597
+ Vector similarity search: The vector database then identifies the vectors that are closest to the query vector.
598
+ This is typically done using a distance metric like Euclidean distance or cosine similarity. The documents
599
+ corresponding to these vectors are returned as the search results.
600
+
601
+ As you correctly assumed, we leverage embedding algorithms to vectorise documents. Let's generate a 3D
602
+ visualization of the document vectors and a search query. For simplicity, let's assume we have a vector database
603
+ of documents that has been reduced to 3 dimensions, and we'll also have a 3D vector for a search query.
604
+
605
+ """)
606
+ with st.expander("The Euclidean distance between two points in 3D space is calculated as:"):
607
+ st.latex("""\\text{Distance}(A(x_1, y_1, z_1), B(x_2, y_2, z_2)) = \sqrt{(x_2 - x_1)^2 + (y_2 - y_1)^2 + (z_2 - z_1)^2}""")
608
+ st.write("""\
609
+ The document that corresponds to the vector with the smallest distance to the query vector is
610
+ considered the most relevant document. The 3D plot above now shows lines from the query vector (in red) to each
611
+ document vector (in blue). Each line represents the Euclidean distance from the query vector to a document vector.
612
+ """)
613
+ embeddings = st.text_input("vector space:", value="king queen prince princess counselor minister teacher")
614
+ embeddings = embeddings.split()
615
+ embeddings_query = st.text_input(label="search term", value='woman')
616
+
617
+ import numpy as np
618
+ import plotly.express as px
619
+ import plotly.graph_objects as go
620
+ from sklearn.manifold import TSNE
621
+
622
+ embeddings = {word: get_embeddings(word) for word in embeddings}
623
+ embeddings[embeddings_query] = get_embeddings(embeddings_query)
624
+
625
+ tsne = TSNE(n_components=3, perplexity=3, random_state=0)
626
+ embedding_matrix = np.array(list(embeddings.values()))
627
+ reduced_embeddings = tsne.fit_transform(embedding_matrix)
628
+
629
+ df = pd.DataFrame(reduced_embeddings, columns=["X", "Y", "Z"])
630
+ df["Word"] = list(embeddings.keys())
631
+ fig = px.scatter_3d(df, x="X", y="Y", z="Z", text="Word", title="Vector Space", width=800, height=800)
632
+
633
+
634
+ docs = reduced_embeddings[:-1]
635
+ query = reduced_embeddings[-1]
636
+ distances = np.linalg.norm(docs - query, axis=1)
637
+ closest_doc_index = np.argmin(distances)
638
+ closest_doc = docs[closest_doc_index]
639
+
640
+ for doc in docs:
641
+ fig.add_trace(go.Scatter3d(x=[query[0], doc[0]], y=[query[1], doc[1]], z=[query[2], doc[2]], mode='lines', line=dict(color='purple', width=2, dash='dash')))
642
+ fig.add_trace(go.Scatter3d(x=[query[0], closest_doc[0]], y=[query[1], closest_doc[1]], z=[query[2], closest_doc[2]], name='closest', mode='lines', line=dict(color='purple', width=2)))
643
+
644
+ st.plotly_chart(fig, use_container_width=True)
645
+
646
+ st.write("""\
647
+ This visualization represents the core concept of a vector database search. The database converts the
648
+ search query into a vector, then finds the document vectors that are closest to the query vector. Those documents are
649
+ considered the most relevant to the search query.
650
+
651
+ It's important to note that in a real-world application, the vectors would likely exist in much higher dimensional
652
+ space. However, the same principles apply: the search algorithm finds the document vectors that are nearest to the
653
+ query vector based on some distance metric.
654
+ """)
655
+ st.subheader(":green[Try Yourself]")
656
+
657
+ st.write("""\
658
+ *There is a vector database containing two words (documents): 'king' and 'queen'. Your task is to pinpoint search
659
+ terms that would yield either of these words. To facilitate this, use the previously presented similarity matrix to
660
+ seek out words that give a higher correlation with the word in question. For instance, you might want to explore
661
+ terms such as 'king', 'queen', 'dog', 'prince', 'man', 'minister', 'boy'.*
662
+ """)
663
+ embeddings_query = st.text_input(label="search term")
664
 
665
  from langchain.embeddings.openai import OpenAIEmbeddings
666
  from langchain.vectorstores import FAISS
 
677
  docs = db.similarity_search_by_vector(embedding_vector)
678
  return docs
679
 
 
 
 
 
 
 
 
680
  if embeddings_query is not None and embeddings_query != '':
681
  docs = search_vector_database(embeddings_query)
682
  st.warning(docs[0].page_content)
 
694
  """)
695
 
696
  divider()
697
+ st.subheader("Conclusion")
698
  st.write("""\
699
  As embedding algorithms are trained on a vast corpus of data, they inherently encapsulate a rich
700
  tapestry of information about our language and even the world at large. Therefore, they can be used for:
 
714
  - https://platform.openai.com/docs/guides/embeddings/use-cases
715
  """)
716
 
717
+
718
+ # *********************************************
719
  divider()
720
  st.header("Dimensionality Reduction (optional)")
721
 
 
722
  st.write("""\
723
  As was mentioned above, embedding vectors are learned in such a way that words with similar meanings
724
  are located close to each other in the space. However, this is an abstract concept that might be difficult to
 
800
  """)
801
  embedding_dim = 1536
802
  embeddings = st.text_input("words to explore:",
803
+ value="king queen man woman prince princess counselor minister teacher")
804
  embeddings = embeddings.split()
805
  embeddings = {word: get_embeddings(word) for word in embeddings}
806
 
 
859
  """)
860
  embedding_dim = 1536
861
  embeddings = st.text_input("words to explore:",
862
+ value="king queen man woman prince princess counselor minister teacher")
863
  embeddings = embeddings.split()
864
  embeddings = {word: get_embeddings(word) for word in embeddings}
865