Spaces:

imh0
/

transformers-p1-embeddings

Runtime error

App Files Files Community

im commited on Jul 27, 2023

Commit

e9755d9

•

1 Parent(s): 4bb4754

add embeddings explanation and dimensionality reduction explanation

Browse files

Files changed (2) hide show

app.py +282 -23
requirements.txt +2 -1

app.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import streamlit as st
 # TODO: move to 'utils'
 mystyle = '''
@@ -15,6 +16,11 @@ def divider():
     _, c, _ = st.columns(3)
     c.divider()
 st.title("Transformers: Tokenisers and Embeddings")
 preface_image, preface_text,  = st.columns(2)
@@ -288,7 +294,7 @@ elif tokeniser_name == 'Unigram':
         according to their probabilities.
     """)
-    st.subheader("Try Yourself:")
     st.write(f"""\
         *Using text area field below try to find or create a comprehensive vocabulary (training dataset) for Tokenisation, which can enhance the
         efficiency of the process. This approach helps to eliminate unknown tokens, thereby making the token sequence
@@ -358,7 +364,7 @@ elif tokeniser_name == 'WordPiece':
         it.
     """)
-    st.subheader("Try Yourself:")
     st.write(f"""\
         *Using text area field below try to find or create a comprehensive vocabulary (training dataset) for Tokenisation, which can enhance the
         efficiency of the process. This approach helps to eliminate unknown tokens, thereby making the token sequence
@@ -472,11 +478,17 @@ st.write("""\
     characteristics using numbers, not words.
 """)
-# TODO: cache
-col1, col2 = st.columns(2)
-token_king = col1.text_input("Choose a word to compare embeddings:", value="king")
-token_queen = col2.text_input("Choose a word to compare embeddings:", value="queen")
 from torch import nn
 from transformers import AutoConfig
@@ -502,28 +514,61 @@ openai.api_key = st.secrets["OPENAI_API_KEY"]
 EMBEDDING_MODEL = 'text-embedding-ada-002'
 EMBEDDING_CTX_LENGTH = 8191
 EMBEDDING_ENCODING = 'cl100k_base'
-king = openai.Embedding.create(input=token_king, model=EMBEDDING_MODEL)["data"][0]["embedding"]
-queen = openai.Embedding.create(input=token_queen, model=EMBEDDING_MODEL)["data"][0]["embedding"]
-df = pd.DataFrame({f'"{token_king}" embeddings': king_emb_np[:50], f'"{token_queen}" embeddings': queen_emb_np[:50]})
-fig = px.line(df, title="Google's 'bert-base-uncased' model embeddings")
 fig.update_layout(legend=dict(orientation="h"))
 st.plotly_chart(fig, use_container_width=True)
-df = pd.DataFrame({f'"{token_king}" embeddings': king[:50], f'"{token_queen}" embeddings': queen[:50]})
-fig = px.line(df, title="OpenAI's 'text-embedding-ada-002' model embeddings")
 fig.update_layout(legend=dict(orientation="h"))
 st.plotly_chart(fig, use_container_width=True)
-import numpy as np
-sentence = st.text_input(label="words to explore embeddings", value="a the king queen space sit eat from on")
-sentence = sentence.split()
-def get_embeddings(text):
-  return np.array(openai.Embedding.create(input=text, model=EMBEDDING_MODEL)["data"][0]["embedding"])
 input = {word: get_embeddings(word) for word in sentence}
@@ -534,24 +579,238 @@ for i, word_i in enumerate(sentence):
 fig = px.imshow(scores_matrix, x=sentence, y=sentence, color_continuous_scale="hot_r")
 fig.update_layout(coloraxis_showscale=False)
-fig.update_layout(width=6000, title_text='Similar words have similar embeddings')
 st.plotly_chart(fig, use_container_width=True)
 from langchain.embeddings.openai import OpenAIEmbeddings
 from langchain.vectorstores import FAISS
 from langchain.schema.document import Document
-db = FAISS.from_documents([Document(page_content="king"), Document(page_content="queen")], OpenAIEmbeddings(model=EMBEDDING_MODEL))
 embeddings_query = st.text_input(label="search term")
 if embeddings_query is not None and embeddings_query != '':
-    embedding_vector = OpenAIEmbeddings(model=EMBEDDING_MODEL).embed_query(embeddings_query)
-    docs = db.similarity_search_by_vector(embedding_vector)
-    st.write(docs[0].page_content)
-st.caption("PCA explanation (optional materials) TBD...")
 with st.expander("References:"):
     st.write("""\
         - https://huggingface.co/blog/getting-started-with-embeddings
         - https://huggingface.co/blog/1b-sentence-embeddings
     """)

 import streamlit as st
+import numpy as np
 # TODO: move to 'utils'
 mystyle = '''
     _, c, _ = st.columns(3)
     c.divider()
+@st.cache_data
+def get_embeddings(text):
+    return np.array(openai.Embedding.create(input=text, model=EMBEDDING_MODEL)["data"][0]["embedding"])
 st.title("Transformers: Tokenisers and Embeddings")
 preface_image, preface_text,  = st.columns(2)
         according to their probabilities.
     """)
+    st.subheader(":green[Try Yourself:]")
     st.write(f"""\
         *Using text area field below try to find or create a comprehensive vocabulary (training dataset) for Tokenisation, which can enhance the
         efficiency of the process. This approach helps to eliminate unknown tokens, thereby making the token sequence
         it.
     """)
+    st.subheader(":green[Try Yourself:]")
     st.write(f"""\
         *Using text area field below try to find or create a comprehensive vocabulary (training dataset) for Tokenisation, which can enhance the
         efficiency of the process. This approach helps to eliminate unknown tokens, thereby making the token sequence
     characteristics using numbers, not words.
 """)
+st.write("""\
+    Let's explore embeddings in more detail. We can take an experimental approach by encoding two specific
+    words and examining the corresponding embedding vectors they generate. To make our exploration more accessible,
+    we'll visualise a portion of these vectors, thereby unveiling the underlying structure of embeddings. Pay attention
+    to common patterns and peaks, try to find two words that yield differing embeddings.
+""")
+col1, col2, col3 = st.columns(3)
+token_king = col1.text_input("Choose a word:", value="king")
+token_queen = col2.text_input("Choose a word:", value="queen")
+token_dots = col3.number_input("Number of dots:", value=50, min_value=0, max_value=1536)
 from torch import nn
 from transformers import AutoConfig
 EMBEDDING_MODEL = 'text-embedding-ada-002'
 EMBEDDING_CTX_LENGTH = 8191
 EMBEDDING_ENCODING = 'cl100k_base'
+king = get_embeddings(token_king)
+queen = get_embeddings(token_queen)
+df = pd.DataFrame({f'"{token_king}" embeddings': king_emb_np, f'"{token_queen}" embeddings': queen_emb_np})
+fig = px.line(df[:token_dots], title=f"Google's 'bert-base-uncased' model embeddings, embedding vector size: {len(queen_emb_np)}")
 fig.update_layout(legend=dict(orientation="h"))
 st.plotly_chart(fig, use_container_width=True)
+with st.expander("Python Code:"):
+    st.code(f"""\
+        from torch import nn
+        from transformers import AutoConfig
+        model_ckpt = 'bert-base-uncased'
+        tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
+        king_id = tokenizer("{token_king}", return_tensors="pt", add_special_tokens=False)
+        queen_id = tokenizer("{token_queen}", return_tensors="pt", add_special_tokens=False)
+        config = AutoConfig.from_pretrained(model_ckpt)
+        token_emb = nn.Embedding(config.vocab_size, config.hidden_size)
+        king_embeddings = token_emb(king_id.input_ids)
+        queen_embeddings = token_emb(queen_id.input_ids)
+    """)
+df = pd.DataFrame({f'"{token_king}" embeddings': king, f'"{token_queen}" embeddings': queen})
+fig = px.line(df[:token_dots], title=f"OpenAI's 'text-embedding-ada-002' model embeddings, embedding vector size: {len(queen)}")
 fig.update_layout(legend=dict(orientation="h"))
 st.plotly_chart(fig, use_container_width=True)
+with st.expander("Python Code:"):
+    st.code(f"""\
+        import openai
+        EMBEDDING_MODEL = 'text-embedding-ada-002'
+        king_embeddings = np.array(openai.Embedding.create(input="{token_king}", model=EMBEDDING_MODEL)["data"][0]["embedding"])
+        queen_embeddings = np.array(openai.Embedding.create(input="{token_queen}", model=EMBEDDING_MODEL)["data"][0]["embedding"])
+    """)
+st.write("""\
+    The similarity can be represented as a similarity score. Identical words naturally have the highest
+    score (black colours), while unrelated terms have lower scores (white colours). To compute this score,
+    we construct a matrix infused with our embedding vectors. Each row in this matrix corresponds to a unique word in the
+    sentence, while each column aligns with another word. The value at the intersection of row i and column j represents
+    the score between word i and word j. For a clearer understanding, let's visualise this matrix using a heatmap. Each
+    cell in the grid corresponds to a pair of words, and the colour of the cell indicates the similarity (correlation)
+    score between those two words. The intensity of the colour directly corresponds to the magnitude of the score - the
+    darker the hue, the higher the score.
+""")
+st.write("""Here is a heatmap of the score matrix for the sentence:""")
+sentence = st.text_input(label="*words to explore embeddings*", value="a the king queen space sit eat from on")
+sentence = sentence.split()
 input = {word: get_embeddings(word) for word in sentence}
 fig = px.imshow(scores_matrix, x=sentence, y=sentence, color_continuous_scale="hot_r")
 fig.update_layout(coloraxis_showscale=False)
+fig.update_layout(width=6000)
 st.plotly_chart(fig, use_container_width=True)
+st.subheader(":green[Try Yourself:]")
 from langchain.embeddings.openai import OpenAIEmbeddings
 from langchain.vectorstores import FAISS
 from langchain.schema.document import Document
+@st.cache_resource
+def create_vector_database():
+    return FAISS.from_documents([Document(page_content="king"), Document(page_content="queen")], OpenAIEmbeddings(model=EMBEDDING_MODEL))
+db = create_vector_database()
+@st.cache_data
+def search_vector_database(term):
+    embedding_vector = OpenAIEmbeddings(model=EMBEDDING_MODEL).embed_query(term)
+    docs = db.similarity_search_by_vector(embedding_vector)
+    return docs
+st.write("""\
+    *There is a vector database containing two words: 'king' and 'queen'. Your task is to pinpoint search
+    terms that would yield either of these words. To facilitate this, use the previously presented similarity matrix to
+    seek out words that give a higher correlation with the word in question. For instance, you might want to explore
+    terms such as 'king', 'queen', 'dog', 'prince', 'man', 'minister', 'boy'.*
+""")
 embeddings_query = st.text_input(label="search term")
 if embeddings_query is not None and embeddings_query != '':
+    docs = search_vector_database(embeddings_query)
+    st.warning(docs[0].page_content)
+    with st.expander("Python Code:"):
+        st.code(f"""\
+            from langchain.embeddings.openai import OpenAIEmbeddings
+            from langchain.vectorstores import FAISS
+            from langchain.schema.document import Document
+            db = FAISS.from_documents([Document(page_content="king"), Document(page_content="queen")], OpenAIEmbeddings(model=EMBEDDING_MODEL))
+            embedding_vector = OpenAIEmbeddings(model=EMBEDDING_MODEL).embed_query("{embeddings_query}")
+            docs = db.similarity_search_by_vector(embedding_vector)
+        """)
+divider()
+st.caption("Conclusion")
+st.write("""\
+    As embedding algorithms are trained on a vast corpus of data, they inherently encapsulate a rich
+    tapestry of information about our language and even the world at large. Therefore, they can be used for:
+    - Search (where results are ranked by relevance to a query string)
+    - Clustering (where text strings are grouped by similarity)
+    - Recommendations (where items with related text strings are recommended)
+    - Anomaly detection (where outliers with little relatedness are identified)
+    - Diversity measurement (where similarity distributions are analyzed)
+    - Classification (where text strings are classified by their most similar label)
+""")
 with st.expander("References:"):
     st.write("""\
         - https://huggingface.co/blog/getting-started-with-embeddings
         - https://huggingface.co/blog/1b-sentence-embeddings
+        - https://platform.openai.com/docs/guides/embeddings/use-cases
+    """)
+divider()
+st.header("Dimensionality Reduction (optional)")
+st.write("""\
+    As was mentioned above, embedding vectors are learned in such a way that words with similar meanings
+    are located close to each other in the space.  However, this is an abstract concept that might be difficult to
+    explore, understand and visualise in a 2D space because word embeddings typically have hundreds of dimensions. To
+    solve this, we can use techniques like Principal Component Analysis (PCA) or t-SNE to reduce the dimensionality of
+    the vectors and plot them.
+""")
+st.write("""But first, let's talk about the meaning of dimensionality reduction using simplified use-case:""")
+dimensionality_name = st.selectbox(label="Choose your example", options=["Simplified", "PCA", 't-SNE'])
+if dimensionality_name == 'Simplified':
+    _, col2, _ = st.columns(3)
+    col2.image("assets/img.png")
+    st.write("""\
+        **Step 1: The context**\n
+        We have a 3D object (your hand) and a light source that's casting a 2D shadow of your hand onto a
+        wall. The shadow is a simpler, lower-dimensional representation of your hand.
+        **Step 2: Identifying the dimensions**\n
+        In this case, the dimensions are the different aspects of your hand that can be
+        observed: the length of your fingers, the width of your palm, the height (or depth) of your hand, the scars,
+        the colour of the skin, etc. However, we have a problem: we can't easily visualise or understand all these dimensions
+        at once. Just as it's hard to imagine a 6-dimensional space.
+        **Step 3: Deciding on important dimensions**\n
+        Let's say you want to compare the number of fingers of different hands. In
+        this case, you don't need to know about the depth of the hand, the width of the palm, or other details like freckles,
+        scars, or skin colour. You just need a shadow that clearly shows the fingers. So, you decide to focus on the length
+        of the fingers, which can be easily shown in the shadow.
+        **Step 4: Reducing dimensions**\n
+        This is where you actually perform dimensionality reduction. You orient your hand in such
+        a way (giving the wall a high-five) that the shadow clearly shows the fingers. You've effectively reduced the
+        dimensions from 3D to 2D. Your hand is still a 3D object, but its shadow — the simplified representation you're using
+        for your comparison — is 2D.
+        **Step 5: Interpretation**\n
+        This hand and shadow example shows how dimensionality reduction simplifies a complex object (
+        the 3D hand) into a lower-dimensional representation (the 2D shadow) that retains the most important information (the
+        number of fingers) while discarding the less important details (like the depth of the hand, skin colour, etc.). It's
+        a process of prioritisation and simplification that makes it easier for us to understand and analyse the data (or the
+        hands, in this case).
+    """)
+elif dimensionality_name == 'PCA':
+    st.write("""\
+        **Step 1: Understanding PCA**\n
+        PCA is a popular method for dimensionality reduction. It identifies the
+        axes in the feature space along which the original data varies the most. These axes are known as the principal
+        components, and they are orthogonal (perpendicular) to each other.
+        **Step 2: Projecting the Data**\n
+        Imagine that instead of just casting a shadow on the wall, you can cast your hand's
+        shadow onto a number of walls arranged at different angles around your hand. Each shadow is a different projection of
+        your hand. In PCA, these different walls represent different principal components, and the shadow on each wall is a
+        projection of your hand onto that principal component.
+        **Step 3: Choosing the Best Projection**\n
+        Now, consider the shadow that most accurately portrays the number of fingers on
+        your hand. This shadow corresponds to the principal component that captures the most variance in the data. In PCA,
+        this would be the first principal component.
+        **Step 4: Secondary Features**\n
+        Next, consider the shadow that, while not as accurate as the first, still gives a
+        reasonable representation of your hand, such as showing the width of your palm. This shadow represents the second
+        principal component, which captures the second highest amount of variance in the data.
+        **Step 5: Reduction of Dimensions**\n
+        In the process of reducing dimensions, we select the top few principal components (
+        shadows) that capture the most variance. The other dimensions (shadows) are discarded. So, instead of having to
+        consider the complex 3D structure of your hand, you can simply look at one or two shadows that give you the most
+        information about the hand.
+        **Step 6: Transformation**\n
+        Finally, we transform the original data into the reduced dimensional space defined by the
+        selected principal components. This is analogous to replacing each hand with the selected shadows for further analysis.
+        By using PCA, we can reduce the complexity of the data (from a 3D hand to a 2D or even 1D shadow), while still
+        retaining the most important information (like the number of fingers or the width of the palm). This makes the data
+        easier to visualize, understand, and work with.
+    """)
+    embedding_dim = 1536
+    embeddings = st.text_input("words to explore:",
+                               value="king queen man woman prince prince princess counselor minister teacher")
+    embeddings = embeddings.split()
+    embeddings = {word: get_embeddings(word) for word in embeddings}
+    from sklearn.decomposition import PCA
+    pca = PCA(n_components=2)
+    embedding_matrix = np.array(list(embeddings.values()))
+    reduced_embeddings = pca.fit_transform(embedding_matrix)
+    df = pd.DataFrame(reduced_embeddings, columns=["X", "Y"])
+    df["Word"] = list(embeddings.keys())
+    fig = px.scatter(df, x="X", y="Y", text="Word", title="Word Embeddings", width=800, height=800)
+    st.plotly_chart(fig, use_container_width=True)
+    st.code(f"""\
+       from sklearn.decomposition import PCA
+        pca = PCA(n_components=2)
+        embedding_matrix = np.array(list(embeddings.values()))
+        reduced_embeddings = pca.fit_transform(embedding_matrix)
+       """, language='python')
+elif dimensionality_name == 't-SNE':
+    st.write("""\
+        **Step 1: Understanding t-SNE**\n
+        t-SNE is a technique for dimensionality reduction that is particularly
+        well-suited for the visualization of high-dimensional datasets. Unlike PCA, which is a linear technique,
+        t-SNE is a non-linear technique, making it better at capturing complex polynomial relationships between variables.
+        **Step 2: Measuring Similarities**\n
+        Imagine that instead of just one hand, you have many hands casting shadows. Each hand
+        is different - some hands might have longer fingers, some might have a wider palm, and so on. Each hand has its own
+        "neighborhood" of similar hands. In t-SNE, these neighborhoods are represented mathematically by a probability
+        distribution. Hands that are very similar to each other have a high probability of being "neighbors", while hands
+        that are very different have a low probability.
+        **Step 3: Creating a Map**\n
+        t-SNE creates a map (or a projection) where hands that were close in the high-dimensional
+        space (similar hands) are still close in the low-dimensional space (in their shadows), and hands that were far apart
+        in the high-dimensional space (different hands) are still far apart in the low-dimensional space. This map is created
+        in such a way that it minimizes the difference between the distances in the high-dimensional space and the distances
+        in the low-dimensional space.
+        **Step 4: Reducing Dimensions**\n
+        The process of reducing dimensions in t-SNE involves optimizing the locations of each
+        hand's shadow in the low-dimensional space such that the overall configuration of shadows best represents the
+        similarities between the hands in the high-dimensional space.
+        **Step 5: Interpretation**\n
+        The result of t-SNE is a map where similar hands are located close together and dissimilar
+        hands are located far apart. This makes it easier to visualize clusters or groups of similar hands.
+        t-SNE, therefore, helps us to project high-dimensional data into a lower-dimensional space in a way that preserves
+        the structure of the data as much as possible, making it easier to visualize and understand the relationships in the
+        data.
+    """)
+    embedding_dim = 1536
+    embeddings = st.text_input("words to explore:",
+                               value="king queen man woman prince prince princess counselor minister teacher")
+    embeddings = embeddings.split()
+    embeddings = {word: get_embeddings(word) for word in embeddings}
+    from sklearn.manifold import TSNE
+    tsne = TSNE(n_components=2, perplexity=2, random_state=0)
+    embedding_matrix = np.array(list(embeddings.values()))
+    reduced_embeddings = tsne.fit_transform(embedding_matrix)
+    df = pd.DataFrame(reduced_embeddings, columns=["X", "Y"])
+    df["Word"] = list(embeddings.keys())
+    fig = px.scatter(df, x="X", y="Y", text="Word", title="Word Embeddings", width=800, height=800)
+    st.plotly_chart(fig, use_container_width=True)
+    st.code(f"""\
+        from sklearn.manifold import TSNE
+        tsne = TSNE(n_components=2, perplexity=2, random_state=0)
+        embedding_matrix = np.array(list(embeddings.values()))
+        reduced_embeddings = tsne.fit_transform(embedding_matrix)
+    """, language='python')
+with st.expander("References:"):
+    st.write("""\
+        - https://hex.tech/blog/dimensionality-reduction/
+        - https://github.com/openai/openai-cookbook/blob/main/examples/Visualizing_embeddings_in_2D.ipynb
     """)

requirements.txt CHANGED Viewed

@@ -6,4 +6,5 @@ openai~=0.27.8
 plotly~=5.15.0
 langchain~=0.0.242
 faiss-cpu~=1.7.4
-tiktoken~=0.4.0

 plotly~=5.15.0
 langchain~=0.0.242
 faiss-cpu~=1.7.4
+tiktoken~=0.4.0
+scikit-learn~=1.3.0