Spaces:

imh0
/

transformers-p1-embeddings

Runtime error

App Files Files Community

im commited on Jul 25, 2023

Commit

84c1553

1 Parent(s): 120ad45

add chapter about Embeddings and vector comparison charts

Browse files

Files changed (3) hide show

.gitignore +2 -1
app.py +75 -1
requirements.txt +3 -1

.gitignore CHANGED Viewed

@@ -162,4 +162,5 @@ cython_debug/
 #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
-.idea/

 #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+.idea/
+.streamlit/secrets.toml

app.py CHANGED Viewed

@@ -213,7 +213,7 @@ if tokeniser_name == 'BPE':
     """)
-    st.subheader("Try Yourself:")
     st.write(f"""\
         *Using text area field below try to find or create a comprehensive vocabulary (training dataset) for Tokenisation, which can enhance the
         efficiency of the process. This approach helps to eliminate unknown tokens, thereby making the token sequence
@@ -445,3 +445,77 @@ with st.expander("References:"):
 divider()
 st.header("Embeddings")
 st.caption("TBD...")

     """)
+    st.subheader(":green[Try Yourself:]")
     st.write(f"""\
         *Using text area field below try to find or create a comprehensive vocabulary (training dataset) for Tokenisation, which can enhance the
         efficiency of the process. This approach helps to eliminate unknown tokens, thereby making the token sequence
 divider()
 st.header("Embeddings")
 st.caption("TBD...")
+st.write("""\
+    Following tokenization, each token is transformed into a vector of numeric characteristics, a process
+    known as 'embedding.' In this context, 'embedding' refers to the mapping of the discrete, categorical space of words
+    or tokens into a continuous, numeric space, which the model can manipulate more effectively.
+    Each dimension in this high-dimensional space can encapsulate a different facet of the token's meaning. For instance,
+    one dimension might capture the tense of a token if it's a verb, while another dimension might capture the degree of
+    positivity or negativity if the token is an adjective expressing sentiment. For instance:
+""")
+st.code("""\
+    "I" -> [noun, person]
+    "love" -> [verb, feeling]
+    "machine" -> [noun, automation]
+    "learn" -> [verb, knowledge]
+    "##ing" -> [gerund, continues]
+""")
+st.write("""\
+    The actual embeddings in a typical NLP model would be in a much higher-dimensional space (often several hundred dimensions), but the idea is the same.
+    Embeddings are dynamically learned from the data, with the model adjusting these embeddings during
+    training to minimize the discrepancy between the predicted and actual outputs for a set of training examples.
+    Consequently, tokens with similar meanings often end up with similar embeddings.
+    In the context of Transformers, these embeddings are the inputs that the model uses. Once again, we represent all the
+    characteristics using numbers, not words.
+""")
+col1, col2 = st.columns(2)
+token_king = col1.text_input("Choose words to compare embeddings:", value="king")
+token_queen = col2.text_input("Choose words to compare embeddings:", value="queen")
+from torch import nn
+from transformers import AutoConfig
+from transformers import AutoTokenizer
+import pandas as pd
+import openai
+model_ckpt = 'bert-base-uncased'
+tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
+king_id = tokenizer(token_king, return_tensors="pt", add_special_tokens=False)
+queen_id = tokenizer(token_queen, return_tensors="pt", add_special_tokens=False)
+config = AutoConfig.from_pretrained(model_ckpt)
+token_emb = nn.Embedding(config.vocab_size, config.hidden_size)
+king_embeddings = token_emb(king_id.input_ids)
+queen_embeddings = token_emb(queen_id.input_ids)
+king_emb_np = king_embeddings.reshape(-1).detach().numpy()
+queen_emb_np = queen_embeddings.reshape(-1).detach().numpy()
+openai.api_key = st.secrets["OPENAI_API_KEY"]
+EMBEDDING_MODEL = 'text-embedding-ada-002'
+EMBEDDING_CTX_LENGTH = 8191
+EMBEDDING_ENCODING = 'cl100k_base'
+king = openai.Embedding.create(input=token_king, model=EMBEDDING_MODEL)["data"][0]["embedding"]
+queen = openai.Embedding.create(input=token_queen, model=EMBEDDING_MODEL)["data"][0]["embedding"]
+st.write("Google's 'bert-base-uncased' model embeddings:")
+df = pd.DataFrame({f'"{token_king}" embeddings': king_emb_np[:50], f'"{token_queen}" embeddings': queen_emb_np[:50]})
+st.line_chart(df)
+st.write("OpenAI's 'text-embedding-ada-002' model embeddings:")
+df = pd.DataFrame({f'"{token_king}" embeddings': king[:50], f'"{token_queen}" embeddings': queen[:50]})
+st.line_chart(df)
+with st.expander("References:"):
+    st.write("""\
+        - https://huggingface.co/blog/getting-started-with-embeddings
+        - https://huggingface.co/blog/1b-sentence-embeddings
+    """)

requirements.txt CHANGED Viewed

@@ -1,3 +1,5 @@
 streamlit~=1.21.0
 tokenizers~=0.13.3
-transformers~=4.31.0

 streamlit~=1.21.0
 tokenizers~=0.13.3
+transformers~=4.31.0
+torch~=2.0.1
+openai~=0.27.8