im commited on
Commit
84c1553
1 Parent(s): 120ad45

add chapter about Embeddings and vector comparison charts

Browse files
Files changed (3) hide show
  1. .gitignore +2 -1
  2. app.py +75 -1
  3. requirements.txt +3 -1
.gitignore CHANGED
@@ -162,4 +162,5 @@ cython_debug/
162
  # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
163
  # and can be added to the global gitignore or merged into this file. For a more nuclear
164
  # option (not recommended) you can uncomment the following to ignore the entire idea folder.
165
- .idea/
 
 
162
  # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
163
  # and can be added to the global gitignore or merged into this file. For a more nuclear
164
  # option (not recommended) you can uncomment the following to ignore the entire idea folder.
165
+ .idea/
166
+ .streamlit/secrets.toml
app.py CHANGED
@@ -213,7 +213,7 @@ if tokeniser_name == 'BPE':
213
  """)
214
 
215
 
216
- st.subheader("Try Yourself:")
217
  st.write(f"""\
218
  *Using text area field below try to find or create a comprehensive vocabulary (training dataset) for Tokenisation, which can enhance the
219
  efficiency of the process. This approach helps to eliminate unknown tokens, thereby making the token sequence
@@ -445,3 +445,77 @@ with st.expander("References:"):
445
  divider()
446
  st.header("Embeddings")
447
  st.caption("TBD...")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
213
  """)
214
 
215
 
216
+ st.subheader(":green[Try Yourself:]")
217
  st.write(f"""\
218
  *Using text area field below try to find or create a comprehensive vocabulary (training dataset) for Tokenisation, which can enhance the
219
  efficiency of the process. This approach helps to eliminate unknown tokens, thereby making the token sequence
 
445
  divider()
446
  st.header("Embeddings")
447
  st.caption("TBD...")
448
+
449
+ st.write("""\
450
+ Following tokenization, each token is transformed into a vector of numeric characteristics, a process
451
+ known as 'embedding.' In this context, 'embedding' refers to the mapping of the discrete, categorical space of words
452
+ or tokens into a continuous, numeric space, which the model can manipulate more effectively.
453
+
454
+ Each dimension in this high-dimensional space can encapsulate a different facet of the token's meaning. For instance,
455
+ one dimension might capture the tense of a token if it's a verb, while another dimension might capture the degree of
456
+ positivity or negativity if the token is an adjective expressing sentiment. For instance:
457
+ """)
458
+ st.code("""\
459
+ "I" -> [noun, person]
460
+ "love" -> [verb, feeling]
461
+ "machine" -> [noun, automation]
462
+ "learn" -> [verb, knowledge]
463
+ "##ing" -> [gerund, continues]
464
+ """)
465
+
466
+ st.write("""\
467
+ The actual embeddings in a typical NLP model would be in a much higher-dimensional space (often several hundred dimensions), but the idea is the same.
468
+ Embeddings are dynamically learned from the data, with the model adjusting these embeddings during
469
+ training to minimize the discrepancy between the predicted and actual outputs for a set of training examples.
470
+ Consequently, tokens with similar meanings often end up with similar embeddings.
471
+
472
+ In the context of Transformers, these embeddings are the inputs that the model uses. Once again, we represent all the
473
+ characteristics using numbers, not words.
474
+ """)
475
+
476
+ col1, col2 = st.columns(2)
477
+ token_king = col1.text_input("Choose words to compare embeddings:", value="king")
478
+ token_queen = col2.text_input("Choose words to compare embeddings:", value="queen")
479
+
480
+ from torch import nn
481
+ from transformers import AutoConfig
482
+ from transformers import AutoTokenizer
483
+ import pandas as pd
484
+ import openai
485
+
486
+ model_ckpt = 'bert-base-uncased'
487
+ tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
488
+ king_id = tokenizer(token_king, return_tensors="pt", add_special_tokens=False)
489
+ queen_id = tokenizer(token_queen, return_tensors="pt", add_special_tokens=False)
490
+
491
+ config = AutoConfig.from_pretrained(model_ckpt)
492
+ token_emb = nn.Embedding(config.vocab_size, config.hidden_size)
493
+ king_embeddings = token_emb(king_id.input_ids)
494
+ queen_embeddings = token_emb(queen_id.input_ids)
495
+ king_emb_np = king_embeddings.reshape(-1).detach().numpy()
496
+ queen_emb_np = queen_embeddings.reshape(-1).detach().numpy()
497
+
498
+
499
+ openai.api_key = st.secrets["OPENAI_API_KEY"]
500
+ EMBEDDING_MODEL = 'text-embedding-ada-002'
501
+ EMBEDDING_CTX_LENGTH = 8191
502
+ EMBEDDING_ENCODING = 'cl100k_base'
503
+ king = openai.Embedding.create(input=token_king, model=EMBEDDING_MODEL)["data"][0]["embedding"]
504
+ queen = openai.Embedding.create(input=token_queen, model=EMBEDDING_MODEL)["data"][0]["embedding"]
505
+
506
+ st.write("Google's 'bert-base-uncased' model embeddings:")
507
+ df = pd.DataFrame({f'"{token_king}" embeddings': king_emb_np[:50], f'"{token_queen}" embeddings': queen_emb_np[:50]})
508
+ st.line_chart(df)
509
+
510
+
511
+ st.write("OpenAI's 'text-embedding-ada-002' model embeddings:")
512
+ df = pd.DataFrame({f'"{token_king}" embeddings': king[:50], f'"{token_queen}" embeddings': queen[:50]})
513
+ st.line_chart(df)
514
+
515
+
516
+
517
+ with st.expander("References:"):
518
+ st.write("""\
519
+ - https://huggingface.co/blog/getting-started-with-embeddings
520
+ - https://huggingface.co/blog/1b-sentence-embeddings
521
+ """)
requirements.txt CHANGED
@@ -1,3 +1,5 @@
1
  streamlit~=1.21.0
2
  tokenizers~=0.13.3
3
- transformers~=4.31.0
 
 
 
1
  streamlit~=1.21.0
2
  tokenizers~=0.13.3
3
+ transformers~=4.31.0
4
+ torch~=2.0.1
5
+ openai~=0.27.8