im commited on
Commit
4bb4754
1 Parent(s): 348c0ea

add heatmap and vector search examples

Browse files
Files changed (2) hide show
  1. app.py +34 -3
  2. requirements.txt +4 -1
app.py CHANGED
@@ -444,7 +444,6 @@ with st.expander("References:"):
444
 
445
  divider()
446
  st.header("Embeddings")
447
- st.caption("TBD...")
448
 
449
  st.write("""\
450
  Following tokenization, each token is transformed into a vector of numeric characteristics, a process
@@ -473,9 +472,11 @@ st.write("""\
473
  characteristics using numbers, not words.
474
  """)
475
 
 
 
476
  col1, col2 = st.columns(2)
477
- token_king = col1.text_input("Choose words to compare embeddings:", value="king")
478
- token_queen = col2.text_input("Choose words to compare embeddings:", value="queen")
479
 
480
  from torch import nn
481
  from transformers import AutoConfig
@@ -516,8 +517,38 @@ fig.update_layout(legend=dict(orientation="h"))
516
  st.plotly_chart(fig, use_container_width=True)
517
 
518
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
519
 
 
 
 
 
 
520
 
 
521
 
522
  with st.expander("References:"):
523
  st.write("""\
 
444
 
445
  divider()
446
  st.header("Embeddings")
 
447
 
448
  st.write("""\
449
  Following tokenization, each token is transformed into a vector of numeric characteristics, a process
 
472
  characteristics using numbers, not words.
473
  """)
474
 
475
+ # TODO: cache
476
+
477
  col1, col2 = st.columns(2)
478
+ token_king = col1.text_input("Choose a word to compare embeddings:", value="king")
479
+ token_queen = col2.text_input("Choose a word to compare embeddings:", value="queen")
480
 
481
  from torch import nn
482
  from transformers import AutoConfig
 
517
  st.plotly_chart(fig, use_container_width=True)
518
 
519
 
520
+ import numpy as np
521
+
522
+ sentence = st.text_input(label="words to explore embeddings", value="a the king queen space sit eat from on")
523
+ sentence = sentence.split()
524
+
525
+ def get_embeddings(text):
526
+ return np.array(openai.Embedding.create(input=text, model=EMBEDDING_MODEL)["data"][0]["embedding"])
527
+
528
+ input = {word: get_embeddings(word) for word in sentence}
529
+
530
+ scores_matrix = np.zeros((len(sentence), len(sentence)))
531
+ for i, word_i in enumerate(sentence):
532
+ for j, word_j in enumerate(sentence):
533
+ scores_matrix[i, j] = np.dot(input[word_i], input[word_j])
534
+
535
+ fig = px.imshow(scores_matrix, x=sentence, y=sentence, color_continuous_scale="hot_r")
536
+ fig.update_layout(coloraxis_showscale=False)
537
+ fig.update_layout(width=6000, title_text='Similar words have similar embeddings')
538
+ st.plotly_chart(fig, use_container_width=True)
539
+
540
+ from langchain.embeddings.openai import OpenAIEmbeddings
541
+ from langchain.vectorstores import FAISS
542
+ from langchain.schema.document import Document
543
+ db = FAISS.from_documents([Document(page_content="king"), Document(page_content="queen")], OpenAIEmbeddings(model=EMBEDDING_MODEL))
544
 
545
+ embeddings_query = st.text_input(label="search term")
546
+ if embeddings_query is not None and embeddings_query != '':
547
+ embedding_vector = OpenAIEmbeddings(model=EMBEDDING_MODEL).embed_query(embeddings_query)
548
+ docs = db.similarity_search_by_vector(embedding_vector)
549
+ st.write(docs[0].page_content)
550
 
551
+ st.caption("PCA explanation (optional materials) TBD...")
552
 
553
  with st.expander("References:"):
554
  st.write("""\
requirements.txt CHANGED
@@ -3,4 +3,7 @@ tokenizers~=0.13.3
3
  transformers~=4.31.0
4
  torch~=2.0.1
5
  openai~=0.27.8
6
- plotly~=5.15.0
 
 
 
 
3
  transformers~=4.31.0
4
  torch~=2.0.1
5
  openai~=0.27.8
6
+ plotly~=5.15.0
7
+ langchain~=0.0.242
8
+ faiss-cpu~=1.7.4
9
+ tiktoken~=0.4.0