im commited on
Commit
e9755d9
β€’
1 Parent(s): 4bb4754

add embeddings explanation and dimensionality reduction explanation

Browse files
Files changed (2) hide show
  1. app.py +282 -23
  2. requirements.txt +2 -1
app.py CHANGED
@@ -1,4 +1,5 @@
1
  import streamlit as st
 
2
 
3
  # TODO: move to 'utils'
4
  mystyle = '''
@@ -15,6 +16,11 @@ def divider():
15
  _, c, _ = st.columns(3)
16
  c.divider()
17
 
 
 
 
 
 
18
  st.title("Transformers: Tokenisers and Embeddings")
19
 
20
  preface_image, preface_text, = st.columns(2)
@@ -288,7 +294,7 @@ elif tokeniser_name == 'Unigram':
288
  according to their probabilities.
289
  """)
290
 
291
- st.subheader("Try Yourself:")
292
  st.write(f"""\
293
  *Using text area field below try to find or create a comprehensive vocabulary (training dataset) for Tokenisation, which can enhance the
294
  efficiency of the process. This approach helps to eliminate unknown tokens, thereby making the token sequence
@@ -358,7 +364,7 @@ elif tokeniser_name == 'WordPiece':
358
  it.
359
  """)
360
 
361
- st.subheader("Try Yourself:")
362
  st.write(f"""\
363
  *Using text area field below try to find or create a comprehensive vocabulary (training dataset) for Tokenisation, which can enhance the
364
  efficiency of the process. This approach helps to eliminate unknown tokens, thereby making the token sequence
@@ -472,11 +478,17 @@ st.write("""\
472
  characteristics using numbers, not words.
473
  """)
474
 
475
- # TODO: cache
 
 
 
 
 
 
 
 
 
476
 
477
- col1, col2 = st.columns(2)
478
- token_king = col1.text_input("Choose a word to compare embeddings:", value="king")
479
- token_queen = col2.text_input("Choose a word to compare embeddings:", value="queen")
480
 
481
  from torch import nn
482
  from transformers import AutoConfig
@@ -502,28 +514,61 @@ openai.api_key = st.secrets["OPENAI_API_KEY"]
502
  EMBEDDING_MODEL = 'text-embedding-ada-002'
503
  EMBEDDING_CTX_LENGTH = 8191
504
  EMBEDDING_ENCODING = 'cl100k_base'
505
- king = openai.Embedding.create(input=token_king, model=EMBEDDING_MODEL)["data"][0]["embedding"]
506
- queen = openai.Embedding.create(input=token_queen, model=EMBEDDING_MODEL)["data"][0]["embedding"]
507
 
508
- df = pd.DataFrame({f'"{token_king}" embeddings': king_emb_np[:50], f'"{token_queen}" embeddings': queen_emb_np[:50]})
509
- fig = px.line(df, title="Google's 'bert-base-uncased' model embeddings")
 
510
  fig.update_layout(legend=dict(orientation="h"))
511
  st.plotly_chart(fig, use_container_width=True)
512
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
513
 
514
- df = pd.DataFrame({f'"{token_king}" embeddings': king[:50], f'"{token_queen}" embeddings': queen[:50]})
515
- fig = px.line(df, title="OpenAI's 'text-embedding-ada-002' model embeddings")
516
  fig.update_layout(legend=dict(orientation="h"))
517
  st.plotly_chart(fig, use_container_width=True)
518
 
519
 
520
- import numpy as np
 
 
 
 
521
 
522
- sentence = st.text_input(label="words to explore embeddings", value="a the king queen space sit eat from on")
523
- sentence = sentence.split()
 
524
 
525
- def get_embeddings(text):
526
- return np.array(openai.Embedding.create(input=text, model=EMBEDDING_MODEL)["data"][0]["embedding"])
 
 
 
 
 
 
 
 
 
 
 
 
527
 
528
  input = {word: get_embeddings(word) for word in sentence}
529
 
@@ -534,24 +579,238 @@ for i, word_i in enumerate(sentence):
534
 
535
  fig = px.imshow(scores_matrix, x=sentence, y=sentence, color_continuous_scale="hot_r")
536
  fig.update_layout(coloraxis_showscale=False)
537
- fig.update_layout(width=6000, title_text='Similar words have similar embeddings')
538
  st.plotly_chart(fig, use_container_width=True)
539
 
 
 
540
  from langchain.embeddings.openai import OpenAIEmbeddings
541
  from langchain.vectorstores import FAISS
542
  from langchain.schema.document import Document
543
- db = FAISS.from_documents([Document(page_content="king"), Document(page_content="queen")], OpenAIEmbeddings(model=EMBEDDING_MODEL))
544
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
545
  embeddings_query = st.text_input(label="search term")
546
  if embeddings_query is not None and embeddings_query != '':
547
- embedding_vector = OpenAIEmbeddings(model=EMBEDDING_MODEL).embed_query(embeddings_query)
548
- docs = db.similarity_search_by_vector(embedding_vector)
549
- st.write(docs[0].page_content)
550
 
551
- st.caption("PCA explanation (optional materials) TBD...")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
552
 
553
  with st.expander("References:"):
554
  st.write("""\
555
  - https://huggingface.co/blog/getting-started-with-embeddings
556
  - https://huggingface.co/blog/1b-sentence-embeddings
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
557
  """)
 
1
  import streamlit as st
2
+ import numpy as np
3
 
4
  # TODO: move to 'utils'
5
  mystyle = '''
 
16
  _, c, _ = st.columns(3)
17
  c.divider()
18
 
19
+ @st.cache_data
20
+ def get_embeddings(text):
21
+ return np.array(openai.Embedding.create(input=text, model=EMBEDDING_MODEL)["data"][0]["embedding"])
22
+
23
+
24
  st.title("Transformers: Tokenisers and Embeddings")
25
 
26
  preface_image, preface_text, = st.columns(2)
 
294
  according to their probabilities.
295
  """)
296
 
297
+ st.subheader(":green[Try Yourself:]")
298
  st.write(f"""\
299
  *Using text area field below try to find or create a comprehensive vocabulary (training dataset) for Tokenisation, which can enhance the
300
  efficiency of the process. This approach helps to eliminate unknown tokens, thereby making the token sequence
 
364
  it.
365
  """)
366
 
367
+ st.subheader(":green[Try Yourself:]")
368
  st.write(f"""\
369
  *Using text area field below try to find or create a comprehensive vocabulary (training dataset) for Tokenisation, which can enhance the
370
  efficiency of the process. This approach helps to eliminate unknown tokens, thereby making the token sequence
 
478
  characteristics using numbers, not words.
479
  """)
480
 
481
+ st.write("""\
482
+ Let's explore embeddings in more detail. We can take an experimental approach by encoding two specific
483
+ words and examining the corresponding embedding vectors they generate. To make our exploration more accessible,
484
+ we'll visualise a portion of these vectors, thereby unveiling the underlying structure of embeddings. Pay attention
485
+ to common patterns and peaks, try to find two words that yield differing embeddings.
486
+ """)
487
+ col1, col2, col3 = st.columns(3)
488
+ token_king = col1.text_input("Choose a word:", value="king")
489
+ token_queen = col2.text_input("Choose a word:", value="queen")
490
+ token_dots = col3.number_input("Number of dots:", value=50, min_value=0, max_value=1536)
491
 
 
 
 
492
 
493
  from torch import nn
494
  from transformers import AutoConfig
 
514
  EMBEDDING_MODEL = 'text-embedding-ada-002'
515
  EMBEDDING_CTX_LENGTH = 8191
516
  EMBEDDING_ENCODING = 'cl100k_base'
517
+ king = get_embeddings(token_king)
518
+ queen = get_embeddings(token_queen)
519
 
520
+
521
+ df = pd.DataFrame({f'"{token_king}" embeddings': king_emb_np, f'"{token_queen}" embeddings': queen_emb_np})
522
+ fig = px.line(df[:token_dots], title=f"Google's 'bert-base-uncased' model embeddings, embedding vector size: {len(queen_emb_np)}")
523
  fig.update_layout(legend=dict(orientation="h"))
524
  st.plotly_chart(fig, use_container_width=True)
525
 
526
+ with st.expander("Python Code:"):
527
+ st.code(f"""\
528
+ from torch import nn
529
+ from transformers import AutoConfig
530
+
531
+ model_ckpt = 'bert-base-uncased'
532
+ tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
533
+ king_id = tokenizer("{token_king}", return_tensors="pt", add_special_tokens=False)
534
+ queen_id = tokenizer("{token_queen}", return_tensors="pt", add_special_tokens=False)
535
+
536
+ config = AutoConfig.from_pretrained(model_ckpt)
537
+ token_emb = nn.Embedding(config.vocab_size, config.hidden_size)
538
+ king_embeddings = token_emb(king_id.input_ids)
539
+ queen_embeddings = token_emb(queen_id.input_ids)
540
+ """)
541
 
542
+ df = pd.DataFrame({f'"{token_king}" embeddings': king, f'"{token_queen}" embeddings': queen})
543
+ fig = px.line(df[:token_dots], title=f"OpenAI's 'text-embedding-ada-002' model embeddings, embedding vector size: {len(queen)}")
544
  fig.update_layout(legend=dict(orientation="h"))
545
  st.plotly_chart(fig, use_container_width=True)
546
 
547
 
548
+ with st.expander("Python Code:"):
549
+ st.code(f"""\
550
+ import openai
551
+
552
+ EMBEDDING_MODEL = 'text-embedding-ada-002'
553
 
554
+ king_embeddings = np.array(openai.Embedding.create(input="{token_king}", model=EMBEDDING_MODEL)["data"][0]["embedding"])
555
+ queen_embeddings = np.array(openai.Embedding.create(input="{token_queen}", model=EMBEDDING_MODEL)["data"][0]["embedding"])
556
+ """)
557
 
558
+ st.write("""\
559
+ The similarity can be represented as a similarity score. Identical words naturally have the highest
560
+ score (black colours), while unrelated terms have lower scores (white colours). To compute this score,
561
+ we construct a matrix infused with our embedding vectors. Each row in this matrix corresponds to a unique word in the
562
+ sentence, while each column aligns with another word. The value at the intersection of row i and column j represents
563
+ the score between word i and word j. For a clearer understanding, let's visualise this matrix using a heatmap. Each
564
+ cell in the grid corresponds to a pair of words, and the colour of the cell indicates the similarity (correlation)
565
+ score between those two words. The intensity of the colour directly corresponds to the magnitude of the score - the
566
+ darker the hue, the higher the score.
567
+ """)
568
+
569
+ st.write("""Here is a heatmap of the score matrix for the sentence:""")
570
+ sentence = st.text_input(label="*words to explore embeddings*", value="a the king queen space sit eat from on")
571
+ sentence = sentence.split()
572
 
573
  input = {word: get_embeddings(word) for word in sentence}
574
 
 
579
 
580
  fig = px.imshow(scores_matrix, x=sentence, y=sentence, color_continuous_scale="hot_r")
581
  fig.update_layout(coloraxis_showscale=False)
582
+ fig.update_layout(width=6000)
583
  st.plotly_chart(fig, use_container_width=True)
584
 
585
+ st.subheader(":green[Try Yourself:]")
586
+
587
  from langchain.embeddings.openai import OpenAIEmbeddings
588
  from langchain.vectorstores import FAISS
589
  from langchain.schema.document import Document
 
590
 
591
+ @st.cache_resource
592
+ def create_vector_database():
593
+ return FAISS.from_documents([Document(page_content="king"), Document(page_content="queen")], OpenAIEmbeddings(model=EMBEDDING_MODEL))
594
+ db = create_vector_database()
595
+
596
+ @st.cache_data
597
+ def search_vector_database(term):
598
+ embedding_vector = OpenAIEmbeddings(model=EMBEDDING_MODEL).embed_query(term)
599
+ docs = db.similarity_search_by_vector(embedding_vector)
600
+ return docs
601
+
602
+ st.write("""\
603
+ *There is a vector database containing two words: 'king' and 'queen'. Your task is to pinpoint search
604
+ terms that would yield either of these words. To facilitate this, use the previously presented similarity matrix to
605
+ seek out words that give a higher correlation with the word in question. For instance, you might want to explore
606
+ terms such as 'king', 'queen', 'dog', 'prince', 'man', 'minister', 'boy'.*
607
+ """)
608
  embeddings_query = st.text_input(label="search term")
609
  if embeddings_query is not None and embeddings_query != '':
610
+ docs = search_vector_database(embeddings_query)
611
+ st.warning(docs[0].page_content)
 
612
 
613
+ with st.expander("Python Code:"):
614
+ st.code(f"""\
615
+ from langchain.embeddings.openai import OpenAIEmbeddings
616
+ from langchain.vectorstores import FAISS
617
+ from langchain.schema.document import Document
618
+
619
+
620
+ db = FAISS.from_documents([Document(page_content="king"), Document(page_content="queen")], OpenAIEmbeddings(model=EMBEDDING_MODEL))
621
+ embedding_vector = OpenAIEmbeddings(model=EMBEDDING_MODEL).embed_query("{embeddings_query}")
622
+ docs = db.similarity_search_by_vector(embedding_vector)
623
+ """)
624
+
625
+ divider()
626
+ st.caption("Conclusion")
627
+ st.write("""\
628
+ As embedding algorithms are trained on a vast corpus of data, they inherently encapsulate a rich
629
+ tapestry of information about our language and even the world at large. Therefore, they can be used for:
630
+
631
+ - Search (where results are ranked by relevance to a query string)
632
+ - Clustering (where text strings are grouped by similarity)
633
+ - Recommendations (where items with related text strings are recommended)
634
+ - Anomaly detection (where outliers with little relatedness are identified)
635
+ - Diversity measurement (where similarity distributions are analyzed)
636
+ - Classification (where text strings are classified by their most similar label)
637
+ """)
638
 
639
  with st.expander("References:"):
640
  st.write("""\
641
  - https://huggingface.co/blog/getting-started-with-embeddings
642
  - https://huggingface.co/blog/1b-sentence-embeddings
643
+ - https://platform.openai.com/docs/guides/embeddings/use-cases
644
+ """)
645
+
646
+ divider()
647
+ st.header("Dimensionality Reduction (optional)")
648
+
649
+
650
+ st.write("""\
651
+ As was mentioned above, embedding vectors are learned in such a way that words with similar meanings
652
+ are located close to each other in the space. However, this is an abstract concept that might be difficult to
653
+ explore, understand and visualise in a 2D space because word embeddings typically have hundreds of dimensions. To
654
+ solve this, we can use techniques like Principal Component Analysis (PCA) or t-SNE to reduce the dimensionality of
655
+ the vectors and plot them.
656
+ """)
657
+ st.write("""But first, let's talk about the meaning of dimensionality reduction using simplified use-case:""")
658
+
659
+ dimensionality_name = st.selectbox(label="Choose your example", options=["Simplified", "PCA", 't-SNE'])
660
+ if dimensionality_name == 'Simplified':
661
+ _, col2, _ = st.columns(3)
662
+ col2.image("assets/img.png")
663
+ st.write("""\
664
+ **Step 1: The context**\n
665
+ We have a 3D object (your hand) and a light source that's casting a 2D shadow of your hand onto a
666
+ wall. The shadow is a simpler, lower-dimensional representation of your hand.
667
+
668
+ **Step 2: Identifying the dimensions**\n
669
+ In this case, the dimensions are the different aspects of your hand that can be
670
+ observed: the length of your fingers, the width of your palm, the height (or depth) of your hand, the scars,
671
+ the colour of the skin, etc. However, we have a problem: we can't easily visualise or understand all these dimensions
672
+ at once. Just as it's hard to imagine a 6-dimensional space.
673
+
674
+ **Step 3: Deciding on important dimensions**\n
675
+ Let's say you want to compare the number of fingers of different hands. In
676
+ this case, you don't need to know about the depth of the hand, the width of the palm, or other details like freckles,
677
+ scars, or skin colour. You just need a shadow that clearly shows the fingers. So, you decide to focus on the length
678
+ of the fingers, which can be easily shown in the shadow.
679
+
680
+ **Step 4: Reducing dimensions**\n
681
+ This is where you actually perform dimensionality reduction. You orient your hand in such
682
+ a way (giving the wall a high-five) that the shadow clearly shows the fingers. You've effectively reduced the
683
+ dimensions from 3D to 2D. Your hand is still a 3D object, but its shadow β€” the simplified representation you're using
684
+ for your comparison β€” is 2D.
685
+
686
+ **Step 5: Interpretation**\n
687
+ This hand and shadow example shows how dimensionality reduction simplifies a complex object (
688
+ the 3D hand) into a lower-dimensional representation (the 2D shadow) that retains the most important information (the
689
+ number of fingers) while discarding the less important details (like the depth of the hand, skin colour, etc.). It's
690
+ a process of prioritisation and simplification that makes it easier for us to understand and analyse the data (or the
691
+ hands, in this case).
692
+ """)
693
+ elif dimensionality_name == 'PCA':
694
+ st.write("""\
695
+ **Step 1: Understanding PCA**\n
696
+ PCA is a popular method for dimensionality reduction. It identifies the
697
+ axes in the feature space along which the original data varies the most. These axes are known as the principal
698
+ components, and they are orthogonal (perpendicular) to each other.
699
+
700
+ **Step 2: Projecting the Data**\n
701
+ Imagine that instead of just casting a shadow on the wall, you can cast your hand's
702
+ shadow onto a number of walls arranged at different angles around your hand. Each shadow is a different projection of
703
+ your hand. In PCA, these different walls represent different principal components, and the shadow on each wall is a
704
+ projection of your hand onto that principal component.
705
+
706
+ **Step 3: Choosing the Best Projection**\n
707
+ Now, consider the shadow that most accurately portrays the number of fingers on
708
+ your hand. This shadow corresponds to the principal component that captures the most variance in the data. In PCA,
709
+ this would be the first principal component.
710
+
711
+ **Step 4: Secondary Features**\n
712
+ Next, consider the shadow that, while not as accurate as the first, still gives a
713
+ reasonable representation of your hand, such as showing the width of your palm. This shadow represents the second
714
+ principal component, which captures the second highest amount of variance in the data.
715
+
716
+ **Step 5: Reduction of Dimensions**\n
717
+ In the process of reducing dimensions, we select the top few principal components (
718
+ shadows) that capture the most variance. The other dimensions (shadows) are discarded. So, instead of having to
719
+ consider the complex 3D structure of your hand, you can simply look at one or two shadows that give you the most
720
+ information about the hand.
721
+
722
+ **Step 6: Transformation**\n
723
+ Finally, we transform the original data into the reduced dimensional space defined by the
724
+ selected principal components. This is analogous to replacing each hand with the selected shadows for further analysis.
725
+ By using PCA, we can reduce the complexity of the data (from a 3D hand to a 2D or even 1D shadow), while still
726
+ retaining the most important information (like the number of fingers or the width of the palm). This makes the data
727
+ easier to visualize, understand, and work with.
728
+ """)
729
+ embedding_dim = 1536
730
+ embeddings = st.text_input("words to explore:",
731
+ value="king queen man woman prince prince princess counselor minister teacher")
732
+ embeddings = embeddings.split()
733
+ embeddings = {word: get_embeddings(word) for word in embeddings}
734
+
735
+ from sklearn.decomposition import PCA
736
+
737
+ pca = PCA(n_components=2)
738
+ embedding_matrix = np.array(list(embeddings.values()))
739
+ reduced_embeddings = pca.fit_transform(embedding_matrix)
740
+
741
+ df = pd.DataFrame(reduced_embeddings, columns=["X", "Y"])
742
+ df["Word"] = list(embeddings.keys())
743
+ fig = px.scatter(df, x="X", y="Y", text="Word", title="Word Embeddings", width=800, height=800)
744
+ st.plotly_chart(fig, use_container_width=True)
745
+
746
+ st.code(f"""\
747
+ from sklearn.decomposition import PCA
748
+
749
+ pca = PCA(n_components=2)
750
+ embedding_matrix = np.array(list(embeddings.values()))
751
+ reduced_embeddings = pca.fit_transform(embedding_matrix)
752
+ """, language='python')
753
+
754
+ elif dimensionality_name == 't-SNE':
755
+ st.write("""\
756
+ **Step 1: Understanding t-SNE**\n
757
+ t-SNE is a technique for dimensionality reduction that is particularly
758
+ well-suited for the visualization of high-dimensional datasets. Unlike PCA, which is a linear technique,
759
+ t-SNE is a non-linear technique, making it better at capturing complex polynomial relationships between variables.
760
+
761
+ **Step 2: Measuring Similarities**\n
762
+ Imagine that instead of just one hand, you have many hands casting shadows. Each hand
763
+ is different - some hands might have longer fingers, some might have a wider palm, and so on. Each hand has its own
764
+ "neighborhood" of similar hands. In t-SNE, these neighborhoods are represented mathematically by a probability
765
+ distribution. Hands that are very similar to each other have a high probability of being "neighbors", while hands
766
+ that are very different have a low probability.
767
+
768
+ **Step 3: Creating a Map**\n
769
+ t-SNE creates a map (or a projection) where hands that were close in the high-dimensional
770
+ space (similar hands) are still close in the low-dimensional space (in their shadows), and hands that were far apart
771
+ in the high-dimensional space (different hands) are still far apart in the low-dimensional space. This map is created
772
+ in such a way that it minimizes the difference between the distances in the high-dimensional space and the distances
773
+ in the low-dimensional space.
774
+
775
+ **Step 4: Reducing Dimensions**\n
776
+ The process of reducing dimensions in t-SNE involves optimizing the locations of each
777
+ hand's shadow in the low-dimensional space such that the overall configuration of shadows best represents the
778
+ similarities between the hands in the high-dimensional space.
779
+
780
+ **Step 5: Interpretation**\n
781
+ The result of t-SNE is a map where similar hands are located close together and dissimilar
782
+ hands are located far apart. This makes it easier to visualize clusters or groups of similar hands.
783
+ t-SNE, therefore, helps us to project high-dimensional data into a lower-dimensional space in a way that preserves
784
+ the structure of the data as much as possible, making it easier to visualize and understand the relationships in the
785
+ data.
786
+ """)
787
+ embedding_dim = 1536
788
+ embeddings = st.text_input("words to explore:",
789
+ value="king queen man woman prince prince princess counselor minister teacher")
790
+ embeddings = embeddings.split()
791
+ embeddings = {word: get_embeddings(word) for word in embeddings}
792
+
793
+ from sklearn.manifold import TSNE
794
+
795
+ tsne = TSNE(n_components=2, perplexity=2, random_state=0)
796
+ embedding_matrix = np.array(list(embeddings.values()))
797
+ reduced_embeddings = tsne.fit_transform(embedding_matrix)
798
+
799
+ df = pd.DataFrame(reduced_embeddings, columns=["X", "Y"])
800
+ df["Word"] = list(embeddings.keys())
801
+ fig = px.scatter(df, x="X", y="Y", text="Word", title="Word Embeddings", width=800, height=800)
802
+ st.plotly_chart(fig, use_container_width=True)
803
+
804
+ st.code(f"""\
805
+ from sklearn.manifold import TSNE
806
+
807
+ tsne = TSNE(n_components=2, perplexity=2, random_state=0)
808
+ embedding_matrix = np.array(list(embeddings.values()))
809
+ reduced_embeddings = tsne.fit_transform(embedding_matrix)
810
+ """, language='python')
811
+
812
+ with st.expander("References:"):
813
+ st.write("""\
814
+ - https://hex.tech/blog/dimensionality-reduction/
815
+ - https://github.com/openai/openai-cookbook/blob/main/examples/Visualizing_embeddings_in_2D.ipynb
816
  """)
requirements.txt CHANGED
@@ -6,4 +6,5 @@ openai~=0.27.8
6
  plotly~=5.15.0
7
  langchain~=0.0.242
8
  faiss-cpu~=1.7.4
9
- tiktoken~=0.4.0
 
 
6
  plotly~=5.15.0
7
  langchain~=0.0.242
8
  faiss-cpu~=1.7.4
9
+ tiktoken~=0.4.0
10
+ scikit-learn~=1.3.0