File size: 49,378 Bytes
ab03e32
e9755d9
ab03e32
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e9755d9
 
 
 
 
ab03e32
 
 
120ad45
30bd82e
ab03e32
 
 
 
120ad45
30bd82e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120ad45
 
 
30bd82e
 
 
 
 
 
ab03e32
 
 
120ad45
 
4e79404
120ad45
 
 
 
 
 
ab03e32
 
 
30bd82e
 
 
 
 
ab03e32
 
085ad06
30bd82e
ab03e32
120ad45
 
 
 
 
 
ab03e32
 
 
 
085ad06
ab03e32
 
 
 
 
 
085ad06
120ad45
ab03e32
 
 
 
 
 
085ad06
120ad45
 
 
 
 
 
ab03e32
 
 
 
085ad06
ab03e32
120ad45
 
 
 
 
 
 
 
ab03e32
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
085ad06
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84c1553
085ad06
 
 
 
 
 
ab03e32
 
 
085ad06
 
ab03e32
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
085ad06
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ab03e32
e9755d9
085ad06
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ab03e32
e9755d9
085ad06
 
ab03e32
085ad06
ab03e32
 
085ad06
 
 
 
 
 
ab03e32
085ad06
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ab03e32
 
 
 
085ad06
 
ab03e32
 
 
 
84c1553
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e9755d9
 
 
 
 
 
 
 
 
 
4bb4754
84c1553
 
 
 
 
 
348c0ea
84c1553
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e9755d9
 
84c1553
e9755d9
 
 
348c0ea
 
84c1553
e9755d9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84c1553
e9755d9
 
348c0ea
 
 
 
e9755d9
 
 
92b778c
e9755d9
 
4bb4754
e9755d9
 
 
4bb4754
e9755d9
 
 
 
 
 
 
 
 
 
 
 
 
 
4bb4754
 
 
 
 
 
 
 
 
 
e9755d9
4bb4754
 
92b778c
 
 
 
 
 
 
 
 
7059ad1
92b778c
7059ad1
92b778c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0a8544d
92b778c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e9755d9
4bb4754
 
 
84c1553
e9755d9
 
 
 
 
 
 
 
 
 
 
4bb4754
e9755d9
 
84c1553
e9755d9
 
 
 
 
 
 
 
 
 
 
 
 
92b778c
e9755d9
 
 
 
 
 
 
 
 
 
 
84c1553
 
 
 
 
e9755d9
 
 
92b778c
 
e9755d9
30bd82e
e9755d9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92b778c
e9755d9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a6a7ec1
e9755d9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92b778c
e9755d9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a6a7ec1
e9755d9
 
 
 
 
 
 
 
 
 
84c1553
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
import streamlit as st
import numpy as np

# TODO: move to 'utils'
mystyle = '''
    <style>
        p {
            text-align: justify;
        }
    </style>
    '''
st.markdown(mystyle, unsafe_allow_html=True)


def divider():
    _, c, _ = st.columns(3)
    c.divider()

@st.cache_data
def get_embeddings(text):
    return np.array(openai.Embedding.create(input=text, model=EMBEDDING_MODEL)["data"][0]["embedding"])


st.title("Transformers: Tokenisers and Embeddings")

preface_image, preface_text,  = st.columns(2)
preface_text.write("""\
"*I think I can safely say that nobody understands quantum mechanics.*" R. Feynman
""")

divider()

st.write("""\
    Did you know that the leading AI models powering speech recognition, language translation, 
    and even your email auto-responses owe their capabilities to a single, revolutionary concept: the Transformer 
    architecture?

    Artificial Intelligence (AI) has seen remarkable progress in the last decade, and a significant part of that is due 
    to advancements in Natural Language Processing (NLP). NLP, a subset of AI, involves the interaction between computers 
    and human language, making it possible for AI to understand, interpret, and generate human language in a valuable 
    way. Within this realm of NLP, a game-changer has emerged: the Transformer model. With its innovative architecture 
    and remarkable performance, the Transformer model has revolutionised how machines understand and generate human 
    language.

    However, the complexity of Transformer models can be daunting, making them seem inaccessible to those without 
    extensive technical expertise. This creates a barrier to understanding, utilising, and improving upon these powerful 
    tools.

    That's why I'm embarking on this series of articles, breaking down the key components of Transformer models into 
    digestible, easy-to-understand concepts. I have chosen to dedicate the first article in this series solely to 
    Tokenisers and Embeddings. The article has the following structure:

    - [Tokenisers](#tokenisers)
    - [Embeddings](#embeddings)
    - [Vector Databases](#vector-databases)
    - [Dimensionality Reduction](#dimensionality-reduction)

    Understanding these foundational concepts is crucial to comprehending the overall structure and function of the 
    Transformer model. They are the building blocks from which the rest of the model is constructed, and their roles 
    within the architecture are essential to the model's ability to process and generate language. In my view, 
    a comprehensive and simple explanation may give a reader a significant advantage in using LLMs.

    Are you ready to take a deep dive into the world of Transformers? I promise that by the end of this series, 
    you'll have a clearer understanding of how these complex models work and how they contribute to the remarkable 
    capabilities of modern AI.

""")

with st.expander("Copernicus Museum in Warsaw"):
    st.write("""\
    Have you ever visited the Copernicus Museum in Warsaw? It's an engaging interactive hub that allows 
    you to familiarise yourself with various scientific topics. The experience is both entertaining and educational, 
    providing the opportunity to explore different concepts firsthand. **They even feature a small neural network that 
    illustrates the neuron activation process during the recognition of handwritten digits!**
    
    I encourage you not to hesitate in modifying parameters or experimenting with different models in the provided 
    examples. This hands-on exploration can significantly enhance your learning experience. So, let's begin our journey 
    through this virtual, interactive museum of AI. Enjoy the exploration!
""")
    st.image("https://i.pinimg.com/originals/04/11/2c/04112c791a859d07a01001ac4f436e59.jpg")

st.write("""\
    Note: *HuggingFace provides an exceptional [tutorial on Transformer models](https://huggingface.co/docs/transformers/index). 
    That tutorial is particularly beneficial for readers willing to dive into advanced topics.*
""")

divider()


st.header("Tokenisers")

st.write("""\
    Tokenisation is the initial step in the data preprocessing pipeline for natural language processing (NLP) 
    models. It involves breaking down a piece of text—whether a sentence, paragraph, or document—into smaller units, 
    known as "tokens". In English and many other languages, a token often corresponds to a word, but it can also be a 
    subword, character, or n-gram. The choice of token size depends on various factors, including the task at hand and 
    the language of the text.
""")

from transformers import AutoTokenizer

sentence = st.text_input("Consider the sentence: (you can change it):", value="Tokenising text is a fundamental step for NLP models.")
sentence_split = sentence.split()
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
sentence_tokenise_bert = tokenizer.tokenize(sentence)
sentence_encode_bert = tokenizer.encode(sentence)
sentence_encode_bert = list(zip(sentence_tokenise_bert, sentence_encode_bert))

st.write(f"""\
    A basic word-level tokenisation, which splits a text by spaces, would produce next tokens:
""")
st.code(f"""
{sentence_split}
""")


st.write(f"""\
    However, we notice that the punctuation may attached to the words. It is disadvantageous, how the tokenization dealt with the word "Don't". 
    "Don't" stands for "do not", so it would be better tokenized as ["Do", "n't"]. (Hint: try another sentence: "I musn't tell lies. Don't do this.") This is where things start getting complicated, 
    and part of the reason each model has its own tokenizer type. Depending on the rules we apply for tokenizing a text, 
    a different tokenized output is generated for the same text. 
    A more sophisticated algorithm, with several optimizations, might generate a different set of tokens: 
""")
st.code(f"""
{sentence_tokenise_bert}
""")

with st.expander("click here to look at the Python code:"):
    st.code(f"""\
        from transformers import AutoTokenizer
        
        sentence = "{sentence}"
        sentence_split = sentence.split()
        tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
        sentence_tokenise_bert = tokenizer.tokenize(sentence)
        sentence_encode_bert = tokenizer.encode(sentence)
        sentence_encode_bert = list(zip(sentence_tokenise_bert, sentence_encode_bert))
    """, language='python')


st.write("""
As machine learning models, including Transformers, work with numbers rather than words, each vocabulary 
entry is assigned a corresponding numerical value. Here is a potential key-value, vocabulary-based representation of 
the input (so called 'token ids'):
"""
)

st.code(f"""
{sentence_encode_bert}
""")


st.write("""
What distinguishes subword Tokenisation is its reliance on statistical rules and algorithms, learned from 
the pretraining corpus. The resulting Tokeniser creates a vocabulary, which usually represents the most frequently 
used words and subwords. For example, Byte Pair Encoding (BPE) first encodes the most frequent words as single 
tokens, while less frequent words are represented by multiple tokens, each representing a word part.

There are numerous different Tokenisers available, including spaCy, Moses, Byte-Pair Encoding (BPE), 
Byte-level BPE, WordPiece, Unigram, and SentencePiece. It's crucial to choose a specific Tokeniser and stick with it. 
Changing the Tokeniser is akin to altering the model's language on the fly—imagine studying physics in English and 
then taking the exam in French or Spanish. You might get lucky, but it's a considerable risk.
""")

training_dataset = """\
   Beautiful is better than ugly.
   Explicit is better than implicit.
   Simple is better than complex.
   Complex is better than complicated.
   Flat is better than nested.
   Sparse is better than dense.
   Readability counts.
   """

tokeniser_name = st.selectbox(label="Choose your tokeniser", options=["BPE", 'Unigram', 'WordPiece'])
if tokeniser_name == 'BPE':
    st.subheader("Byte-Pair Encoding (BPE)")
    st.write("""\
        Byte-Pair Encoding (BPE) was introduced in [Neural Machine Translation of Rare Words with Subword 
        Units (Sennrich et al., 2015)](https://arxiv.org/abs/1508.07909). BPE relies on a pre-tokenizer that splits the 
        training data into words. Pre-tokenization can be as simple as space tokenization, e.g. GPT-2, Roberta. More 
        advanced pre-tokenization include rule-based tokenization, e.g. XLM, FlauBERT which uses Moses for most 
        languages, or GPT which uses Spacy and ftfy, to count the frequency of each word in the training corpus.
        
        After pre-tokenization, a set of unique words has been created and the frequency with which each word occurred in the 
        training data has been determined. Next, BPE creates a base vocabulary consisting of all symbols that occur in the 
        set of unique words and learns merge rules to form a new symbol from two symbols of the base vocabulary. It does so 
        until the vocabulary has attained the desired vocabulary size. Note that the desired vocabulary size is a 
        hyperparameter to define before training the tokenizer.
        
        As an example, let’s assume that after pre-tokenization, the following set of words including their frequency has 
        been determined:
    """)
    st.code(""" ("hug", 10), ("pug", 5), ("pun", 12), ("bun", 4), ("hugs", 5) """)
    st.write("""\
        Consequently, the base vocabulary is ["b", "g", "h", "n", "p", "s", "u"]. Splitting all words into symbols of the base vocabulary, we obtain:
    """)
    st.code(""" ("h" "u" "g", 10), ("p" "u" "g", 5), ("p" "u" "n", 12), ("b" "u" "n", 4), ("h" "u" "g" "s", 5) """)
    st.write("""\
        BPE then counts the frequency of each possible symbol pair and picks the symbol pair that occurs 
        most frequently. In the example above "h" followed by "u" is present 10 + 5 = 15 times (10 times in the 10 
        occurrences of "hug", 5 times in the 5 occurrences of "hugs"). However, the most frequent symbol pair is "u" 
        followed by "g", occurring 10 + 5 + 5 = 20 times in total. Thus, the first merge rule the tokenizer learns is to 
        group all "u" symbols followed by a "g" symbol together. Next, "ug" is added to the vocabulary. The set of words 
        then becomes
    """)
    st.code(""" ("h" "ug", 10), ("p" "ug", 5), ("p" "u" "n", 12), ("b" "u" "n", 4), ("h" "ug" "s", 5) """)
    st.write("""\
        BPE then identifies the next most common symbol pair. It’s "u" followed by "n", which occurs 16 
        times. "u", "n" is merged to "un" and added to the vocabulary. The next most frequent symbol pair is "h" followed 
        by "ug", occurring 15 times. Again the pair is merged and "hug" can be added to the vocabulary.
        
        At this stage, the vocabulary is ["b", "g", "h", "n", "p", "s", "u", "ug", "un", "hug"] and our set of unique words is represented as
    """)
    st.code(""" ("hug", 10), ("p" "ug", 5), ("p" "un", 12), ("b" "un", 4), ("hug" "s", 5) """)
    st.write("""\
        Assuming, that the Byte-Pair Encoding training would stop at this point, the learned merge rules 
        would then be applied to new words (as long as those new words do not include symbols that were not in the base 
        vocabulary). For instance, the word "bug" would be tokenized to ["b", "ug"] but "mug" would be tokenized as [
        "[unk]", "ug"] since the symbol "m" is not in the base vocabulary. In general, single letters such as "m" are not 
        replaced by the "[unk]" symbol because the training data usually includes at least one occurrence of each letter, 
        but it is likely to happen for very special characters like emojis.
        
        As mentioned earlier, the vocabulary size, i.e. the base vocabulary size + the number of merges, is a hyperparameter 
        to choose. For instance GPT has a vocabulary size of 40,478 since they have 478 base characters and chose to stop 
        training after 40,000 merges. 
    """)


    st.subheader(":green[Try Yourself:]")
    st.write(f"""\
        *Using text area field below try to find or create a comprehensive vocabulary (training dataset) for Tokenisation, which can enhance the 
        efficiency of the process. This approach helps to eliminate unknown tokens, thereby making the token sequence 
        more understandable and containing less tokens (ids)* 
      """)

    training_dataset = st.text_area("*Training Dataset - Vocabulary:*", value=training_dataset, height=200)
    training_dataset = training_dataset.split('\n')
    vocabulary_size = st.number_input("Vocabulary Size:", value=100000)
    sentence = st.text_input(label="*Text to tokenise:*",
                             value="[CLS]  Tokenising text is a fundamental step for NLP models. [SEP] [PAD] [PAD] [PAD]")


    from tokenizers import Tokenizer, decoders, models, normalizers, pre_tokenizers, trainers
    tokenizer = Tokenizer(models.BPE(unk_token="[UNK]"))
    tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
    tokenizer.decoder = decoders.ByteLevel()
    trainer = trainers.BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"], vocab_size=vocabulary_size)
    tokenizer.train_from_iterator(training_dataset, trainer=trainer)
    output = tokenizer.encode(sentence)

    st.write("*Tokens:*")
    st.code(f"""{output.tokens}""")
    st.code(f"""\
    ids: {output.ids}
    attention_mast: {output.attention_mask}
    """)

    st.write(""" *well done if you get ids like these: [1, 57, 49, 28, 10, 58, 55, 52, 31, 54, 5, 2, 3, 3, 3]!*""")

    with st.expander("Python code:"):
        st.code(f"""
            from tokenizers import Tokenizer, decoders, models, normalizers, pre_tokenizers, trainers
            
            tokenizer = Tokenizer(models.BPE(unk_token="[UNK]"))
            tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
            tokenizer.decoder = decoders.ByteLevel()
            trainer = trainers.BpeTrainer(
                special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"], 
                vocab_size={vocabulary_size})
            training_dataset = {training_dataset}
            tokenizer.train_from_iterator(training_dataset, trainer=trainer)
            output = tokenizer.encode("{sentence}")
                """, language='python')
elif tokeniser_name == 'Unigram':
    st.subheader("""Unigram""")
    st.write("""\
        Unigram is a subword tokenization algorithm introduced in [Subword Regularization: Improving Neural 
        Network Translation Models with Multiple Subword Candidates (Kudo, 2018)](https://arxiv.org/pdf/1804.10959.pdf). 
        In contrast to BPE or WordPiece, Unigram initializes its base vocabulary to a large number of symbols and 
        progressively trims down each symbol to obtain a smaller vocabulary. The base vocabulary could for instance 
        correspond to all pre-tokenized words and the most common substrings. Unigram is not used directly for any of the 
        models in the transformers, but it’s used in conjunction with SentencePiece.
        
        At each training step, the Unigram algorithm defines a loss (often defined as the log-likelihood) over the training 
        data given the current vocabulary and a unigram language model. Then, for each symbol in the vocabulary, 
        the algorithm computes how much the overall loss would increase if the symbol was to be removed from the vocabulary. 
        Unigram then removes p (with p usually being 10% or 20%) percent of the symbols whose loss increase is the lowest, 
        i.e. those symbols that least affect the overall loss over the training data. This process is repeated until the 
        vocabulary has reached the desired size. The Unigram algorithm always keeps the base characters so that any word can 
        be tokenized.
        
        Because Unigram is not based on merge rules (in contrast to BPE and WordPiece), the algorithm has several ways of 
        tokenizing new text after training. As an example, if a trained Unigram tokenizer exhibits the vocabulary:
    """)
    st.code(""" ["b", "g", "h", "n", "p", "s", "u", "ug", "un", "hug"] """)
    st.write("""\
        "hugs" could be tokenized both as ["hug", "s"], ["h", "ug", "s"] or ["h", "u", "g", "s"]. So which 
        one to choose? Unigram saves the probability of each token in the training corpus on top of saving the vocabulary 
        so that the probability of each possible tokenization can be computed after training. The algorithm simply picks 
        the most likely tokenization in practice, but also offers the possibility to sample a possible tokenization 
        according to their probabilities.
    """)

    st.subheader(":green[Try Yourself:]")
    st.write(f"""\
        *Using text area field below try to find or create a comprehensive vocabulary (training dataset) for Tokenisation, which can enhance the 
        efficiency of the process. This approach helps to eliminate unknown tokens, thereby making the token sequence 
        more understandable and containing less tokens (ids)* 
          """)

    training_dataset = st.text_area("*Training Dataset - Vocabulary(change it and looks at resulted tokens):*", value=training_dataset, height=200)
    training_dataset = training_dataset.split('\n')
    vocabulary_size = st.number_input("Vocabulary Size:", value=100000)
    sentence = st.text_input(label="*Text to tokenise:*",
                             value="[CLS]  Tokenising text is a fundamental step for NLP models. [SEP] [PAD] [PAD] [PAD]")

    from tokenizers import Tokenizer, decoders, models, normalizers, pre_tokenizers, trainers

    tokenizer = Tokenizer(models.Unigram())
    tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
    tokenizer.decoder = decoders.ByteLevel()
    trainer = trainers.UnigramTrainer(
        vocab_size=vocabulary_size,
        unk_token="[UNK]",
        # initial_alphabet=pre_tokenizers.ByteLevel.alphabet(),
        special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"],
    )
    tokenizer.train_from_iterator(training_dataset, trainer=trainer)
    output = tokenizer.encode(sentence)

    # TODO: make it more visible, container with a differect color or something
    st.write("*Tokens:*")
    st.code(f"""{output.tokens}""")
    st.code(f"""\
        ids: {output.ids}
        attention_mast: {output.attention_mask}
        """)

    st.write(""" *well done if you get ids like these: [1, 57, 49, 28, 10, 58, 55, 52, 31, 54, 5, 2, 3, 3, 3]!*""")
    with st.expander("Python code:"):
        st.code(f"""\
            from tokenizers import Tokenizer, decoders, models, normalizers, pre_tokenizers, trainers
            
            tokenizer = Tokenizer(models.Unigram())
            tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
            tokenizer.decoder = decoders.ByteLevel()
            trainer = trainers.UnigramTrainer(
                vocab_size={vocabulary_size},
                special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"],
            )
            training_dataset = {training_dataset}
            tokenizer.train_from_iterator(training_dataset, trainer=trainer)
            output = tokenizer.encode("{sentence}") 
    """, language='python')
elif tokeniser_name == 'WordPiece':
    st.subheader("""WordPiece""")
    st.write("""\
        WordPiece is the subword tokenization algorithm used for BERT, DistilBERT, and Electra. The 
        algorithm was outlined in [Japanese and Korean Voice Search (Schuster et al., 
        2012)](https://static.googleusercontent.com/media/research.google.com/ja//pubs/archive/37842.pdf) and is very 
        similar to BPE. WordPiece first initializes the vocabulary to include every character present in the training 
        data and progressively learns a given number of merge rules. In contrast to BPE, WordPiece does not choose the 
        most frequent symbol pair, but the one that maximizes the likelihood of the training data once added to the 
        vocabulary.
        
        So what does this mean exactly? Referring to the example from BPE tokeniser, maximizing the likelihood of the training data is 
        equivalent to finding the symbol pair, whose probability divided by the probabilities of its first symbol followed by 
        its second symbol is the greatest among all symbol pairs. E.g. "u", followed by "g" would have only been merged if 
        the probability of "ug" divided by "u", "g" would have been greater than for any other symbol pair. Intuitively, 
        WordPiece is slightly different to BPE in that it evaluates what it loses by merging two symbols to ensure it’s worth 
        it. 
    """)

    st.subheader(":green[Try Yourself:]")
    st.write(f"""\
        *Using text area field below try to find or create a comprehensive vocabulary (training dataset) for Tokenisation, which can enhance the 
        efficiency of the process. This approach helps to eliminate unknown tokens, thereby making the token sequence 
        more understandable and containing less tokens (ids)* 
    """)

    training_dataset = st.text_area("*Training Dataset - Vocabulary(change it and looks at resulted tokens):*",
                                    value=training_dataset, height=200)
    training_dataset = training_dataset.split('\n')
    vocabulary_size = st.number_input("Vocabulary Size:", value=100000)
    sentence = st.text_input(label="*Text to tokenise:*",
                             value="[CLS]  Tokenising text is a fundamental step for NLP models. [SEP] [PAD] [PAD] [PAD]")

    from tokenizers import Tokenizer, decoders, models, pre_tokenizers, trainers

    tokenizer = Tokenizer(models.WordPiece(unk_token="[UNK]"))
    tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
    tokenizer.decoder = decoders.ByteLevel()
    trainer = trainers.WordPieceTrainer(
        vocab_size=vocabulary_size,
        special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"],
    )
    tokenizer.train_from_iterator(training_dataset, trainer=trainer)
    output = tokenizer.encode(sentence)

    # TODO: make it more visible, container with a differect color or something
    st.write("*Tokens:*")
    st.code(f"""{output.tokens}""")
    st.code(f"""\
            ids: {output.ids}
            attention_mast: {output.attention_mask}
            """)

    st.write(""" *well done if you get ids like these: [1, 76, 72, 50, 10, 77, 71, 68, 66, 78, 5, 2, 3, 3, 3]!*""")
    with st.expander("Python code:"):
        st.code(f"""\
            from tokenizers import Tokenizer, decoders, models, pre_tokenizers, trainers
            
            tokenizer = Tokenizer(models.WordPiece(unk_token="[UNK]"))
            trainer = trainers.WordPieceTrainer(
                vocab_size={vocabulary_size},
                special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"],
            )
            training_dataset = {training_dataset}
            tokenizer.train_from_iterator(training_dataset, trainer=trainer)
            output = tokenizer.encode("{sentence}") 
        """, language='python')


with st.expander("Special tokens meaning:"):
    st.write("""\
        \\#\\# prefix: It means that the preceding string is not whitespace, any token with this prefix should be 
        merged with the previous token when you convert the tokens back to a string.
        
        [UNK]: Stands for "unknown". This token is used to represent any word that is not in the model's vocabulary. Since 
        most models have a fixed-size vocabulary, it's not possible to have a unique token for every possible word. The [UNK] 
        token is used as a catch-all for any words the model hasn't seen before. E.g. in our example we 'decided' that Large 
        Language (LL) abbreviation is not part of the model's vocabulary.
        
        [CLS]: Stands for "classification". In models like BERT, this token is added at the beginning of every input 
        sequence. The representation (embedding) of this token is used as the aggregate sequence representation for 
        classification tasks. In other words, the model is trained to encode the meaning of the entire sequence into this token.
        
        [SEP]: Stands for "separator". This token is used to separate different sequences when the model needs to take more 
        than one input sequence. For example, in question-answering tasks, the model takes two inputs: a question and a 
        passage that contains the answer. The two inputs are separated by a [SEP] token.
        
        [MASK]: This token is specific to models like BERT, which are trained with a masked language modelling objective. 
        During training, some percentage of the input tokens are replaced with the [MASK] token, and the model's goal is to 
        predict the original value of the masked tokens.
        
        [PAD]: Stands for "padding". This token is used to fill in the extra spaces when batching sequences of different 
        lengths together. Since models require input sequences to be the same length, shorter sequences are extended with [
        PAD] tokens. In our example, we extended the length of the input sequence to 16 tokens.
""")


with st.expander("References:"):
    st.write("""\
    - https://huggingface.co/docs/transformers/tokenizer_summary
    - https://huggingface.co/docs/tokenizers/training_from_memory
    """)

divider()
st.header("Embeddings")

st.write("""\
    Following tokenization, each token is transformed into a vector of numeric characteristics, a process 
    known as 'embedding.' In this context, 'embedding' refers to the mapping of the discrete, categorical space of words 
    or tokens into a continuous, numeric space, which the model can manipulate more effectively.
    
    Each dimension in this high-dimensional space can encapsulate a different facet of the token's meaning. For instance, 
    one dimension might capture the tense of a token if it's a verb, while another dimension might capture the degree of 
    positivity or negativity if the token is an adjective expressing sentiment. For instance: 
""")
st.code("""\
    "I" -> [noun, person]
    "love" -> [verb, feeling]
    "machine" -> [noun, automation]
    "learn" -> [verb, knowledge]
    "##ing" -> [gerund, continues]
""")

st.write("""\
    The actual embeddings in a typical NLP model would be in a much higher-dimensional space (often several hundred dimensions), but the idea is the same.
    Embeddings are dynamically learned from the data, with the model adjusting these embeddings during 
    training to minimize the discrepancy between the predicted and actual outputs for a set of training examples. 
    Consequently, tokens with similar meanings often end up with similar embeddings.

    In the context of Transformers, these embeddings are the inputs that the model uses. Once again, we represent all the 
    characteristics using numbers, not words.
""")

st.write("""\
    Let's explore embeddings in more detail. We can take an experimental approach by encoding two specific 
    words and examining the corresponding embedding vectors they generate. To make our exploration more accessible, 
    we'll visualise a portion of these vectors, thereby unveiling the underlying structure of embeddings. Pay attention 
    to common patterns and peaks, try to find two words that yield differing embeddings.
""")
col1, col2, col3 = st.columns(3)
token_king = col1.text_input("Choose a word:", value="king")
token_queen = col2.text_input("Choose a word:", value="queen")
token_dots = col3.number_input("Number of dots:", value=50, min_value=0, max_value=1536)


from torch import nn
from transformers import AutoConfig
from transformers import AutoTokenizer
import pandas as pd
import openai
import plotly.express as px

model_ckpt = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
king_id = tokenizer(token_king, return_tensors="pt", add_special_tokens=False)
queen_id = tokenizer(token_queen, return_tensors="pt", add_special_tokens=False)

config = AutoConfig.from_pretrained(model_ckpt)
token_emb = nn.Embedding(config.vocab_size, config.hidden_size)
king_embeddings = token_emb(king_id.input_ids)
queen_embeddings = token_emb(queen_id.input_ids)
king_emb_np = king_embeddings.reshape(-1).detach().numpy()
queen_emb_np = queen_embeddings.reshape(-1).detach().numpy()


openai.api_key = st.secrets["OPENAI_API_KEY"]
EMBEDDING_MODEL = 'text-embedding-ada-002'
EMBEDDING_CTX_LENGTH = 8191
EMBEDDING_ENCODING = 'cl100k_base'
king = get_embeddings(token_king)
queen = get_embeddings(token_queen)


df = pd.DataFrame({f'"{token_king}" embeddings': king_emb_np, f'"{token_queen}" embeddings': queen_emb_np})
fig = px.line(df[:token_dots], title=f"Google's 'bert-base-uncased' model embeddings, embedding vector size: {len(queen_emb_np)}")
fig.update_layout(legend=dict(orientation="h"))
st.plotly_chart(fig, use_container_width=True)

with st.expander("Python Code:"):
    st.code(f"""\
        from torch import nn
        from transformers import AutoConfig

        model_ckpt = 'bert-base-uncased'
        tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
        king_id = tokenizer("{token_king}", return_tensors="pt", add_special_tokens=False)
        queen_id = tokenizer("{token_queen}", return_tensors="pt", add_special_tokens=False)

        config = AutoConfig.from_pretrained(model_ckpt)
        token_emb = nn.Embedding(config.vocab_size, config.hidden_size)
        king_embeddings = token_emb(king_id.input_ids)
        queen_embeddings = token_emb(queen_id.input_ids)
    """)

df = pd.DataFrame({f'"{token_king}" embeddings': king, f'"{token_queen}" embeddings': queen})
fig = px.line(df[:token_dots], title=f"OpenAI's 'text-embedding-ada-002' model embeddings, embedding vector size: {len(queen)}")
fig.update_layout(legend=dict(orientation="h"))
st.plotly_chart(fig, use_container_width=True)


with st.expander("Python Code:"):
    st.code(f"""\
        import openai
        import numpy as np
        
        EMBEDDING_MODEL = 'text-embedding-ada-002'

        king_embeddings = np.array(openai.Embedding.create(input="{token_king}", model=EMBEDDING_MODEL)["data"][0]["embedding"])
        queen_embeddings = np.array(openai.Embedding.create(input="{token_queen}", model=EMBEDDING_MODEL)["data"][0]["embedding"])
    """)

st.write("""\
    The similarity can be represented as a similarity score. Identical words naturally have the highest 
    score (black colours), while unrelated terms have lower scores (white colours). To compute this score, 
    we construct a matrix infused with our embedding vectors. Each row in this matrix corresponds to a unique word in the 
    sentence, while each column aligns with another word. The value at the intersection of row i and column j represents 
    the score between word i and word j. For a clearer understanding, let's visualise this matrix using a heatmap. Each 
    cell in the grid corresponds to a pair of words, and the colour of the cell indicates the similarity (correlation) 
    score between those two words. The intensity of the colour directly corresponds to the magnitude of the score - the 
    darker the hue, the higher the score.
""")

st.write("""Here is a heatmap of the score matrix for the sentence:""")
sentence = st.text_input(label="*words to explore embeddings*", value="a the king queen space sit eat from on")
sentence = sentence.split()

input = {word: get_embeddings(word) for word in sentence}

scores_matrix = np.zeros((len(sentence), len(sentence)))
for i, word_i in enumerate(sentence):
    for j, word_j in enumerate(sentence):
        scores_matrix[i, j] = np.dot(input[word_i], input[word_j])

fig = px.imshow(scores_matrix, x=sentence, y=sentence, color_continuous_scale="hot_r")
fig.update_layout(coloraxis_showscale=False)
fig.update_layout(width=6000)
st.plotly_chart(fig, use_container_width=True)

st.subheader("Vector Databases")
st.write("""\
    In a vector database, each item (e.g., a document) is represented as a point in a multidimensional 
    space. Each point is a vector that represents the features of the item. The goal is to place similar items close to 
    each other and dissimilar items farther apart. In the case of documents, the features could be derived from the words 
    in the document, and the similarity might be based on the overlapping words or concepts between the documents.
    
    The retrieval of documents based on search terms involves two main steps:

    - **Vectorization of the search query**: The search query is converted into a vector using the same process used to vectorize the documents in the database.

    - **Vector similarity search**: The vector database then identifies the vectors that are closest to the query vector. 
    This is typically done using a distance metric like Euclidean distance or cosine similarity. The documents 
    corresponding to these vectors are returned as the search results.
    
    As you correctly assumed, we leverage embedding algorithms to vectorise documents. Let's generate a 3D 
    visualization of the document vectors and a search query. For simplicity, let's assume we have a vector database 
    of documents that has been reduced to 3 dimensions, and we'll also have a 3D vector for a search query. 

""")
with st.expander("The Euclidean distance between two points in 3D space is calculated as:"):
    st.latex("""\\text{Distance}(A(x_1, y_1, z_1), B(x_2, y_2, z_2)) = \sqrt{(x_2 - x_1)^2 + (y_2 - y_1)^2 + (z_2 - z_1)^2}""")
st.write("""\
    The document that corresponds to the vector with the smallest distance to the query vector is 
    considered the most relevant document. The 3D plot above now shows lines from the query vector (in red) to each 
    document vector (in blue). Each line represents the Euclidean distance from the query vector to a document vector.
""")
embeddings = st.text_input("vector space:", value="king queen prince princess counselor minister teacher")
embeddings = embeddings.split()
embeddings_query = st.text_input(label="search term", value='woman')

import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from sklearn.manifold import TSNE

embeddings = {word: get_embeddings(word) for word in embeddings}
embeddings[embeddings_query] = get_embeddings(embeddings_query)

tsne = TSNE(n_components=3, perplexity=3, random_state=0)
embedding_matrix = np.array(list(embeddings.values()))
reduced_embeddings = tsne.fit_transform(embedding_matrix)

df = pd.DataFrame(reduced_embeddings, columns=["X", "Y", "Z"])
df["Word"] = list(embeddings.keys())
fig = px.scatter_3d(df, x="X", y="Y", z="Z", text="Word", title="Vector Space", width=800, height=800)


docs = reduced_embeddings[:-1]
query = reduced_embeddings[-1]
distances = np.linalg.norm(docs - query, axis=1)
closest_doc_index = np.argmin(distances)
closest_doc = docs[closest_doc_index]

for doc in docs:
    fig.add_trace(go.Scatter3d(x=[query[0], doc[0]], y=[query[1], doc[1]], z=[query[2], doc[2]], mode='lines', line=dict(color='purple', width=2, dash='dash')))
fig.add_trace(go.Scatter3d(x=[query[0], closest_doc[0]], y=[query[1], closest_doc[1]], z=[query[2], closest_doc[2]], name='closest', mode='lines', line=dict(color='purple', width=2)))
fig.update_layout(legend=dict(orientation="h"))
st.plotly_chart(fig, use_container_width=True)

st.write("""\
    This visualization represents the core concept of a vector database search. The database converts the 
    search query into a vector, then finds the document vectors that are closest to the query vector. Those documents are 
    considered the most relevant to the search query.
    
    It's important to note that in a real-world application, the vectors would likely exist in much higher dimensional 
    space. However, the same principles apply: the search algorithm finds the document vectors that are nearest to the 
    query vector based on some distance metric.
""")
st.subheader(":green[Try Yourself]")

st.write("""\
    *There is a vector database containing two words (documents): 'king' and 'queen'. Your task is to pinpoint search 
    terms that would yield either of these words. To facilitate this, use the previously presented similarity matrix to 
    seek out words that give a higher correlation with the word in question. For instance, you might want to explore 
    terms such as 'king', 'queen', 'dog', 'prince', 'man', 'minister', 'boy'.* 
""")
embeddings_query = st.text_input(label="search term")

from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.schema.document import Document

@st.cache_resource
def create_vector_database():
    return FAISS.from_documents([Document(page_content="king"), Document(page_content="queen")], OpenAIEmbeddings(model=EMBEDDING_MODEL))
db = create_vector_database()

@st.cache_data
def search_vector_database(term):
    embedding_vector = OpenAIEmbeddings(model=EMBEDDING_MODEL).embed_query(term)
    docs = db.similarity_search_by_vector(embedding_vector)
    return docs

if embeddings_query is not None and embeddings_query != '':
    docs = search_vector_database(embeddings_query)
    st.warning(docs[0].page_content)

    with st.expander("Python Code:"):
        st.code(f"""\
            from langchain.embeddings.openai import OpenAIEmbeddings
            from langchain.vectorstores import FAISS
            from langchain.schema.document import Document
    
    
            db = FAISS.from_documents([Document(page_content="king"), Document(page_content="queen")], OpenAIEmbeddings(model=EMBEDDING_MODEL))
            embedding_vector = OpenAIEmbeddings(model=EMBEDDING_MODEL).embed_query("{embeddings_query}")
            docs = db.similarity_search_by_vector(embedding_vector)
        """)

divider()
st.subheader("Conclusion")
st.write("""\
    As embedding algorithms are trained on a vast corpus of data, they inherently encapsulate a rich 
    tapestry of information about our language and even the world at large. Therefore, they can be used for:

    - Search (where results are ranked by relevance to a query string)
    - Clustering (where text strings are grouped by similarity)
    - Recommendations (where items with related text strings are recommended)
    - Anomaly detection (where outliers with little relatedness are identified)
    - Diversity measurement (where similarity distributions are analyzed)
    - Classification (where text strings are classified by their most similar label)
""")

with st.expander("References:"):
    st.write("""\
        - https://huggingface.co/blog/getting-started-with-embeddings
        - https://huggingface.co/blog/1b-sentence-embeddings
        - https://platform.openai.com/docs/guides/embeddings/use-cases
    """)


# *********************************************
divider()
st.header("Dimensionality Reduction")

st.write("""\
    As was mentioned above, embedding vectors are learned in such a way that words with similar meanings 
    are located close to each other in the space.  However, this is an abstract concept that might be difficult to 
    explore, understand and visualise in a 2D space because word embeddings typically have hundreds of dimensions. To 
    solve this, we can use techniques like Principal Component Analysis (PCA) or t-SNE to reduce the dimensionality of 
    the vectors and plot them.
""")
st.write("""But first, let's talk about the meaning of dimensionality reduction using simplified use-case:""")

dimensionality_name = st.selectbox(label="Choose your example", options=["Simplified", "PCA", 't-SNE'])
if dimensionality_name == 'Simplified':
    _, col2, _ = st.columns(3)
    col2.image("assets/img.png")
    st.write("""\
        **Step 1: The context**\n
        We have a 3D object (your hand) and a light source that's casting a 2D shadow of your hand onto a 
        wall. The shadow is a simpler, lower-dimensional representation of your hand.

        **Step 2: Identifying the dimensions**\n
        In this case, the dimensions are the different aspects of your hand that can be 
        observed: the length of your fingers, the width of your palm, the height (or depth) of your hand, the scars, 
        the colour of the skin, etc. However, we have a problem: we can't easily visualise or understand all these dimensions 
        at once. Just as it's hard to imagine a 6-dimensional space.

        **Step 3: Deciding on important dimensions**\n
        Let's say you want to compare the number of fingers of different hands. In 
        this case, you don't need to know about the depth of the hand, the width of the palm, or other details like freckles, 
        scars, or skin colour. You just need a shadow that clearly shows the fingers. So, you decide to focus on the length 
        of the fingers, which can be easily shown in the shadow.

        **Step 4: Reducing dimensions**\n
        This is where you actually perform dimensionality reduction. You orient your hand in such 
        a way (giving the wall a high-five) that the shadow clearly shows the fingers. You've effectively reduced the 
        dimensions from 3D to 2D. Your hand is still a 3D object, but its shadow — the simplified representation you're using 
        for your comparison — is 2D.

        **Step 5: Interpretation**\n
        This hand and shadow example shows how dimensionality reduction simplifies a complex object (
        the 3D hand) into a lower-dimensional representation (the 2D shadow) that retains the most important information (the 
        number of fingers) while discarding the less important details (like the depth of the hand, skin colour, etc.). It's 
        a process of prioritisation and simplification that makes it easier for us to understand and analyse the data (or the 
        hands, in this case).
    """)
elif dimensionality_name == 'PCA':
    st.write("""\
        **Step 1: Understanding PCA**\n
        PCA is a popular method for dimensionality reduction. It identifies the 
        axes in the feature space along which the original data varies the most. These axes are known as the principal 
        components, and they are orthogonal (perpendicular) to each other.
    
        **Step 2: Projecting the Data**\n
        Imagine that instead of just casting a shadow on the wall, you can cast your hand's 
        shadow onto a number of walls arranged at different angles around your hand. Each shadow is a different projection of 
        your hand. In PCA, these different walls represent different principal components, and the shadow on each wall is a 
        projection of your hand onto that principal component.
        
        **Step 3: Choosing the Best Projection**\n 
        Now, consider the shadow that most accurately portrays the number of fingers on 
        your hand. This shadow corresponds to the principal component that captures the most variance in the data. In PCA, 
        this would be the first principal component.
        
        **Step 4: Secondary Features**\n 
        Next, consider the shadow that, while not as accurate as the first, still gives a 
        reasonable representation of your hand, such as showing the width of your palm. This shadow represents the second 
        principal component, which captures the second highest amount of variance in the data.
        
        **Step 5: Reduction of Dimensions**\n 
        In the process of reducing dimensions, we select the top few principal components (
        shadows) that capture the most variance. The other dimensions (shadows) are discarded. So, instead of having to 
        consider the complex 3D structure of your hand, you can simply look at one or two shadows that give you the most 
        information about the hand.
        
        **Step 6: Transformation**\n 
        Finally, we transform the original data into the reduced dimensional space defined by the 
        selected principal components. This is analogous to replacing each hand with the selected shadows for further analysis.
        By using PCA, we can reduce the complexity of the data (from a 3D hand to a 2D or even 1D shadow), while still 
        retaining the most important information (like the number of fingers or the width of the palm). This makes the data 
        easier to visualize, understand, and work with. 
    """)
    embedding_dim = 1536
    embeddings = st.text_input("words to explore:",
                               value="king queen man woman prince princess counselor minister teacher")
    embeddings = embeddings.split()
    embeddings = {word: get_embeddings(word) for word in embeddings}

    from sklearn.decomposition import PCA

    pca = PCA(n_components=2)
    embedding_matrix = np.array(list(embeddings.values()))
    reduced_embeddings = pca.fit_transform(embedding_matrix)

    df = pd.DataFrame(reduced_embeddings, columns=["X", "Y"])
    df["Word"] = list(embeddings.keys())
    fig = px.scatter(df, x="X", y="Y", text="Word", title="Word Embeddings", width=800, height=800)
    st.plotly_chart(fig, use_container_width=True)

    st.code(f"""\
       from sklearn.decomposition import PCA
       import numpy as np

        pca = PCA(n_components=2)
        embedding_matrix = np.array(list(embeddings.values()))
        reduced_embeddings = pca.fit_transform(embedding_matrix)
       """, language='python')

elif dimensionality_name == 't-SNE':
    st.write("""\
        **Step 1: Understanding t-SNE**\n 
        t-SNE is a technique for dimensionality reduction that is particularly 
        well-suited for the visualization of high-dimensional datasets. Unlike PCA, which is a linear technique, 
        t-SNE is a non-linear technique, making it better at capturing complex polynomial relationships between variables.
    
        **Step 2: Measuring Similarities**\n 
        Imagine that instead of just one hand, you have many hands casting shadows. Each hand 
        is different - some hands might have longer fingers, some might have a wider palm, and so on. Each hand has its own 
        "neighborhood" of similar hands. In t-SNE, these neighborhoods are represented mathematically by a probability 
        distribution. Hands that are very similar to each other have a high probability of being "neighbors", while hands 
        that are very different have a low probability.
        
        **Step 3: Creating a Map**\n 
        t-SNE creates a map (or a projection) where hands that were close in the high-dimensional 
        space (similar hands) are still close in the low-dimensional space (in their shadows), and hands that were far apart 
        in the high-dimensional space (different hands) are still far apart in the low-dimensional space. This map is created 
        in such a way that it minimizes the difference between the distances in the high-dimensional space and the distances 
        in the low-dimensional space.
        
        **Step 4: Reducing Dimensions**\n 
        The process of reducing dimensions in t-SNE involves optimizing the locations of each 
        hand's shadow in the low-dimensional space such that the overall configuration of shadows best represents the 
        similarities between the hands in the high-dimensional space.
        
        **Step 5: Interpretation**\n 
        The result of t-SNE is a map where similar hands are located close together and dissimilar 
        hands are located far apart. This makes it easier to visualize clusters or groups of similar hands.
        t-SNE, therefore, helps us to project high-dimensional data into a lower-dimensional space in a way that preserves 
        the structure of the data as much as possible, making it easier to visualize and understand the relationships in the 
        data. 
    """)
    embedding_dim = 1536
    embeddings = st.text_input("words to explore:",
                               value="king queen man woman prince princess counselor minister teacher")
    embeddings = embeddings.split()
    embeddings = {word: get_embeddings(word) for word in embeddings}

    from sklearn.manifold import TSNE

    tsne = TSNE(n_components=2, perplexity=2, random_state=0)
    embedding_matrix = np.array(list(embeddings.values()))
    reduced_embeddings = tsne.fit_transform(embedding_matrix)

    df = pd.DataFrame(reduced_embeddings, columns=["X", "Y"])
    df["Word"] = list(embeddings.keys())
    fig = px.scatter(df, x="X", y="Y", text="Word", title="Word Embeddings", width=800, height=800)
    st.plotly_chart(fig, use_container_width=True)

    st.code(f"""\
        from sklearn.manifold import TSNE
        import numpy as np

        tsne = TSNE(n_components=2, perplexity=2, random_state=0)
        embedding_matrix = np.array(list(embeddings.values()))
        reduced_embeddings = tsne.fit_transform(embedding_matrix)
    """, language='python')

with st.expander("References:"):
    st.write("""\
        - https://hex.tech/blog/dimensionality-reduction/
        - https://github.com/openai/openai-cookbook/blob/main/examples/Visualizing_embeddings_in_2D.ipynb
    """)