im commited on
Commit
ab03e32
1 Parent(s): 74ab428
Files changed (4) hide show
  1. .gitignore +165 -0
  2. .streamlit/config.toml +3 -0
  3. app.py +246 -0
  4. requirements.txt +3 -0
.gitignore ADDED
@@ -0,0 +1,165 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py,cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # flask
86
+ flask_session
87
+ *.log
88
+ datasets/
89
+
90
+ # pyenv
91
+ # For a library or package, you might want to ignore these files since the code is
92
+ # intended to run in multiple environments; otherwise, check them in:
93
+ # .python-version
94
+
95
+ # pipenv
96
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
97
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
98
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
99
+ # install all needed dependencies.
100
+ #Pipfile.lock
101
+
102
+ # poetry
103
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
104
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
105
+ # commonly ignored for libraries.
106
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
107
+ #poetry.lock
108
+
109
+ # pdm
110
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
111
+ #pdm.lock
112
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
113
+ # in version control.
114
+ # https://pdm.fming.dev/#use-with-ide
115
+ .pdm.toml
116
+
117
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
118
+ __pypackages__/
119
+
120
+ # Celery stuff
121
+ celerybeat-schedule
122
+ celerybeat.pid
123
+
124
+ # SageMath parsed files
125
+ *.sage.py
126
+
127
+ # Environments
128
+ .env
129
+ .venv
130
+ env/
131
+ venv/
132
+ ENV/
133
+ env.bak/
134
+ venv.bak/
135
+
136
+ # Spyder project settings
137
+ .spyderproject
138
+ .spyproject
139
+
140
+ # Rope project settings
141
+ .ropeproject
142
+
143
+ # mkdocs documentation
144
+ /site
145
+
146
+ # mypy
147
+ .mypy_cache/
148
+ .dmypy.json
149
+ dmypy.json
150
+
151
+ # Pyre type checker
152
+ .pyre/
153
+
154
+ # pytype static type analyzer
155
+ .pytype/
156
+
157
+ # Cython debug symbols
158
+ cython_debug/
159
+
160
+ # PyCharm
161
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
162
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
163
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
164
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
165
+ .idea/
.streamlit/config.toml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ [theme]
2
+ base="dark"
3
+ font="sans serif"
app.py ADDED
@@ -0,0 +1,246 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+ # TODO: move to 'utils'
4
+ mystyle = '''
5
+ <style>
6
+ p {
7
+ text-align: justify;
8
+ }
9
+ </style>
10
+ '''
11
+ st.markdown(mystyle, unsafe_allow_html=True)
12
+
13
+
14
+ def divider():
15
+ _, c, _ = st.columns(3)
16
+ c.divider()
17
+
18
+ st.title("Transformers: Tokenisers and Embeddings")
19
+
20
+ preface_image, preface_text, = st.columns(2)
21
+ # preface_image.image("https://static.streamlit.io/examples/dice.jpg")
22
+ # preface_image.image("""https://assets.digitalocean.com/articles/alligator/boo.svg""")
23
+ preface_text.write("""*Transformers represent a revolutionary class of machine learning architectures that have sparked
24
+ immense interest. While numerous insightful tutorials are available, the evolution of transformer architectures over
25
+ the last few years has led to significant simplifications. These advancements have made it increasingly
26
+ straightforward to understand their inner workings. In this series of articles, I aim to provide a direct, clear explanation of
27
+ how and why modern transformers function, unburdened by the historical complexities associated with their inception.*
28
+ """)
29
+
30
+ divider()
31
+
32
+ st.write("""In order to understand the recent success in AI we need to understand the Transformer architecture. Its
33
+ rise in the field of Natural Language Processing (NLP) is largely attributed to a combination of several key
34
+ advancements:
35
+
36
+ - Tokenisers and Embeddings
37
+ - Attention and Self-Attention
38
+ - Encoder-Decoder architecture
39
+
40
+ Understanding these foundational concepts is crucial to comprehending the overall structure and function of the
41
+ Transformer model. They are the building blocks from which the rest of the model is constructed, and their roles
42
+ within the architecture are essential to the model's ability to process and generate language.
43
+
44
+ Given the importance and complexity of these concepts, I have chosen to dedicate the first article in this series
45
+ solely to Tokenisation and embeddings. The decision to separate the topics into individual articles is driven by a
46
+ desire to provide a thorough and in-depth understanding of each component of the Transformer model.
47
+
48
+
49
+ """)
50
+
51
+ with st.expander("Copernicus Museum in Warsaw"):
52
+ st.write("""
53
+ Have you ever visited the Copernicus Museum in Warsaw? It's an engaging interactive hub that allows
54
+ you to familiarize yourself with various scientific topics. The experience is both entertaining and educational,
55
+ providing the opportunity to explore different concepts firsthand. **They even feature a small neural network that
56
+ illustrates the neuron activation process during the recognition of handwritten digits!**
57
+
58
+ Taking inspiration from this approach, we'll embark on our journey into the world of Transformer models by first
59
+ establishing a firm understanding of Tokenisation and embeddings. This foundation will equip us with the knowledge
60
+ needed to delve into the more complex aspects of these models later on.
61
+
62
+ I encourage you not to hesitate in modifying parameters or experimenting with different models in the provided
63
+ examples. This hands-on exploration can significantly enhance your learning experience. So, let's begin our journey
64
+ through this virtual, interactive museum of AI. Enjoy the exploration!
65
+ """)
66
+ st.image("https://i.pinimg.com/originals/04/11/2c/04112c791a859d07a01001ac4f436e59.jpg")
67
+
68
+ divider()
69
+
70
+ st.header("Tokenisers and Tokenisation")
71
+
72
+ st.write("""Tokenisation is the initial step in the data preprocessing pipeline for natural language processing (NLP)
73
+ models. It involves breaking down a piece of text—whether a sentence, paragraph, or document—into smaller units,
74
+ known as "tokens". In English and many other languages, a token often corresponds to a word, but it can also be a
75
+ subword, character, or n-gram. The choice of token size depends on various factors, including the task at hand and
76
+ the language of the text.
77
+ """)
78
+
79
+ from transformers import AutoTokenizer
80
+
81
+ sentence = st.text_input("Sentence to explore (you can change it):", value="Tokenising text is a fundamental step for NLP models.")
82
+ sentence_split = sentence.split()
83
+ tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
84
+ sentence_tokenise_bert = tokenizer.tokenize(sentence)
85
+ sentence_encode_bert = tokenizer.encode(sentence)
86
+ sentence_encode_bert = list(zip(sentence_tokenise_bert, sentence_encode_bert))
87
+
88
+ st.write(f"""
89
+ Consider the sentence:
90
+ """)
91
+ st.code(f"""
92
+ "{sentence}"
93
+ """)
94
+
95
+ st.write(f"""
96
+ A basic word-level Tokenisation would produce tokens:
97
+ """)
98
+ st.code(f"""
99
+ {sentence_split}
100
+ """)
101
+
102
+
103
+ st.write(f"""
104
+ However, a more sophisticated algorithm, with several optimizations, might generate a different set of tokens:
105
+ """)
106
+ st.code(f"""
107
+ {sentence_tokenise_bert}
108
+ """)
109
+
110
+ with st.expander("click to look at the code:"):
111
+ st.code(f"""\
112
+ from transformers import AutoTokenizer
113
+
114
+ sentence = st.text_input("Sentence to explore (you can change it):", value="{sentence}")
115
+ sentence_split = sentence.split()
116
+ tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
117
+ sentence_tokenise_bert = tokenizer.tokenize(sentence)
118
+ sentence_encode_bert = tokenizer.encode(sentence)
119
+ sentence_encode_bert = list(zip(sentence_tokenise_bert, sentence_encode_bert))
120
+ """, language='python')
121
+
122
+
123
+ st.write("""
124
+ As machine learning models, including Transformers, work with numbers rather than words, each vocabulary
125
+ entry is assigned a corresponding numerical value. Here is a potential key-value, vocabulary-based representation of
126
+ the input (so called 'token ids'):
127
+ """
128
+ )
129
+
130
+ st.code(f"""
131
+ {sentence_encode_bert}
132
+ """)
133
+
134
+
135
+ st.write("""
136
+ What distinguishes subword Tokenisation is its reliance on statistical rules and algorithms, learned from
137
+ the pretraining corpus. The resulting Tokeniser creates a vocabulary, which usually represents the most frequently
138
+ used words and subwords. For example, Byte Pair Encoding (BPE) first encodes the most frequent words as single
139
+ tokens, while less frequent words are represented by multiple tokens, each representing a word part.
140
+
141
+ There are numerous different Tokenisers available, including spaCy, Moses, Byte-Pair Encoding (BPE),
142
+ Byte-level BPE, WordPiece, Unigram, and SentencePiece. It's crucial to choose a specific Tokeniser and stick with it.
143
+ Changing the Tokeniser is akin to altering the model's language on the fly—imagine studying physics in English and
144
+ then taking the exam in French or Spanish. You might get lucky, but it's a considerable risk.
145
+ """)
146
+
147
+ with st.expander("""Let's train a tokeniser using our own dataset"""):
148
+ training_dataset = """\
149
+ Beautiful is better than ugly.
150
+ Explicit is better than implicit.
151
+ Simple is better than complex.
152
+ Complex is better than complicated.
153
+ Flat is better than nested.
154
+ Sparse is better than dense.
155
+ Readability counts.
156
+ """
157
+ training_dataset = st.text_area("*Training Dataset - Vocabulary:*", value=training_dataset, height=200)
158
+ training_dataset = training_dataset.split('\n')
159
+ vocabulary_size = st.number_input("Vocabulary Size:", value=100000)
160
+
161
+
162
+ # TODO: add more tokenisers
163
+ from tokenizers import Tokenizer, decoders, models, normalizers, pre_tokenizers, trainers
164
+ tokenizer = Tokenizer(models.BPE(unk_token="[UNK]"))
165
+ # tokenizer = Tokenizer(models.Unigram())
166
+ tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
167
+ tokenizer.decoder = decoders.ByteLevel()
168
+ trainer = trainers.BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"], vocab_size=vocabulary_size)
169
+
170
+ # trainer = trainers.UnigramTrainer(
171
+ # vocab_size=20000,
172
+ # initial_alphabet=pre_tokenizers.ByteLevel.alphabet(),
173
+ # special_tokens=["<PAD>", "<BOS>", "<EOS>"],
174
+ # )
175
+
176
+ tokenizer.train_from_iterator(training_dataset, trainer=trainer)
177
+
178
+ sentence = st.text_input("*Text to tokenise:*", value="[CLS] Tokenising text is a fundamental step for NLP models. [SEP] [PAD] [PAD] [PAD]")
179
+ output = tokenizer.encode(sentence)
180
+
181
+ st.write("*Tokens:*")
182
+ st.code(f"""{output.tokens}""")
183
+ st.code(f"""\
184
+ ids: {output.ids}
185
+ attention_mast: {output.attention_mask}
186
+ """)
187
+
188
+
189
+
190
+ st.subheader("Try Yourself:")
191
+ st.write(f""" *Aim to find or create a comprehensive vocabulary (training dataset) for Tokenisation, which can enhance the
192
+ efficiency of the process. This approach helps to eliminate unknown tokens, thereby making the token sequence
193
+ more understandable and containing less tokens*
194
+ """)
195
+
196
+ st.caption("Special tokens meaning:")
197
+ st.write("""
198
+ \\#\\# prefix: It means that the preceding string is not whitespace, any token with this prefix should be
199
+ merged with the previous token when you convert the tokens back to a string.
200
+
201
+ [UNK]: Stands for "unknown". This token is used to represent any word that is not in the model's vocabulary. Since
202
+ most models have a fixed-size vocabulary, it's not possible to have a unique token for every possible word. The [UNK]
203
+ token is used as a catch-all for any words the model hasn't seen before. E.g. in our example we 'decided' that Large
204
+ Language (LL) abbreviation is not part of the model's vocabulary.
205
+
206
+ [CLS]: Stands for "classification". In models like BERT, this token is added at the beginning of every input
207
+ sequence. The representation (embedding) of this token is used as the aggregate sequence representation for
208
+ classification tasks. In other words, the model is trained to encode the meaning of the entire sequence into this token.
209
+
210
+ [SEP]: Stands for "separator". This token is used to separate different sequences when the model needs to take more
211
+ than one input sequence. For example, in question-answering tasks, the model takes two inputs: a question and a
212
+ passage that contains the answer. The two inputs are separated by a [SEP] token.
213
+
214
+ [MASK]: This token is specific to models like BERT, which are trained with a masked language modelling objective.
215
+ During training, some percentage of the input tokens are replaced with the [MASK] token, and the model's goal is to
216
+ predict the original value of the masked tokens.
217
+
218
+ [PAD]: Stands for "padding". This token is used to fill in the extra spaces when batching sequences of different
219
+ lengths together. Since models require input sequences to be the same length, shorter sequences are extended with [
220
+ PAD] tokens. In our example, we extended the length of the input sequence to 16 tokens.
221
+
222
+ """)
223
+ st.caption("Python code:")
224
+ st.code(f"""
225
+ from tokenizers import Tokenizer, decoders, models, normalizers, pre_tokenizers, trainers
226
+ tokenizer = Tokenizer(models.BPE(unk_token="[UNK]"))
227
+ tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
228
+ tokenizer.decoder = decoders.ByteLevel()
229
+ trainer = trainers.BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"], vocab_size={vocabulary_size})
230
+ training_dataset = {training_dataset}
231
+ tokenizer.train_from_iterator(training_dataset, trainer=trainer)
232
+ output = tokenizer.encode("{sentence}")
233
+ """, language='python')
234
+
235
+
236
+ with st.expander("References:"):
237
+ st.write("""\
238
+ - https://huggingface.co/docs/transformers/tokenizer_summary
239
+ - https://huggingface.co/docs/tokenizers/training_from_memory
240
+ - https://en.wikipedia.org/wiki/Byte_pair_encoding
241
+
242
+ """)
243
+
244
+ divider()
245
+ st.header("Embeddings")
246
+ st.caption("TBD...")
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ streamlit~=1.21.0
2
+ tokenizers~=0.13.3
3
+ transformers~=4.31.0