hkeshhk commited on
Commit
d15c366
1 Parent(s): aea33b6

bpetokenizer upload

Browse files
tokenizer/.github/workflows/pypi.yml ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: python-package
2
+
3
+ on:
4
+ release:
5
+ types: [published]
6
+
7
+ jobs:
8
+ pypi-publish:
9
+ name: Upload release to PyPI
10
+ runs-on: ubuntu-latest
11
+ permissions:
12
+ id-token: write
13
+
14
+ steps:
15
+ - name: Check out repository
16
+ uses: actions/checkout@v4
17
+
18
+ - name: Set up Python
19
+ uses: actions/setup-python@v5
20
+ with:
21
+ python-version: '3.x'
22
+
23
+ - name: Install dependencies
24
+ run: |
25
+ python -m pip install --upgrade pip
26
+ pip install setuptools wheel
27
+
28
+ - name: Build package
29
+ run: |
30
+ python setup.py sdist bdist_wheel
31
+
32
+ - name: Publish package distributions to PyPI
33
+ uses: pypa/gh-action-pypi-publish@release/v1
34
+ with:
35
+ password: ${{ secrets.PYPI_API_TOKEN }}
tokenizer/.github/workflows/tests.yml ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Tests
2
+
3
+ on:
4
+ push:
5
+ branches:
6
+ - main
7
+ pull_request:
8
+ branches:
9
+ - main
10
+
11
+ jobs:
12
+ TestBPETokenizer:
13
+ runs-on: ${{ matrix.os }}
14
+ strategy:
15
+ matrix:
16
+ os: [ubuntu-latest, windows-latest, macos-latest]
17
+ python-version: ["3.9", "3.10", "3.11"]
18
+ steps:
19
+ - uses: actions/checkout@v4
20
+
21
+ - name: Setup Python ${{ matrix.python-version }}
22
+ uses: actions/setup-python@v5
23
+ with:
24
+ python-version: ${{ matrix.python-version }}
25
+
26
+ - name: Install dependencies
27
+ run: |
28
+ python -m pip install --upgrade pip
29
+ pip install pytest
30
+ pip install regex
31
+
32
+ - name: Run Tests
33
+ run: |
34
+ python -m pytest tests/test_tokenizer.py
35
+
36
+ - name: Upload Test Results
37
+ uses: actions/upload-artifact@v4
38
+ with:
39
+ name: test-results
40
+ path: test-results.xml
41
+
tokenizer/.gitignore ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Project
2
+ /.vs
3
+ .vscode
4
+
5
+ # Log files
6
+ *.log
7
+
8
+ # Python virtualenv
9
+ .venv*
10
+
11
+ # Byte-compiled / optimized / DLL files
12
+ __pycache__/
13
+ *.py[cod]
14
+ *$py.class
15
+
16
+ # C extensions
17
+ *.so
18
+
19
+ # Distribution / packaging
20
+ .Python
21
+ build/
22
+ develop-eggs/
23
+ dist/
24
+ downloads/
25
+ eggs/
26
+ .eggs/
27
+ lib/
28
+ lib64/
29
+ parts/
30
+ sdist/
31
+ var/
32
+ wheels/
33
+ share/python-wheels/
34
+ *.egg-info/
35
+ .installed.cfg
36
+ *.egg
37
+ MANIFEST
38
+
39
+ # PyInstaller
40
+ # Usually these files are written by a python script from a template
41
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
42
+ *.manifest
43
+ *.spec
44
+
45
+ # Installer logs
46
+ pip-log.txt
47
+ pip-delete-this-directory.txt
48
+
49
+ # Unit test / coverage reports
50
+ htmlcov/
51
+ .tox/
52
+ .nox/
53
+ .coverage
54
+ .coverage.*
55
+ .cache
56
+ nosetests.xml
57
+ coverage.xml
58
+ *.cover
59
+ *.py,cover
60
+ .hypothesis/
61
+ .pytest_cache/
62
+ cover/
63
+
64
+ # Translations
65
+ *.mo
66
+ *.pot
67
+
68
+ # Django stuff:
69
+ *.log
70
+ local_settings.py
71
+ db.sqlite3
72
+ db.sqlite3-journal
73
+
74
+ # Flask stuff:
75
+ instance/
76
+ .webassets-cache
77
+
78
+ # Scrapy stuff:
79
+ .scrapy
80
+
81
+ # Sphinx documentation
82
+ docs/_build/
83
+
84
+ # PyBuilder
85
+ .pybuilder/
86
+ target/
87
+
88
+ # Jupyter Notebook
89
+ .ipynb_checkpoints
90
+
91
+ # IPython
92
+ profile_default/
93
+ ipython_config.py
94
+
95
+ # pyenv
96
+ # For a library or package, you might want to ignore these files since the code is
97
+ # intended to run in multiple environments; otherwise, check them in:
98
+ # .python-version
99
+
100
+ # pipenv
101
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
102
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
103
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
104
+ # install all needed dependencies.
105
+ #Pipfile.lock
106
+
107
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow
108
+ __pypackages__/
109
+
110
+ # Celery stuff
111
+ celerybeat-schedule
112
+ celerybeat.pid
113
+
114
+ # SageMath parsed files
115
+ *.sage.py
116
+
117
+ # Environments
118
+ .env
119
+ .venv
120
+ env/
121
+ venv/
122
+ ENV/
123
+ env.bak/
124
+ venv.bak/
125
+
126
+ # Spyder project settings
127
+ .spyderproject
128
+ .spyproject
129
+
130
+ # Rope project settings
131
+ .ropeproject
132
+
133
+ # mkdocs documentation
134
+ /site
135
+
136
+ # mypy
137
+ .mypy_cache/
138
+ .dmypy.json
139
+ dmypy.json
140
+
141
+ # Pyre type checker
142
+ .pyre/
143
+
144
+ # pytype static type analyzer
145
+ .pytype/
146
+
147
+ # Cython debug symbols
148
+ cython_debug/
149
+
150
+ logs
151
+
152
+ .idea/*
153
+ .DS_Store
154
+
155
+ output/
156
+ *.pkl
tokenizer/README.md ADDED
@@ -0,0 +1,151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # bpetokenizer
2
+
3
+ A Byte Pair Encoding (BPE) tokenizer, which algorithmically follows along the GPT tokenizer. The tokenizer is capable of handling special tokens and uses a customizable regex pattern for tokenization(includes the gpt4 regex pattern). supports `save` and `load` tokenizers in the `json` and `file` format.
4
+
5
+
6
+ ### Overview
7
+
8
+ The Byte Pair Encoding (BPE) algorithm is a simple yet powerful method for building a vocabulary of subword units for a given text corpus. This tokenizer can be used for training your tokenizer of the LLM on various languages of text corpus.
9
+
10
+ this algorithm is first introduced in the paper [Neural Machine Translation of Rare Words with Subword Units](https://arxiv.org/pdf/1508.07909) and then used this in the gpt2 tokenizer([Language Models are Unsupervised Multitask Learners](https://d4mucfpksywv.cloudfront.net/better-language-models/language_models_are_unsupervised_multitask_learners.pdf))
11
+
12
+ The [notebook](notebooks/tokenization.ipynb) which shows the BPE algorithm in detail and how the tokenizers work internally.
13
+
14
+ Every LLM(LLama, Gemini, Mistral..) use their own Tokenizers trained on their own text dataset.
15
+
16
+
17
+ ### Features
18
+
19
+ - Implements Byte Pair Encoding (BPE) algorithm.
20
+ - Handles special tokens.
21
+ - Uses a customizable regex pattern for tokenization.
22
+ - Compatible with Python 3.9 and above
23
+
24
+
25
+ #### This repository has 2 different Tokenizers:
26
+ - `BPETokenizer`
27
+ - `Tokenizer`
28
+
29
+ 1. [Tokenizer](bpetokenizer/base.py): This class contains `train`, `encode`, `decode` and functionalities to `save` and `load`. Also contains few helper functions `get_stats`, `merge`, `replace_control_characters`.. to perform the BPE algorithm for the tokenizer.
30
+
31
+ 2. [BPETokenizer](bpetokenizer/tokenizer.py): This class emphasizes the real power of the tokenizer(used in gpt4 tokenizer..[tiktoken](https://github.com/openai/tiktoken)), uses the `GPT4_SPLIT_PATTERN` to split the text as mentioned in the gpt4 tokenizer. also handles the `special_tokens` (refer [sample_bpetokenizer](sample/bpetokenizer/sample_bpetokenizer.py)). which inherits the `save` and `load` functionlities to save and load the tokenizer respectively.
32
+
33
+
34
+ ### Usage
35
+
36
+ this tutorial leverages the `special_tokens` usage in the Tokenizer.
37
+
38
+ Install the package
39
+
40
+ ```shell
41
+ pip install bpetokenizer
42
+ ```
43
+
44
+
45
+ ```py
46
+ from bpetokenizer import BPETokenizer
47
+
48
+ special_tokens = {
49
+ "<|endoftext|>": 1001,
50
+ "<|startoftext|>": 1002,
51
+ "[SPECIAL1]": 1003,
52
+ "[SPECIAL2]": 1004,
53
+ }
54
+
55
+ tokenizer = BPETokenizer(special_tokens=special_tokens) # you can also use the method _special_tokens to register the special tokens (if not passed when intializing)
56
+ texts = "<|startoftext|> Hello, World! This is a sample text with the special tokens [SPECIAL1] and [SPECIAL2] to test the tokenizer.<|endoftext|>"
57
+
58
+ tokenizer.train(texts, vocab_size=310, verbose=True)
59
+ # tokenizer._special_tokens(special_tokens) # if not passed when intialization of the BPETokenizer
60
+
61
+ encode_text = """
62
+ <|startoftext|>Hello, World! This is a sample text with the special tokens [SPECIAL1] and [SPECIAL2] to test the tokenizer.
63
+ Hello, Universe! Another example sentence containing [SPECIAL1] and [SPECIAL2], used to ensure tokenizer's robustness.
64
+ Greetings, Earth! Here we have [SPECIAL1] appearing once again, followed by [SPECIAL2] in the same sentence.
65
+ Hello, World! This is yet another sample text, with [SPECIAL1] and [SPECIAL2] making an appearance.
66
+ Hey there, World! Testing the tokenizer with [SPECIAL1] and [SPECIAL2] to see if it handles special tokens properly.
67
+ Salutations, Planet! The tokenizer should recognize [SPECIAL1] and [SPECIAL2] in this long string of text.
68
+ Hello again, World! [SPECIAL1] and [SPECIAL2] are special tokens that need to be handled correctly by the tokenizer.
69
+ Welcome, World! Including [SPECIAL1] and [SPECIAL2] multiple times in this large text to ensure proper encoding.
70
+ Hi, World! Let's add [SPECIAL1] and [SPECIAL2] in various parts of this long sentence to test the tokenizer thoroughly.
71
+ <|endoftext|>
72
+ """
73
+ ids = tokenizer.encode(encode_text, special_tokens="all")
74
+ print(ids)
75
+
76
+ decode_text = tokenizer.decode(ids)
77
+ print(decode_text)
78
+
79
+ tokenizer.save("sample_bpetokenizer", mode="json") # mode: default is file
80
+ ```
81
+
82
+ refer [sample_bpetokenizer](sample/bpetokenizer) to have an understanding of the `vocab` and the `model` file of the tokenizer trained on the above texts.
83
+
84
+
85
+ #### To Load the Tokenizer
86
+
87
+ ```py
88
+ from bpetokenizer import BPETokenizer
89
+
90
+ tokenizer = BPETokenizer()
91
+
92
+ tokenizer.load("sample_bpetokenizer.json", mode="json")
93
+
94
+ encode_text = """
95
+ <|startoftext|>Hello, World! This is a sample text with the special tokens [SPECIAL1] and [SPECIAL2] to test the tokenizer.
96
+ Hello, Universe! Another example sentence containing [SPECIAL1] and [SPECIAL2], used to ensure tokenizer's robustness.
97
+ Greetings, Earth! Here we have [SPECIAL1] appearing once again, followed by [SPECIAL2] in the same sentence.<|endoftext|>"""
98
+
99
+ print("vocab: ", tokenizer.vocab)
100
+ print('---')
101
+ print("merges: ", tokenizer.merges)
102
+ print('---')
103
+ print("special tokens: ", tokenizer.special_tokens)
104
+
105
+ ids = tokenizer.encode(encode_text, special_tokens="all")
106
+ print('---')
107
+ print(ids)
108
+
109
+ decode_text = tokenizer.decode(ids)
110
+ print('---')
111
+ print(decode_text)
112
+
113
+ # you can also print the tokens and the text chunks split with the pattern.
114
+ tokens = tokenizer.tokens(encode_text, verbose=True) # if verbose, prints the text chunks and also the pattern used to split.
115
+ print('---')
116
+ print("tokens: ", tokens)
117
+
118
+ ```
119
+ refer to the [load_json_vocab](sample/load_json_vocab/) and run the `bpetokenizer_json` to get an overview of `vocab`, `merges`, `special_tokens` and to view the tokens that are split by the tokenizer using pattern, look at [tokens](sample/load_json_vocab/tokens.py)
120
+
121
+ ### Run Tests
122
+
123
+ the tests folder `tests/` include the tests of the tokenizer, uses pytest.
124
+
125
+ ```
126
+ python3 -m pytest
127
+ ```
128
+
129
+ additionally, the workflows are setup to run the tests when made a PR.
130
+
131
+
132
+ ### Contributing
133
+
134
+ Contributions to the BPE Tokenizer are most welcomed! If you would like to contribute, please follow these steps:
135
+
136
+ - Star and Fork the repository.
137
+ - Create a new branch (git checkout -b feature/your-feature).
138
+ - Commit your changes (git commit -am 'Add some feature').
139
+ - Push to the branch (git push origin feature/your-feature).
140
+ - Create a new Pull Request.
141
+
142
+ Please ensure your code follows the project's coding standards and includes appropriate tests. Also, update the documentation as necessary.
143
+
144
+
145
+ ### License
146
+
147
+ This project is licensed under the MIT License.
148
+
149
+ ----
150
+
151
+ *this tokenizer is inspired from the [minbpe](https://github.com/karpathy/minbpe), but more optimized.
tokenizer/bpetokenizer/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from .base import Tokenizer
2
+ from .tokenizer import BPETokenizer
3
+ from .version import __version__
tokenizer/bpetokenizer/__pycache__/__init__.cpython-39.pyc ADDED
Binary file (306 Bytes). View file
 
tokenizer/bpetokenizer/__pycache__/base.cpython-39.pyc ADDED
Binary file (8.04 kB). View file
 
tokenizer/bpetokenizer/__pycache__/tokenizer.cpython-39.pyc ADDED
Binary file (6.36 kB). View file
 
tokenizer/bpetokenizer/__pycache__/version.cpython-39.pyc ADDED
Binary file (209 Bytes). View file
 
tokenizer/bpetokenizer/base.py ADDED
@@ -0,0 +1,223 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ This file will contains all the helper functions
3
+ and Base class which has the methods to save/load model,
4
+ also required to build the BPETokenizer.
5
+ """
6
+
7
+ import regex as re
8
+ from .version import __version__
9
+
10
+ def get_stats(tokens, counts=None) -> dict:
11
+ """Get statistics of the tokens. Includes the frequency of each consecutive pair of tokens"""
12
+ counts = {} if counts is None else counts
13
+ for pair in zip(tokens, tokens[1:]):
14
+ counts[pair] = counts.get(pair, 0) + 1
15
+ return counts
16
+
17
+
18
+ def merge(ids, pair, idx) -> list:
19
+ """Merge the pair of tokens in the ids(list of tokens) with representing it with idx(new token in the vocab)."""
20
+ newids = []
21
+ i = 0
22
+ while i < len(ids):
23
+ if i < len(ids) -1 and ids[i] == pair[0] and ids[i+1] == pair[1]:
24
+ newids.append(idx)
25
+ i += 2
26
+ else:
27
+ newids.append(ids[i])
28
+ i += 1
29
+ return newids
30
+
31
+
32
+ import unicodedata
33
+
34
+ def replace_control_characters(s: str) -> str:
35
+ """
36
+ Replace control characters in a string with their unicode escape sequences. Prevents distortion
37
+ Example:
38
+ token = b"hello\nworld\x00"
39
+ print(token) -> hello
40
+ world (and \x00 might not be visible)
41
+ print(replace_control_characters(token))
42
+ -> hello\u000aworld\u0000
43
+
44
+ """
45
+ chars = []
46
+ for ch in s:
47
+ if unicodedata.category(ch)[0] != "C": # the category of the `\*` chars start with C
48
+ chars.append(ch)
49
+ else:
50
+ chars.append(f"\\u{ord(ch):04x}")
51
+ return "".join(chars)
52
+
53
+
54
+ def render_token(t: bytes) -> str:
55
+ s = t.decode('utf-8', errors='replace') # this will replace the unkown with a �
56
+ s = replace_control_characters(s)
57
+ return s
58
+
59
+
60
+
61
+ class Tokenizer:
62
+ """A Base class for the tokenizer, used for training and encoding/decoding the text without special tokens."""
63
+
64
+ def __init__(self):
65
+ self.merges = {}
66
+ self.pattern = "" # the regex pattern
67
+ self.compiled_pattern = re.compile(self.pattern) if self.pattern else ""
68
+ self.special_tokens = {}
69
+ self.vocab = self._build_vocab() if self.merges else {}
70
+
71
+ def _build_vocab(self) -> dict:
72
+ """Build the vocab from the merges and special tokens. This will be used to encode/decode the tokens."""
73
+ vocab = {idx: bytes([idx]) for idx in range(256)}
74
+ for (p0, p1), idx in self.merges.items():
75
+ vocab[idx] = vocab[p0] + vocab[p1]
76
+ if self.special_tokens:
77
+ for special, idx in self.special_tokens.items():
78
+ vocab[idx] = special.encode("utf-8")
79
+ return vocab
80
+
81
+ def save(self, file_name, mode="file"):
82
+ """
83
+ Writes metadata and vocabulary information to the model and vocab files.
84
+ mode: str, default="file" | "json" to save the model and vocab in json format.
85
+ """
86
+ if mode == "file":
87
+ model_file = file_name + ".model"
88
+ with open(model_file, 'w') as f:
89
+ f.write(f"{__version__}\n")
90
+ f.write(f"{self.pattern}\n")
91
+ f.write(f"{len(self.special_tokens)}\n")
92
+ if self.special_tokens:
93
+ for special, idx in self.special_tokens.items():
94
+ f.write(f"{special} {idx}\n")
95
+
96
+ for idx1, idx2 in self.merges: # this will give the tokens of pair which are merged
97
+ f.write(f"{idx1} {idx2}\n")
98
+
99
+ vocab_file = file_name + ".vocab"
100
+ inverted_merges = {idx: pair for pair, idx in self.merges.items()}
101
+ with open(vocab_file, "w", encoding="utf-8") as f:
102
+ for idx, token in self.vocab.items():
103
+ s = render_token(token)
104
+ # find the children of this token, if any
105
+ if idx in inverted_merges:
106
+ # if this token has children, render it nicely as a merge
107
+ idx0, idx1 = inverted_merges[idx]
108
+ s0 = render_token(self.vocab[idx0])
109
+ s1 = render_token(self.vocab[idx1])
110
+ f.write(f"[{s0}][{s1}] -> [{s}] {idx}\n")
111
+ else:
112
+ # otherwise this is leaf token, just print it
113
+ # (this should just be the first 256 tokens, the bytes)
114
+ f.write(f"[{s}] {idx}\n")
115
+ elif mode == "json":
116
+ import json
117
+ data = {
118
+ "version": __version__,
119
+ "pattern": str(self.pattern),
120
+ "special_tokens": self.special_tokens,
121
+ "merges": {str(k): v for k, v in self.merges.items()},
122
+ "vocab": {idx: render_token(token) for idx, token in self.vocab.items()}
123
+ }
124
+ with open(file_name + ".json", "w", encoding="utf-8") as f:
125
+ json.dump(data, f, ensure_ascii=False, indent=4)
126
+ else:
127
+ raise ValueError("mode should be either 'file' or 'json'")
128
+
129
+
130
+ def load(self, file_name, mode="file"):
131
+ """
132
+ Load the model and vocab files to the tokenizer.
133
+ mode: str, default="file" | "json" to load the model and vocab in json format.
134
+ """
135
+ if mode == "file":
136
+ assert file_name.endswith(".model")
137
+ merges = {}
138
+ special_tokens = {}
139
+ idx = 256
140
+ with open(file_name, 'r', encoding="utf-8") as f:
141
+ assert f.readline().strip() == __version__
142
+ self.pattern = f.readline().strip().split()
143
+ num_special = int(f.readline().strip()) # no of lines of special_tokens
144
+ for _ in range(num_special):
145
+ special, idx = f.readline().strip().split()
146
+ special_tokens[special] = int(idx)
147
+ for line in f:
148
+ idx1, idx2 = map(int, line.strip().split())
149
+ merges[(idx1, idx2)] = idx
150
+ idx += 1
151
+
152
+ self.merges = merges
153
+ self.special_tokens = special_tokens
154
+ self.vocab = self._build_vocab()
155
+
156
+ elif mode == "json":
157
+ assert file_name.endswith(".json")
158
+
159
+ import json
160
+ with open(file_name, "r", encoding="utf-8") as f:
161
+ data = json.load(f)
162
+ assert data["version"] == __version__
163
+ pattern = data["pattern"]
164
+ pattern_regex = re.compile(r'regex.Regex\("(.+)", flags=(regex\.\w+)\)')
165
+ match = pattern_regex.match(pattern)
166
+ if match:
167
+ self.pattern = match.group(1)
168
+ self.special_tokens = data["special_tokens"]
169
+ self.inverse_special_tokens = {v: k for k, v in self.special_tokens.items()}
170
+ merges = data["merges"]
171
+ self.merges = {tuple(map(int, k.strip('()').split(','))): v for k, v in merges.items()}
172
+ vocab = data["vocab"]
173
+ self.vocab = {int(k): v.encode("utf-8") for k, v in vocab.items()}
174
+
175
+
176
+
177
+ def encode(self, texts):
178
+ """Method to encode the text to ids."""
179
+ text_bytes = texts.encode("utf-8") # raw bytes string
180
+ ids = list(map(int, text_bytes))
181
+ while len(ids) >= 2:
182
+ # find the pair with the lowest merge index
183
+ stats = get_stats(ids)
184
+ pair = min(stats, key=lambda p: self.merges.get(p, float("inf")))
185
+
186
+ if pair not in self.merges:
187
+ break # nothing else can be merged anymore
188
+ # otherwise let's merge the best pair (lowest merge index)
189
+ idx = self.merges[pair]
190
+ ids = merge(ids, pair, idx)
191
+ return ids
192
+
193
+ def decode(self, ids):
194
+ """Method to decode the ids to text."""
195
+ bytes_str = b"".join([self.vocab[idx] for idx in ids])
196
+ text = bytes_str.decode("utf-8", errors="replace")
197
+ return text
198
+
199
+ def train(self, texts, vocab_size, verbose=False):
200
+ """Method for training the tokenizer."""
201
+ assert vocab_size >= 256
202
+ num_merges = vocab_size - 256
203
+
204
+ tokens = texts.encode("utf-8")
205
+ ids = list(tokens)
206
+ merges = {}
207
+ vocab = {idx: bytes([idx]) for idx in range(256)} # vocab for first 255 bytes
208
+
209
+ # bpe algorithm
210
+ for i in range(num_merges):
211
+ stats = get_stats(ids)
212
+ pair = max(stats, key=stats.get) # returns the highest frequency pair
213
+ idx = 256 + i
214
+
215
+ ids = merge(ids, pair, idx)
216
+ merges[pair] = idx
217
+ vocab[idx] = vocab[pair[0]] + vocab[pair[1]] # concat of bytes
218
+
219
+ if verbose:
220
+ print(f"merging {i+1}/{num_merges}: {pair} -> {idx} ({vocab[idx]}) had {stats[pair]} frequency")
221
+
222
+ self.merges = merges
223
+ self.vocab = vocab
tokenizer/bpetokenizer/tokenizer.py ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Byte Pair Encoding tokenizer.
3
+
4
+ Algorithmically follows along the GPT tokenizer:
5
+ https://github.com/openai/gpt-2/blob/master/src/encoder.py
6
+
7
+ The Byte Pair Encoding (BPE) algorithm is a simple algorithm that builds a vocabulary
8
+ of subword units for a given text corpus.
9
+
10
+ More detailed information could be found in:
11
+ https://github.com/Hk669/bpetokenizer/blob/main/notebooks/tokenization.ipynb
12
+ https://en.wikipedia.org/wiki/Byte_pair_encoding
13
+ https://youtu.be/zduSFxRajkE?si=Qv-yX2NUY69aIjCQ (Andrej Karpathy's tutorial on Tokenizer)
14
+
15
+ """
16
+
17
+ from .base import Tokenizer, get_stats, merge
18
+ import regex as re
19
+
20
+ # from the openai/tiktoken (used in gpt4 tokenizer)
21
+ GPT4_SPLIT_PATTERN = r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]++[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+""" # raw string
22
+
23
+
24
+ class BPETokenizer(Tokenizer):
25
+ """Byte Pair Encoding tokenizer. Which handles the special tokens and the pattern for tokenization."""
26
+
27
+ def __init__(self, pattern=None, special_tokens=None):
28
+ super().__init__()
29
+ self.pattern = GPT4_SPLIT_PATTERN if pattern is None else pattern
30
+ self.compiled_pattern = re.compile(self.pattern)
31
+ self.special_tokens = {} if special_tokens is None else special_tokens
32
+ self.inverse_special_tokens = {} if special_tokens is None else {v: k for k, v in special_tokens.items()}
33
+
34
+
35
+ def train(self, texts, vocab_size, verbose=False) -> None:
36
+ """Train the tokenizer on the given texts and vocab size. The vocab size should be greater than 256."""
37
+ assert vocab_size >= 256
38
+ num_merges = vocab_size - 256
39
+
40
+ text_chunks = re.findall(self.compiled_pattern, texts) # handles the desired pattern of tokens with regex pattern
41
+
42
+ ids = [list(tokens.encode("utf-8")) for tokens in text_chunks] # List[List[int]]
43
+ merges = {}
44
+ vocab = {idx: bytes([idx]) for idx in range(256)} # vocab for first 255 bytes
45
+
46
+ # bpe algorithm
47
+ for i in range(num_merges):
48
+ stats = {}
49
+ for chunk in ids:
50
+ get_stats(chunk, stats)
51
+
52
+ pair = max(stats, key=stats.get) # returns the highest frequency pair
53
+ idx = 256 + i
54
+
55
+ ids = [merge(chunk_ids, pair, idx) for chunk_ids in ids] # merge all the max occuring pair in the each chunk in ids
56
+ merges[pair] = idx
57
+ vocab[idx] = vocab[pair[0]] + vocab[pair[1]] # concat of bytes
58
+
59
+ if verbose:
60
+ print(f"merging {i+1}/{num_merges}: {pair} -> {idx} ({vocab[idx]}) had {stats[pair]} frequency")
61
+
62
+ self.merges = merges
63
+ self.vocab = vocab
64
+
65
+
66
+ def _encode(self, _bytes) -> list:
67
+ """Encode the bytes into token ids(BPE algorithm)."""
68
+ ids = list(_bytes)
69
+ while len(ids) >= 2:
70
+ # find the pair with the lowest merge index
71
+ stats = get_stats(ids)
72
+ pair = min(stats, key=lambda p: self.merges.get(p, float("inf")))
73
+
74
+ if pair not in self.merges:
75
+ break # nothing else can be merged anymore
76
+ # otherwise let's merge the best pair (lowest merge index)
77
+ idx = self.merges[pair]
78
+ ids = merge(ids, pair, idx)
79
+ return ids
80
+
81
+
82
+ def encode_ord(self, text) -> list:
83
+ text_chunks = re.findall(self.compiled_pattern, text)
84
+ ids = []
85
+ for chunk in text_chunks:
86
+ _bytes = chunk.encode("utf-8")
87
+ chunk_ids = self._encode(_bytes)
88
+ ids.extend(chunk_ids)
89
+ return ids
90
+
91
+
92
+ def encode(self, text, special_tokens="none") -> list:
93
+ """
94
+ Encode the text into token ids.
95
+ If special_tokens is set to "all", it will include the special tokens in the ids.
96
+ If set to "none", it will exclude the special tokens.
97
+ If set to "none_raise", it will raise an error if the text contains any special tokens.
98
+ """
99
+ special = None
100
+ if special_tokens == "all":
101
+ special = self.special_tokens
102
+ elif special_tokens == "none":
103
+ special = {}
104
+ elif special_tokens == "none_raise":
105
+ special = {}
106
+ assert all(token not in text for token in self.special_tokens)
107
+ else:
108
+ raise ValueError(f"invalid special tokens argument: {special_tokens}")
109
+
110
+ if not special:
111
+ return self.encode_ord(text)
112
+
113
+ special_pattern = "(" + "|".join(re.escape(k) for k in special) + ")"
114
+ text_chunks = re.split(special_pattern, text)
115
+ ids = []
116
+ for chunk in text_chunks:
117
+ if chunk in special:
118
+ ids.append(special[chunk])
119
+ else:
120
+ chunkids = self._encode(chunk.encode("utf-8"))
121
+ ids.extend(chunkids)
122
+ return ids
123
+
124
+
125
+ def decode(self, ids) -> str:
126
+ part_bytes = []
127
+ for idx in ids:
128
+ if idx in self.vocab: #str conversion because vocab keys are strings when loaded from json
129
+ part_bytes.append(self.vocab[idx])
130
+ elif idx in self.inverse_special_tokens:
131
+ part_bytes.append(self.inverse_special_tokens[idx].encode("utf-8")) # special tokens are not encoded in vocab
132
+ elif idx in self.merges:
133
+ pair = self.merges[idx]
134
+ part_bytes.append(self.vocab[pair[0]] + self.vocab[pair[1]])
135
+ else:
136
+ raise ValueError(f"invalid token id: {idx}")
137
+ text_bytes = b"".join(part_bytes)
138
+ text = text_bytes.decode("utf-8", errors="replace")
139
+ return text
140
+
141
+
142
+ def _special_tokens(self, special_tokens) -> None:
143
+ """Set the special tokens for the tokenizer. If not passed when initializing, it will be empty."""
144
+ self.special_tokens = special_tokens
145
+ self.inverse_special_tokens = {v: k for k, v in special_tokens.items()}
146
+
147
+
148
+ def tokens(self, text, verbose=False) -> list:
149
+ text_chunks = re.findall(self.compiled_pattern, text)
150
+
151
+ _tokens = []
152
+ for chunk in text_chunks:
153
+ _bytes = chunk.encode("utf-8")
154
+ chunk_ids = self._encode(_bytes)
155
+ chunk_tokens = [self.vocab[idx].decode("utf-8", errors="replace") if idx in self.vocab else f"[UNK{idx}]" for idx in chunk_ids]
156
+ _tokens.extend(chunk_tokens)
157
+ if verbose:
158
+ print(f"---\ntext chunks: {text_chunks}\n")
159
+ print(f"---\npattern: {self.pattern}\n")
160
+ return _tokens
tokenizer/bpetokenizer/version.py ADDED
@@ -0,0 +1 @@
 
 
1
+ __version__ = "1.0.31"
tokenizer/notebooks/tokenization.ipynb ADDED
@@ -0,0 +1,764 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {},
6
+ "source": [
7
+ "### Tokenizer\n",
8
+ "\n",
9
+ "A completely seperate, independent module from the LLM. which has its own training dataset of text, on which you train the vocabulary using the BPE(Byte pair encoding) algorithm. It then translates back and forth between the raw text and the sequence of integers/tokens. LLM only deals with the tokens and never directly deals with the text.\n",
10
+ "\n",
11
+ "![image.png](../public/tokenizer.png)"
12
+ ]
13
+ },
14
+ {
15
+ "cell_type": "code",
16
+ "execution_count": 8,
17
+ "metadata": {},
18
+ "outputs": [
19
+ {
20
+ "data": {
21
+ "text/plain": [
22
+ "97"
23
+ ]
24
+ },
25
+ "execution_count": 8,
26
+ "metadata": {},
27
+ "output_type": "execute_result"
28
+ }
29
+ ],
30
+ "source": [
31
+ "# the unicode code point of the character\n",
32
+ "ord('a')"
33
+ ]
34
+ },
35
+ {
36
+ "cell_type": "code",
37
+ "execution_count": 9,
38
+ "metadata": {},
39
+ "outputs": [
40
+ {
41
+ "data": {
42
+ "text/plain": [
43
+ "[3118,\n",
44
+ " 3136,\n",
45
+ " 3120,\n",
46
+ " 3137,\n",
47
+ " 32,\n",
48
+ " 3086,\n",
49
+ " 3122,\n",
50
+ " 3134,\n",
51
+ " 32,\n",
52
+ " 3081,\n",
53
+ " 3112,\n",
54
+ " 3149,\n",
55
+ " 3112,\n",
56
+ " 3134,\n",
57
+ " 3120,\n",
58
+ " 3137,\n",
59
+ " 63,\n",
60
+ " 32,\n",
61
+ " 40,\n",
62
+ " 72,\n",
63
+ " 111,\n",
64
+ " 119,\n",
65
+ " 32,\n",
66
+ " 97,\n",
67
+ " 114,\n",
68
+ " 101,\n",
69
+ " 32,\n",
70
+ " 121,\n",
71
+ " 111,\n",
72
+ " 117,\n",
73
+ " 63,\n",
74
+ " 41]"
75
+ ]
76
+ },
77
+ "execution_count": 9,
78
+ "metadata": {},
79
+ "output_type": "execute_result"
80
+ }
81
+ ],
82
+ "source": [
83
+ "tokens = [ord(c) for c in \"మీరు ఎలా ఉన్నారు? (How are you?)\"]\n",
84
+ "tokens"
85
+ ]
86
+ },
87
+ {
88
+ "cell_type": "markdown",
89
+ "metadata": {},
90
+ "source": [
91
+ "but having the token for each letter will increase the computation cost to generate and also train the model. so the BPE algorithm to introduced in the [GPT2 paper](https://d4mucfpksywv.cloudfront.net/better-language-models/language_models_are_unsupervised_multitask_learners.pdf)"
92
+ ]
93
+ },
94
+ {
95
+ "cell_type": "markdown",
96
+ "metadata": {},
97
+ "source": [
98
+ "### Byte pair encoding algorithm\n",
99
+ "\n",
100
+ "consider the string:\n",
101
+ "\n",
102
+ "`aaabdaaabac`\n",
103
+ "\n",
104
+ "the byte pair \"aa\" is the most occuring in the string, so we replace that with a new byte which is not used in the `vocab`, let's say \"Z\".\n",
105
+ "Now the following string will be\n",
106
+ "\n",
107
+ "```\n",
108
+ "ZabdZabac\n",
109
+ "Z = aa\n",
110
+ "```\n",
111
+ "\n",
112
+ "this process will be continued with recursive byte pair encoding replacing all the byte pairs till the string/data cannot be compressed further.\n",
113
+ "\n",
114
+ "\n",
115
+ "Then the process is repeated with byte pair \"ab\", replacing it with \"Y\"\n",
116
+ "```\n",
117
+ "ZYdZYac\n",
118
+ "Y=ab\n",
119
+ "Z=aa\n",
120
+ "```\n",
121
+ "replacing \"ZY\" with \"X\"\n",
122
+ "```\n",
123
+ "XdXac\n",
124
+ "X=ZY\n",
125
+ "Y=ab\n",
126
+ "Z=aa\n",
127
+ "```\n",
128
+ "\n",
129
+ "\n"
130
+ ]
131
+ },
132
+ {
133
+ "cell_type": "code",
134
+ "execution_count": 10,
135
+ "metadata": {},
136
+ "outputs": [
137
+ {
138
+ "name": "stdout",
139
+ "output_type": "stream",
140
+ "text": [
141
+ "----\n",
142
+ "Autogen enables the next-gen LLM applications with a generic [multi-agent conversation](https://microsoft.github.io/autogen/docs/Use-Cases/agent_chat) framework. It offers customizable and conversable agents that integrate LLMs, tools, and humans.\n",
143
+ "By automating chat among multiple capable agents, one can easily make them collectively perform tasks autonomously or with human feedback, including tasks that require using tools via code.\n",
144
+ "\n",
145
+ "Features of this use case include:\n",
146
+ "\n",
147
+ "- **Multi-agent conversations**: AutoGen agents can communicate with each other to solve tasks. This allows for more complex and sophisticated applications than would be possible with a single LLM.\n",
148
+ "- **Customization**: AutoGen agents can be customized to meet the specific needs of an application. This includes the ability to choose the LLMs to use, the types of human input to allow, and the tools to employ.\n",
149
+ "- **Human participation**: AutoGen seamlessly allows human participation. This means that humans can provide input and feedback to the agents as needed.\n",
150
+ "\n",
151
+ "For [example](https://github.com/microsoft/autogen/blob/main/test/twoagent.py),\n",
152
+ "\n",
153
+ "```python\n",
154
+ "from autogen import AssistantAgent, UserProxyAgent, config_list_from_json\n",
155
+ "# Load LLM inference endpoints from an env variable or a file\n",
156
+ "# See https://microsoft.github.io/autogen/docs/FAQ#set-your-api-endpoints\n",
157
+ "# and OAI_CONFIG_LIST_sample\n",
158
+ "config_list = config_list_from_json(env_or_file=\"OAI_CONFIG_LIST\")\n",
159
+ "# You can also set config_list directly as a list, for example, config_list = [{'model': 'gpt-4', 'api_key': '<your OpenAI API key here>'},]\n",
160
+ "assistant = AssistantAgent(\"assistant\", llm_config={\"config_list\": config_list})\n",
161
+ "user_proxy = UserProxyAgent(\"user_proxy\", code_execution_config={\"work_dir\": \"coding\", \"use_docker\": False}) # IMPORTANT: set to True to run code in docker, recommended\n",
162
+ "user_proxy.initiate_chat(assistant, message=\"Plot a chart of NVDA and TESLA stock price change YTD.\")\n",
163
+ "# This initiates an automated chat between the two agents to solve the task\n",
164
+ "```\n",
165
+ "\n",
166
+ "more python code:\n",
167
+ "\n",
168
+ "```python\n",
169
+ " def create(\n",
170
+ " self,\n",
171
+ " *,\n",
172
+ " messages: Iterable[ChatCompletionMessageParam],\n",
173
+ " model: Union[str, ChatModel],\n",
174
+ " frequency_penalty: Optional[float] | NotGiven = NOT_GIVEN,\n",
175
+ " function_call: completion_create_params.FunctionCall | NotGiven = NOT_GIVEN,\n",
176
+ " functions: Iterable[completion_create_params.Function] | NotGiven = NOT_GIVEN,\n",
177
+ " logit_bias: Optional[Dict[str, int]] | NotGiven = NOT_GIVEN,\n",
178
+ " logprobs: Optional[bool] | NotGiven = NOT_GIVEN,\n",
179
+ " max_tokens: Optional[int] | NotGiven = NOT_GIVEN,\n",
180
+ " n: Optional[int] | NotGiven = NOT_GIVEN,\n",
181
+ " presence_penalty: Optional[float] | NotGiven = NOT_GIVEN,\n",
182
+ " response_format: completion_create_params.ResponseFormat | NotGiven = NOT_GIVEN,\n",
183
+ " seed: Optional[int] | NotGiven = NOT_GIVEN,\n",
184
+ " stop: Union[Optional[str], List[str]] | NotGiven = NOT_GIVEN,\n",
185
+ " stream: Optional[Literal[False]] | Literal[True] | NotGiven = NOT_GIVEN,\n",
186
+ " stream_options: Optional[ChatCompletionStreamOptionsParam] | NotGiven = NOT_GIVEN,\n",
187
+ " temperature: Optional[float] | NotGiven = NOT_GIVEN,\n",
188
+ " tool_choice: ChatCompletionToolChoiceOptionParam | NotGiven = NOT_GIVEN,\n",
189
+ " tools: Iterable[ChatCompletionToolParam] | NotGiven = NOT_GIVEN,\n",
190
+ " top_logprobs: Optional[int] | NotGiven = NOT_GIVEN,\n",
191
+ " top_p: Optional[float] | NotGiven = NOT_GIVEN,\n",
192
+ " user: str | NotGiven = NOT_GIVEN,\n",
193
+ " # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.\n",
194
+ " # The extra values given here take precedence over values defined on the client or passed to this method.\n",
195
+ " extra_headers: Headers | None = None,\n",
196
+ " extra_query: Query | None = None,\n",
197
+ " extra_body: Body | None = None,\n",
198
+ " timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,\n",
199
+ " ) -> ChatCompletion | Stream[ChatCompletionChunk]:\n",
200
+ " return self._post(\n",
201
+ " \"/chat/completions\",\n",
202
+ " body=maybe_transform(\n",
203
+ " {\n",
204
+ " \"messages\": messages,\n",
205
+ " \"model\": model,\n",
206
+ " \"frequency_penalty\": frequency_penalty,\n",
207
+ " \"function_call\": function_call,\n",
208
+ " \"functions\": functions,\n",
209
+ " \"logit_bias\": logit_bias,\n",
210
+ " \"logprobs\": logprobs,\n",
211
+ " \"max_tokens\": max_tokens,\n",
212
+ " \"n\": n,\n",
213
+ " \"presence_penalty\": presence_penalty,\n",
214
+ " \"response_format\": response_format,\n",
215
+ " \"seed\": seed,\n",
216
+ " \"stop\": stop,\n",
217
+ " \"stream\": stream,\n",
218
+ " \"stream_options\": stream_options,\n",
219
+ " \"temperature\": temperature,\n",
220
+ " \"tool_choice\": tool_choice,\n",
221
+ " \"tools\": tools,\n",
222
+ " \"top_logprobs\": top_logprobs,\n",
223
+ " \"top_p\": top_p,\n",
224
+ " \"user\": user,\n",
225
+ " },\n",
226
+ " completion_create_params.CompletionCreateParams,\n",
227
+ " ),\n",
228
+ " options=make_request_options(\n",
229
+ " extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout\n",
230
+ " ),\n",
231
+ " cast_to=ChatCompletion,\n",
232
+ " stream=stream or False,\n",
233
+ " stream_cls=Stream[ChatCompletionChunk],\n",
234
+ " )\n",
235
+ "```\n",
236
+ "\n",
237
+ "length: 5397\n",
238
+ "----\n",
239
+ "[65, 117, 116, 111, 103, 101, 110, 32, 101, 110, 97, 98, 108, 101, 115, 32, 116, 104, 101, 32, 110, 101, 120, 116, 45, 103, 101, 110, 32, 76, 76, 77, 32, 97, 112, 112, 108, 105, 99, 97, 116, 105, 111, 110, 115, 32, 119, 105, 116, 104, 32, 97, 32, 103, 101, 110, 101, 114, 105, 99, 32, 91, 109, 117, 108, 116, 105, 45, 97, 103, 101, 110, 116, 32, 99, 111, 110, 118, 101, 114, 115, 97, 116, 105, 111, 110, 93, 40, 104, 116, 116, 112, 115, 58, 47, 47, 109, 105, 99, 114, 111, 115, 111, 102, 116, 46, 103, 105, 116, 104, 117, 98, 46, 105, 111, 47, 97, 117, 116, 111, 103, 101, 110, 47, 100, 111, 99, 115, 47, 85, 115, 101, 45, 67, 97, 115, 101, 115, 47, 97, 103, 101, 110, 116, 95, 99, 104, 97, 116, 41, 32, 102, 114, 97, 109, 101, 119, 111, 114, 107, 46, 32, 73, 116, 32, 111, 102, 102, 101, 114, 115, 32, 99, 117, 115, 116, 111, 109, 105, 122, 97, 98, 108, 101, 32, 97, 110, 100, 32, 99, 111, 110, 118, 101, 114, 115, 97, 98, 108, 101, 32, 97, 103, 101, 110, 116, 115, 32, 116, 104, 97, 116, 32, 105, 110, 116, 101, 103, 114, 97, 116, 101, 32, 76, 76, 77, 115, 44, 32, 116, 111, 111, 108, 115, 44, 32, 97, 110, 100, 32, 104, 117, 109, 97, 110, 115, 46, 10, 66, 121, 32, 97, 117, 116, 111, 109, 97, 116, 105, 110, 103, 32, 99, 104, 97, 116, 32, 97, 109, 111, 110, 103, 32, 109, 117, 108, 116, 105, 112, 108, 101, 32, 99, 97, 112, 97, 98, 108, 101, 32, 97, 103, 101, 110, 116, 115, 44, 32, 111, 110, 101, 32, 99, 97, 110, 32, 101, 97, 115, 105, 108, 121, 32, 109, 97, 107, 101, 32, 116, 104, 101, 109, 32, 99, 111, 108, 108, 101, 99, 116, 105, 118, 101, 108, 121, 32, 112, 101, 114, 102, 111, 114, 109, 32, 116, 97, 115, 107, 115, 32, 97, 117, 116, 111, 110, 111, 109, 111, 117, 115, 108, 121, 32, 111, 114, 32, 119, 105, 116, 104, 32, 104, 117, 109, 97, 110, 32, 102, 101, 101, 100, 98, 97, 99, 107, 44, 32, 105, 110, 99, 108, 117, 100, 105, 110, 103, 32, 116, 97, 115, 107, 115, 32, 116, 104, 97, 116, 32, 114, 101, 113, 117, 105, 114, 101, 32, 117, 115, 105, 110, 103, 32, 116, 111, 111, 108, 115, 32, 118, 105, 97, 32, 99, 111, 100, 101, 46, 10, 10, 70, 101, 97, 116, 117, 114, 101, 115, 32, 111, 102, 32, 116, 104, 105, 115, 32, 117, 115, 101, 32, 99, 97, 115, 101, 32, 105, 110, 99, 108, 117, 100, 101, 58, 10, 10, 45, 32, 42, 42, 77, 117, 108, 116, 105, 45, 97, 103, 101, 110, 116, 32, 99, 111, 110, 118, 101, 114, 115, 97, 116, 105, 111, 110, 115, 42, 42, 58, 32, 65, 117, 116, 111, 71, 101, 110, 32, 97, 103, 101, 110, 116, 115, 32, 99, 97, 110, 32, 99, 111, 109, 109, 117, 110, 105, 99, 97, 116, 101, 32, 119, 105, 116, 104, 32, 101, 97, 99, 104, 32, 111, 116, 104, 101, 114, 32, 116, 111, 32, 115, 111, 108, 118, 101, 32, 116, 97, 115, 107, 115, 46, 32, 84, 104, 105, 115, 32, 97, 108, 108, 111, 119, 115, 32, 102, 111, 114, 32, 109, 111, 114, 101, 32, 99, 111, 109, 112, 108, 101, 120, 32, 97, 110, 100, 32, 115, 111, 112, 104, 105, 115, 116, 105, 99, 97, 116, 101, 100, 32, 97, 112, 112, 108, 105, 99, 97, 116, 105, 111, 110, 115, 32, 116, 104, 97, 110, 32, 119, 111, 117, 108, 100, 32, 98, 101, 32, 112, 111, 115, 115, 105, 98, 108, 101, 32, 119, 105, 116, 104, 32, 97, 32, 115, 105, 110, 103, 108, 101, 32, 76, 76, 77, 46, 10, 45, 32, 42, 42, 67, 117, 115, 116, 111, 109, 105, 122, 97, 116, 105, 111, 110, 42, 42, 58, 32, 65, 117, 116, 111, 71, 101, 110, 32, 97, 103, 101, 110, 116, 115, 32, 99, 97, 110, 32, 98, 101, 32, 99, 117, 115, 116, 111, 109, 105, 122, 101, 100, 32, 116, 111, 32, 109, 101, 101, 116, 32, 116, 104, 101, 32, 115, 112, 101, 99, 105, 102, 105, 99, 32, 110, 101, 101, 100, 115, 32, 111, 102, 32, 97, 110, 32, 97, 112, 112, 108, 105, 99, 97, 116, 105, 111, 110, 46, 32, 84, 104, 105, 115, 32, 105, 110, 99, 108, 117, 100, 101, 115, 32, 116, 104, 101, 32, 97, 98, 105, 108, 105, 116, 121, 32, 116, 111, 32, 99, 104, 111, 111, 115, 101, 32, 116, 104, 101, 32, 76, 76, 77, 115, 32, 116, 111, 32, 117, 115, 101, 44, 32, 116, 104, 101, 32, 116, 121, 112, 101, 115, 32, 111, 102, 32, 104, 117, 109, 97, 110, 32, 105, 110, 112, 117, 116, 32, 116, 111, 32, 97, 108, 108, 111, 119, 44, 32, 97, 110, 100, 32, 116, 104, 101, 32, 116, 111, 111, 108, 115, 32, 116, 111, 32, 101, 109, 112, 108, 111, 121, 46, 10, 45, 32, 42, 42, 72, 117, 109, 97, 110, 32, 112, 97, 114, 116, 105, 99, 105, 112, 97, 116, 105, 111, 110, 42, 42, 58, 32, 65, 117, 116, 111, 71, 101, 110, 32, 115, 101, 97, 109, 108, 101, 115, 115, 108, 121, 32, 97, 108, 108, 111, 119, 115, 32, 104, 117, 109, 97, 110, 32, 112, 97, 114, 116, 105, 99, 105, 112, 97, 116, 105, 111, 110, 46, 32, 84, 104, 105, 115, 32, 109, 101, 97, 110, 115, 32, 116, 104, 97, 116, 32, 104, 117, 109, 97, 110, 115, 32, 99, 97, 110, 32, 112, 114, 111, 118, 105, 100, 101, 32, 105, 110, 112, 117, 116, 32, 97, 110, 100, 32, 102, 101, 101, 100, 98, 97, 99, 107, 32, 116, 111, 32, 116, 104, 101, 32, 97, 103, 101, 110, 116, 115, 32, 97, 115, 32, 110, 101, 101, 100, 101, 100, 46, 10, 10, 70, 111, 114, 32, 91, 101, 120, 97, 109, 112, 108, 101, 93, 40, 104, 116, 116, 112, 115, 58, 47, 47, 103, 105, 116, 104, 117, 98, 46, 99, 111, 109, 47, 109, 105, 99, 114, 111, 115, 111, 102, 116, 47, 97, 117, 116, 111, 103, 101, 110, 47, 98, 108, 111, 98, 47, 109, 97, 105, 110, 47, 116, 101, 115, 116, 47, 116, 119, 111, 97, 103, 101, 110, 116, 46, 112, 121, 41, 44, 10, 10, 96, 96, 96, 112, 121, 116, 104, 111, 110, 10, 102, 114, 111, 109, 32, 97, 117, 116, 111, 103, 101, 110, 32, 105, 109, 112, 111, 114, 116, 32, 65, 115, 115, 105, 115, 116, 97, 110, 116, 65, 103, 101, 110, 116, 44, 32, 85, 115, 101, 114, 80, 114, 111, 120, 121, 65, 103, 101, 110, 116, 44, 32, 99, 111, 110, 102, 105, 103, 95, 108, 105, 115, 116, 95, 102, 114, 111, 109, 95, 106, 115, 111, 110, 10, 35, 32, 76, 111, 97, 100, 32, 76, 76, 77, 32, 105, 110, 102, 101, 114, 101, 110, 99, 101, 32, 101, 110, 100, 112, 111, 105, 110, 116, 115, 32, 102, 114, 111, 109, 32, 97, 110, 32, 101, 110, 118, 32, 118, 97, 114, 105, 97, 98, 108, 101, 32, 111, 114, 32, 97, 32, 102, 105, 108, 101, 10, 35, 32, 83, 101, 101, 32, 104, 116, 116, 112, 115, 58, 47, 47, 109, 105, 99, 114, 111, 115, 111, 102, 116, 46, 103, 105, 116, 104, 117, 98, 46, 105, 111, 47, 97, 117, 116, 111, 103, 101, 110, 47, 100, 111, 99, 115, 47, 70, 65, 81, 35, 115, 101, 116, 45, 121, 111, 117, 114, 45, 97, 112, 105, 45, 101, 110, 100, 112, 111, 105, 110, 116, 115, 10, 35, 32, 97, 110, 100, 32, 79, 65, 73, 95, 67, 79, 78, 70, 73, 71, 95, 76, 73, 83, 84, 95, 115, 97, 109, 112, 108, 101, 10, 99, 111, 110, 102, 105, 103, 95, 108, 105, 115, 116, 32, 61, 32, 99, 111, 110, 102, 105, 103, 95, 108, 105, 115, 116, 95, 102, 114, 111, 109, 95, 106, 115, 111, 110, 40, 101, 110, 118, 95, 111, 114, 95, 102, 105, 108, 101, 61, 34, 79, 65, 73, 95, 67, 79, 78, 70, 73, 71, 95, 76, 73, 83, 84, 34, 41, 10, 35, 32, 89, 111, 117, 32, 99, 97, 110, 32, 97, 108, 115, 111, 32, 115, 101, 116, 32, 99, 111, 110, 102, 105, 103, 95, 108, 105, 115, 116, 32, 100, 105, 114, 101, 99, 116, 108, 121, 32, 97, 115, 32, 97, 32, 108, 105, 115, 116, 44, 32, 102, 111, 114, 32, 101, 120, 97, 109, 112, 108, 101, 44, 32, 99, 111, 110, 102, 105, 103, 95, 108, 105, 115, 116, 32, 61, 32, 91, 123, 39, 109, 111, 100, 101, 108, 39, 58, 32, 39, 103, 112, 116, 45, 52, 39, 44, 32, 39, 97, 112, 105, 95, 107, 101, 121, 39, 58, 32, 39, 60, 121, 111, 117, 114, 32, 79, 112, 101, 110, 65, 73, 32, 65, 80, 73, 32, 107, 101, 121, 32, 104, 101, 114, 101, 62, 39, 125, 44, 93, 10, 97, 115, 115, 105, 115, 116, 97, 110, 116, 32, 61, 32, 65, 115, 115, 105, 115, 116, 97, 110, 116, 65, 103, 101, 110, 116, 40, 34, 97, 115, 115, 105, 115, 116, 97, 110, 116, 34, 44, 32, 108, 108, 109, 95, 99, 111, 110, 102, 105, 103, 61, 123, 34, 99, 111, 110, 102, 105, 103, 95, 108, 105, 115, 116, 34, 58, 32, 99, 111, 110, 102, 105, 103, 95, 108, 105, 115, 116, 125, 41, 10, 117, 115, 101, 114, 95, 112, 114, 111, 120, 121, 32, 61, 32, 85, 115, 101, 114, 80, 114, 111, 120, 121, 65, 103, 101, 110, 116, 40, 34, 117, 115, 101, 114, 95, 112, 114, 111, 120, 121, 34, 44, 32, 99, 111, 100, 101, 95, 101, 120, 101, 99, 117, 116, 105, 111, 110, 95, 99, 111, 110, 102, 105, 103, 61, 123, 34, 119, 111, 114, 107, 95, 100, 105, 114, 34, 58, 32, 34, 99, 111, 100, 105, 110, 103, 34, 44, 32, 34, 117, 115, 101, 95, 100, 111, 99, 107, 101, 114, 34, 58, 32, 70, 97, 108, 115, 101, 125, 41, 32, 35, 32, 73, 77, 80, 79, 82, 84, 65, 78, 84, 58, 32, 115, 101, 116, 32, 116, 111, 32, 84, 114, 117, 101, 32, 116, 111, 32, 114, 117, 110, 32, 99, 111, 100, 101, 32, 105, 110, 32, 100, 111, 99, 107, 101, 114, 44, 32, 114, 101, 99, 111, 109, 109, 101, 110, 100, 101, 100, 10, 117, 115, 101, 114, 95, 112, 114, 111, 120, 121, 46, 105, 110, 105, 116, 105, 97, 116, 101, 95, 99, 104, 97, 116, 40, 97, 115, 115, 105, 115, 116, 97, 110, 116, 44, 32, 109, 101, 115, 115, 97, 103, 101, 61, 34, 80, 108, 111, 116, 32, 97, 32, 99, 104, 97, 114, 116, 32, 111, 102, 32, 78, 86, 68, 65, 32, 97, 110, 100, 32, 84, 69, 83, 76, 65, 32, 115, 116, 111, 99, 107, 32, 112, 114, 105, 99, 101, 32, 99, 104, 97, 110, 103, 101, 32, 89, 84, 68, 46, 34, 41, 10, 35, 32, 84, 104, 105, 115, 32, 105, 110, 105, 116, 105, 97, 116, 101, 115, 32, 97, 110, 32, 97, 117, 116, 111, 109, 97, 116, 101, 100, 32, 99, 104, 97, 116, 32, 98, 101, 116, 119, 101, 101, 110, 32, 116, 104, 101, 32, 116, 119, 111, 32, 97, 103, 101, 110, 116, 115, 32, 116, 111, 32, 115, 111, 108, 118, 101, 32, 116, 104, 101, 32, 116, 97, 115, 107, 10, 96, 96, 96, 10, 10, 109, 111, 114, 101, 32, 112, 121, 116, 104, 111, 110, 32, 99, 111, 100, 101, 58, 10, 10, 96, 96, 96, 112, 121, 116, 104, 111, 110, 10, 32, 32, 32, 32, 100, 101, 102, 32, 99, 114, 101, 97, 116, 101, 40, 10, 32, 32, 32, 32, 32, 32, 32, 32, 115, 101, 108, 102, 44, 10, 32, 32, 32, 32, 32, 32, 32, 32, 42, 44, 10, 32, 32, 32, 32, 32, 32, 32, 32, 109, 101, 115, 115, 97, 103, 101, 115, 58, 32, 73, 116, 101, 114, 97, 98, 108, 101, 91, 67, 104, 97, 116, 67, 111, 109, 112, 108, 101, 116, 105, 111, 110, 77, 101, 115, 115, 97, 103, 101, 80, 97, 114, 97, 109, 93, 44, 10, 32, 32, 32, 32, 32, 32, 32, 32, 109, 111, 100, 101, 108, 58, 32, 85, 110, 105, 111, 110, 91, 115, 116, 114, 44, 32, 67, 104, 97, 116, 77, 111, 100, 101, 108, 93, 44, 10, 32, 32, 32, 32, 32, 32, 32, 32, 102, 114, 101, 113, 117, 101, 110, 99, 121, 95, 112, 101, 110, 97, 108, 116, 121, 58, 32, 79, 112, 116, 105, 111, 110, 97, 108, 91, 102, 108, 111, 97, 116, 93, 32, 124, 32, 78, 111, 116, 71, 105, 118, 101, 110, 32, 61, 32, 78, 79, 84, 95, 71, 73, 86, 69, 78, 44, 10, 32, 32, 32, 32, 32, 32, 32, 32, 102, 117, 110, 99, 116, 105, 111, 110, 95, 99, 97, 108, 108, 58, 32, 99, 111, 109, 112, 108, 101, 116, 105, 111, 110, 95, 99, 114, 101, 97, 116, 101, 95, 112, 97, 114, 97, 109, 115, 46, 70, 117, 110, 99, 116, 105, 111, 110, 67, 97, 108, 108, 32, 124, 32, 78, 111, 116, 71, 105, 118, 101, 110, 32, 61, 32, 78, 79, 84, 95, 71, 73, 86, 69, 78, 44, 10, 32, 32, 32, 32, 32, 32, 32, 32, 102, 117, 110, 99, 116, 105, 111, 110, 115, 58, 32, 73, 116, 101, 114, 97, 98, 108, 101, 91, 99, 111, 109, 112, 108, 101, 116, 105, 111, 110, 95, 99, 114, 101, 97, 116, 101, 95, 112, 97, 114, 97, 109, 115, 46, 70, 117, 110, 99, 116, 105, 111, 110, 93, 32, 124, 32, 78, 111, 116, 71, 105, 118, 101, 110, 32, 61, 32, 78, 79, 84, 95, 71, 73, 86, 69, 78, 44, 10, 32, 32, 32, 32, 32, 32, 32, 32, 108, 111, 103, 105, 116, 95, 98, 105, 97, 115, 58, 32, 79, 112, 116, 105, 111, 110, 97, 108, 91, 68, 105, 99, 116, 91, 115, 116, 114, 44, 32, 105, 110, 116, 93, 93, 32, 124, 32, 78, 111, 116, 71, 105, 118, 101, 110, 32, 61, 32, 78, 79, 84, 95, 71, 73, 86, 69, 78, 44, 10, 32, 32, 32, 32, 32, 32, 32, 32, 108, 111, 103, 112, 114, 111, 98, 115, 58, 32, 79, 112, 116, 105, 111, 110, 97, 108, 91, 98, 111, 111, 108, 93, 32, 124, 32, 78, 111, 116, 71, 105, 118, 101, 110, 32, 61, 32, 78, 79, 84, 95, 71, 73, 86, 69, 78, 44, 10, 32, 32, 32, 32, 32, 32, 32, 32, 109, 97, 120, 95, 116, 111, 107, 101, 110, 115, 58, 32, 79, 112, 116, 105, 111, 110, 97, 108, 91, 105, 110, 116, 93, 32, 124, 32, 78, 111, 116, 71, 105, 118, 101, 110, 32, 61, 32, 78, 79, 84, 95, 71, 73, 86, 69, 78, 44, 10, 32, 32, 32, 32, 32, 32, 32, 32, 110, 58, 32, 79, 112, 116, 105, 111, 110, 97, 108, 91, 105, 110, 116, 93, 32, 124, 32, 78, 111, 116, 71, 105, 118, 101, 110, 32, 61, 32, 78, 79, 84, 95, 71, 73, 86, 69, 78, 44, 10, 32, 32, 32, 32, 32, 32, 32, 32, 112, 114, 101, 115, 101, 110, 99, 101, 95, 112, 101, 110, 97, 108, 116, 121, 58, 32, 79, 112, 116, 105, 111, 110, 97, 108, 91, 102, 108, 111, 97, 116, 93, 32, 124, 32, 78, 111, 116, 71, 105, 118, 101, 110, 32, 61, 32, 78, 79, 84, 95, 71, 73, 86, 69, 78, 44, 10, 32, 32, 32, 32, 32, 32, 32, 32, 114, 101, 115, 112, 111, 110, 115, 101, 95, 102, 111, 114, 109, 97, 116, 58, 32, 99, 111, 109, 112, 108, 101, 116, 105, 111, 110, 95, 99, 114, 101, 97, 116, 101, 95, 112, 97, 114, 97, 109, 115, 46, 82, 101, 115, 112, 111, 110, 115, 101, 70, 111, 114, 109, 97, 116, 32, 124, 32, 78, 111, 116, 71, 105, 118, 101, 110, 32, 61, 32, 78, 79, 84, 95, 71, 73, 86, 69, 78, 44, 10, 32, 32, 32, 32, 32, 32, 32, 32, 115, 101, 101, 100, 58, 32, 79, 112, 116, 105, 111, 110, 97, 108, 91, 105, 110, 116, 93, 32, 124, 32, 78, 111, 116, 71, 105, 118, 101, 110, 32, 61, 32, 78, 79, 84, 95, 71, 73, 86, 69, 78, 44, 10, 32, 32, 32, 32, 32, 32, 32, 32, 115, 116, 111, 112, 58, 32, 85, 110, 105, 111, 110, 91, 79, 112, 116, 105, 111, 110, 97, 108, 91, 115, 116, 114, 93, 44, 32, 76, 105, 115, 116, 91, 115, 116, 114, 93, 93, 32, 124, 32, 78, 111, 116, 71, 105, 118, 101, 110, 32, 61, 32, 78, 79, 84, 95, 71, 73, 86, 69, 78, 44, 10, 32, 32, 32, 32, 32, 32, 32, 32, 115, 116, 114, 101, 97, 109, 58, 32, 79, 112, 116, 105, 111, 110, 97, 108, 91, 76, 105, 116, 101, 114, 97, 108, 91, 70, 97, 108, 115, 101, 93, 93, 32, 124, 32, 76, 105, 116, 101, 114, 97, 108, 91, 84, 114, 117, 101, 93, 32, 124, 32, 78, 111, 116, 71, 105, 118, 101, 110, 32, 61, 32, 78, 79, 84, 95, 71, 73, 86, 69, 78, 44, 10, 32, 32, 32, 32, 32, 32, 32, 32, 115, 116, 114, 101, 97, 109, 95, 111, 112, 116, 105, 111, 110, 115, 58, 32, 79, 112, 116, 105, 111, 110, 97, 108, 91, 67, 104, 97, 116, 67, 111, 109, 112, 108, 101, 116, 105, 111, 110, 83, 116, 114, 101, 97, 109, 79, 112, 116, 105, 111, 110, 115, 80, 97, 114, 97, 109, 93, 32, 124, 32, 78, 111, 116, 71, 105, 118, 101, 110, 32, 61, 32, 78, 79, 84, 95, 71, 73, 86, 69, 78, 44, 10, 32, 32, 32, 32, 32, 32, 32, 32, 116, 101, 109, 112, 101, 114, 97, 116, 117, 114, 101, 58, 32, 79, 112, 116, 105, 111, 110, 97, 108, 91, 102, 108, 111, 97, 116, 93, 32, 124, 32, 78, 111, 116, 71, 105, 118, 101, 110, 32, 61, 32, 78, 79, 84, 95, 71, 73, 86, 69, 78, 44, 10, 32, 32, 32, 32, 32, 32, 32, 32, 116, 111, 111, 108, 95, 99, 104, 111, 105, 99, 101, 58, 32, 67, 104, 97, 116, 67, 111, 109, 112, 108, 101, 116, 105, 111, 110, 84, 111, 111, 108, 67, 104, 111, 105, 99, 101, 79, 112, 116, 105, 111, 110, 80, 97, 114, 97, 109, 32, 124, 32, 78, 111, 116, 71, 105, 118, 101, 110, 32, 61, 32, 78, 79, 84, 95, 71, 73, 86, 69, 78, 44, 10, 32, 32, 32, 32, 32, 32, 32, 32, 116, 111, 111, 108, 115, 58, 32, 73, 116, 101, 114, 97, 98, 108, 101, 91, 67, 104, 97, 116, 67, 111, 109, 112, 108, 101, 116, 105, 111, 110, 84, 111, 111, 108, 80, 97, 114, 97, 109, 93, 32, 124, 32, 78, 111, 116, 71, 105, 118, 101, 110, 32, 61, 32, 78, 79, 84, 95, 71, 73, 86, 69, 78, 44, 10, 32, 32, 32, 32, 32, 32, 32, 32, 116, 111, 112, 95, 108, 111, 103, 112, 114, 111, 98, 115, 58, 32, 79, 112, 116, 105, 111, 110, 97, 108, 91, 105, 110, 116, 93, 32, 124, 32, 78, 111, 116, 71, 105, 118, 101, 110, 32, 61, 32, 78, 79, 84, 95, 71, 73, 86, 69, 78, 44, 10, 32, 32, 32, 32, 32, 32, 32, 32, 116, 111, 112, 95, 112, 58, 32, 79, 112, 116, 105, 111, 110, 97, 108, 91, 102, 108, 111, 97, 116, 93, 32, 124, 32, 78, 111, 116, 71, 105, 118, 101, 110, 32, 61, 32, 78, 79, 84, 95, 71, 73, 86, 69, 78, 44, 10, 32, 32, 32, 32, 32, 32, 32, 32, 117, 115, 101, 114, 58, 32, 115, 116, 114, 32, 124, 32, 78, 111, 116, 71, 105, 118, 101, 110, 32, 61, 32, 78, 79, 84, 95, 71, 73, 86, 69, 78, 44, 10, 32, 32, 32, 32, 32, 32, 32, 32, 35, 32, 85, 115, 101, 32, 116, 104, 101, 32, 102, 111, 108, 108, 111, 119, 105, 110, 103, 32, 97, 114, 103, 117, 109, 101, 110, 116, 115, 32, 105, 102, 32, 121, 111, 117, 32, 110, 101, 101, 100, 32, 116, 111, 32, 112, 97, 115, 115, 32, 97, 100, 100, 105, 116, 105, 111, 110, 97, 108, 32, 112, 97, 114, 97, 109, 101, 116, 101, 114, 115, 32, 116, 111, 32, 116, 104, 101, 32, 65, 80, 73, 32, 116, 104, 97, 116, 32, 97, 114, 101, 110, 39, 116, 32, 97, 118, 97, 105, 108, 97, 98, 108, 101, 32, 118, 105, 97, 32, 107, 119, 97, 114, 103, 115, 46, 10, 32, 32, 32, 32, 32, 32, 32, 32, 35, 32, 84, 104, 101, 32, 101, 120, 116, 114, 97, 32, 118, 97, 108, 117, 101, 115, 32, 103, 105, 118, 101, 110, 32, 104, 101, 114, 101, 32, 116, 97, 107, 101, 32, 112, 114, 101, 99, 101, 100, 101, 110, 99, 101, 32, 111, 118, 101, 114, 32, 118, 97, 108, 117, 101, 115, 32, 100, 101, 102, 105, 110, 101, 100, 32, 111, 110, 32, 116, 104, 101, 32, 99, 108, 105, 101, 110, 116, 32, 111, 114, 32, 112, 97, 115, 115, 101, 100, 32, 116, 111, 32, 116, 104, 105, 115, 32, 109, 101, 116, 104, 111, 100, 46, 10, 32, 32, 32, 32, 32, 32, 32, 32, 101, 120, 116, 114, 97, 95, 104, 101, 97, 100, 101, 114, 115, 58, 32, 72, 101, 97, 100, 101, 114, 115, 32, 124, 32, 78, 111, 110, 101, 32, 61, 32, 78, 111, 110, 101, 44, 10, 32, 32, 32, 32, 32, 32, 32, 32, 101, 120, 116, 114, 97, 95, 113, 117, 101, 114, 121, 58, 32, 81, 117, 101, 114, 121, 32, 124, 32, 78, 111, 110, 101, 32, 61, 32, 78, 111, 110, 101, 44, 10, 32, 32, 32, 32, 32, 32, 32, 32, 101, 120, 116, 114, 97, 95, 98, 111, 100, 121, 58, 32, 66, 111, 100, 121, 32, 124, 32, 78, 111, 110, 101, 32, 61, 32, 78, 111, 110, 101, 44, 10, 32, 32, 32, 32, 32, 32, 32, 32, 116, 105, 109, 101, 111, 117, 116, 58, 32, 102, 108, 111, 97, 116, 32, 124, 32, 104, 116, 116, 112, 120, 46, 84, 105, 109, 101, 111, 117, 116, 32, 124, 32, 78, 111, 110, 101, 32, 124, 32, 78, 111, 116, 71, 105, 118, 101, 110, 32, 61, 32, 78, 79, 84, 95, 71, 73, 86, 69, 78, 44, 10, 32, 32, 32, 32, 41, 32, 45, 62, 32, 67, 104, 97, 116, 67, 111, 109, 112, 108, 101, 116, 105, 111, 110, 32, 124, 32, 83, 116, 114, 101, 97, 109, 91, 67, 104, 97, 116, 67, 111, 109, 112, 108, 101, 116, 105, 111, 110, 67, 104, 117, 110, 107, 93, 58, 10, 32, 32, 32, 32, 32, 32, 32, 32, 114, 101, 116, 117, 114, 110, 32, 115, 101, 108, 102, 46, 95, 112, 111, 115, 116, 40, 10, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 34, 47, 99, 104, 97, 116, 47, 99, 111, 109, 112, 108, 101, 116, 105, 111, 110, 115, 34, 44, 10, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 98, 111, 100, 121, 61, 109, 97, 121, 98, 101, 95, 116, 114, 97, 110, 115, 102, 111, 114, 109, 40, 10, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 123, 10, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 34, 109, 101, 115, 115, 97, 103, 101, 115, 34, 58, 32, 109, 101, 115, 115, 97, 103, 101, 115, 44, 10, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 34, 109, 111, 100, 101, 108, 34, 58, 32, 109, 111, 100, 101, 108, 44, 10, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 34, 102, 114, 101, 113, 117, 101, 110, 99, 121, 95, 112, 101, 110, 97, 108, 116, 121, 34, 58, 32, 102, 114, 101, 113, 117, 101, 110, 99, 121, 95, 112, 101, 110, 97, 108, 116, 121, 44, 10, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 34, 102, 117, 110, 99, 116, 105, 111, 110, 95, 99, 97, 108, 108, 34, 58, 32, 102, 117, 110, 99, 116, 105, 111, 110, 95, 99, 97, 108, 108, 44, 10, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 34, 102, 117, 110, 99, 116, 105, 111, 110, 115, 34, 58, 32, 102, 117, 110, 99, 116, 105, 111, 110, 115, 44, 10, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 34, 108, 111, 103, 105, 116, 95, 98, 105, 97, 115, 34, 58, 32, 108, 111, 103, 105, 116, 95, 98, 105, 97, 115, 44, 10, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 34, 108, 111, 103, 112, 114, 111, 98, 115, 34, 58, 32, 108, 111, 103, 112, 114, 111, 98, 115, 44, 10, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 34, 109, 97, 120, 95, 116, 111, 107, 101, 110, 115, 34, 58, 32, 109, 97, 120, 95, 116, 111, 107, 101, 110, 115, 44, 10, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 34, 110, 34, 58, 32, 110, 44, 10, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 34, 112, 114, 101, 115, 101, 110, 99, 101, 95, 112, 101, 110, 97, 108, 116, 121, 34, 58, 32, 112, 114, 101, 115, 101, 110, 99, 101, 95, 112, 101, 110, 97, 108, 116, 121, 44, 10, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 34, 114, 101, 115, 112, 111, 110, 115, 101, 95, 102, 111, 114, 109, 97, 116, 34, 58, 32, 114, 101, 115, 112, 111, 110, 115, 101, 95, 102, 111, 114, 109, 97, 116, 44, 10, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 34, 115, 101, 101, 100, 34, 58, 32, 115, 101, 101, 100, 44, 10, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 34, 115, 116, 111, 112, 34, 58, 32, 115, 116, 111, 112, 44, 10, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 34, 115, 116, 114, 101, 97, 109, 34, 58, 32, 115, 116, 114, 101, 97, 109, 44, 10, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 34, 115, 116, 114, 101, 97, 109, 95, 111, 112, 116, 105, 111, 110, 115, 34, 58, 32, 115, 116, 114, 101, 97, 109, 95, 111, 112, 116, 105, 111, 110, 115, 44, 10, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 34, 116, 101, 109, 112, 101, 114, 97, 116, 117, 114, 101, 34, 58, 32, 116, 101, 109, 112, 101, 114, 97, 116, 117, 114, 101, 44, 10, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 34, 116, 111, 111, 108, 95, 99, 104, 111, 105, 99, 101, 34, 58, 32, 116, 111, 111, 108, 95, 99, 104, 111, 105, 99, 101, 44, 10, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 34, 116, 111, 111, 108, 115, 34, 58, 32, 116, 111, 111, 108, 115, 44, 10, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 34, 116, 111, 112, 95, 108, 111, 103, 112, 114, 111, 98, 115, 34, 58, 32, 116, 111, 112, 95, 108, 111, 103, 112, 114, 111, 98, 115, 44, 10, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 34, 116, 111, 112, 95, 112, 34, 58, 32, 116, 111, 112, 95, 112, 44, 10, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 34, 117, 115, 101, 114, 34, 58, 32, 117, 115, 101, 114, 44, 10, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 125, 44, 10, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 99, 111, 109, 112, 108, 101, 116, 105, 111, 110, 95, 99, 114, 101, 97, 116, 101, 95, 112, 97, 114, 97, 109, 115, 46, 67, 111, 109, 112, 108, 101, 116, 105, 111, 110, 67, 114, 101, 97, 116, 101, 80, 97, 114, 97, 109, 115, 44, 10, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 41, 44, 10, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 111, 112, 116, 105, 111, 110, 115, 61, 109, 97, 107, 101, 95, 114, 101, 113, 117, 101, 115, 116, 95, 111, 112, 116, 105, 111, 110, 115, 40, 10, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 101, 120, 116, 114, 97, 95, 104, 101, 97, 100, 101, 114, 115, 61, 101, 120, 116, 114, 97, 95, 104, 101, 97, 100, 101, 114, 115, 44, 32, 101, 120, 116, 114, 97, 95, 113, 117, 101, 114, 121, 61, 101, 120, 116, 114, 97, 95, 113, 117, 101, 114, 121, 44, 32, 101, 120, 116, 114, 97, 95, 98, 111, 100, 121, 61, 101, 120, 116, 114, 97, 95, 98, 111, 100, 121, 44, 32, 116, 105, 109, 101, 111, 117, 116, 61, 116, 105, 109, 101, 111, 117, 116, 10, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 41, 44, 10, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 99, 97, 115, 116, 95, 116, 111, 61, 67, 104, 97, 116, 67, 111, 109, 112, 108, 101, 116, 105, 111, 110, 44, 10, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 115, 116, 114, 101, 97, 109, 61, 115, 116, 114, 101, 97, 109, 32, 111, 114, 32, 70, 97, 108, 115, 101, 44, 10, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 115, 116, 114, 101, 97, 109, 95, 99, 108, 115, 61, 83, 116, 114, 101, 97, 109, 91, 67, 104, 97, 116, 67, 111, 109, 112, 108, 101, 116, 105, 111, 110, 67, 104, 117, 110, 107, 93, 44, 10, 32, 32, 32, 32, 32, 32, 32, 32, 41, 10, 96, 96, 96, 10]\n",
240
+ "length: 5397\n"
241
+ ]
242
+ }
243
+ ],
244
+ "source": [
245
+ "text = \"\"\"Autogen enables the next-gen LLM applications with a generic [multi-agent conversation](https://microsoft.github.io/autogen/docs/Use-Cases/agent_chat) framework. It offers customizable and conversable agents that integrate LLMs, tools, and humans.\n",
246
+ "By automating chat among multiple capable agents, one can easily make them collectively perform tasks autonomously or with human feedback, including tasks that require using tools via code.\n",
247
+ "\n",
248
+ "Features of this use case include:\n",
249
+ "\n",
250
+ "- **Multi-agent conversations**: AutoGen agents can communicate with each other to solve tasks. This allows for more complex and sophisticated applications than would be possible with a single LLM.\n",
251
+ "- **Customization**: AutoGen agents can be customized to meet the specific needs of an application. This includes the ability to choose the LLMs to use, the types of human input to allow, and the tools to employ.\n",
252
+ "- **Human participation**: AutoGen seamlessly allows human participation. This means that humans can provide input and feedback to the agents as needed.\n",
253
+ "\n",
254
+ "For [example](https://github.com/microsoft/autogen/blob/main/test/twoagent.py),\n",
255
+ "\n",
256
+ "```python\n",
257
+ "from autogen import AssistantAgent, UserProxyAgent, config_list_from_json\n",
258
+ "# Load LLM inference endpoints from an env variable or a file\n",
259
+ "# See https://microsoft.github.io/autogen/docs/FAQ#set-your-api-endpoints\n",
260
+ "# and OAI_CONFIG_LIST_sample\n",
261
+ "config_list = config_list_from_json(env_or_file=\"OAI_CONFIG_LIST\")\n",
262
+ "# You can also set config_list directly as a list, for example, config_list = [{'model': 'gpt-4', 'api_key': '<your OpenAI API key here>'},]\n",
263
+ "assistant = AssistantAgent(\"assistant\", llm_config={\"config_list\": config_list})\n",
264
+ "user_proxy = UserProxyAgent(\"user_proxy\", code_execution_config={\"work_dir\": \"coding\", \"use_docker\": False}) # IMPORTANT: set to True to run code in docker, recommended\n",
265
+ "user_proxy.initiate_chat(assistant, message=\"Plot a chart of NVDA and TESLA stock price change YTD.\")\n",
266
+ "# This initiates an automated chat between the two agents to solve the task\n",
267
+ "```\n",
268
+ "\n",
269
+ "more python code:\n",
270
+ "\n",
271
+ "```python\n",
272
+ " def create(\n",
273
+ " self,\n",
274
+ " *,\n",
275
+ " messages: Iterable[ChatCompletionMessageParam],\n",
276
+ " model: Union[str, ChatModel],\n",
277
+ " frequency_penalty: Optional[float] | NotGiven = NOT_GIVEN,\n",
278
+ " function_call: completion_create_params.FunctionCall | NotGiven = NOT_GIVEN,\n",
279
+ " functions: Iterable[completion_create_params.Function] | NotGiven = NOT_GIVEN,\n",
280
+ " logit_bias: Optional[Dict[str, int]] | NotGiven = NOT_GIVEN,\n",
281
+ " logprobs: Optional[bool] | NotGiven = NOT_GIVEN,\n",
282
+ " max_tokens: Optional[int] | NotGiven = NOT_GIVEN,\n",
283
+ " n: Optional[int] | NotGiven = NOT_GIVEN,\n",
284
+ " presence_penalty: Optional[float] | NotGiven = NOT_GIVEN,\n",
285
+ " response_format: completion_create_params.ResponseFormat | NotGiven = NOT_GIVEN,\n",
286
+ " seed: Optional[int] | NotGiven = NOT_GIVEN,\n",
287
+ " stop: Union[Optional[str], List[str]] | NotGiven = NOT_GIVEN,\n",
288
+ " stream: Optional[Literal[False]] | Literal[True] | NotGiven = NOT_GIVEN,\n",
289
+ " stream_options: Optional[ChatCompletionStreamOptionsParam] | NotGiven = NOT_GIVEN,\n",
290
+ " temperature: Optional[float] | NotGiven = NOT_GIVEN,\n",
291
+ " tool_choice: ChatCompletionToolChoiceOptionParam | NotGiven = NOT_GIVEN,\n",
292
+ " tools: Iterable[ChatCompletionToolParam] | NotGiven = NOT_GIVEN,\n",
293
+ " top_logprobs: Optional[int] | NotGiven = NOT_GIVEN,\n",
294
+ " top_p: Optional[float] | NotGiven = NOT_GIVEN,\n",
295
+ " user: str | NotGiven = NOT_GIVEN,\n",
296
+ " # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.\n",
297
+ " # The extra values given here take precedence over values defined on the client or passed to this method.\n",
298
+ " extra_headers: Headers | None = None,\n",
299
+ " extra_query: Query | None = None,\n",
300
+ " extra_body: Body | None = None,\n",
301
+ " timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,\n",
302
+ " ) -> ChatCompletion | Stream[ChatCompletionChunk]:\n",
303
+ " return self._post(\n",
304
+ " \"/chat/completions\",\n",
305
+ " body=maybe_transform(\n",
306
+ " {\n",
307
+ " \"messages\": messages,\n",
308
+ " \"model\": model,\n",
309
+ " \"frequency_penalty\": frequency_penalty,\n",
310
+ " \"function_call\": function_call,\n",
311
+ " \"functions\": functions,\n",
312
+ " \"logit_bias\": logit_bias,\n",
313
+ " \"logprobs\": logprobs,\n",
314
+ " \"max_tokens\": max_tokens,\n",
315
+ " \"n\": n,\n",
316
+ " \"presence_penalty\": presence_penalty,\n",
317
+ " \"response_format\": response_format,\n",
318
+ " \"seed\": seed,\n",
319
+ " \"stop\": stop,\n",
320
+ " \"stream\": stream,\n",
321
+ " \"stream_options\": stream_options,\n",
322
+ " \"temperature\": temperature,\n",
323
+ " \"tool_choice\": tool_choice,\n",
324
+ " \"tools\": tools,\n",
325
+ " \"top_logprobs\": top_logprobs,\n",
326
+ " \"top_p\": top_p,\n",
327
+ " \"user\": user,\n",
328
+ " },\n",
329
+ " completion_create_params.CompletionCreateParams,\n",
330
+ " ),\n",
331
+ " options=make_request_options(\n",
332
+ " extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout\n",
333
+ " ),\n",
334
+ " cast_to=ChatCompletion,\n",
335
+ " stream=stream or False,\n",
336
+ " stream_cls=Stream[ChatCompletionChunk],\n",
337
+ " )\n",
338
+ "```\n",
339
+ "\"\"\"\n",
340
+ "tokens = text.encode('utf-8') # which will produce raw byte strings\n",
341
+ "tokens = list(map(int, tokens)) # convert the byte strings to integers\n",
342
+ "print('----')\n",
343
+ "print(text)\n",
344
+ "print('length:', len(text))\n",
345
+ "print('----')\n",
346
+ "print(tokens)\n",
347
+ "print('length:', len(tokens))\n"
348
+ ]
349
+ },
350
+ {
351
+ "cell_type": "code",
352
+ "execution_count": 11,
353
+ "metadata": {},
354
+ "outputs": [
355
+ {
356
+ "name": "stdout",
357
+ "output_type": "stream",
358
+ "text": [
359
+ "[(770, (32, 32)), (86, (111, 110)), (73, (101, 110)), (66, (10, 32)), (65, (116, 105)), (57, (44, 10)), (56, (105, 111)), (55, (58, 32)), (55, (32, 116)), (52, (97, 116)), (50, (116, 111)), (50, (101, 32)), (48, (32, 78)), (47, (110, 32)), (44, (114, 101)), (40, (115, 116)), (40, (32, 97)), (38, (115, 32)), (38, (101, 114)), (36, (115, 101)), (35, (97, 108)), (35, (32, 99)), (34, (108, 101)), (32, (116, 104)), (31, (114, 97)), (30, (97, 110)), (29, (110, 116)), (28, (118, 101)), (28, (116, 114)), (28, (111, 109)), (28, (97, 109)), (27, (124, 32)), (27, (103, 101)), (27, (101, 97)), (27, (99, 111)), (27, (78, 111)), (27, (61, 32)), (27, (32, 124)), (27, (32, 61)), (26, (116, 32)), (26, (101, 115)), (25, (105, 110)), (24, (110, 115)), (24, (34, 58)), (24, (32, 34)), (23, (116, 101)), (23, (112, 108)), (23, (109, 112)), (22, (111, 116)), (22, (105, 118)), (22, (101, 116)), (22, (44, 32)), (22, (32, 115)), (21, (112, 116)), (21, (110, 97)), (21, (108, 111)), (21, (105, 115)), (21, (104, 97)), (21, (100, 101)), (21, (84, 95)), (20, (116, 71)), (20, (104, 101)), (20, (95, 71)), (20, (86, 69)), (20, (79, 84)), (20, (78, 79)), (20, (78, 44)), (20, (73, 86)), (20, (71, 105)), (20, (71, 73)), (20, (69, 78)), (19, (114, 111)), (19, (111, 114)), (19, (110, 99)), (19, (109, 97)), (18, (117, 116)), (18, (105, 99)), (18, (97, 115)), (17, (97, 114)), (17, (95, 112)), (16, (111, 108)), (16, (111, 100)), (16, (105, 116)), (16, (101, 100)), (16, (99, 97)), (16, (93, 32)), (16, (79, 112)), (15, (112, 114)), (15, (111, 112)), (15, (111, 32)), (15, (109, 101)), (15, (108, 91)), (15, (101, 120)), (15, (101, 95)), (15, (100, 32)), (15, (97, 103)), (15, (95, 99)), (15, (32, 102)), (14, (117, 115)), (14, (115, 115)), (14, (111, 103)), (14, (110, 101)), (14, (32, 111)), (14, (32, 101)), (14, (32, 79)), (13, (115, 44)), (13, (112, 101)), (13, (111, 111)), (13, (108, 105)), (13, (102, 105)), (13, (32, 112)), (13, (32, 109)), (12, (117, 110)), (12, (117, 101)), (12, (115, 58)), (12, (112, 97)), (12, (99, 104)), (12, (67, 104)), (12, (32, 105)), (11, (121, 32)), (11, (120, 116)), (11, (114, 32)), (11, (108, 115)), (11, (101, 101)), (11, (99, 116)), (11, (99, 101)), (11, (98, 108)), (10, (116, 97)), (10, (111, 117)), (10, (110, 102)), (10, (110, 100)), (10, (108, 108)), (10, (107, 101)), (10, (104, 117)), (10, (97, 98)), (10, (95, 108)), (9, (116, 115)), (9, (116, 93)), (9, (115, 111)), (9, (115, 105)), (9, (115, 97)), (9, (115, 34)), (9, (114, 115)), (9, (112, 111)), (9, (108, 116)), (9, (105, 103)), (9, (104, 111)), (9, (97, 95)), (9, (67, 111)), (9, (32, 104)), (8, (116, 121)), (8, (116, 95)), (8, (116, 67)), (8, (113, 117)), (8, (111, 102)), (8, (110, 103)), (8, (110, 95)), (8, (109, 111)), (8, (105, 97)), (8, (102, 114)), (8, (102, 111)), (8, (101, 108)), (8, (101, 44)), (8, (99, 114)), (8, (97, 32)), (8, (96, 96)), (8, (35, 32)), (8, (32, 76)), (7, (117, 114)), (7, (117, 109)), (7, (115, 46)), (7, (111, 98)), (7, (111, 97)), (7, (109, 95)), (7, (104, 105)), (7, (103, 112)), (7, (103, 105)), (7, (103, 95)), (7, (97, 117)), (7, (46, 10)), (7, (32, 84)), (7, (32, 65)), (6, (114, 109)), (6, (112, 95)), (6, (111, 115)), (6, (111, 105)), (6, (109, 105)), (6, (109, 32)), (6, (102, 117)), (6, (102, 32)), (6, (101, 99)), (6, (98, 115)), (6, (97, 112)), (6, (97, 100)), (6, (95, 102)), (6, (95, 98)), (6, (42, 42)), (6, (32, 114)), (6, (32, 110)), (6, (32, 108)), (6, (10, 10)), (5, (120, 121)), (5, (119, 111)), (5, (119, 105)), (5, (116, 117)), (5, (116, 44)), (5, (115, 112)), (5, (111, 120)), (5, (111, 99)), (5, (110, 118)), (5, (110, 105)), (5, (109, 115)), (5, (108, 121)), (5, (108, 117)), (5, (105, 109)), (5, (105, 108)), (5, (104, 32)), (5, (103, 32)), (5, (102, 108)), (5, (101, 113)), (5, (101, 109)), (5, (100, 121)), (5, (100, 105)), (5, (99, 108)), (5, (99, 107)), (5, (98, 111)), (5, (95, 116)), (5, (95, 111)), (5, (91, 67)), (5, (84, 104)), (5, (80, 97)), (5, (76, 77)), (5, (76, 76)), (5, (34, 116)), (5, (32, 119)), (5, (32, 118)), (5, (32, 117)), (5, (32, 85)), (5, (32, 73)), (5, (10, 35)), (4, (121, 58)), (4, (121, 44)), (4, (118, 97)), (4, (117, 108)), (4, (116, 116)), (4, (116, 112)), (4, (116, 40)), (4, (115, 107)), (4, (114, 121)), (4, (114, 116)), (4, (114, 95)), (4, (114, 44)), (4, (112, 121)), (4, (111, 119)), (4, (110, 67)), (4, (110, 47)), (4, (104, 116)), (4, (102, 101)), (4, (101, 111)), (4, (101, 58)), (4, (100, 111)), (4, (98, 105)), (4, (98, 101)), (4, (93, 44)), (4, (91, 115)), (4, (91, 105)), (4, (91, 102)), (4, (85, 115)), (4, (73, 116)), (4, (65, 117)), (4, (65, 103)), (4, (47, 109)), (4, (47, 97)), (4, (46, 32)), (4, (41, 10)), (4, (40, 10)), (4, (34, 115)), (4, (34, 44)), (4, (32, 100)), (4, (32, 98)), (4, (32, 42)), (4, (32, 41)), (4, (10, 96)), (3, (121, 116)), (3, (121, 111)), (3, (121, 95)), (3, (121, 61)), (3, (121, 34)), (3, (120, 95)), (3, (118, 105)), (3, (117, 100)), (3, (117, 98)), (3, (116, 119)), (3, (116, 47)), (3, (116, 46)), (3, (116, 45)), (3, (116, 34)), (3, (115, 61)), (3, (115, 47)), (3, (114, 117)), (3, (114, 105)), (3, (114, 34)), (3, (112, 115)), (3, (112, 112)), (3, (111, 107)), (3, (111, 71)), (3, (110, 10)), (3, (109, 117)), (3, (109, 93)), (3, (108, 95)), (3, (107, 115)), (3, (105, 122)), (3, (105, 114)), (3, (105, 112)), (3, (105, 45)), (3, (102, 116)), (3, (101, 93)), (3, (101, 91)), (3, (99, 121)), (3, (99, 117)), (3, (99, 105)), (3, (98, 46)), (3, (97, 120)), (3, (97, 107)), (3, (97, 99)), (3, (95, 113)), (3, (95, 104)), (3, (93, 93)), (3, (83, 116)), (3, (76, 105)), (3, (73, 32)), (3, (71, 101)), (3, (70, 97)), (3, (65, 73)), (3, (61, 101)), (3, (58, 47)), (3, (58, 10)), (3, (47, 47)), (3, (46, 105)), (3, (45, 97)), (3, (45, 32)), (3, (42, 58)), (3, (41, 44)), (3, (41, 32)), (3, (34, 117)), (3, (34, 109)), (3, (34, 102)), (3, (32, 91)), (3, (32, 67)), (3, (32, 39)), (3, (32, 35)), (3, (10, 45)), (2, (125, 44)), (2, (125, 41)), (2, (123, 34)), (2, (122, 97)), (2, (121, 65)), (2, (121, 46)), (2, (120, 97)), (2, (119, 115)), (2, (117, 32)), (2, (116, 91)), (2, (116, 65)), (2, (116, 58)), (2, (115, 108)), (2, (114, 107)), (2, (114, 103)), (2, (114, 93)), (2, (114, 80)), (2, (112, 117)), (2, (112, 105)), (2, (112, 58)), (2, (112, 44)), (2, (112, 34)), (2, (111, 118)), (2, (111, 47)), (2, (110, 112)), (2, (110, 107)), (2, (110, 93)), (2, (110, 91)), (2, (110, 84)), (2, (110, 46)), (2, (110, 44)), (2, (110, 42)), (2, (109, 109)), (2, (109, 91)), (2, (108, 118)), (2, (108, 102)), (2, (108, 93)), (2, (108, 58)), (2, (108, 44)), (2, (108, 34)), (2, (108, 32)), (2, (107, 93)), (2, (107, 32)), (2, (106, 115)), (2, (105, 102)), (2, (103, 61)), (2, (101, 121)), (2, (101, 102)), (2, (101, 80)), (2, (101, 61)), (2, (101, 34)), (2, (101, 10)), (2, (100, 112)), (2, (100, 98)), (2, (100, 46)), (2, (99, 115)), (2, (99, 32)), (2, (98, 97)), (2, (97, 105)), (2, (96, 112)), (2, (96, 10)), (2, (95, 106)), (2, (95, 100)), (2, (95, 76)), (2, (95, 67)), (2, (93, 40)), (2, (85, 110)), (2, (84, 114)), (2, (84, 111)), (2, (83, 84)), (2, (80, 114)), (2, (80, 73)), (2, (79, 78)), (2, (79, 65)), (2, (78, 70)), (2, (77, 115)), (2, (77, 32)), (2, (76, 73)), (2, (73, 95)), (2, (73, 83)), (2, (73, 71)), (2, (71, 95)), (2, (70, 117)), (2, (70, 111)), (2, (70, 73)), (2, (67, 97)), (2, (67, 79)), (2, (65, 115)), (2, (65, 80)), (2, (65, 32)), (2, (61, 123)), (2, (61, 109)), (2, (61, 34)), (2, (47, 116)), (2, (47, 100)), (2, (47, 99)), (2, (46, 103)), (2, (46, 70)), (2, (40, 104)), (2, (40, 34)), (2, (39, 58)), (2, (34, 108)), (2, (34, 99)), (2, (34, 41)), (2, (32, 107)), (2, (32, 103)), (2, (32, 89)), (2, (32, 83)), (2, (32, 70)), (2, (10, 117)), (2, (10, 70)), (1, (123, 39)), (1, (123, 10)), (1, (122, 101)), (1, (121, 112)), (1, (121, 98)), (1, (121, 41)), (1, (121, 39)), (1, (120, 101)), (1, (120, 46)), (1, (120, 32)), (1, (119, 101)), (1, (119, 97)), (1, (119, 44)), (1, (118, 95)), (1, (118, 32)), (1, (117, 105)), (1, (116, 125)), (1, (116, 108)), (1, (116, 77)), (1, (116, 61)), (1, (116, 41)), (1, (116, 10)), (1, (115, 102)), (1, (115, 80)), (1, (115, 42)), (1, (115, 40)), (1, (115, 10)), (1, (114, 110)), (1, (114, 102)), (1, (114, 58)), (1, (114, 45)), (1, (112, 120)), (1, (112, 104)), (1, (111, 121)), (1, (111, 61)), (1, (110, 111)), (1, (110, 83)), (1, (110, 80)), (1, (110, 77)), (1, (110, 65)), (1, (110, 58)), (1, (110, 40)), (1, (110, 39)), (1, (110, 34)), (1, (109, 108)), (1, (109, 79)), (1, (109, 61)), (1, (109, 58)), (1, (109, 47)), (1, (109, 44)), (1, (109, 40)), (1, (109, 34)), (1, (108, 109)), (1, (108, 100)), (1, (108, 97)), (1, (108, 80)), (1, (108, 67)), (1, (108, 39)), (1, (107, 119)), (1, (107, 95)), (1, (107, 46)), (1, (107, 44)), (1, (107, 10)), (1, (105, 101)), (1, (105, 100)), (1, (105, 98)), (1, (105, 95)), (1, (103, 117)), (1, (103, 115)), (1, (103, 114)), (1, (103, 108)), (1, (103, 34)), (1, (102, 102)), (1, (102, 46)), (1, (102, 44)), (1, (101, 125)), (1, (101, 119)), (1, (101, 103)), (1, (101, 79)), (1, (101, 70)), (1, (101, 62)), (1, (101, 46)), (1, (101, 45)), (1, (101, 40)), (1, (100, 115)), (1, (100, 100)), (1, (100, 58)), (1, (100, 44)), (1, (100, 34)), (1, (100, 10)), (1, (98, 47)), (1, (97, 121)), (1, (97, 118)), (1, (95, 115)), (1, (95, 114)), (1, (95, 107)), (1, (95, 101)), (1, (93, 58)), (1, (93, 10)), (1, (91, 123)), (1, (91, 109)), (1, (91, 101)), (1, (91, 99)), (1, (91, 98)), (1, (91, 84)), (1, (91, 79)), (1, (91, 76)), (1, (91, 70)), (1, (91, 68)), (1, (89, 111)), (1, (89, 84)), (1, (86, 68)), (1, (84, 105)), (1, (84, 69)), (1, (84, 68)), (1, (84, 65)), (1, (84, 58)), (1, (84, 34)), (1, (83, 101)), (1, (83, 76)), (1, (82, 101)), (1, (82, 84)), (1, (81, 117)), (1, (81, 35)), (1, (80, 108)), (1, (80, 79)), (1, (79, 82)), (1, (78, 86)), (1, (78, 84)), (1, (77, 117)), (1, (77, 111)), (1, (77, 101)), (1, (77, 80)), (1, (77, 46)), (1, (76, 111)), (1, (76, 65)), (1, (73, 77)), (1, (72, 117)), (1, (72, 101)), (1, (70, 101)), (1, (70, 65)), (1, (69, 83)), (1, (68, 105)), (1, (68, 65)), (1, (68, 46)), (1, (67, 117)), (1, (67, 114)), (1, (66, 121)), (1, (66, 111)), (1, (65, 81)), (1, (65, 78)), (1, (62, 39)), (1, (62, 32)), (1, (61, 116)), (1, (61, 115)), (1, (61, 83)), (1, (61, 67)), (1, (60, 121)), (1, (52, 39)), (1, (47, 103)), (1, (47, 98)), (1, (47, 85)), (1, (47, 70)), (1, (46, 112)), (1, (46, 99)), (1, (46, 95)), (1, (46, 84)), (1, (46, 82)), (1, (46, 67)), (1, (46, 34)), (1, (45, 121)), (1, (45, 103)), (1, (45, 101)), (1, (45, 67)), (1, (45, 62)), (1, (45, 52)), (1, (44, 93)), (1, (42, 77)), (1, (42, 72)), (1, (42, 67)), (1, (42, 44)), (1, (40, 101)), (1, (40, 97)), (1, (39, 125)), (1, (39, 116)), (1, (39, 109)), (1, (39, 103)), (1, (39, 97)), (1, (39, 60)), (1, (39, 44)), (1, (35, 115)), (1, (34, 119)), (1, (34, 114)), (1, (34, 112)), (1, (34, 110)), (1, (34, 97)), (1, (34, 80)), (1, (34, 79)), (1, (34, 47)), (1, (32, 125)), (1, (32, 123)), (1, (32, 121)), (1, (32, 81)), (1, (32, 72)), (1, (32, 66)), (1, (32, 45)), (1, (10, 109)), (1, (10, 102)), (1, (10, 99)), (1, (10, 97)), (1, (10, 66))]\n"
360
+ ]
361
+ }
362
+ ],
363
+ "source": [
364
+ "def get_stats(ids):\n",
365
+ " \"\"\"\n",
366
+ " Get statistics of the token ids. includes the most common token pairs.\n",
367
+ " \"\"\"\n",
368
+ " counts = {}\n",
369
+ " for pair in zip(ids, ids[1:]):\n",
370
+ " counts[pair] = counts.get(pair, 0) + 1\n",
371
+ " return counts\n",
372
+ "\n",
373
+ "stats = get_stats(tokens)\n",
374
+ "# print(stats)\n",
375
+ "print(sorted(((v,k) for k,v in stats.items()), reverse=True))"
376
+ ]
377
+ },
378
+ {
379
+ "cell_type": "code",
380
+ "execution_count": 12,
381
+ "metadata": {},
382
+ "outputs": [
383
+ {
384
+ "data": {
385
+ "text/plain": [
386
+ "(' ', ' ')"
387
+ ]
388
+ },
389
+ "execution_count": 12,
390
+ "metadata": {},
391
+ "output_type": "execute_result"
392
+ }
393
+ ],
394
+ "source": [
395
+ "chr(32), chr(32) # the space character is the most common character in the text"
396
+ ]
397
+ },
398
+ {
399
+ "cell_type": "code",
400
+ "execution_count": 13,
401
+ "metadata": {},
402
+ "outputs": [
403
+ {
404
+ "name": "stdout",
405
+ "output_type": "stream",
406
+ "text": [
407
+ "[65, 117, 116, 111, 103, 101, 110, 32, 101, 110, 97, 98, 108, 101, 115, 32, 116, 104, 101, 32, 110, 101, 120, 116, 45, 103, 101, 110, 32, 76, 76, 77, 32, 97, 112, 112, 108, 105, 99, 97, 116, 105, 111, 110, 115, 32, 119, 105, 116, 104, 32, 97, 32, 103, 101, 110, 101, 114, 105, 99, 32, 91, 109, 117, 108, 116, 105, 45, 97, 103, 101, 110, 116, 32, 99, 111, 110, 118, 101, 114, 115, 97, 116, 105, 111, 110, 93, 40, 104, 116, 116, 112, 115, 58, 47, 47, 109, 105, 99, 114, 111, 115, 111, 102, 116, 46, 103, 105, 116, 104, 117, 98, 46, 105, 111, 47, 97, 117, 116, 111, 103, 101, 110, 47, 100, 111, 99, 115, 47, 85, 115, 101, 45, 67, 97, 115, 101, 115, 47, 97, 103, 101, 110, 116, 95, 99, 104, 97, 116, 41, 32, 102, 114, 97, 109, 101, 119, 111, 114, 107, 46, 32, 73, 116, 32, 111, 102, 102, 101, 114, 115, 32, 99, 117, 115, 116, 111, 109, 105, 122, 97, 98, 108, 101, 32, 97, 110, 100, 32, 99, 111, 110, 118, 101, 114, 115, 97, 98, 108, 101, 32, 97, 103, 101, 110, 116, 115, 32, 116, 104, 97, 116, 32, 105, 110, 116, 101, 103, 114, 97, 116, 101, 32, 76, 76, 77, 115, 44, 32, 116, 111, 111, 108, 115, 44, 32, 97, 110, 100, 32, 104, 117, 109, 97, 110, 115, 46, 10, 66, 121, 32, 97, 117, 116, 111, 109, 97, 116, 105, 110, 103, 32, 99, 104, 97, 116, 32, 97, 109, 111, 110, 103, 32, 109, 117, 108, 116, 105, 112, 108, 101, 32, 99, 97, 112, 97, 98, 108, 101, 32, 97, 103, 101, 110, 116, 115, 44, 32, 111, 110, 101, 32, 99, 97, 110, 32, 101, 97, 115, 105, 108, 121, 32, 109, 97, 107, 101, 32, 116, 104, 101, 109, 32, 99, 111, 108, 108, 101, 99, 116, 105, 118, 101, 108, 121, 32, 112, 101, 114, 102, 111, 114, 109, 32, 116, 97, 115, 107, 115, 32, 97, 117, 116, 111, 110, 111, 109, 111, 117, 115, 108, 121, 32, 111, 114, 32, 119, 105, 116, 104, 32, 104, 117, 109, 97, 110, 32, 102, 101, 101, 100, 98, 97, 99, 107, 44, 32, 105, 110, 99, 108, 117, 100, 105, 110, 103, 32, 116, 97, 115, 107, 115, 32, 116, 104, 97, 116, 32, 114, 101, 113, 117, 105, 114, 101, 32, 117, 115, 105, 110, 103, 32, 116, 111, 111, 108, 115, 32, 118, 105, 97, 32, 99, 111, 100, 101, 46, 10, 10, 70, 101, 97, 116, 117, 114, 101, 115, 32, 111, 102, 32, 116, 104, 105, 115, 32, 117, 115, 101, 32, 99, 97, 115, 101, 32, 105, 110, 99, 108, 117, 100, 101, 58, 10, 10, 45, 32, 42, 42, 77, 117, 108, 116, 105, 45, 97, 103, 101, 110, 116, 32, 99, 111, 110, 118, 101, 114, 115, 97, 116, 105, 111, 110, 115, 42, 42, 58, 32, 65, 117, 116, 111, 71, 101, 110, 32, 97, 103, 101, 110, 116, 115, 32, 99, 97, 110, 32, 99, 111, 109, 109, 117, 110, 105, 99, 97, 116, 101, 32, 119, 105, 116, 104, 32, 101, 97, 99, 104, 32, 111, 116, 104, 101, 114, 32, 116, 111, 32, 115, 111, 108, 118, 101, 32, 116, 97, 115, 107, 115, 46, 32, 84, 104, 105, 115, 32, 97, 108, 108, 111, 119, 115, 32, 102, 111, 114, 32, 109, 111, 114, 101, 32, 99, 111, 109, 112, 108, 101, 120, 32, 97, 110, 100, 32, 115, 111, 112, 104, 105, 115, 116, 105, 99, 97, 116, 101, 100, 32, 97, 112, 112, 108, 105, 99, 97, 116, 105, 111, 110, 115, 32, 116, 104, 97, 110, 32, 119, 111, 117, 108, 100, 32, 98, 101, 32, 112, 111, 115, 115, 105, 98, 108, 101, 32, 119, 105, 116, 104, 32, 97, 32, 115, 105, 110, 103, 108, 101, 32, 76, 76, 77, 46, 10, 45, 32, 42, 42, 67, 117, 115, 116, 111, 109, 105, 122, 97, 116, 105, 111, 110, 42, 42, 58, 32, 65, 117, 116, 111, 71, 101, 110, 32, 97, 103, 101, 110, 116, 115, 32, 99, 97, 110, 32, 98, 101, 32, 99, 117, 115, 116, 111, 109, 105, 122, 101, 100, 32, 116, 111, 32, 109, 101, 101, 116, 32, 116, 104, 101, 32, 115, 112, 101, 99, 105, 102, 105, 99, 32, 110, 101, 101, 100, 115, 32, 111, 102, 32, 97, 110, 32, 97, 112, 112, 108, 105, 99, 97, 116, 105, 111, 110, 46, 32, 84, 104, 105, 115, 32, 105, 110, 99, 108, 117, 100, 101, 115, 32, 116, 104, 101, 32, 97, 98, 105, 108, 105, 116, 121, 32, 116, 111, 32, 99, 104, 111, 111, 115, 101, 32, 116, 104, 101, 32, 76, 76, 77, 115, 32, 116, 111, 32, 117, 115, 101, 44, 32, 116, 104, 101, 32, 116, 121, 112, 101, 115, 32, 111, 102, 32, 104, 117, 109, 97, 110, 32, 105, 110, 112, 117, 116, 32, 116, 111, 32, 97, 108, 108, 111, 119, 44, 32, 97, 110, 100, 32, 116, 104, 101, 32, 116, 111, 111, 108, 115, 32, 116, 111, 32, 101, 109, 112, 108, 111, 121, 46, 10, 45, 32, 42, 42, 72, 117, 109, 97, 110, 32, 112, 97, 114, 116, 105, 99, 105, 112, 97, 116, 105, 111, 110, 42, 42, 58, 32, 65, 117, 116, 111, 71, 101, 110, 32, 115, 101, 97, 109, 108, 101, 115, 115, 108, 121, 32, 97, 108, 108, 111, 119, 115, 32, 104, 117, 109, 97, 110, 32, 112, 97, 114, 116, 105, 99, 105, 112, 97, 116, 105, 111, 110, 46, 32, 84, 104, 105, 115, 32, 109, 101, 97, 110, 115, 32, 116, 104, 97, 116, 32, 104, 117, 109, 97, 110, 115, 32, 99, 97, 110, 32, 112, 114, 111, 118, 105, 100, 101, 32, 105, 110, 112, 117, 116, 32, 97, 110, 100, 32, 102, 101, 101, 100, 98, 97, 99, 107, 32, 116, 111, 32, 116, 104, 101, 32, 97, 103, 101, 110, 116, 115, 32, 97, 115, 32, 110, 101, 101, 100, 101, 100, 46, 10, 10, 70, 111, 114, 32, 91, 101, 120, 97, 109, 112, 108, 101, 93, 40, 104, 116, 116, 112, 115, 58, 47, 47, 103, 105, 116, 104, 117, 98, 46, 99, 111, 109, 47, 109, 105, 99, 114, 111, 115, 111, 102, 116, 47, 97, 117, 116, 111, 103, 101, 110, 47, 98, 108, 111, 98, 47, 109, 97, 105, 110, 47, 116, 101, 115, 116, 47, 116, 119, 111, 97, 103, 101, 110, 116, 46, 112, 121, 41, 44, 10, 10, 96, 96, 96, 112, 121, 116, 104, 111, 110, 10, 102, 114, 111, 109, 32, 97, 117, 116, 111, 103, 101, 110, 32, 105, 109, 112, 111, 114, 116, 32, 65, 115, 115, 105, 115, 116, 97, 110, 116, 65, 103, 101, 110, 116, 44, 32, 85, 115, 101, 114, 80, 114, 111, 120, 121, 65, 103, 101, 110, 116, 44, 32, 99, 111, 110, 102, 105, 103, 95, 108, 105, 115, 116, 95, 102, 114, 111, 109, 95, 106, 115, 111, 110, 10, 35, 32, 76, 111, 97, 100, 32, 76, 76, 77, 32, 105, 110, 102, 101, 114, 101, 110, 99, 101, 32, 101, 110, 100, 112, 111, 105, 110, 116, 115, 32, 102, 114, 111, 109, 32, 97, 110, 32, 101, 110, 118, 32, 118, 97, 114, 105, 97, 98, 108, 101, 32, 111, 114, 32, 97, 32, 102, 105, 108, 101, 10, 35, 32, 83, 101, 101, 32, 104, 116, 116, 112, 115, 58, 47, 47, 109, 105, 99, 114, 111, 115, 111, 102, 116, 46, 103, 105, 116, 104, 117, 98, 46, 105, 111, 47, 97, 117, 116, 111, 103, 101, 110, 47, 100, 111, 99, 115, 47, 70, 65, 81, 35, 115, 101, 116, 45, 121, 111, 117, 114, 45, 97, 112, 105, 45, 101, 110, 100, 112, 111, 105, 110, 116, 115, 10, 35, 32, 97, 110, 100, 32, 79, 65, 73, 95, 67, 79, 78, 70, 73, 71, 95, 76, 73, 83, 84, 95, 115, 97, 109, 112, 108, 101, 10, 99, 111, 110, 102, 105, 103, 95, 108, 105, 115, 116, 32, 61, 32, 99, 111, 110, 102, 105, 103, 95, 108, 105, 115, 116, 95, 102, 114, 111, 109, 95, 106, 115, 111, 110, 40, 101, 110, 118, 95, 111, 114, 95, 102, 105, 108, 101, 61, 34, 79, 65, 73, 95, 67, 79, 78, 70, 73, 71, 95, 76, 73, 83, 84, 34, 41, 10, 35, 32, 89, 111, 117, 32, 99, 97, 110, 32, 97, 108, 115, 111, 32, 115, 101, 116, 32, 99, 111, 110, 102, 105, 103, 95, 108, 105, 115, 116, 32, 100, 105, 114, 101, 99, 116, 108, 121, 32, 97, 115, 32, 97, 32, 108, 105, 115, 116, 44, 32, 102, 111, 114, 32, 101, 120, 97, 109, 112, 108, 101, 44, 32, 99, 111, 110, 102, 105, 103, 95, 108, 105, 115, 116, 32, 61, 32, 91, 123, 39, 109, 111, 100, 101, 108, 39, 58, 32, 39, 103, 112, 116, 45, 52, 39, 44, 32, 39, 97, 112, 105, 95, 107, 101, 121, 39, 58, 32, 39, 60, 121, 111, 117, 114, 32, 79, 112, 101, 110, 65, 73, 32, 65, 80, 73, 32, 107, 101, 121, 32, 104, 101, 114, 101, 62, 39, 125, 44, 93, 10, 97, 115, 115, 105, 115, 116, 97, 110, 116, 32, 61, 32, 65, 115, 115, 105, 115, 116, 97, 110, 116, 65, 103, 101, 110, 116, 40, 34, 97, 115, 115, 105, 115, 116, 97, 110, 116, 34, 44, 32, 108, 108, 109, 95, 99, 111, 110, 102, 105, 103, 61, 123, 34, 99, 111, 110, 102, 105, 103, 95, 108, 105, 115, 116, 34, 58, 32, 99, 111, 110, 102, 105, 103, 95, 108, 105, 115, 116, 125, 41, 10, 117, 115, 101, 114, 95, 112, 114, 111, 120, 121, 32, 61, 32, 85, 115, 101, 114, 80, 114, 111, 120, 121, 65, 103, 101, 110, 116, 40, 34, 117, 115, 101, 114, 95, 112, 114, 111, 120, 121, 34, 44, 32, 99, 111, 100, 101, 95, 101, 120, 101, 99, 117, 116, 105, 111, 110, 95, 99, 111, 110, 102, 105, 103, 61, 123, 34, 119, 111, 114, 107, 95, 100, 105, 114, 34, 58, 32, 34, 99, 111, 100, 105, 110, 103, 34, 44, 32, 34, 117, 115, 101, 95, 100, 111, 99, 107, 101, 114, 34, 58, 32, 70, 97, 108, 115, 101, 125, 41, 32, 35, 32, 73, 77, 80, 79, 82, 84, 65, 78, 84, 58, 32, 115, 101, 116, 32, 116, 111, 32, 84, 114, 117, 101, 32, 116, 111, 32, 114, 117, 110, 32, 99, 111, 100, 101, 32, 105, 110, 32, 100, 111, 99, 107, 101, 114, 44, 32, 114, 101, 99, 111, 109, 109, 101, 110, 100, 101, 100, 10, 117, 115, 101, 114, 95, 112, 114, 111, 120, 121, 46, 105, 110, 105, 116, 105, 97, 116, 101, 95, 99, 104, 97, 116, 40, 97, 115, 115, 105, 115, 116, 97, 110, 116, 44, 32, 109, 101, 115, 115, 97, 103, 101, 61, 34, 80, 108, 111, 116, 32, 97, 32, 99, 104, 97, 114, 116, 32, 111, 102, 32, 78, 86, 68, 65, 32, 97, 110, 100, 32, 84, 69, 83, 76, 65, 32, 115, 116, 111, 99, 107, 32, 112, 114, 105, 99, 101, 32, 99, 104, 97, 110, 103, 101, 32, 89, 84, 68, 46, 34, 41, 10, 35, 32, 84, 104, 105, 115, 32, 105, 110, 105, 116, 105, 97, 116, 101, 115, 32, 97, 110, 32, 97, 117, 116, 111, 109, 97, 116, 101, 100, 32, 99, 104, 97, 116, 32, 98, 101, 116, 119, 101, 101, 110, 32, 116, 104, 101, 32, 116, 119, 111, 32, 97, 103, 101, 110, 116, 115, 32, 116, 111, 32, 115, 111, 108, 118, 101, 32, 116, 104, 101, 32, 116, 97, 115, 107, 10, 96, 96, 96, 10, 10, 109, 111, 114, 101, 32, 112, 121, 116, 104, 111, 110, 32, 99, 111, 100, 101, 58, 10, 10, 96, 96, 96, 112, 121, 116, 104, 111, 110, 10, 1000, 1000, 100, 101, 102, 32, 99, 114, 101, 97, 116, 101, 40, 10, 1000, 1000, 1000, 1000, 115, 101, 108, 102, 44, 10, 1000, 1000, 1000, 1000, 42, 44, 10, 1000, 1000, 1000, 1000, 109, 101, 115, 115, 97, 103, 101, 115, 58, 32, 73, 116, 101, 114, 97, 98, 108, 101, 91, 67, 104, 97, 116, 67, 111, 109, 112, 108, 101, 116, 105, 111, 110, 77, 101, 115, 115, 97, 103, 101, 80, 97, 114, 97, 109, 93, 44, 10, 1000, 1000, 1000, 1000, 109, 111, 100, 101, 108, 58, 32, 85, 110, 105, 111, 110, 91, 115, 116, 114, 44, 32, 67, 104, 97, 116, 77, 111, 100, 101, 108, 93, 44, 10, 1000, 1000, 1000, 1000, 102, 114, 101, 113, 117, 101, 110, 99, 121, 95, 112, 101, 110, 97, 108, 116, 121, 58, 32, 79, 112, 116, 105, 111, 110, 97, 108, 91, 102, 108, 111, 97, 116, 93, 32, 124, 32, 78, 111, 116, 71, 105, 118, 101, 110, 32, 61, 32, 78, 79, 84, 95, 71, 73, 86, 69, 78, 44, 10, 1000, 1000, 1000, 1000, 102, 117, 110, 99, 116, 105, 111, 110, 95, 99, 97, 108, 108, 58, 32, 99, 111, 109, 112, 108, 101, 116, 105, 111, 110, 95, 99, 114, 101, 97, 116, 101, 95, 112, 97, 114, 97, 109, 115, 46, 70, 117, 110, 99, 116, 105, 111, 110, 67, 97, 108, 108, 32, 124, 32, 78, 111, 116, 71, 105, 118, 101, 110, 32, 61, 32, 78, 79, 84, 95, 71, 73, 86, 69, 78, 44, 10, 1000, 1000, 1000, 1000, 102, 117, 110, 99, 116, 105, 111, 110, 115, 58, 32, 73, 116, 101, 114, 97, 98, 108, 101, 91, 99, 111, 109, 112, 108, 101, 116, 105, 111, 110, 95, 99, 114, 101, 97, 116, 101, 95, 112, 97, 114, 97, 109, 115, 46, 70, 117, 110, 99, 116, 105, 111, 110, 93, 32, 124, 32, 78, 111, 116, 71, 105, 118, 101, 110, 32, 61, 32, 78, 79, 84, 95, 71, 73, 86, 69, 78, 44, 10, 1000, 1000, 1000, 1000, 108, 111, 103, 105, 116, 95, 98, 105, 97, 115, 58, 32, 79, 112, 116, 105, 111, 110, 97, 108, 91, 68, 105, 99, 116, 91, 115, 116, 114, 44, 32, 105, 110, 116, 93, 93, 32, 124, 32, 78, 111, 116, 71, 105, 118, 101, 110, 32, 61, 32, 78, 79, 84, 95, 71, 73, 86, 69, 78, 44, 10, 1000, 1000, 1000, 1000, 108, 111, 103, 112, 114, 111, 98, 115, 58, 32, 79, 112, 116, 105, 111, 110, 97, 108, 91, 98, 111, 111, 108, 93, 32, 124, 32, 78, 111, 116, 71, 105, 118, 101, 110, 32, 61, 32, 78, 79, 84, 95, 71, 73, 86, 69, 78, 44, 10, 1000, 1000, 1000, 1000, 109, 97, 120, 95, 116, 111, 107, 101, 110, 115, 58, 32, 79, 112, 116, 105, 111, 110, 97, 108, 91, 105, 110, 116, 93, 32, 124, 32, 78, 111, 116, 71, 105, 118, 101, 110, 32, 61, 32, 78, 79, 84, 95, 71, 73, 86, 69, 78, 44, 10, 1000, 1000, 1000, 1000, 110, 58, 32, 79, 112, 116, 105, 111, 110, 97, 108, 91, 105, 110, 116, 93, 32, 124, 32, 78, 111, 116, 71, 105, 118, 101, 110, 32, 61, 32, 78, 79, 84, 95, 71, 73, 86, 69, 78, 44, 10, 1000, 1000, 1000, 1000, 112, 114, 101, 115, 101, 110, 99, 101, 95, 112, 101, 110, 97, 108, 116, 121, 58, 32, 79, 112, 116, 105, 111, 110, 97, 108, 91, 102, 108, 111, 97, 116, 93, 32, 124, 32, 78, 111, 116, 71, 105, 118, 101, 110, 32, 61, 32, 78, 79, 84, 95, 71, 73, 86, 69, 78, 44, 10, 1000, 1000, 1000, 1000, 114, 101, 115, 112, 111, 110, 115, 101, 95, 102, 111, 114, 109, 97, 116, 58, 32, 99, 111, 109, 112, 108, 101, 116, 105, 111, 110, 95, 99, 114, 101, 97, 116, 101, 95, 112, 97, 114, 97, 109, 115, 46, 82, 101, 115, 112, 111, 110, 115, 101, 70, 111, 114, 109, 97, 116, 32, 124, 32, 78, 111, 116, 71, 105, 118, 101, 110, 32, 61, 32, 78, 79, 84, 95, 71, 73, 86, 69, 78, 44, 10, 1000, 1000, 1000, 1000, 115, 101, 101, 100, 58, 32, 79, 112, 116, 105, 111, 110, 97, 108, 91, 105, 110, 116, 93, 32, 124, 32, 78, 111, 116, 71, 105, 118, 101, 110, 32, 61, 32, 78, 79, 84, 95, 71, 73, 86, 69, 78, 44, 10, 1000, 1000, 1000, 1000, 115, 116, 111, 112, 58, 32, 85, 110, 105, 111, 110, 91, 79, 112, 116, 105, 111, 110, 97, 108, 91, 115, 116, 114, 93, 44, 32, 76, 105, 115, 116, 91, 115, 116, 114, 93, 93, 32, 124, 32, 78, 111, 116, 71, 105, 118, 101, 110, 32, 61, 32, 78, 79, 84, 95, 71, 73, 86, 69, 78, 44, 10, 1000, 1000, 1000, 1000, 115, 116, 114, 101, 97, 109, 58, 32, 79, 112, 116, 105, 111, 110, 97, 108, 91, 76, 105, 116, 101, 114, 97, 108, 91, 70, 97, 108, 115, 101, 93, 93, 32, 124, 32, 76, 105, 116, 101, 114, 97, 108, 91, 84, 114, 117, 101, 93, 32, 124, 32, 78, 111, 116, 71, 105, 118, 101, 110, 32, 61, 32, 78, 79, 84, 95, 71, 73, 86, 69, 78, 44, 10, 1000, 1000, 1000, 1000, 115, 116, 114, 101, 97, 109, 95, 111, 112, 116, 105, 111, 110, 115, 58, 32, 79, 112, 116, 105, 111, 110, 97, 108, 91, 67, 104, 97, 116, 67, 111, 109, 112, 108, 101, 116, 105, 111, 110, 83, 116, 114, 101, 97, 109, 79, 112, 116, 105, 111, 110, 115, 80, 97, 114, 97, 109, 93, 32, 124, 32, 78, 111, 116, 71, 105, 118, 101, 110, 32, 61, 32, 78, 79, 84, 95, 71, 73, 86, 69, 78, 44, 10, 1000, 1000, 1000, 1000, 116, 101, 109, 112, 101, 114, 97, 116, 117, 114, 101, 58, 32, 79, 112, 116, 105, 111, 110, 97, 108, 91, 102, 108, 111, 97, 116, 93, 32, 124, 32, 78, 111, 116, 71, 105, 118, 101, 110, 32, 61, 32, 78, 79, 84, 95, 71, 73, 86, 69, 78, 44, 10, 1000, 1000, 1000, 1000, 116, 111, 111, 108, 95, 99, 104, 111, 105, 99, 101, 58, 32, 67, 104, 97, 116, 67, 111, 109, 112, 108, 101, 116, 105, 111, 110, 84, 111, 111, 108, 67, 104, 111, 105, 99, 101, 79, 112, 116, 105, 111, 110, 80, 97, 114, 97, 109, 32, 124, 32, 78, 111, 116, 71, 105, 118, 101, 110, 32, 61, 32, 78, 79, 84, 95, 71, 73, 86, 69, 78, 44, 10, 1000, 1000, 1000, 1000, 116, 111, 111, 108, 115, 58, 32, 73, 116, 101, 114, 97, 98, 108, 101, 91, 67, 104, 97, 116, 67, 111, 109, 112, 108, 101, 116, 105, 111, 110, 84, 111, 111, 108, 80, 97, 114, 97, 109, 93, 32, 124, 32, 78, 111, 116, 71, 105, 118, 101, 110, 32, 61, 32, 78, 79, 84, 95, 71, 73, 86, 69, 78, 44, 10, 1000, 1000, 1000, 1000, 116, 111, 112, 95, 108, 111, 103, 112, 114, 111, 98, 115, 58, 32, 79, 112, 116, 105, 111, 110, 97, 108, 91, 105, 110, 116, 93, 32, 124, 32, 78, 111, 116, 71, 105, 118, 101, 110, 32, 61, 32, 78, 79, 84, 95, 71, 73, 86, 69, 78, 44, 10, 1000, 1000, 1000, 1000, 116, 111, 112, 95, 112, 58, 32, 79, 112, 116, 105, 111, 110, 97, 108, 91, 102, 108, 111, 97, 116, 93, 32, 124, 32, 78, 111, 116, 71, 105, 118, 101, 110, 32, 61, 32, 78, 79, 84, 95, 71, 73, 86, 69, 78, 44, 10, 1000, 1000, 1000, 1000, 117, 115, 101, 114, 58, 32, 115, 116, 114, 32, 124, 32, 78, 111, 116, 71, 105, 118, 101, 110, 32, 61, 32, 78, 79, 84, 95, 71, 73, 86, 69, 78, 44, 10, 1000, 1000, 1000, 1000, 35, 32, 85, 115, 101, 32, 116, 104, 101, 32, 102, 111, 108, 108, 111, 119, 105, 110, 103, 32, 97, 114, 103, 117, 109, 101, 110, 116, 115, 32, 105, 102, 32, 121, 111, 117, 32, 110, 101, 101, 100, 32, 116, 111, 32, 112, 97, 115, 115, 32, 97, 100, 100, 105, 116, 105, 111, 110, 97, 108, 32, 112, 97, 114, 97, 109, 101, 116, 101, 114, 115, 32, 116, 111, 32, 116, 104, 101, 32, 65, 80, 73, 32, 116, 104, 97, 116, 32, 97, 114, 101, 110, 39, 116, 32, 97, 118, 97, 105, 108, 97, 98, 108, 101, 32, 118, 105, 97, 32, 107, 119, 97, 114, 103, 115, 46, 10, 1000, 1000, 1000, 1000, 35, 32, 84, 104, 101, 32, 101, 120, 116, 114, 97, 32, 118, 97, 108, 117, 101, 115, 32, 103, 105, 118, 101, 110, 32, 104, 101, 114, 101, 32, 116, 97, 107, 101, 32, 112, 114, 101, 99, 101, 100, 101, 110, 99, 101, 32, 111, 118, 101, 114, 32, 118, 97, 108, 117, 101, 115, 32, 100, 101, 102, 105, 110, 101, 100, 32, 111, 110, 32, 116, 104, 101, 32, 99, 108, 105, 101, 110, 116, 32, 111, 114, 32, 112, 97, 115, 115, 101, 100, 32, 116, 111, 32, 116, 104, 105, 115, 32, 109, 101, 116, 104, 111, 100, 46, 10, 1000, 1000, 1000, 1000, 101, 120, 116, 114, 97, 95, 104, 101, 97, 100, 101, 114, 115, 58, 32, 72, 101, 97, 100, 101, 114, 115, 32, 124, 32, 78, 111, 110, 101, 32, 61, 32, 78, 111, 110, 101, 44, 10, 1000, 1000, 1000, 1000, 101, 120, 116, 114, 97, 95, 113, 117, 101, 114, 121, 58, 32, 81, 117, 101, 114, 121, 32, 124, 32, 78, 111, 110, 101, 32, 61, 32, 78, 111, 110, 101, 44, 10, 1000, 1000, 1000, 1000, 101, 120, 116, 114, 97, 95, 98, 111, 100, 121, 58, 32, 66, 111, 100, 121, 32, 124, 32, 78, 111, 110, 101, 32, 61, 32, 78, 111, 110, 101, 44, 10, 1000, 1000, 1000, 1000, 116, 105, 109, 101, 111, 117, 116, 58, 32, 102, 108, 111, 97, 116, 32, 124, 32, 104, 116, 116, 112, 120, 46, 84, 105, 109, 101, 111, 117, 116, 32, 124, 32, 78, 111, 110, 101, 32, 124, 32, 78, 111, 116, 71, 105, 118, 101, 110, 32, 61, 32, 78, 79, 84, 95, 71, 73, 86, 69, 78, 44, 10, 1000, 1000, 41, 32, 45, 62, 32, 67, 104, 97, 116, 67, 111, 109, 112, 108, 101, 116, 105, 111, 110, 32, 124, 32, 83, 116, 114, 101, 97, 109, 91, 67, 104, 97, 116, 67, 111, 109, 112, 108, 101, 116, 105, 111, 110, 67, 104, 117, 110, 107, 93, 58, 10, 1000, 1000, 1000, 1000, 114, 101, 116, 117, 114, 110, 32, 115, 101, 108, 102, 46, 95, 112, 111, 115, 116, 40, 10, 1000, 1000, 1000, 1000, 1000, 1000, 34, 47, 99, 104, 97, 116, 47, 99, 111, 109, 112, 108, 101, 116, 105, 111, 110, 115, 34, 44, 10, 1000, 1000, 1000, 1000, 1000, 1000, 98, 111, 100, 121, 61, 109, 97, 121, 98, 101, 95, 116, 114, 97, 110, 115, 102, 111, 114, 109, 40, 10, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 123, 10, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 34, 109, 101, 115, 115, 97, 103, 101, 115, 34, 58, 32, 109, 101, 115, 115, 97, 103, 101, 115, 44, 10, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 34, 109, 111, 100, 101, 108, 34, 58, 32, 109, 111, 100, 101, 108, 44, 10, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 34, 102, 114, 101, 113, 117, 101, 110, 99, 121, 95, 112, 101, 110, 97, 108, 116, 121, 34, 58, 32, 102, 114, 101, 113, 117, 101, 110, 99, 121, 95, 112, 101, 110, 97, 108, 116, 121, 44, 10, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 34, 102, 117, 110, 99, 116, 105, 111, 110, 95, 99, 97, 108, 108, 34, 58, 32, 102, 117, 110, 99, 116, 105, 111, 110, 95, 99, 97, 108, 108, 44, 10, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 34, 102, 117, 110, 99, 116, 105, 111, 110, 115, 34, 58, 32, 102, 117, 110, 99, 116, 105, 111, 110, 115, 44, 10, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 34, 108, 111, 103, 105, 116, 95, 98, 105, 97, 115, 34, 58, 32, 108, 111, 103, 105, 116, 95, 98, 105, 97, 115, 44, 10, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 34, 108, 111, 103, 112, 114, 111, 98, 115, 34, 58, 32, 108, 111, 103, 112, 114, 111, 98, 115, 44, 10, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 34, 109, 97, 120, 95, 116, 111, 107, 101, 110, 115, 34, 58, 32, 109, 97, 120, 95, 116, 111, 107, 101, 110, 115, 44, 10, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 34, 110, 34, 58, 32, 110, 44, 10, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 34, 112, 114, 101, 115, 101, 110, 99, 101, 95, 112, 101, 110, 97, 108, 116, 121, 34, 58, 32, 112, 114, 101, 115, 101, 110, 99, 101, 95, 112, 101, 110, 97, 108, 116, 121, 44, 10, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 34, 114, 101, 115, 112, 111, 110, 115, 101, 95, 102, 111, 114, 109, 97, 116, 34, 58, 32, 114, 101, 115, 112, 111, 110, 115, 101, 95, 102, 111, 114, 109, 97, 116, 44, 10, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 34, 115, 101, 101, 100, 34, 58, 32, 115, 101, 101, 100, 44, 10, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 34, 115, 116, 111, 112, 34, 58, 32, 115, 116, 111, 112, 44, 10, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 34, 115, 116, 114, 101, 97, 109, 34, 58, 32, 115, 116, 114, 101, 97, 109, 44, 10, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 34, 115, 116, 114, 101, 97, 109, 95, 111, 112, 116, 105, 111, 110, 115, 34, 58, 32, 115, 116, 114, 101, 97, 109, 95, 111, 112, 116, 105, 111, 110, 115, 44, 10, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 34, 116, 101, 109, 112, 101, 114, 97, 116, 117, 114, 101, 34, 58, 32, 116, 101, 109, 112, 101, 114, 97, 116, 117, 114, 101, 44, 10, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 34, 116, 111, 111, 108, 95, 99, 104, 111, 105, 99, 101, 34, 58, 32, 116, 111, 111, 108, 95, 99, 104, 111, 105, 99, 101, 44, 10, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 34, 116, 111, 111, 108, 115, 34, 58, 32, 116, 111, 111, 108, 115, 44, 10, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 34, 116, 111, 112, 95, 108, 111, 103, 112, 114, 111, 98, 115, 34, 58, 32, 116, 111, 112, 95, 108, 111, 103, 112, 114, 111, 98, 115, 44, 10, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 34, 116, 111, 112, 95, 112, 34, 58, 32, 116, 111, 112, 95, 112, 44, 10, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 34, 117, 115, 101, 114, 34, 58, 32, 117, 115, 101, 114, 44, 10, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 125, 44, 10, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 99, 111, 109, 112, 108, 101, 116, 105, 111, 110, 95, 99, 114, 101, 97, 116, 101, 95, 112, 97, 114, 97, 109, 115, 46, 67, 111, 109, 112, 108, 101, 116, 105, 111, 110, 67, 114, 101, 97, 116, 101, 80, 97, 114, 97, 109, 115, 44, 10, 1000, 1000, 1000, 1000, 1000, 1000, 41, 44, 10, 1000, 1000, 1000, 1000, 1000, 1000, 111, 112, 116, 105, 111, 110, 115, 61, 109, 97, 107, 101, 95, 114, 101, 113, 117, 101, 115, 116, 95, 111, 112, 116, 105, 111, 110, 115, 40, 10, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 101, 120, 116, 114, 97, 95, 104, 101, 97, 100, 101, 114, 115, 61, 101, 120, 116, 114, 97, 95, 104, 101, 97, 100, 101, 114, 115, 44, 32, 101, 120, 116, 114, 97, 95, 113, 117, 101, 114, 121, 61, 101, 120, 116, 114, 97, 95, 113, 117, 101, 114, 121, 44, 32, 101, 120, 116, 114, 97, 95, 98, 111, 100, 121, 61, 101, 120, 116, 114, 97, 95, 98, 111, 100, 121, 44, 32, 116, 105, 109, 101, 111, 117, 116, 61, 116, 105, 109, 101, 111, 117, 116, 10, 1000, 1000, 1000, 1000, 1000, 1000, 41, 44, 10, 1000, 1000, 1000, 1000, 1000, 1000, 99, 97, 115, 116, 95, 116, 111, 61, 67, 104, 97, 116, 67, 111, 109, 112, 108, 101, 116, 105, 111, 110, 44, 10, 1000, 1000, 1000, 1000, 1000, 1000, 115, 116, 114, 101, 97, 109, 61, 115, 116, 114, 101, 97, 109, 32, 111, 114, 32, 70, 97, 108, 115, 101, 44, 10, 1000, 1000, 1000, 1000, 1000, 1000, 115, 116, 114, 101, 97, 109, 95, 99, 108, 115, 61, 83, 116, 114, 101, 97, 109, 91, 67, 104, 97, 116, 67, 111, 109, 112, 108, 101, 116, 105, 111, 110, 67, 104, 117, 110, 107, 93, 44, 10, 1000, 1000, 1000, 1000, 41, 10, 96, 96, 96, 10]\n",
408
+ "length: 4979\n"
409
+ ]
410
+ }
411
+ ],
412
+ "source": [
413
+ "def merge(ids, pair, idx):\n",
414
+ " \"\"\"\n",
415
+ " BPE algorithm\n",
416
+ " ids: list of integers(tokens)\n",
417
+ " pair: tuple of consecutive integers\n",
418
+ " idx: new vocab token to replace the pair\n",
419
+ " \"\"\"\n",
420
+ " new_ids = []\n",
421
+ " i = 0\n",
422
+ " while i < len(ids):\n",
423
+ " if i < len(ids) - 1 and ids[i] == pair[0] and ids[i+1] == pair[1]:\n",
424
+ " new_ids.append(idx)\n",
425
+ " i += 2\n",
426
+ " else:\n",
427
+ " new_ids.append(ids[i])\n",
428
+ " i += 1\n",
429
+ " return new_ids\n",
430
+ "\n",
431
+ "# merge the most common pair\n",
432
+ "tokens2 = merge(tokens, (32, 32), 1000)\n",
433
+ "print(tokens2)\n",
434
+ "print('length: ',len(tokens2))"
435
+ ]
436
+ },
437
+ {
438
+ "cell_type": "code",
439
+ "execution_count": 14,
440
+ "metadata": {},
441
+ "outputs": [
442
+ {
443
+ "name": "stdout",
444
+ "output_type": "stream",
445
+ "text": [
446
+ "merge (32, 32) to 256\n",
447
+ "merge (256, 256) to 257\n",
448
+ "merge (257, 257) to 258\n",
449
+ "merge (111, 110) to 259\n",
450
+ "merge (101, 110) to 260\n",
451
+ "merge (116, 105) to 261\n",
452
+ "merge (10, 258) to 262\n",
453
+ "merge (58, 32) to 263\n",
454
+ "merge (44, 262) to 264\n",
455
+ "merge (261, 259) to 265\n",
456
+ "merge (101, 32) to 266\n",
457
+ "merge (116, 111) to 267\n",
458
+ "merge (32, 78) to 268\n",
459
+ "merge (97, 116) to 269\n",
460
+ "merge (115, 32) to 270\n",
461
+ "merge (101, 114) to 271\n",
462
+ "merge (114, 101) to 272\n",
463
+ "merge (97, 108) to 273\n",
464
+ "merge (116, 104) to 274\n",
465
+ "merge (115, 116) to 275\n",
466
+ "merge (97, 110) to 276\n",
467
+ "merge (260, 32) to 277\n",
468
+ "merge (97, 109) to 278\n",
469
+ "merge (108, 101) to 279\n",
470
+ "merge (32, 124) to 280\n",
471
+ "merge (105, 110) to 281\n",
472
+ "merge (34, 263) to 282\n",
473
+ "merge (111, 109) to 283\n",
474
+ "merge (61, 268) to 284\n",
475
+ "merge (44, 32) to 285\n",
476
+ "merge (280, 268) to 286\n",
477
+ "merge (257, 34) to 287\n",
478
+ "merge (264, 258) to 288\n",
479
+ "merge (115, 101) to 289\n",
480
+ "merge (108, 111) to 290\n",
481
+ "merge (84, 95) to 291\n",
482
+ "merge (105, 118) to 292\n",
483
+ "merge (292, 277) to 293\n",
484
+ "merge (112, 265) to 294\n",
485
+ "merge (111, 116) to 295\n"
486
+ ]
487
+ }
488
+ ],
489
+ "source": [
490
+ "# complete cycle\n",
491
+ "def get_stats(ids):\n",
492
+ " counts = {}\n",
493
+ " for pair in zip(ids, ids[1:]):\n",
494
+ " counts[pair] = counts.get(pair, 0) +1 \n",
495
+ " return counts\n",
496
+ "\n",
497
+ "def merge(ids, pair, idx):\n",
498
+ " newids = []\n",
499
+ " i = 0\n",
500
+ " while i < len(ids):\n",
501
+ " if i < len(ids) - 1 and ids[i] == pair[0] and ids [i+1] == pair[1]:\n",
502
+ " newids.append(idx)\n",
503
+ " i += 2\n",
504
+ " else:\n",
505
+ " newids.append(ids[i])\n",
506
+ " i += 1\n",
507
+ " return newids\n",
508
+ "\n",
509
+ "# merge all the common pairs and create a new vocab\n",
510
+ "vocab_size = 296\n",
511
+ "num_merges = vocab_size - 256 # the utf-8 vocab size is 256\n",
512
+ "ids = list(tokens)\n",
513
+ "\n",
514
+ "\n",
515
+ "merges = {}\n",
516
+ "for i in range(num_merges):\n",
517
+ " stats = get_stats(ids)\n",
518
+ " pair = max(stats, key = stats.get) # get the most common pair\n",
519
+ " idx = 256 + i # new vocab token\n",
520
+ " print(f'merge {pair} to {idx}')\n",
521
+ " ids = merge(ids, pair, idx)\n",
522
+ " merges[pair] = idx\n"
523
+ ]
524
+ },
525
+ {
526
+ "cell_type": "code",
527
+ "execution_count": 15,
528
+ "metadata": {},
529
+ "outputs": [
530
+ {
531
+ "name": "stdout",
532
+ "output_type": "stream",
533
+ "text": [
534
+ "tokens length: 5397\n",
535
+ "new tokens length: 3365\n",
536
+ "compression rate: 1.60X\n"
537
+ ]
538
+ }
539
+ ],
540
+ "source": [
541
+ "print(\"tokens length: \", len(tokens))\n",
542
+ "print(\"new tokens length: \", len(ids))\n",
543
+ "print(f\"compression rate: {len(tokens) / len(ids):.2f}X\")"
544
+ ]
545
+ },
546
+ {
547
+ "cell_type": "markdown",
548
+ "metadata": {},
549
+ "source": [
550
+ "#### decoding\n",
551
+ "\n",
552
+ "Given the sequence of integers [0, vocab_size], converting it into a string."
553
+ ]
554
+ },
555
+ {
556
+ "cell_type": "code",
557
+ "execution_count": 16,
558
+ "metadata": {},
559
+ "outputs": [
560
+ {
561
+ "name": "stdout",
562
+ "output_type": "stream",
563
+ "text": [
564
+ "---\n",
565
+ "Autogen enables the next-gen LLM applications with a generic [multi-agent conversation](https://microsoft.github.io/autogen/docs/Use-Cases/agent_chat) framework. It offers customizable and conversable agents that integrate LLMs, tools, and humans.\n",
566
+ "By automating chat among multiple capable agents, one can easily make them collectively perform tasks autonomously or with human feedback, including tasks that require using tools via code.\n",
567
+ "\n",
568
+ "Features of this use case include:\n",
569
+ "\n",
570
+ "- **Multi-agent conversations**: AutoGen agents can communicate with each other to solve tasks. This allows for more complex and sophisticated applications than would be possible with a single LLM.\n",
571
+ "- **Customization**: AutoGen agents can be customized to meet the specific needs of an application. This includes the ability to choose the LLMs to use, the types of human input to allow, and the tools to employ.\n",
572
+ "- **Human participation**: AutoGen seamlessly allows human participation. This means that humans can provide input and feedback to the agents as needed.\n",
573
+ "\n",
574
+ "For [example](https://github.com/microsoft/autogen/blob/main/test/twoagent.py),\n",
575
+ "\n",
576
+ "```python\n",
577
+ "from autogen import AssistantAgent, UserProxyAgent, config_list_from_json\n",
578
+ "# Load LLM inference endpoints from an env variable or a file\n",
579
+ "# See https://microsoft.github.io/autogen/docs/FAQ#set-your-api-endpoints\n",
580
+ "# and OAI_CONFIG_LIST_sample\n",
581
+ "config_list = config_list_from_json(env_or_file=\"OAI_CONFIG_LIST\")\n",
582
+ "# You can also set config_list directly as a list, for example, config_list = [{'model': 'gpt-4', 'api_key': '<your OpenAI API key here>'},]\n",
583
+ "assistant = AssistantAgent(\"assistant\", llm_config={\"config_list\": config_list})\n",
584
+ "user_proxy = UserProxyAgent(\"user_proxy\", code_execution_config={\"work_dir\": \"coding\", \"use_docker\": False}) # IMPORTANT: set to True to run code in docker, recommended\n",
585
+ "user_proxy.initiate_chat(assistant, message=\"Plot a chart of NVDA and TESLA stock price change YTD.\")\n",
586
+ "# This initiates an automated chat between the two agents to solve the task\n",
587
+ "```\n",
588
+ "\n",
589
+ "more python code:\n",
590
+ "\n",
591
+ "```python\n",
592
+ " def create(\n",
593
+ " self,\n",
594
+ " *,\n",
595
+ " messages: Iterable[ChatCompletionMessageParam],\n",
596
+ " model: Union[str, ChatModel],\n",
597
+ " frequency_penalty: Optional[float] | NotGiven = NOT_GIVEN,\n",
598
+ " function_call: completion_create_params.FunctionCall | NotGiven = NOT_GIVEN,\n",
599
+ " functions: Iterable[completion_create_params.Function] | NotGiven = NOT_GIVEN,\n",
600
+ " logit_bias: Optional[Dict[str, int]] | NotGiven = NOT_GIVEN,\n",
601
+ " logprobs: Optional[bool] | NotGiven = NOT_GIVEN,\n",
602
+ " max_tokens: Optional[int] | NotGiven = NOT_GIVEN,\n",
603
+ " n: Optional[int] | NotGiven = NOT_GIVEN,\n",
604
+ " presence_penalty: Optional[float] | NotGiven = NOT_GIVEN,\n",
605
+ " response_format: completion_create_params.ResponseFormat | NotGiven = NOT_GIVEN,\n",
606
+ " seed: Optional[int] | NotGiven = NOT_GIVEN,\n",
607
+ " stop: Union[Optional[str], List[str]] | NotGiven = NOT_GIVEN,\n",
608
+ " stream: Optional[Literal[False]] | Literal[True] | NotGiven = NOT_GIVEN,\n",
609
+ " stream_options: Optional[ChatCompletionStreamOptionsParam] | NotGiven = NOT_GIVEN,\n",
610
+ " temperature: Optional[float] | NotGiven = NOT_GIVEN,\n",
611
+ " tool_choice: ChatCompletionToolChoiceOptionParam | NotGiven = NOT_GIVEN,\n",
612
+ " tools: Iterable[ChatCompletionToolParam] | NotGiven = NOT_GIVEN,\n",
613
+ " top_logprobs: Optional[int] | NotGiven = NOT_GIVEN,\n",
614
+ " top_p: Optional[float] | NotGiven = NOT_GIVEN,\n",
615
+ " user: str | NotGiven = NOT_GIVEN,\n",
616
+ " # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.\n",
617
+ " # The extra values given here take precedence over values defined on the client or passed to this method.\n",
618
+ " extra_headers: Headers | None = None,\n",
619
+ " extra_query: Query | None = None,\n",
620
+ " extra_body: Body | None = None,\n",
621
+ " timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,\n",
622
+ " ) -> ChatCompletion | Stream[ChatCompletionChunk]:\n",
623
+ " return self._post(\n",
624
+ " \"/chat/completions\",\n",
625
+ " body=maybe_transform(\n",
626
+ " {\n",
627
+ " \"messages\": messages,\n",
628
+ " \"model\": model,\n",
629
+ " \"frequency_penalty\": frequency_penalty,\n",
630
+ " \"function_call\": function_call,\n",
631
+ " \"functions\": functions,\n",
632
+ " \"logit_bias\": logit_bias,\n",
633
+ " \"logprobs\": logprobs,\n",
634
+ " \"max_tokens\": max_tokens,\n",
635
+ " \"n\": n,\n",
636
+ " \"presence_penalty\": presence_penalty,\n",
637
+ " \"response_format\": response_format,\n",
638
+ " \"seed\": seed,\n",
639
+ " \"stop\": stop,\n",
640
+ " \"stream\": stream,\n",
641
+ " \"stream_options\": stream_options,\n",
642
+ " \"temperature\": temperature,\n",
643
+ " \"tool_choice\": tool_choice,\n",
644
+ " \"tools\": tools,\n",
645
+ " \"top_logprobs\": top_logprobs,\n",
646
+ " \"top_p\": top_p,\n",
647
+ " \"user\": user,\n",
648
+ " },\n",
649
+ " completion_create_params.CompletionCreateParams,\n",
650
+ " ),\n",
651
+ " options=make_request_options(\n",
652
+ " extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout\n",
653
+ " ),\n",
654
+ " cast_to=ChatCompletion,\n",
655
+ " stream=stream or False,\n",
656
+ " stream_cls=Stream[ChatCompletionChunk],\n",
657
+ " )\n",
658
+ "```\n",
659
+ "\n",
660
+ "length: 5397\n"
661
+ ]
662
+ }
663
+ ],
664
+ "source": [
665
+ "vocab = {idx: bytes([idx]) for idx in range(256)} # utf-8 vocab\n",
666
+ "for (p0, p1), idx in merges.items():\n",
667
+ " vocab[idx] = vocab[p0] + vocab[p1] # adding the extra vocab tokens (256 - 296)\n",
668
+ "\n",
669
+ "def decode(ids):\n",
670
+ " bytetokens = b\"\".join(vocab[i] for i in ids)\n",
671
+ " text = bytetokens.decode(\"utf-8\", errors=\"replace\") # if there are any errors, replace them with a question mark\n",
672
+ " return text\n",
673
+ "\n",
674
+ "print('---')\n",
675
+ "print(decode(ids))\n",
676
+ "print('length: ', len(decode(ids)))"
677
+ ]
678
+ },
679
+ {
680
+ "cell_type": "markdown",
681
+ "metadata": {},
682
+ "source": [
683
+ "#### encoding\n",
684
+ "convert the string into the tokens"
685
+ ]
686
+ },
687
+ {
688
+ "cell_type": "code",
689
+ "execution_count": 17,
690
+ "metadata": {},
691
+ "outputs": [
692
+ {
693
+ "name": "stdout",
694
+ "output_type": "stream",
695
+ "text": [
696
+ "[104, 107]\n"
697
+ ]
698
+ }
699
+ ],
700
+ "source": [
701
+ "def encode(texts):\n",
702
+ " tokens = list(texts.encode('utf-8'))\n",
703
+ " while len(tokens) >=2:\n",
704
+ " stats = get_stats(tokens)\n",
705
+ " pair = min(stats, key=lambda p: merges.get(p, float('inf'))) # selects the pair with minimum prioroty\n",
706
+ " if pair not in merges:\n",
707
+ " break\n",
708
+ " idx = merges[pair]\n",
709
+ " tokens = merge(tokens, pair, idx)\n",
710
+ " return tokens\n",
711
+ "\n",
712
+ "print(encode(\"hk\"))"
713
+ ]
714
+ },
715
+ {
716
+ "cell_type": "markdown",
717
+ "metadata": {},
718
+ "source": [
719
+ "*the line ensures the algorithm respects the merge priorities defined\n",
720
+ "```\n",
721
+ "pair = min(stats, key=lambda p: merges.get(p, float('inf')))\n",
722
+ "```"
723
+ ]
724
+ },
725
+ {
726
+ "cell_type": "code",
727
+ "execution_count": 18,
728
+ "metadata": {},
729
+ "outputs": [
730
+ {
731
+ "name": "stdout",
732
+ "output_type": "stream",
733
+ "text": [
734
+ " presence_penalty \n"
735
+ ]
736
+ }
737
+ ],
738
+ "source": [
739
+ "print(decode(encode(\" presence_penalty \")))"
740
+ ]
741
+ }
742
+ ],
743
+ "metadata": {
744
+ "kernelspec": {
745
+ "display_name": "Python 3",
746
+ "language": "python",
747
+ "name": "python3"
748
+ },
749
+ "language_info": {
750
+ "codemirror_mode": {
751
+ "name": "ipython",
752
+ "version": 3
753
+ },
754
+ "file_extension": ".py",
755
+ "mimetype": "text/x-python",
756
+ "name": "python",
757
+ "nbconvert_exporter": "python",
758
+ "pygments_lexer": "ipython3",
759
+ "version": "3.11.4"
760
+ }
761
+ },
762
+ "nbformat": 4,
763
+ "nbformat_minor": 2
764
+ }
tokenizer/public/tokenizer.png ADDED
tokenizer/sample/bpetokenizer/sample_bpetokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7204cc27b30a3ec11d1c6bb741376eabc5345f4660def0d564cff1cda29a6a28
3
+ size 720
tokenizer/sample/bpetokenizer/sample_bpetokenizer.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ sys.path.append("../")
3
+
4
+ from bpetokenizer import BPETokenizer
5
+
6
+ special_tokens = {
7
+ "<|endoftext|>": 1001,
8
+ "<|startoftext|>": 1002,
9
+ "[SPECIAL1]": 1003,
10
+ "[SPECIAL2]": 1004,
11
+ }
12
+
13
+ tokenizer = BPETokenizer(special_tokens=special_tokens)
14
+ texts = "<|startoftext|> Hello, World! This is a sample text with the special tokens [SPECIAL1] and [SPECIAL2] to test the tokenizer.<|endoftext|>"
15
+
16
+ tokenizer.train(texts, vocab_size=310, verbose=True)
17
+
18
+ encode_text = """
19
+ <|startoftext|>Hello, World! This is a sample text with the special tokens [SPECIAL1] and [SPECIAL2] to test the tokenizer.
20
+ Hello, Universe! Another example sentence containing [SPECIAL1] and [SPECIAL2], used to ensure tokenizer's robustness.
21
+ Greetings, Earth! Here we have [SPECIAL1] appearing once again, followed by [SPECIAL2] in the same sentence.
22
+ Hello, World! This is yet another sample text, with [SPECIAL1] and [SPECIAL2] making an appearance.
23
+ Hey there, World! Testing the tokenizer with [SPECIAL1] and [SPECIAL2] to see if it handles special tokens properly.
24
+ Salutations, Planet! The tokenizer should recognize [SPECIAL1] and [SPECIAL2] in this long string of text.
25
+ Hello again, World! [SPECIAL1] and [SPECIAL2] are special tokens that need to be handled correctly by the tokenizer.
26
+ Welcome, World! Including [SPECIAL1] and [SPECIAL2] multiple times in this large text to ensure proper encoding.
27
+ Hi, World! Let's add [SPECIAL1] and [SPECIAL2] in various parts of this long sentence to test the tokenizer thoroughly.
28
+ <|endoftext|>
29
+ """
30
+ ids = tokenizer.encode(encode_text, special_tokens="all")
31
+ print(ids)
32
+
33
+ decode_text = tokenizer.decode(ids)
34
+ print(decode_text)
35
+
36
+ tokenizer.save("sample_bpetokenizer")
tokenizer/sample/bpetokenizer/sample_bpetokenizer.vocab ADDED
@@ -0,0 +1,310 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [\u0000] 0
2
+ [\u0001] 1
3
+ [\u0002] 2
4
+ [\u0003] 3
5
+ [\u0004] 4
6
+ [\u0005] 5
7
+ [\u0006] 6
8
+ [\u0007] 7
9
+ [\u0008] 8
10
+ [\u0009] 9
11
+ [\u000a] 10
12
+ [\u000b] 11
13
+ [\u000c] 12
14
+ [\u000d] 13
15
+ [\u000e] 14
16
+ [\u000f] 15
17
+ [\u0010] 16
18
+ [\u0011] 17
19
+ [\u0012] 18
20
+ [\u0013] 19
21
+ [\u0014] 20
22
+ [\u0015] 21
23
+ [\u0016] 22
24
+ [\u0017] 23
25
+ [\u0018] 24
26
+ [\u0019] 25
27
+ [\u001a] 26
28
+ [\u001b] 27
29
+ [\u001c] 28
30
+ [\u001d] 29
31
+ [\u001e] 30
32
+ [\u001f] 31
33
+ [ ] 32
34
+ [!] 33
35
+ ["] 34
36
+ [#] 35
37
+ [$] 36
38
+ [%] 37
39
+ [&] 38
40
+ ['] 39
41
+ [(] 40
42
+ [)] 41
43
+ [*] 42
44
+ [+] 43
45
+ [,] 44
46
+ [-] 45
47
+ [.] 46
48
+ [/] 47
49
+ [0] 48
50
+ [1] 49
51
+ [2] 50
52
+ [3] 51
53
+ [4] 52
54
+ [5] 53
55
+ [6] 54
56
+ [7] 55
57
+ [8] 56
58
+ [9] 57
59
+ [:] 58
60
+ [;] 59
61
+ [<] 60
62
+ [=] 61
63
+ [>] 62
64
+ [?] 63
65
+ [@] 64
66
+ [A] 65
67
+ [B] 66
68
+ [C] 67
69
+ [D] 68
70
+ [E] 69
71
+ [F] 70
72
+ [G] 71
73
+ [H] 72
74
+ [I] 73
75
+ [J] 74
76
+ [K] 75
77
+ [L] 76
78
+ [M] 77
79
+ [N] 78
80
+ [O] 79
81
+ [P] 80
82
+ [Q] 81
83
+ [R] 82
84
+ [S] 83
85
+ [T] 84
86
+ [U] 85
87
+ [V] 86
88
+ [W] 87
89
+ [X] 88
90
+ [Y] 89
91
+ [Z] 90
92
+ [[] 91
93
+ [\] 92
94
+ []] 93
95
+ [^] 94
96
+ [_] 95
97
+ [`] 96
98
+ [a] 97
99
+ [b] 98
100
+ [c] 99
101
+ [d] 100
102
+ [e] 101
103
+ [f] 102
104
+ [g] 103
105
+ [h] 104
106
+ [i] 105
107
+ [j] 106
108
+ [k] 107
109
+ [l] 108
110
+ [m] 109
111
+ [n] 110
112
+ [o] 111
113
+ [p] 112
114
+ [q] 113
115
+ [r] 114
116
+ [s] 115
117
+ [t] 116
118
+ [u] 117
119
+ [v] 118
120
+ [w] 119
121
+ [x] 120
122
+ [y] 121
123
+ [z] 122
124
+ [{] 123
125
+ [|] 124
126
+ [}] 125
127
+ [~] 126
128
+ [\u007f] 127
129
+ [�] 128
130
+ [�] 129
131
+ [�] 130
132
+ [�] 131
133
+ [�] 132
134
+ [�] 133
135
+ [�] 134
136
+ [�] 135
137
+ [�] 136
138
+ [�] 137
139
+ [�] 138
140
+ [�] 139
141
+ [�] 140
142
+ [�] 141
143
+ [�] 142
144
+ [�] 143
145
+ [�] 144
146
+ [�] 145
147
+ [�] 146
148
+ [�] 147
149
+ [�] 148
150
+ [�] 149
151
+ [�] 150
152
+ [�] 151
153
+ [�] 152
154
+ [�] 153
155
+ [�] 154
156
+ [�] 155
157
+ [�] 156
158
+ [�] 157
159
+ [�] 158
160
+ [�] 159
161
+ [�] 160
162
+ [�] 161
163
+ [�] 162
164
+ [�] 163
165
+ [�] 164
166
+ [�] 165
167
+ [�] 166
168
+ [�] 167
169
+ [�] 168
170
+ [�] 169
171
+ [�] 170
172
+ [�] 171
173
+ [�] 172
174
+ [�] 173
175
+ [�] 174
176
+ [�] 175
177
+ [�] 176
178
+ [�] 177
179
+ [�] 178
180
+ [�] 179
181
+ [�] 180
182
+ [�] 181
183
+ [�] 182
184
+ [�] 183
185
+ [�] 184
186
+ [�] 185
187
+ [�] 186
188
+ [�] 187
189
+ [�] 188
190
+ [�] 189
191
+ [�] 190
192
+ [�] 191
193
+ [�] 192
194
+ [�] 193
195
+ [�] 194
196
+ [�] 195
197
+ [�] 196
198
+ [�] 197
199
+ [�] 198
200
+ [�] 199
201
+ [�] 200
202
+ [�] 201
203
+ [�] 202
204
+ [�] 203
205
+ [�] 204
206
+ [�] 205
207
+ [�] 206
208
+ [�] 207
209
+ [�] 208
210
+ [�] 209
211
+ [�] 210
212
+ [�] 211
213
+ [�] 212
214
+ [�] 213
215
+ [�] 214
216
+ [�] 215
217
+ [�] 216
218
+ [�] 217
219
+ [�] 218
220
+ [�] 219
221
+ [�] 220
222
+ [�] 221
223
+ [�] 222
224
+ [�] 223
225
+ [�] 224
226
+ [�] 225
227
+ [�] 226
228
+ [�] 227
229
+ [�] 228
230
+ [�] 229
231
+ [�] 230
232
+ [�] 231
233
+ [�] 232
234
+ [�] 233
235
+ [�] 234
236
+ [�] 235
237
+ [�] 236
238
+ [�] 237
239
+ [�] 238
240
+ [�] 239
241
+ [�] 240
242
+ [�] 241
243
+ [�] 242
244
+ [�] 243
245
+ [�] 244
246
+ [�] 245
247
+ [�] 246
248
+ [�] 247
249
+ [�] 248
250
+ [�] 249
251
+ [�] 250
252
+ [�] 251
253
+ [�] 252
254
+ [�] 253
255
+ [�] 254
256
+ [�] 255
257
+ [ ][t] -> [ t] 256
258
+ [e][x] -> [ex] 257
259
+ [ex][t] -> [ext] 258
260
+ [ t][o] -> [ to] 259
261
+ [e][n] -> [en] 260
262
+ [<][|] -> [<|] 261
263
+ [s][t] -> [st] 262
264
+ [o][f] -> [of] 263
265
+ [of][t] -> [oft] 264
266
+ [oft][ext] -> [oftext] 265
267
+ [|][>] -> [|>] 266
268
+ [i][s] -> [is] 267
269
+ [ ][a] -> [ a] 268
270
+ [ ][s] -> [ s] 269
271
+ [ t][h] -> [ th] 270
272
+ [ th][e] -> [ the] 271
273
+ [ to][k] -> [ tok] 272
274
+ [ tok][en] -> [ token] 273
275
+ [ ][[] -> [ [] 274
276
+ [S][P] -> [SP] 275
277
+ [SP][E] -> [SPE] 276
278
+ [SPE][C] -> [SPEC] 277
279
+ [SPEC][I] -> [SPECI] 278
280
+ [SPECI][A] -> [SPECIA] 279
281
+ [SPECIA][L] -> [SPECIAL] 280
282
+ [st][a] -> [sta] 281
283
+ [sta][r] -> [star] 282
284
+ [star][t] -> [start] 283
285
+ [start][oftext] -> [startoftext] 284
286
+ [ ][H] -> [ H] 285
287
+ [ H][e] -> [ He] 286
288
+ [ He][l] -> [ Hel] 287
289
+ [ Hel][l] -> [ Hell] 288
290
+ [ Hell][o] -> [ Hello] 289
291
+ [ ][W] -> [ W] 290
292
+ [ W][o] -> [ Wo] 291
293
+ [ Wo][r] -> [ Wor] 292
294
+ [ Wor][l] -> [ Worl] 293
295
+ [ Worl][d] -> [ World] 294
296
+ [ ][T] -> [ T] 295
297
+ [ T][h] -> [ Th] 296
298
+ [ Th][is] -> [ This] 297
299
+ [ ][is] -> [ is] 298
300
+ [ s][a] -> [ sa] 299
301
+ [ sa][m] -> [ sam] 300
302
+ [ sam][p] -> [ samp] 301
303
+ [ samp][l] -> [ sampl] 302
304
+ [ sampl][e] -> [ sample] 303
305
+ [ t][ext] -> [ text] 304
306
+ [ ][w] -> [ w] 305
307
+ [ w][i] -> [ wi] 306
308
+ [ wi][t] -> [ wit] 307
309
+ [ wit][h] -> [ with] 308
310
+ [ s][p] -> [ sp] 309
tokenizer/sample/load_json_vocab/bpetokenizer_json.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from bpetokenizer import BPETokenizer
2
+
3
+ tokenizer = BPETokenizer()
4
+
5
+ tokenizer.load("sample_bpetokenizer.json", mode="json")
6
+
7
+ encode_text = """
8
+ <|startoftext|>Hello, World! This is a sample text with the special tokens [SPECIAL1] and [SPECIAL2] to test the tokenizer.
9
+ Hello, Universe! Another example sentence containing [SPECIAL1] and [SPECIAL2], used to ensure tokenizer's robustness.
10
+ Greetings, Earth! Here we have [SPECIAL1] appearing once again, followed by [SPECIAL2] in the same sentence.<|endoftext|>"""
11
+
12
+ print("vocab: ", tokenizer.vocab)
13
+ print('---')
14
+ print("merges: ", tokenizer.merges)
15
+ print('---')
16
+ print("special tokens: ", tokenizer.special_tokens)
17
+
18
+ ids = tokenizer.encode(encode_text, special_tokens="all")
19
+ print('---')
20
+ print('Ids: ', ids)
21
+
22
+ decode_text = tokenizer.decode(ids)
23
+ print('---')
24
+ print(decode_text)
tokenizer/sample/load_json_vocab/sample_bpetokenizer.json ADDED
@@ -0,0 +1,378 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": "1.0.31",
3
+ "pattern": "'(?i:[sdmt]|ll|ve|re)|[^\\r\\n\\p{L}\\p{N}]?+\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]++[\\r\\n]*|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+",
4
+ "special_tokens": {
5
+ "<|endoftext|>": 311,
6
+ "<|startoftext|>": 312,
7
+ "[SPECIAL1]": 313,
8
+ "[SPECIAL2]": 314
9
+ },
10
+ "merges": {
11
+ "(32, 116)": 256,
12
+ "(101, 120)": 257,
13
+ "(257, 116)": 258,
14
+ "(256, 111)": 259,
15
+ "(101, 110)": 260,
16
+ "(60, 124)": 261,
17
+ "(115, 116)": 262,
18
+ "(111, 102)": 263,
19
+ "(263, 116)": 264,
20
+ "(264, 258)": 265,
21
+ "(124, 62)": 266,
22
+ "(105, 115)": 267,
23
+ "(32, 97)": 268,
24
+ "(32, 115)": 269,
25
+ "(256, 104)": 270,
26
+ "(270, 101)": 271,
27
+ "(259, 107)": 272,
28
+ "(272, 260)": 273,
29
+ "(32, 91)": 274,
30
+ "(83, 80)": 275,
31
+ "(275, 69)": 276,
32
+ "(276, 67)": 277,
33
+ "(277, 73)": 278,
34
+ "(278, 65)": 279,
35
+ "(279, 76)": 280,
36
+ "(262, 97)": 281,
37
+ "(281, 114)": 282,
38
+ "(282, 116)": 283,
39
+ "(283, 265)": 284,
40
+ "(32, 72)": 285,
41
+ "(285, 101)": 286,
42
+ "(286, 108)": 287,
43
+ "(287, 108)": 288,
44
+ "(288, 111)": 289,
45
+ "(32, 87)": 290,
46
+ "(290, 111)": 291,
47
+ "(291, 114)": 292,
48
+ "(292, 108)": 293,
49
+ "(293, 100)": 294,
50
+ "(32, 84)": 295,
51
+ "(295, 104)": 296,
52
+ "(296, 267)": 297,
53
+ "(32, 267)": 298,
54
+ "(269, 97)": 299,
55
+ "(299, 109)": 300,
56
+ "(300, 112)": 301,
57
+ "(301, 108)": 302,
58
+ "(302, 101)": 303,
59
+ "(256, 258)": 304,
60
+ "(32, 119)": 305,
61
+ "(305, 105)": 306,
62
+ "(306, 116)": 307,
63
+ "(307, 104)": 308,
64
+ "(269, 112)": 309
65
+ },
66
+ "vocab": {
67
+ "0": "\\u0000",
68
+ "1": "\\u0001",
69
+ "2": "\\u0002",
70
+ "3": "\\u0003",
71
+ "4": "\\u0004",
72
+ "5": "\\u0005",
73
+ "6": "\\u0006",
74
+ "7": "\\u0007",
75
+ "8": "\\u0008",
76
+ "9": "\\u0009",
77
+ "10": "\\u000a",
78
+ "11": "\\u000b",
79
+ "12": "\\u000c",
80
+ "13": "\\u000d",
81
+ "14": "\\u000e",
82
+ "15": "\\u000f",
83
+ "16": "\\u0010",
84
+ "17": "\\u0011",
85
+ "18": "\\u0012",
86
+ "19": "\\u0013",
87
+ "20": "\\u0014",
88
+ "21": "\\u0015",
89
+ "22": "\\u0016",
90
+ "23": "\\u0017",
91
+ "24": "\\u0018",
92
+ "25": "\\u0019",
93
+ "26": "\\u001a",
94
+ "27": "\\u001b",
95
+ "28": "\\u001c",
96
+ "29": "\\u001d",
97
+ "30": "\\u001e",
98
+ "31": "\\u001f",
99
+ "32": " ",
100
+ "33": "!",
101
+ "34": "\"",
102
+ "35": "#",
103
+ "36": "$",
104
+ "37": "%",
105
+ "38": "&",
106
+ "39": "'",
107
+ "40": "(",
108
+ "41": ")",
109
+ "42": "*",
110
+ "43": "+",
111
+ "44": ",",
112
+ "45": "-",
113
+ "46": ".",
114
+ "47": "/",
115
+ "48": "0",
116
+ "49": "1",
117
+ "50": "2",
118
+ "51": "3",
119
+ "52": "4",
120
+ "53": "5",
121
+ "54": "6",
122
+ "55": "7",
123
+ "56": "8",
124
+ "57": "9",
125
+ "58": ":",
126
+ "59": ";",
127
+ "60": "<",
128
+ "61": "=",
129
+ "62": ">",
130
+ "63": "?",
131
+ "64": "@",
132
+ "65": "A",
133
+ "66": "B",
134
+ "67": "C",
135
+ "68": "D",
136
+ "69": "E",
137
+ "70": "F",
138
+ "71": "G",
139
+ "72": "H",
140
+ "73": "I",
141
+ "74": "J",
142
+ "75": "K",
143
+ "76": "L",
144
+ "77": "M",
145
+ "78": "N",
146
+ "79": "O",
147
+ "80": "P",
148
+ "81": "Q",
149
+ "82": "R",
150
+ "83": "S",
151
+ "84": "T",
152
+ "85": "U",
153
+ "86": "V",
154
+ "87": "W",
155
+ "88": "X",
156
+ "89": "Y",
157
+ "90": "Z",
158
+ "91": "[",
159
+ "92": "\\",
160
+ "93": "]",
161
+ "94": "^",
162
+ "95": "_",
163
+ "96": "`",
164
+ "97": "a",
165
+ "98": "b",
166
+ "99": "c",
167
+ "100": "d",
168
+ "101": "e",
169
+ "102": "f",
170
+ "103": "g",
171
+ "104": "h",
172
+ "105": "i",
173
+ "106": "j",
174
+ "107": "k",
175
+ "108": "l",
176
+ "109": "m",
177
+ "110": "n",
178
+ "111": "o",
179
+ "112": "p",
180
+ "113": "q",
181
+ "114": "r",
182
+ "115": "s",
183
+ "116": "t",
184
+ "117": "u",
185
+ "118": "v",
186
+ "119": "w",
187
+ "120": "x",
188
+ "121": "y",
189
+ "122": "z",
190
+ "123": "{",
191
+ "124": "|",
192
+ "125": "}",
193
+ "126": "~",
194
+ "127": "\\u007f",
195
+ "128": "�",
196
+ "129": "�",
197
+ "130": "�",
198
+ "131": "�",
199
+ "132": "�",
200
+ "133": "�",
201
+ "134": "�",
202
+ "135": "�",
203
+ "136": "�",
204
+ "137": "�",
205
+ "138": "�",
206
+ "139": "�",
207
+ "140": "�",
208
+ "141": "�",
209
+ "142": "�",
210
+ "143": "�",
211
+ "144": "�",
212
+ "145": "�",
213
+ "146": "�",
214
+ "147": "�",
215
+ "148": "�",
216
+ "149": "�",
217
+ "150": "�",
218
+ "151": "�",
219
+ "152": "�",
220
+ "153": "�",
221
+ "154": "�",
222
+ "155": "�",
223
+ "156": "�",
224
+ "157": "�",
225
+ "158": "�",
226
+ "159": "�",
227
+ "160": "�",
228
+ "161": "�",
229
+ "162": "�",
230
+ "163": "�",
231
+ "164": "�",
232
+ "165": "�",
233
+ "166": "�",
234
+ "167": "�",
235
+ "168": "�",
236
+ "169": "�",
237
+ "170": "�",
238
+ "171": "�",
239
+ "172": "�",
240
+ "173": "�",
241
+ "174": "�",
242
+ "175": "�",
243
+ "176": "�",
244
+ "177": "�",
245
+ "178": "�",
246
+ "179": "�",
247
+ "180": "�",
248
+ "181": "�",
249
+ "182": "�",
250
+ "183": "�",
251
+ "184": "�",
252
+ "185": "�",
253
+ "186": "�",
254
+ "187": "�",
255
+ "188": "�",
256
+ "189": "�",
257
+ "190": "�",
258
+ "191": "�",
259
+ "192": "�",
260
+ "193": "�",
261
+ "194": "�",
262
+ "195": "�",
263
+ "196": "�",
264
+ "197": "�",
265
+ "198": "�",
266
+ "199": "�",
267
+ "200": "�",
268
+ "201": "�",
269
+ "202": "�",
270
+ "203": "�",
271
+ "204": "�",
272
+ "205": "�",
273
+ "206": "�",
274
+ "207": "�",
275
+ "208": "�",
276
+ "209": "�",
277
+ "210": "�",
278
+ "211": "�",
279
+ "212": "�",
280
+ "213": "�",
281
+ "214": "�",
282
+ "215": "�",
283
+ "216": "�",
284
+ "217": "�",
285
+ "218": "�",
286
+ "219": "�",
287
+ "220": "�",
288
+ "221": "�",
289
+ "222": "�",
290
+ "223": "�",
291
+ "224": "�",
292
+ "225": "�",
293
+ "226": "�",
294
+ "227": "�",
295
+ "228": "�",
296
+ "229": "�",
297
+ "230": "�",
298
+ "231": "�",
299
+ "232": "�",
300
+ "233": "�",
301
+ "234": "�",
302
+ "235": "�",
303
+ "236": "�",
304
+ "237": "�",
305
+ "238": "�",
306
+ "239": "�",
307
+ "240": "�",
308
+ "241": "�",
309
+ "242": "�",
310
+ "243": "�",
311
+ "244": "�",
312
+ "245": "�",
313
+ "246": "�",
314
+ "247": "�",
315
+ "248": "�",
316
+ "249": "�",
317
+ "250": "�",
318
+ "251": "�",
319
+ "252": "�",
320
+ "253": "�",
321
+ "254": "�",
322
+ "255": "�",
323
+ "256": " t",
324
+ "257": "ex",
325
+ "258": "ext",
326
+ "259": " to",
327
+ "260": "en",
328
+ "261": "<|",
329
+ "262": "st",
330
+ "263": "of",
331
+ "264": "oft",
332
+ "265": "oftext",
333
+ "266": "|>",
334
+ "267": "is",
335
+ "268": " a",
336
+ "269": " s",
337
+ "270": " th",
338
+ "271": " the",
339
+ "272": " tok",
340
+ "273": " token",
341
+ "274": " [",
342
+ "275": "SP",
343
+ "276": "SPE",
344
+ "277": "SPEC",
345
+ "278": "SPECI",
346
+ "279": "SPECIA",
347
+ "280": "SPECIAL",
348
+ "281": "sta",
349
+ "282": "star",
350
+ "283": "start",
351
+ "284": "startoftext",
352
+ "285": " H",
353
+ "286": " He",
354
+ "287": " Hel",
355
+ "288": " Hell",
356
+ "289": " Hello",
357
+ "290": " W",
358
+ "291": " Wo",
359
+ "292": " Wor",
360
+ "293": " Worl",
361
+ "294": " World",
362
+ "295": " T",
363
+ "296": " Th",
364
+ "297": " This",
365
+ "298": " is",
366
+ "299": " sa",
367
+ "300": " sam",
368
+ "301": " samp",
369
+ "302": " sampl",
370
+ "303": " sample",
371
+ "304": " text",
372
+ "305": " w",
373
+ "306": " wi",
374
+ "307": " wit",
375
+ "308": " with",
376
+ "309": " sp"
377
+ }
378
+ }
tokenizer/sample/load_json_vocab/tokens.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ sys.path.append('../')
3
+
4
+ from bpetokenizer import BPETokenizer
5
+
6
+ # intializing the tokenizer
7
+ tokenizer = BPETokenizer()
8
+
9
+ # load the vocab which is pretrained
10
+ tokenizer.load("sample_bpetokenizer.json", mode="json")
11
+
12
+ text = "<|startoftext|>This method? generates the tokens! which are split, before the tokenization using the pattern: default we use the gpt4 split pattern mentioned in the tiktoken.<|endoftext|>"
13
+
14
+
15
+ # this method returns a list of tokens of the text passed.
16
+ tokens = tokenizer.tokens(text, verbose=True) # if verbose, prints the text chunks and also the pattern used to split.
17
+ print('---')
18
+ print("tokens: ", tokens)
19
+
20
+ """
21
+ tokens: ['<|', 'st', 'ar', 't', 'oftext', '|>', 'T', 'h', 'is', ' ', 'm', 'e', 'th', 'o', 'd', '?', ' ', 'g', 'en', 'er', 'a', 't', 'e', 's', ' the', ' token',
22
+ 's', '!', ' w', 'h', 'i', 'c', 'h', ' a', 'r', 'e', ' s', 'pl', 'i', 't', ',', ' ', 'b', 'e', 'f', 'o', 'r', 'e', ' the',
23
+ ' tokeniz', 'a', 't', 'i', 'on', ' ', 'u', 's', 'ing', ' the', ' ', 'p', 'a', 't', 't', 'er', 'n', ':', ' ', 'd', 'e', 'f', 'a', 'u', 'l', 't', ' w', 'e', ' ',
24
+ 'u', 'se', ' the', ' ', 'g', 'p', 't', '4', ' s', 'pl', 'i', 't', ' ', 'p', 'a', 't', 't', 'er', 'n', ' ',
25
+ 'm', 'en', 't', 'i', 'on', 'e', 'd', ' ', 'in', ' the', ' t', 'i', 'k', 't', 'o', 'k', 'en', '.', '<|', 'en', 'd', 'oftext', '|>']
26
+ """
tokenizer/sample/tokenizer/wiki.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e8329f536bb6d37b88b4bd5b75770c5240a06d343f32756dd34eea37c49d69e2
3
+ size 36
tokenizer/sample/tokenizer/wiki.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from bpetokenizer import Tokenizer
2
+
3
+ text = "aaabdaaabac"
4
+ tokenizer = Tokenizer()
5
+ tokenizer.train(text, 259, verbose=True)
6
+
7
+ ids = tokenizer.encode(text)
8
+ print(ids)
9
+ print('---')
10
+
11
+ decoded_text = tokenizer.decode(ids)
12
+ print(decoded_text)
13
+
14
+ tokenizer.save("wiki")
tokenizer/sample/tokenizer/wiki.vocab ADDED
@@ -0,0 +1,259 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [\u0000] 0
2
+ [\u0001] 1
3
+ [\u0002] 2
4
+ [\u0003] 3
5
+ [\u0004] 4
6
+ [\u0005] 5
7
+ [\u0006] 6
8
+ [\u0007] 7
9
+ [\u0008] 8
10
+ [\u0009] 9
11
+ [\u000a] 10
12
+ [\u000b] 11
13
+ [\u000c] 12
14
+ [\u000d] 13
15
+ [\u000e] 14
16
+ [\u000f] 15
17
+ [\u0010] 16
18
+ [\u0011] 17
19
+ [\u0012] 18
20
+ [\u0013] 19
21
+ [\u0014] 20
22
+ [\u0015] 21
23
+ [\u0016] 22
24
+ [\u0017] 23
25
+ [\u0018] 24
26
+ [\u0019] 25
27
+ [\u001a] 26
28
+ [\u001b] 27
29
+ [\u001c] 28
30
+ [\u001d] 29
31
+ [\u001e] 30
32
+ [\u001f] 31
33
+ [ ] 32
34
+ [!] 33
35
+ ["] 34
36
+ [#] 35
37
+ [$] 36
38
+ [%] 37
39
+ [&] 38
40
+ ['] 39
41
+ [(] 40
42
+ [)] 41
43
+ [*] 42
44
+ [+] 43
45
+ [,] 44
46
+ [-] 45
47
+ [.] 46
48
+ [/] 47
49
+ [0] 48
50
+ [1] 49
51
+ [2] 50
52
+ [3] 51
53
+ [4] 52
54
+ [5] 53
55
+ [6] 54
56
+ [7] 55
57
+ [8] 56
58
+ [9] 57
59
+ [:] 58
60
+ [;] 59
61
+ [<] 60
62
+ [=] 61
63
+ [>] 62
64
+ [?] 63
65
+ [@] 64
66
+ [A] 65
67
+ [B] 66
68
+ [C] 67
69
+ [D] 68
70
+ [E] 69
71
+ [F] 70
72
+ [G] 71
73
+ [H] 72
74
+ [I] 73
75
+ [J] 74
76
+ [K] 75
77
+ [L] 76
78
+ [M] 77
79
+ [N] 78
80
+ [O] 79
81
+ [P] 80
82
+ [Q] 81
83
+ [R] 82
84
+ [S] 83
85
+ [T] 84
86
+ [U] 85
87
+ [V] 86
88
+ [W] 87
89
+ [X] 88
90
+ [Y] 89
91
+ [Z] 90
92
+ [[] 91
93
+ [\] 92
94
+ []] 93
95
+ [^] 94
96
+ [_] 95
97
+ [`] 96
98
+ [a] 97
99
+ [b] 98
100
+ [c] 99
101
+ [d] 100
102
+ [e] 101
103
+ [f] 102
104
+ [g] 103
105
+ [h] 104
106
+ [i] 105
107
+ [j] 106
108
+ [k] 107
109
+ [l] 108
110
+ [m] 109
111
+ [n] 110
112
+ [o] 111
113
+ [p] 112
114
+ [q] 113
115
+ [r] 114
116
+ [s] 115
117
+ [t] 116
118
+ [u] 117
119
+ [v] 118
120
+ [w] 119
121
+ [x] 120
122
+ [y] 121
123
+ [z] 122
124
+ [{] 123
125
+ [|] 124
126
+ [}] 125
127
+ [~] 126
128
+ [\u007f] 127
129
+ [�] 128
130
+ [�] 129
131
+ [�] 130
132
+ [�] 131
133
+ [�] 132
134
+ [�] 133
135
+ [�] 134
136
+ [�] 135
137
+ [�] 136
138
+ [�] 137
139
+ [�] 138
140
+ [�] 139
141
+ [�] 140
142
+ [�] 141
143
+ [�] 142
144
+ [�] 143
145
+ [�] 144
146
+ [�] 145
147
+ [�] 146
148
+ [�] 147
149
+ [�] 148
150
+ [�] 149
151
+ [�] 150
152
+ [�] 151
153
+ [�] 152
154
+ [�] 153
155
+ [�] 154
156
+ [�] 155
157
+ [�] 156
158
+ [�] 157
159
+ [�] 158
160
+ [�] 159
161
+ [�] 160
162
+ [�] 161
163
+ [�] 162
164
+ [�] 163
165
+ [�] 164
166
+ [�] 165
167
+ [�] 166
168
+ [�] 167
169
+ [�] 168
170
+ [�] 169
171
+ [�] 170
172
+ [�] 171
173
+ [�] 172
174
+ [�] 173
175
+ [�] 174
176
+ [�] 175
177
+ [�] 176
178
+ [�] 177
179
+ [�] 178
180
+ [�] 179
181
+ [�] 180
182
+ [�] 181
183
+ [�] 182
184
+ [�] 183
185
+ [�] 184
186
+ [�] 185
187
+ [�] 186
188
+ [�] 187
189
+ [�] 188
190
+ [�] 189
191
+ [�] 190
192
+ [�] 191
193
+ [�] 192
194
+ [�] 193
195
+ [�] 194
196
+ [�] 195
197
+ [�] 196
198
+ [�] 197
199
+ [�] 198
200
+ [�] 199
201
+ [�] 200
202
+ [�] 201
203
+ [�] 202
204
+ [�] 203
205
+ [�] 204
206
+ [�] 205
207
+ [�] 206
208
+ [�] 207
209
+ [�] 208
210
+ [�] 209
211
+ [�] 210
212
+ [�] 211
213
+ [�] 212
214
+ [�] 213
215
+ [�] 214
216
+ [�] 215
217
+ [�] 216
218
+ [�] 217
219
+ [�] 218
220
+ [�] 219
221
+ [�] 220
222
+ [�] 221
223
+ [�] 222
224
+ [�] 223
225
+ [�] 224
226
+ [�] 225
227
+ [�] 226
228
+ [�] 227
229
+ [�] 228
230
+ [�] 229
231
+ [�] 230
232
+ [�] 231
233
+ [�] 232
234
+ [�] 233
235
+ [�] 234
236
+ [�] 235
237
+ [�] 236
238
+ [�] 237
239
+ [�] 238
240
+ [�] 239
241
+ [�] 240
242
+ [�] 241
243
+ [�] 242
244
+ [�] 243
245
+ [�] 244
246
+ [�] 245
247
+ [�] 246
248
+ [�] 247
249
+ [�] 248
250
+ [�] 249
251
+ [�] 250
252
+ [�] 251
253
+ [�] 252
254
+ [�] 253
255
+ [�] 254
256
+ [�] 255
257
+ [a][a] -> [aa] 256
258
+ [aa][a] -> [aaa] 257
259
+ [aaa][b] -> [aaab] 258
tokenizer/setup.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from setuptools import find_packages, setup
3
+
4
+ here = os.path.abspath(os.path.dirname(__file__))
5
+
6
+ # Get the code version
7
+ version = {}
8
+ with open(os.path.join(here, "bpetokenizer/version.py")) as f:
9
+ exec(f.read(), version)
10
+ __version__ = version["__version__"]
11
+
12
+
13
+ with open("README.md", "r", encoding="utf-8") as f:
14
+ long_description = f.read()
15
+
16
+
17
+ setup(
18
+ name="bpetokenizer",
19
+ version=__version__,
20
+ description="Byte Pair Encoding Tokenizer with special tokens and regex pattern",
21
+ long_description=long_description,
22
+ long_description_content_type="text/markdown",
23
+ url="https://github.com/Hk669/bpetokenizer",
24
+ author="Hrushikesh Dokala",
25
+ author_email="hrushi669@gmail.com",
26
+ license="MIT",
27
+ packages=find_packages(include=["bpetokenizer"]),
28
+ classifiers=[
29
+ "License :: OSI Approved :: MIT License",
30
+ "Programming Language :: Python :: 3",
31
+ "Operating System :: OS Independent",
32
+ ],
33
+ install_requires=["regex"],
34
+ extras_require={
35
+ "dev": ["pytest", "twine"],
36
+ },
37
+ python_requires=">=3.9,<3.13",
38
+ )
tokenizer/tests/__pycache__/test_tokenizer.cpython-39-pytest-7.1.2.pyc ADDED
Binary file (7.46 kB). View file
 
tokenizer/tests/test_tokenizer.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pytest
3
+ from bpetokenizer import BPETokenizer, Tokenizer
4
+
5
+ @pytest.fixture
6
+ def tokenizer():
7
+ return Tokenizer()
8
+
9
+ @pytest.fixture
10
+ def bpe_tokenizer():
11
+ return BPETokenizer()
12
+
13
+
14
+ def test_train():
15
+ """Test the training of the tokenizer."""
16
+ text = "aaabdaaabac"
17
+ tokenizer = Tokenizer()
18
+ tokenizer.train(text, 259, verbose=False)
19
+ assert len(tokenizer.vocab) == 259
20
+ assert len(tokenizer.merges) == 3
21
+ assert tokenizer.decode(tokenizer.encode(text)) == "aaabdaaabac"
22
+
23
+
24
+ def test_encode():
25
+ """Test the encoding of the tokenizer."""
26
+ text = "aaabdaaabac"
27
+ tokenizer = Tokenizer()
28
+ tokenizer.train(text, 259, verbose=False)
29
+ assert tokenizer.encode("aaabdaaabac") == [258, 100, 258, 97, 99]
30
+
31
+
32
+ def test_decode():
33
+ """Test the decoding of the tokenizer."""
34
+ text = "aaabdaaabac"
35
+ tokenizer = Tokenizer()
36
+ tokenizer.train(text, 259, verbose=False)
37
+ assert tokenizer.decode([258, 100, 258, 97, 99]) == "aaabdaaabac"
38
+
39
+
40
+ def test_train_bpe():
41
+ """Test the training of the BPE tokenizer."""
42
+ text = "aaabdaaabac"
43
+ tokenizer = BPETokenizer()
44
+ tokenizer.train(text, 256 + 3, verbose=False)
45
+ assert len(tokenizer.vocab) == 259
46
+ assert len(tokenizer.merges) == 3
47
+ assert tokenizer.decode(tokenizer.encode(text)) == "aaabdaaabac"
48
+
49
+
50
+ def test_train_bpe_w_special_tokens():
51
+ """Test the bpetokenizer with special tokens"""
52
+ special_tokens = {
53
+ "<|endoftext|>": 1001,
54
+ "<|startoftext|>": 1002,
55
+ "[SPECIAL1]": 1003,
56
+ "[SPECIAL2]": 1004,
57
+ }
58
+
59
+ PATTERN = r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]++[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+"""
60
+ tokenizer = BPETokenizer(special_tokens=special_tokens, pattern=PATTERN)
61
+ texts = "<|startoftext|> Hello, World! This is a sample text with the special tokens [SPECIAL1] and [SPECIAL2] to test the tokenizer.<|endoftext|>"
62
+ tokenizer.train(texts, vocab_size=310, verbose=False)
63
+
64
+ assert len(tokenizer.vocab) == 310
65
+ assert len(tokenizer.merges) == 310 - 256
66
+ assert tokenizer.decode(tokenizer.encode(texts)) == texts
67
+ assert tokenizer.inverse_special_tokens == {v: k for k,v in special_tokens.items()}
68
+ assert tokenizer.special_tokens == special_tokens
69
+ assert tokenizer.pattern == PATTERN