bpetokenizer upload
Browse files- tokenizer/.github/workflows/pypi.yml +35 -0
- tokenizer/.github/workflows/tests.yml +41 -0
- tokenizer/.gitignore +156 -0
- tokenizer/README.md +151 -0
- tokenizer/bpetokenizer/__init__.py +3 -0
- tokenizer/bpetokenizer/__pycache__/__init__.cpython-39.pyc +0 -0
- tokenizer/bpetokenizer/__pycache__/base.cpython-39.pyc +0 -0
- tokenizer/bpetokenizer/__pycache__/tokenizer.cpython-39.pyc +0 -0
- tokenizer/bpetokenizer/__pycache__/version.cpython-39.pyc +0 -0
- tokenizer/bpetokenizer/base.py +223 -0
- tokenizer/bpetokenizer/tokenizer.py +160 -0
- tokenizer/bpetokenizer/version.py +1 -0
- tokenizer/notebooks/tokenization.ipynb +764 -0
- tokenizer/public/tokenizer.png +0 -0
- tokenizer/sample/bpetokenizer/sample_bpetokenizer.model +3 -0
- tokenizer/sample/bpetokenizer/sample_bpetokenizer.py +36 -0
- tokenizer/sample/bpetokenizer/sample_bpetokenizer.vocab +310 -0
- tokenizer/sample/load_json_vocab/bpetokenizer_json.py +24 -0
- tokenizer/sample/load_json_vocab/sample_bpetokenizer.json +378 -0
- tokenizer/sample/load_json_vocab/tokens.py +26 -0
- tokenizer/sample/tokenizer/wiki.model +3 -0
- tokenizer/sample/tokenizer/wiki.py +14 -0
- tokenizer/sample/tokenizer/wiki.vocab +259 -0
- tokenizer/setup.py +38 -0
- tokenizer/tests/__pycache__/test_tokenizer.cpython-39-pytest-7.1.2.pyc +0 -0
- tokenizer/tests/test_tokenizer.py +69 -0
tokenizer/.github/workflows/pypi.yml
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name: python-package
|
2 |
+
|
3 |
+
on:
|
4 |
+
release:
|
5 |
+
types: [published]
|
6 |
+
|
7 |
+
jobs:
|
8 |
+
pypi-publish:
|
9 |
+
name: Upload release to PyPI
|
10 |
+
runs-on: ubuntu-latest
|
11 |
+
permissions:
|
12 |
+
id-token: write
|
13 |
+
|
14 |
+
steps:
|
15 |
+
- name: Check out repository
|
16 |
+
uses: actions/checkout@v4
|
17 |
+
|
18 |
+
- name: Set up Python
|
19 |
+
uses: actions/setup-python@v5
|
20 |
+
with:
|
21 |
+
python-version: '3.x'
|
22 |
+
|
23 |
+
- name: Install dependencies
|
24 |
+
run: |
|
25 |
+
python -m pip install --upgrade pip
|
26 |
+
pip install setuptools wheel
|
27 |
+
|
28 |
+
- name: Build package
|
29 |
+
run: |
|
30 |
+
python setup.py sdist bdist_wheel
|
31 |
+
|
32 |
+
- name: Publish package distributions to PyPI
|
33 |
+
uses: pypa/gh-action-pypi-publish@release/v1
|
34 |
+
with:
|
35 |
+
password: ${{ secrets.PYPI_API_TOKEN }}
|
tokenizer/.github/workflows/tests.yml
ADDED
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name: Tests
|
2 |
+
|
3 |
+
on:
|
4 |
+
push:
|
5 |
+
branches:
|
6 |
+
- main
|
7 |
+
pull_request:
|
8 |
+
branches:
|
9 |
+
- main
|
10 |
+
|
11 |
+
jobs:
|
12 |
+
TestBPETokenizer:
|
13 |
+
runs-on: ${{ matrix.os }}
|
14 |
+
strategy:
|
15 |
+
matrix:
|
16 |
+
os: [ubuntu-latest, windows-latest, macos-latest]
|
17 |
+
python-version: ["3.9", "3.10", "3.11"]
|
18 |
+
steps:
|
19 |
+
- uses: actions/checkout@v4
|
20 |
+
|
21 |
+
- name: Setup Python ${{ matrix.python-version }}
|
22 |
+
uses: actions/setup-python@v5
|
23 |
+
with:
|
24 |
+
python-version: ${{ matrix.python-version }}
|
25 |
+
|
26 |
+
- name: Install dependencies
|
27 |
+
run: |
|
28 |
+
python -m pip install --upgrade pip
|
29 |
+
pip install pytest
|
30 |
+
pip install regex
|
31 |
+
|
32 |
+
- name: Run Tests
|
33 |
+
run: |
|
34 |
+
python -m pytest tests/test_tokenizer.py
|
35 |
+
|
36 |
+
- name: Upload Test Results
|
37 |
+
uses: actions/upload-artifact@v4
|
38 |
+
with:
|
39 |
+
name: test-results
|
40 |
+
path: test-results.xml
|
41 |
+
|
tokenizer/.gitignore
ADDED
@@ -0,0 +1,156 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Project
|
2 |
+
/.vs
|
3 |
+
.vscode
|
4 |
+
|
5 |
+
# Log files
|
6 |
+
*.log
|
7 |
+
|
8 |
+
# Python virtualenv
|
9 |
+
.venv*
|
10 |
+
|
11 |
+
# Byte-compiled / optimized / DLL files
|
12 |
+
__pycache__/
|
13 |
+
*.py[cod]
|
14 |
+
*$py.class
|
15 |
+
|
16 |
+
# C extensions
|
17 |
+
*.so
|
18 |
+
|
19 |
+
# Distribution / packaging
|
20 |
+
.Python
|
21 |
+
build/
|
22 |
+
develop-eggs/
|
23 |
+
dist/
|
24 |
+
downloads/
|
25 |
+
eggs/
|
26 |
+
.eggs/
|
27 |
+
lib/
|
28 |
+
lib64/
|
29 |
+
parts/
|
30 |
+
sdist/
|
31 |
+
var/
|
32 |
+
wheels/
|
33 |
+
share/python-wheels/
|
34 |
+
*.egg-info/
|
35 |
+
.installed.cfg
|
36 |
+
*.egg
|
37 |
+
MANIFEST
|
38 |
+
|
39 |
+
# PyInstaller
|
40 |
+
# Usually these files are written by a python script from a template
|
41 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
42 |
+
*.manifest
|
43 |
+
*.spec
|
44 |
+
|
45 |
+
# Installer logs
|
46 |
+
pip-log.txt
|
47 |
+
pip-delete-this-directory.txt
|
48 |
+
|
49 |
+
# Unit test / coverage reports
|
50 |
+
htmlcov/
|
51 |
+
.tox/
|
52 |
+
.nox/
|
53 |
+
.coverage
|
54 |
+
.coverage.*
|
55 |
+
.cache
|
56 |
+
nosetests.xml
|
57 |
+
coverage.xml
|
58 |
+
*.cover
|
59 |
+
*.py,cover
|
60 |
+
.hypothesis/
|
61 |
+
.pytest_cache/
|
62 |
+
cover/
|
63 |
+
|
64 |
+
# Translations
|
65 |
+
*.mo
|
66 |
+
*.pot
|
67 |
+
|
68 |
+
# Django stuff:
|
69 |
+
*.log
|
70 |
+
local_settings.py
|
71 |
+
db.sqlite3
|
72 |
+
db.sqlite3-journal
|
73 |
+
|
74 |
+
# Flask stuff:
|
75 |
+
instance/
|
76 |
+
.webassets-cache
|
77 |
+
|
78 |
+
# Scrapy stuff:
|
79 |
+
.scrapy
|
80 |
+
|
81 |
+
# Sphinx documentation
|
82 |
+
docs/_build/
|
83 |
+
|
84 |
+
# PyBuilder
|
85 |
+
.pybuilder/
|
86 |
+
target/
|
87 |
+
|
88 |
+
# Jupyter Notebook
|
89 |
+
.ipynb_checkpoints
|
90 |
+
|
91 |
+
# IPython
|
92 |
+
profile_default/
|
93 |
+
ipython_config.py
|
94 |
+
|
95 |
+
# pyenv
|
96 |
+
# For a library or package, you might want to ignore these files since the code is
|
97 |
+
# intended to run in multiple environments; otherwise, check them in:
|
98 |
+
# .python-version
|
99 |
+
|
100 |
+
# pipenv
|
101 |
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
102 |
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
103 |
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
104 |
+
# install all needed dependencies.
|
105 |
+
#Pipfile.lock
|
106 |
+
|
107 |
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
|
108 |
+
__pypackages__/
|
109 |
+
|
110 |
+
# Celery stuff
|
111 |
+
celerybeat-schedule
|
112 |
+
celerybeat.pid
|
113 |
+
|
114 |
+
# SageMath parsed files
|
115 |
+
*.sage.py
|
116 |
+
|
117 |
+
# Environments
|
118 |
+
.env
|
119 |
+
.venv
|
120 |
+
env/
|
121 |
+
venv/
|
122 |
+
ENV/
|
123 |
+
env.bak/
|
124 |
+
venv.bak/
|
125 |
+
|
126 |
+
# Spyder project settings
|
127 |
+
.spyderproject
|
128 |
+
.spyproject
|
129 |
+
|
130 |
+
# Rope project settings
|
131 |
+
.ropeproject
|
132 |
+
|
133 |
+
# mkdocs documentation
|
134 |
+
/site
|
135 |
+
|
136 |
+
# mypy
|
137 |
+
.mypy_cache/
|
138 |
+
.dmypy.json
|
139 |
+
dmypy.json
|
140 |
+
|
141 |
+
# Pyre type checker
|
142 |
+
.pyre/
|
143 |
+
|
144 |
+
# pytype static type analyzer
|
145 |
+
.pytype/
|
146 |
+
|
147 |
+
# Cython debug symbols
|
148 |
+
cython_debug/
|
149 |
+
|
150 |
+
logs
|
151 |
+
|
152 |
+
.idea/*
|
153 |
+
.DS_Store
|
154 |
+
|
155 |
+
output/
|
156 |
+
*.pkl
|
tokenizer/README.md
ADDED
@@ -0,0 +1,151 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# bpetokenizer
|
2 |
+
|
3 |
+
A Byte Pair Encoding (BPE) tokenizer, which algorithmically follows along the GPT tokenizer. The tokenizer is capable of handling special tokens and uses a customizable regex pattern for tokenization(includes the gpt4 regex pattern). supports `save` and `load` tokenizers in the `json` and `file` format.
|
4 |
+
|
5 |
+
|
6 |
+
### Overview
|
7 |
+
|
8 |
+
The Byte Pair Encoding (BPE) algorithm is a simple yet powerful method for building a vocabulary of subword units for a given text corpus. This tokenizer can be used for training your tokenizer of the LLM on various languages of text corpus.
|
9 |
+
|
10 |
+
this algorithm is first introduced in the paper [Neural Machine Translation of Rare Words with Subword Units](https://arxiv.org/pdf/1508.07909) and then used this in the gpt2 tokenizer([Language Models are Unsupervised Multitask Learners](https://d4mucfpksywv.cloudfront.net/better-language-models/language_models_are_unsupervised_multitask_learners.pdf))
|
11 |
+
|
12 |
+
The [notebook](notebooks/tokenization.ipynb) which shows the BPE algorithm in detail and how the tokenizers work internally.
|
13 |
+
|
14 |
+
Every LLM(LLama, Gemini, Mistral..) use their own Tokenizers trained on their own text dataset.
|
15 |
+
|
16 |
+
|
17 |
+
### Features
|
18 |
+
|
19 |
+
- Implements Byte Pair Encoding (BPE) algorithm.
|
20 |
+
- Handles special tokens.
|
21 |
+
- Uses a customizable regex pattern for tokenization.
|
22 |
+
- Compatible with Python 3.9 and above
|
23 |
+
|
24 |
+
|
25 |
+
#### This repository has 2 different Tokenizers:
|
26 |
+
- `BPETokenizer`
|
27 |
+
- `Tokenizer`
|
28 |
+
|
29 |
+
1. [Tokenizer](bpetokenizer/base.py): This class contains `train`, `encode`, `decode` and functionalities to `save` and `load`. Also contains few helper functions `get_stats`, `merge`, `replace_control_characters`.. to perform the BPE algorithm for the tokenizer.
|
30 |
+
|
31 |
+
2. [BPETokenizer](bpetokenizer/tokenizer.py): This class emphasizes the real power of the tokenizer(used in gpt4 tokenizer..[tiktoken](https://github.com/openai/tiktoken)), uses the `GPT4_SPLIT_PATTERN` to split the text as mentioned in the gpt4 tokenizer. also handles the `special_tokens` (refer [sample_bpetokenizer](sample/bpetokenizer/sample_bpetokenizer.py)). which inherits the `save` and `load` functionlities to save and load the tokenizer respectively.
|
32 |
+
|
33 |
+
|
34 |
+
### Usage
|
35 |
+
|
36 |
+
this tutorial leverages the `special_tokens` usage in the Tokenizer.
|
37 |
+
|
38 |
+
Install the package
|
39 |
+
|
40 |
+
```shell
|
41 |
+
pip install bpetokenizer
|
42 |
+
```
|
43 |
+
|
44 |
+
|
45 |
+
```py
|
46 |
+
from bpetokenizer import BPETokenizer
|
47 |
+
|
48 |
+
special_tokens = {
|
49 |
+
"<|endoftext|>": 1001,
|
50 |
+
"<|startoftext|>": 1002,
|
51 |
+
"[SPECIAL1]": 1003,
|
52 |
+
"[SPECIAL2]": 1004,
|
53 |
+
}
|
54 |
+
|
55 |
+
tokenizer = BPETokenizer(special_tokens=special_tokens) # you can also use the method _special_tokens to register the special tokens (if not passed when intializing)
|
56 |
+
texts = "<|startoftext|> Hello, World! This is a sample text with the special tokens [SPECIAL1] and [SPECIAL2] to test the tokenizer.<|endoftext|>"
|
57 |
+
|
58 |
+
tokenizer.train(texts, vocab_size=310, verbose=True)
|
59 |
+
# tokenizer._special_tokens(special_tokens) # if not passed when intialization of the BPETokenizer
|
60 |
+
|
61 |
+
encode_text = """
|
62 |
+
<|startoftext|>Hello, World! This is a sample text with the special tokens [SPECIAL1] and [SPECIAL2] to test the tokenizer.
|
63 |
+
Hello, Universe! Another example sentence containing [SPECIAL1] and [SPECIAL2], used to ensure tokenizer's robustness.
|
64 |
+
Greetings, Earth! Here we have [SPECIAL1] appearing once again, followed by [SPECIAL2] in the same sentence.
|
65 |
+
Hello, World! This is yet another sample text, with [SPECIAL1] and [SPECIAL2] making an appearance.
|
66 |
+
Hey there, World! Testing the tokenizer with [SPECIAL1] and [SPECIAL2] to see if it handles special tokens properly.
|
67 |
+
Salutations, Planet! The tokenizer should recognize [SPECIAL1] and [SPECIAL2] in this long string of text.
|
68 |
+
Hello again, World! [SPECIAL1] and [SPECIAL2] are special tokens that need to be handled correctly by the tokenizer.
|
69 |
+
Welcome, World! Including [SPECIAL1] and [SPECIAL2] multiple times in this large text to ensure proper encoding.
|
70 |
+
Hi, World! Let's add [SPECIAL1] and [SPECIAL2] in various parts of this long sentence to test the tokenizer thoroughly.
|
71 |
+
<|endoftext|>
|
72 |
+
"""
|
73 |
+
ids = tokenizer.encode(encode_text, special_tokens="all")
|
74 |
+
print(ids)
|
75 |
+
|
76 |
+
decode_text = tokenizer.decode(ids)
|
77 |
+
print(decode_text)
|
78 |
+
|
79 |
+
tokenizer.save("sample_bpetokenizer", mode="json") # mode: default is file
|
80 |
+
```
|
81 |
+
|
82 |
+
refer [sample_bpetokenizer](sample/bpetokenizer) to have an understanding of the `vocab` and the `model` file of the tokenizer trained on the above texts.
|
83 |
+
|
84 |
+
|
85 |
+
#### To Load the Tokenizer
|
86 |
+
|
87 |
+
```py
|
88 |
+
from bpetokenizer import BPETokenizer
|
89 |
+
|
90 |
+
tokenizer = BPETokenizer()
|
91 |
+
|
92 |
+
tokenizer.load("sample_bpetokenizer.json", mode="json")
|
93 |
+
|
94 |
+
encode_text = """
|
95 |
+
<|startoftext|>Hello, World! This is a sample text with the special tokens [SPECIAL1] and [SPECIAL2] to test the tokenizer.
|
96 |
+
Hello, Universe! Another example sentence containing [SPECIAL1] and [SPECIAL2], used to ensure tokenizer's robustness.
|
97 |
+
Greetings, Earth! Here we have [SPECIAL1] appearing once again, followed by [SPECIAL2] in the same sentence.<|endoftext|>"""
|
98 |
+
|
99 |
+
print("vocab: ", tokenizer.vocab)
|
100 |
+
print('---')
|
101 |
+
print("merges: ", tokenizer.merges)
|
102 |
+
print('---')
|
103 |
+
print("special tokens: ", tokenizer.special_tokens)
|
104 |
+
|
105 |
+
ids = tokenizer.encode(encode_text, special_tokens="all")
|
106 |
+
print('---')
|
107 |
+
print(ids)
|
108 |
+
|
109 |
+
decode_text = tokenizer.decode(ids)
|
110 |
+
print('---')
|
111 |
+
print(decode_text)
|
112 |
+
|
113 |
+
# you can also print the tokens and the text chunks split with the pattern.
|
114 |
+
tokens = tokenizer.tokens(encode_text, verbose=True) # if verbose, prints the text chunks and also the pattern used to split.
|
115 |
+
print('---')
|
116 |
+
print("tokens: ", tokens)
|
117 |
+
|
118 |
+
```
|
119 |
+
refer to the [load_json_vocab](sample/load_json_vocab/) and run the `bpetokenizer_json` to get an overview of `vocab`, `merges`, `special_tokens` and to view the tokens that are split by the tokenizer using pattern, look at [tokens](sample/load_json_vocab/tokens.py)
|
120 |
+
|
121 |
+
### Run Tests
|
122 |
+
|
123 |
+
the tests folder `tests/` include the tests of the tokenizer, uses pytest.
|
124 |
+
|
125 |
+
```
|
126 |
+
python3 -m pytest
|
127 |
+
```
|
128 |
+
|
129 |
+
additionally, the workflows are setup to run the tests when made a PR.
|
130 |
+
|
131 |
+
|
132 |
+
### Contributing
|
133 |
+
|
134 |
+
Contributions to the BPE Tokenizer are most welcomed! If you would like to contribute, please follow these steps:
|
135 |
+
|
136 |
+
- Star and Fork the repository.
|
137 |
+
- Create a new branch (git checkout -b feature/your-feature).
|
138 |
+
- Commit your changes (git commit -am 'Add some feature').
|
139 |
+
- Push to the branch (git push origin feature/your-feature).
|
140 |
+
- Create a new Pull Request.
|
141 |
+
|
142 |
+
Please ensure your code follows the project's coding standards and includes appropriate tests. Also, update the documentation as necessary.
|
143 |
+
|
144 |
+
|
145 |
+
### License
|
146 |
+
|
147 |
+
This project is licensed under the MIT License.
|
148 |
+
|
149 |
+
----
|
150 |
+
|
151 |
+
*this tokenizer is inspired from the [minbpe](https://github.com/karpathy/minbpe), but more optimized.
|
tokenizer/bpetokenizer/__init__.py
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
from .base import Tokenizer
|
2 |
+
from .tokenizer import BPETokenizer
|
3 |
+
from .version import __version__
|
tokenizer/bpetokenizer/__pycache__/__init__.cpython-39.pyc
ADDED
Binary file (306 Bytes). View file
|
|
tokenizer/bpetokenizer/__pycache__/base.cpython-39.pyc
ADDED
Binary file (8.04 kB). View file
|
|
tokenizer/bpetokenizer/__pycache__/tokenizer.cpython-39.pyc
ADDED
Binary file (6.36 kB). View file
|
|
tokenizer/bpetokenizer/__pycache__/version.cpython-39.pyc
ADDED
Binary file (209 Bytes). View file
|
|
tokenizer/bpetokenizer/base.py
ADDED
@@ -0,0 +1,223 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
This file will contains all the helper functions
|
3 |
+
and Base class which has the methods to save/load model,
|
4 |
+
also required to build the BPETokenizer.
|
5 |
+
"""
|
6 |
+
|
7 |
+
import regex as re
|
8 |
+
from .version import __version__
|
9 |
+
|
10 |
+
def get_stats(tokens, counts=None) -> dict:
|
11 |
+
"""Get statistics of the tokens. Includes the frequency of each consecutive pair of tokens"""
|
12 |
+
counts = {} if counts is None else counts
|
13 |
+
for pair in zip(tokens, tokens[1:]):
|
14 |
+
counts[pair] = counts.get(pair, 0) + 1
|
15 |
+
return counts
|
16 |
+
|
17 |
+
|
18 |
+
def merge(ids, pair, idx) -> list:
|
19 |
+
"""Merge the pair of tokens in the ids(list of tokens) with representing it with idx(new token in the vocab)."""
|
20 |
+
newids = []
|
21 |
+
i = 0
|
22 |
+
while i < len(ids):
|
23 |
+
if i < len(ids) -1 and ids[i] == pair[0] and ids[i+1] == pair[1]:
|
24 |
+
newids.append(idx)
|
25 |
+
i += 2
|
26 |
+
else:
|
27 |
+
newids.append(ids[i])
|
28 |
+
i += 1
|
29 |
+
return newids
|
30 |
+
|
31 |
+
|
32 |
+
import unicodedata
|
33 |
+
|
34 |
+
def replace_control_characters(s: str) -> str:
|
35 |
+
"""
|
36 |
+
Replace control characters in a string with their unicode escape sequences. Prevents distortion
|
37 |
+
Example:
|
38 |
+
token = b"hello\nworld\x00"
|
39 |
+
print(token) -> hello
|
40 |
+
world (and \x00 might not be visible)
|
41 |
+
print(replace_control_characters(token))
|
42 |
+
-> hello\u000aworld\u0000
|
43 |
+
|
44 |
+
"""
|
45 |
+
chars = []
|
46 |
+
for ch in s:
|
47 |
+
if unicodedata.category(ch)[0] != "C": # the category of the `\*` chars start with C
|
48 |
+
chars.append(ch)
|
49 |
+
else:
|
50 |
+
chars.append(f"\\u{ord(ch):04x}")
|
51 |
+
return "".join(chars)
|
52 |
+
|
53 |
+
|
54 |
+
def render_token(t: bytes) -> str:
|
55 |
+
s = t.decode('utf-8', errors='replace') # this will replace the unkown with a �
|
56 |
+
s = replace_control_characters(s)
|
57 |
+
return s
|
58 |
+
|
59 |
+
|
60 |
+
|
61 |
+
class Tokenizer:
|
62 |
+
"""A Base class for the tokenizer, used for training and encoding/decoding the text without special tokens."""
|
63 |
+
|
64 |
+
def __init__(self):
|
65 |
+
self.merges = {}
|
66 |
+
self.pattern = "" # the regex pattern
|
67 |
+
self.compiled_pattern = re.compile(self.pattern) if self.pattern else ""
|
68 |
+
self.special_tokens = {}
|
69 |
+
self.vocab = self._build_vocab() if self.merges else {}
|
70 |
+
|
71 |
+
def _build_vocab(self) -> dict:
|
72 |
+
"""Build the vocab from the merges and special tokens. This will be used to encode/decode the tokens."""
|
73 |
+
vocab = {idx: bytes([idx]) for idx in range(256)}
|
74 |
+
for (p0, p1), idx in self.merges.items():
|
75 |
+
vocab[idx] = vocab[p0] + vocab[p1]
|
76 |
+
if self.special_tokens:
|
77 |
+
for special, idx in self.special_tokens.items():
|
78 |
+
vocab[idx] = special.encode("utf-8")
|
79 |
+
return vocab
|
80 |
+
|
81 |
+
def save(self, file_name, mode="file"):
|
82 |
+
"""
|
83 |
+
Writes metadata and vocabulary information to the model and vocab files.
|
84 |
+
mode: str, default="file" | "json" to save the model and vocab in json format.
|
85 |
+
"""
|
86 |
+
if mode == "file":
|
87 |
+
model_file = file_name + ".model"
|
88 |
+
with open(model_file, 'w') as f:
|
89 |
+
f.write(f"{__version__}\n")
|
90 |
+
f.write(f"{self.pattern}\n")
|
91 |
+
f.write(f"{len(self.special_tokens)}\n")
|
92 |
+
if self.special_tokens:
|
93 |
+
for special, idx in self.special_tokens.items():
|
94 |
+
f.write(f"{special} {idx}\n")
|
95 |
+
|
96 |
+
for idx1, idx2 in self.merges: # this will give the tokens of pair which are merged
|
97 |
+
f.write(f"{idx1} {idx2}\n")
|
98 |
+
|
99 |
+
vocab_file = file_name + ".vocab"
|
100 |
+
inverted_merges = {idx: pair for pair, idx in self.merges.items()}
|
101 |
+
with open(vocab_file, "w", encoding="utf-8") as f:
|
102 |
+
for idx, token in self.vocab.items():
|
103 |
+
s = render_token(token)
|
104 |
+
# find the children of this token, if any
|
105 |
+
if idx in inverted_merges:
|
106 |
+
# if this token has children, render it nicely as a merge
|
107 |
+
idx0, idx1 = inverted_merges[idx]
|
108 |
+
s0 = render_token(self.vocab[idx0])
|
109 |
+
s1 = render_token(self.vocab[idx1])
|
110 |
+
f.write(f"[{s0}][{s1}] -> [{s}] {idx}\n")
|
111 |
+
else:
|
112 |
+
# otherwise this is leaf token, just print it
|
113 |
+
# (this should just be the first 256 tokens, the bytes)
|
114 |
+
f.write(f"[{s}] {idx}\n")
|
115 |
+
elif mode == "json":
|
116 |
+
import json
|
117 |
+
data = {
|
118 |
+
"version": __version__,
|
119 |
+
"pattern": str(self.pattern),
|
120 |
+
"special_tokens": self.special_tokens,
|
121 |
+
"merges": {str(k): v for k, v in self.merges.items()},
|
122 |
+
"vocab": {idx: render_token(token) for idx, token in self.vocab.items()}
|
123 |
+
}
|
124 |
+
with open(file_name + ".json", "w", encoding="utf-8") as f:
|
125 |
+
json.dump(data, f, ensure_ascii=False, indent=4)
|
126 |
+
else:
|
127 |
+
raise ValueError("mode should be either 'file' or 'json'")
|
128 |
+
|
129 |
+
|
130 |
+
def load(self, file_name, mode="file"):
|
131 |
+
"""
|
132 |
+
Load the model and vocab files to the tokenizer.
|
133 |
+
mode: str, default="file" | "json" to load the model and vocab in json format.
|
134 |
+
"""
|
135 |
+
if mode == "file":
|
136 |
+
assert file_name.endswith(".model")
|
137 |
+
merges = {}
|
138 |
+
special_tokens = {}
|
139 |
+
idx = 256
|
140 |
+
with open(file_name, 'r', encoding="utf-8") as f:
|
141 |
+
assert f.readline().strip() == __version__
|
142 |
+
self.pattern = f.readline().strip().split()
|
143 |
+
num_special = int(f.readline().strip()) # no of lines of special_tokens
|
144 |
+
for _ in range(num_special):
|
145 |
+
special, idx = f.readline().strip().split()
|
146 |
+
special_tokens[special] = int(idx)
|
147 |
+
for line in f:
|
148 |
+
idx1, idx2 = map(int, line.strip().split())
|
149 |
+
merges[(idx1, idx2)] = idx
|
150 |
+
idx += 1
|
151 |
+
|
152 |
+
self.merges = merges
|
153 |
+
self.special_tokens = special_tokens
|
154 |
+
self.vocab = self._build_vocab()
|
155 |
+
|
156 |
+
elif mode == "json":
|
157 |
+
assert file_name.endswith(".json")
|
158 |
+
|
159 |
+
import json
|
160 |
+
with open(file_name, "r", encoding="utf-8") as f:
|
161 |
+
data = json.load(f)
|
162 |
+
assert data["version"] == __version__
|
163 |
+
pattern = data["pattern"]
|
164 |
+
pattern_regex = re.compile(r'regex.Regex\("(.+)", flags=(regex\.\w+)\)')
|
165 |
+
match = pattern_regex.match(pattern)
|
166 |
+
if match:
|
167 |
+
self.pattern = match.group(1)
|
168 |
+
self.special_tokens = data["special_tokens"]
|
169 |
+
self.inverse_special_tokens = {v: k for k, v in self.special_tokens.items()}
|
170 |
+
merges = data["merges"]
|
171 |
+
self.merges = {tuple(map(int, k.strip('()').split(','))): v for k, v in merges.items()}
|
172 |
+
vocab = data["vocab"]
|
173 |
+
self.vocab = {int(k): v.encode("utf-8") for k, v in vocab.items()}
|
174 |
+
|
175 |
+
|
176 |
+
|
177 |
+
def encode(self, texts):
|
178 |
+
"""Method to encode the text to ids."""
|
179 |
+
text_bytes = texts.encode("utf-8") # raw bytes string
|
180 |
+
ids = list(map(int, text_bytes))
|
181 |
+
while len(ids) >= 2:
|
182 |
+
# find the pair with the lowest merge index
|
183 |
+
stats = get_stats(ids)
|
184 |
+
pair = min(stats, key=lambda p: self.merges.get(p, float("inf")))
|
185 |
+
|
186 |
+
if pair not in self.merges:
|
187 |
+
break # nothing else can be merged anymore
|
188 |
+
# otherwise let's merge the best pair (lowest merge index)
|
189 |
+
idx = self.merges[pair]
|
190 |
+
ids = merge(ids, pair, idx)
|
191 |
+
return ids
|
192 |
+
|
193 |
+
def decode(self, ids):
|
194 |
+
"""Method to decode the ids to text."""
|
195 |
+
bytes_str = b"".join([self.vocab[idx] for idx in ids])
|
196 |
+
text = bytes_str.decode("utf-8", errors="replace")
|
197 |
+
return text
|
198 |
+
|
199 |
+
def train(self, texts, vocab_size, verbose=False):
|
200 |
+
"""Method for training the tokenizer."""
|
201 |
+
assert vocab_size >= 256
|
202 |
+
num_merges = vocab_size - 256
|
203 |
+
|
204 |
+
tokens = texts.encode("utf-8")
|
205 |
+
ids = list(tokens)
|
206 |
+
merges = {}
|
207 |
+
vocab = {idx: bytes([idx]) for idx in range(256)} # vocab for first 255 bytes
|
208 |
+
|
209 |
+
# bpe algorithm
|
210 |
+
for i in range(num_merges):
|
211 |
+
stats = get_stats(ids)
|
212 |
+
pair = max(stats, key=stats.get) # returns the highest frequency pair
|
213 |
+
idx = 256 + i
|
214 |
+
|
215 |
+
ids = merge(ids, pair, idx)
|
216 |
+
merges[pair] = idx
|
217 |
+
vocab[idx] = vocab[pair[0]] + vocab[pair[1]] # concat of bytes
|
218 |
+
|
219 |
+
if verbose:
|
220 |
+
print(f"merging {i+1}/{num_merges}: {pair} -> {idx} ({vocab[idx]}) had {stats[pair]} frequency")
|
221 |
+
|
222 |
+
self.merges = merges
|
223 |
+
self.vocab = vocab
|
tokenizer/bpetokenizer/tokenizer.py
ADDED
@@ -0,0 +1,160 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Byte Pair Encoding tokenizer.
|
3 |
+
|
4 |
+
Algorithmically follows along the GPT tokenizer:
|
5 |
+
https://github.com/openai/gpt-2/blob/master/src/encoder.py
|
6 |
+
|
7 |
+
The Byte Pair Encoding (BPE) algorithm is a simple algorithm that builds a vocabulary
|
8 |
+
of subword units for a given text corpus.
|
9 |
+
|
10 |
+
More detailed information could be found in:
|
11 |
+
https://github.com/Hk669/bpetokenizer/blob/main/notebooks/tokenization.ipynb
|
12 |
+
https://en.wikipedia.org/wiki/Byte_pair_encoding
|
13 |
+
https://youtu.be/zduSFxRajkE?si=Qv-yX2NUY69aIjCQ (Andrej Karpathy's tutorial on Tokenizer)
|
14 |
+
|
15 |
+
"""
|
16 |
+
|
17 |
+
from .base import Tokenizer, get_stats, merge
|
18 |
+
import regex as re
|
19 |
+
|
20 |
+
# from the openai/tiktoken (used in gpt4 tokenizer)
|
21 |
+
GPT4_SPLIT_PATTERN = r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]++[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+""" # raw string
|
22 |
+
|
23 |
+
|
24 |
+
class BPETokenizer(Tokenizer):
|
25 |
+
"""Byte Pair Encoding tokenizer. Which handles the special tokens and the pattern for tokenization."""
|
26 |
+
|
27 |
+
def __init__(self, pattern=None, special_tokens=None):
|
28 |
+
super().__init__()
|
29 |
+
self.pattern = GPT4_SPLIT_PATTERN if pattern is None else pattern
|
30 |
+
self.compiled_pattern = re.compile(self.pattern)
|
31 |
+
self.special_tokens = {} if special_tokens is None else special_tokens
|
32 |
+
self.inverse_special_tokens = {} if special_tokens is None else {v: k for k, v in special_tokens.items()}
|
33 |
+
|
34 |
+
|
35 |
+
def train(self, texts, vocab_size, verbose=False) -> None:
|
36 |
+
"""Train the tokenizer on the given texts and vocab size. The vocab size should be greater than 256."""
|
37 |
+
assert vocab_size >= 256
|
38 |
+
num_merges = vocab_size - 256
|
39 |
+
|
40 |
+
text_chunks = re.findall(self.compiled_pattern, texts) # handles the desired pattern of tokens with regex pattern
|
41 |
+
|
42 |
+
ids = [list(tokens.encode("utf-8")) for tokens in text_chunks] # List[List[int]]
|
43 |
+
merges = {}
|
44 |
+
vocab = {idx: bytes([idx]) for idx in range(256)} # vocab for first 255 bytes
|
45 |
+
|
46 |
+
# bpe algorithm
|
47 |
+
for i in range(num_merges):
|
48 |
+
stats = {}
|
49 |
+
for chunk in ids:
|
50 |
+
get_stats(chunk, stats)
|
51 |
+
|
52 |
+
pair = max(stats, key=stats.get) # returns the highest frequency pair
|
53 |
+
idx = 256 + i
|
54 |
+
|
55 |
+
ids = [merge(chunk_ids, pair, idx) for chunk_ids in ids] # merge all the max occuring pair in the each chunk in ids
|
56 |
+
merges[pair] = idx
|
57 |
+
vocab[idx] = vocab[pair[0]] + vocab[pair[1]] # concat of bytes
|
58 |
+
|
59 |
+
if verbose:
|
60 |
+
print(f"merging {i+1}/{num_merges}: {pair} -> {idx} ({vocab[idx]}) had {stats[pair]} frequency")
|
61 |
+
|
62 |
+
self.merges = merges
|
63 |
+
self.vocab = vocab
|
64 |
+
|
65 |
+
|
66 |
+
def _encode(self, _bytes) -> list:
|
67 |
+
"""Encode the bytes into token ids(BPE algorithm)."""
|
68 |
+
ids = list(_bytes)
|
69 |
+
while len(ids) >= 2:
|
70 |
+
# find the pair with the lowest merge index
|
71 |
+
stats = get_stats(ids)
|
72 |
+
pair = min(stats, key=lambda p: self.merges.get(p, float("inf")))
|
73 |
+
|
74 |
+
if pair not in self.merges:
|
75 |
+
break # nothing else can be merged anymore
|
76 |
+
# otherwise let's merge the best pair (lowest merge index)
|
77 |
+
idx = self.merges[pair]
|
78 |
+
ids = merge(ids, pair, idx)
|
79 |
+
return ids
|
80 |
+
|
81 |
+
|
82 |
+
def encode_ord(self, text) -> list:
|
83 |
+
text_chunks = re.findall(self.compiled_pattern, text)
|
84 |
+
ids = []
|
85 |
+
for chunk in text_chunks:
|
86 |
+
_bytes = chunk.encode("utf-8")
|
87 |
+
chunk_ids = self._encode(_bytes)
|
88 |
+
ids.extend(chunk_ids)
|
89 |
+
return ids
|
90 |
+
|
91 |
+
|
92 |
+
def encode(self, text, special_tokens="none") -> list:
|
93 |
+
"""
|
94 |
+
Encode the text into token ids.
|
95 |
+
If special_tokens is set to "all", it will include the special tokens in the ids.
|
96 |
+
If set to "none", it will exclude the special tokens.
|
97 |
+
If set to "none_raise", it will raise an error if the text contains any special tokens.
|
98 |
+
"""
|
99 |
+
special = None
|
100 |
+
if special_tokens == "all":
|
101 |
+
special = self.special_tokens
|
102 |
+
elif special_tokens == "none":
|
103 |
+
special = {}
|
104 |
+
elif special_tokens == "none_raise":
|
105 |
+
special = {}
|
106 |
+
assert all(token not in text for token in self.special_tokens)
|
107 |
+
else:
|
108 |
+
raise ValueError(f"invalid special tokens argument: {special_tokens}")
|
109 |
+
|
110 |
+
if not special:
|
111 |
+
return self.encode_ord(text)
|
112 |
+
|
113 |
+
special_pattern = "(" + "|".join(re.escape(k) for k in special) + ")"
|
114 |
+
text_chunks = re.split(special_pattern, text)
|
115 |
+
ids = []
|
116 |
+
for chunk in text_chunks:
|
117 |
+
if chunk in special:
|
118 |
+
ids.append(special[chunk])
|
119 |
+
else:
|
120 |
+
chunkids = self._encode(chunk.encode("utf-8"))
|
121 |
+
ids.extend(chunkids)
|
122 |
+
return ids
|
123 |
+
|
124 |
+
|
125 |
+
def decode(self, ids) -> str:
|
126 |
+
part_bytes = []
|
127 |
+
for idx in ids:
|
128 |
+
if idx in self.vocab: #str conversion because vocab keys are strings when loaded from json
|
129 |
+
part_bytes.append(self.vocab[idx])
|
130 |
+
elif idx in self.inverse_special_tokens:
|
131 |
+
part_bytes.append(self.inverse_special_tokens[idx].encode("utf-8")) # special tokens are not encoded in vocab
|
132 |
+
elif idx in self.merges:
|
133 |
+
pair = self.merges[idx]
|
134 |
+
part_bytes.append(self.vocab[pair[0]] + self.vocab[pair[1]])
|
135 |
+
else:
|
136 |
+
raise ValueError(f"invalid token id: {idx}")
|
137 |
+
text_bytes = b"".join(part_bytes)
|
138 |
+
text = text_bytes.decode("utf-8", errors="replace")
|
139 |
+
return text
|
140 |
+
|
141 |
+
|
142 |
+
def _special_tokens(self, special_tokens) -> None:
|
143 |
+
"""Set the special tokens for the tokenizer. If not passed when initializing, it will be empty."""
|
144 |
+
self.special_tokens = special_tokens
|
145 |
+
self.inverse_special_tokens = {v: k for k, v in special_tokens.items()}
|
146 |
+
|
147 |
+
|
148 |
+
def tokens(self, text, verbose=False) -> list:
|
149 |
+
text_chunks = re.findall(self.compiled_pattern, text)
|
150 |
+
|
151 |
+
_tokens = []
|
152 |
+
for chunk in text_chunks:
|
153 |
+
_bytes = chunk.encode("utf-8")
|
154 |
+
chunk_ids = self._encode(_bytes)
|
155 |
+
chunk_tokens = [self.vocab[idx].decode("utf-8", errors="replace") if idx in self.vocab else f"[UNK{idx}]" for idx in chunk_ids]
|
156 |
+
_tokens.extend(chunk_tokens)
|
157 |
+
if verbose:
|
158 |
+
print(f"---\ntext chunks: {text_chunks}\n")
|
159 |
+
print(f"---\npattern: {self.pattern}\n")
|
160 |
+
return _tokens
|
tokenizer/bpetokenizer/version.py
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
__version__ = "1.0.31"
|
tokenizer/notebooks/tokenization.ipynb
ADDED
@@ -0,0 +1,764 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "markdown",
|
5 |
+
"metadata": {},
|
6 |
+
"source": [
|
7 |
+
"### Tokenizer\n",
|
8 |
+
"\n",
|
9 |
+
"A completely seperate, independent module from the LLM. which has its own training dataset of text, on which you train the vocabulary using the BPE(Byte pair encoding) algorithm. It then translates back and forth between the raw text and the sequence of integers/tokens. LLM only deals with the tokens and never directly deals with the text.\n",
|
10 |
+
"\n",
|
11 |
+
"![image.png](../public/tokenizer.png)"
|
12 |
+
]
|
13 |
+
},
|
14 |
+
{
|
15 |
+
"cell_type": "code",
|
16 |
+
"execution_count": 8,
|
17 |
+
"metadata": {},
|
18 |
+
"outputs": [
|
19 |
+
{
|
20 |
+
"data": {
|
21 |
+
"text/plain": [
|
22 |
+
"97"
|
23 |
+
]
|
24 |
+
},
|
25 |
+
"execution_count": 8,
|
26 |
+
"metadata": {},
|
27 |
+
"output_type": "execute_result"
|
28 |
+
}
|
29 |
+
],
|
30 |
+
"source": [
|
31 |
+
"# the unicode code point of the character\n",
|
32 |
+
"ord('a')"
|
33 |
+
]
|
34 |
+
},
|
35 |
+
{
|
36 |
+
"cell_type": "code",
|
37 |
+
"execution_count": 9,
|
38 |
+
"metadata": {},
|
39 |
+
"outputs": [
|
40 |
+
{
|
41 |
+
"data": {
|
42 |
+
"text/plain": [
|
43 |
+
"[3118,\n",
|
44 |
+
" 3136,\n",
|
45 |
+
" 3120,\n",
|
46 |
+
" 3137,\n",
|
47 |
+
" 32,\n",
|
48 |
+
" 3086,\n",
|
49 |
+
" 3122,\n",
|
50 |
+
" 3134,\n",
|
51 |
+
" 32,\n",
|
52 |
+
" 3081,\n",
|
53 |
+
" 3112,\n",
|
54 |
+
" 3149,\n",
|
55 |
+
" 3112,\n",
|
56 |
+
" 3134,\n",
|
57 |
+
" 3120,\n",
|
58 |
+
" 3137,\n",
|
59 |
+
" 63,\n",
|
60 |
+
" 32,\n",
|
61 |
+
" 40,\n",
|
62 |
+
" 72,\n",
|
63 |
+
" 111,\n",
|
64 |
+
" 119,\n",
|
65 |
+
" 32,\n",
|
66 |
+
" 97,\n",
|
67 |
+
" 114,\n",
|
68 |
+
" 101,\n",
|
69 |
+
" 32,\n",
|
70 |
+
" 121,\n",
|
71 |
+
" 111,\n",
|
72 |
+
" 117,\n",
|
73 |
+
" 63,\n",
|
74 |
+
" 41]"
|
75 |
+
]
|
76 |
+
},
|
77 |
+
"execution_count": 9,
|
78 |
+
"metadata": {},
|
79 |
+
"output_type": "execute_result"
|
80 |
+
}
|
81 |
+
],
|
82 |
+
"source": [
|
83 |
+
"tokens = [ord(c) for c in \"మీరు ఎలా ఉన్నారు? (How are you?)\"]\n",
|
84 |
+
"tokens"
|
85 |
+
]
|
86 |
+
},
|
87 |
+
{
|
88 |
+
"cell_type": "markdown",
|
89 |
+
"metadata": {},
|
90 |
+
"source": [
|
91 |
+
"but having the token for each letter will increase the computation cost to generate and also train the model. so the BPE algorithm to introduced in the [GPT2 paper](https://d4mucfpksywv.cloudfront.net/better-language-models/language_models_are_unsupervised_multitask_learners.pdf)"
|
92 |
+
]
|
93 |
+
},
|
94 |
+
{
|
95 |
+
"cell_type": "markdown",
|
96 |
+
"metadata": {},
|
97 |
+
"source": [
|
98 |
+
"### Byte pair encoding algorithm\n",
|
99 |
+
"\n",
|
100 |
+
"consider the string:\n",
|
101 |
+
"\n",
|
102 |
+
"`aaabdaaabac`\n",
|
103 |
+
"\n",
|
104 |
+
"the byte pair \"aa\" is the most occuring in the string, so we replace that with a new byte which is not used in the `vocab`, let's say \"Z\".\n",
|
105 |
+
"Now the following string will be\n",
|
106 |
+
"\n",
|
107 |
+
"```\n",
|
108 |
+
"ZabdZabac\n",
|
109 |
+
"Z = aa\n",
|
110 |
+
"```\n",
|
111 |
+
"\n",
|
112 |
+
"this process will be continued with recursive byte pair encoding replacing all the byte pairs till the string/data cannot be compressed further.\n",
|
113 |
+
"\n",
|
114 |
+
"\n",
|
115 |
+
"Then the process is repeated with byte pair \"ab\", replacing it with \"Y\"\n",
|
116 |
+
"```\n",
|
117 |
+
"ZYdZYac\n",
|
118 |
+
"Y=ab\n",
|
119 |
+
"Z=aa\n",
|
120 |
+
"```\n",
|
121 |
+
"replacing \"ZY\" with \"X\"\n",
|
122 |
+
"```\n",
|
123 |
+
"XdXac\n",
|
124 |
+
"X=ZY\n",
|
125 |
+
"Y=ab\n",
|
126 |
+
"Z=aa\n",
|
127 |
+
"```\n",
|
128 |
+
"\n",
|
129 |
+
"\n"
|
130 |
+
]
|
131 |
+
},
|
132 |
+
{
|
133 |
+
"cell_type": "code",
|
134 |
+
"execution_count": 10,
|
135 |
+
"metadata": {},
|
136 |
+
"outputs": [
|
137 |
+
{
|
138 |
+
"name": "stdout",
|
139 |
+
"output_type": "stream",
|
140 |
+
"text": [
|
141 |
+
"----\n",
|
142 |
+
"Autogen enables the next-gen LLM applications with a generic [multi-agent conversation](https://microsoft.github.io/autogen/docs/Use-Cases/agent_chat) framework. It offers customizable and conversable agents that integrate LLMs, tools, and humans.\n",
|
143 |
+
"By automating chat among multiple capable agents, one can easily make them collectively perform tasks autonomously or with human feedback, including tasks that require using tools via code.\n",
|
144 |
+
"\n",
|
145 |
+
"Features of this use case include:\n",
|
146 |
+
"\n",
|
147 |
+
"- **Multi-agent conversations**: AutoGen agents can communicate with each other to solve tasks. This allows for more complex and sophisticated applications than would be possible with a single LLM.\n",
|
148 |
+
"- **Customization**: AutoGen agents can be customized to meet the specific needs of an application. This includes the ability to choose the LLMs to use, the types of human input to allow, and the tools to employ.\n",
|
149 |
+
"- **Human participation**: AutoGen seamlessly allows human participation. This means that humans can provide input and feedback to the agents as needed.\n",
|
150 |
+
"\n",
|
151 |
+
"For [example](https://github.com/microsoft/autogen/blob/main/test/twoagent.py),\n",
|
152 |
+
"\n",
|
153 |
+
"```python\n",
|
154 |
+
"from autogen import AssistantAgent, UserProxyAgent, config_list_from_json\n",
|
155 |
+
"# Load LLM inference endpoints from an env variable or a file\n",
|
156 |
+
"# See https://microsoft.github.io/autogen/docs/FAQ#set-your-api-endpoints\n",
|
157 |
+
"# and OAI_CONFIG_LIST_sample\n",
|
158 |
+
"config_list = config_list_from_json(env_or_file=\"OAI_CONFIG_LIST\")\n",
|
159 |
+
"# You can also set config_list directly as a list, for example, config_list = [{'model': 'gpt-4', 'api_key': '<your OpenAI API key here>'},]\n",
|
160 |
+
"assistant = AssistantAgent(\"assistant\", llm_config={\"config_list\": config_list})\n",
|
161 |
+
"user_proxy = UserProxyAgent(\"user_proxy\", code_execution_config={\"work_dir\": \"coding\", \"use_docker\": False}) # IMPORTANT: set to True to run code in docker, recommended\n",
|
162 |
+
"user_proxy.initiate_chat(assistant, message=\"Plot a chart of NVDA and TESLA stock price change YTD.\")\n",
|
163 |
+
"# This initiates an automated chat between the two agents to solve the task\n",
|
164 |
+
"```\n",
|
165 |
+
"\n",
|
166 |
+
"more python code:\n",
|
167 |
+
"\n",
|
168 |
+
"```python\n",
|
169 |
+
" def create(\n",
|
170 |
+
" self,\n",
|
171 |
+
" *,\n",
|
172 |
+
" messages: Iterable[ChatCompletionMessageParam],\n",
|
173 |
+
" model: Union[str, ChatModel],\n",
|
174 |
+
" frequency_penalty: Optional[float] | NotGiven = NOT_GIVEN,\n",
|
175 |
+
" function_call: completion_create_params.FunctionCall | NotGiven = NOT_GIVEN,\n",
|
176 |
+
" functions: Iterable[completion_create_params.Function] | NotGiven = NOT_GIVEN,\n",
|
177 |
+
" logit_bias: Optional[Dict[str, int]] | NotGiven = NOT_GIVEN,\n",
|
178 |
+
" logprobs: Optional[bool] | NotGiven = NOT_GIVEN,\n",
|
179 |
+
" max_tokens: Optional[int] | NotGiven = NOT_GIVEN,\n",
|
180 |
+
" n: Optional[int] | NotGiven = NOT_GIVEN,\n",
|
181 |
+
" presence_penalty: Optional[float] | NotGiven = NOT_GIVEN,\n",
|
182 |
+
" response_format: completion_create_params.ResponseFormat | NotGiven = NOT_GIVEN,\n",
|
183 |
+
" seed: Optional[int] | NotGiven = NOT_GIVEN,\n",
|
184 |
+
" stop: Union[Optional[str], List[str]] | NotGiven = NOT_GIVEN,\n",
|
185 |
+
" stream: Optional[Literal[False]] | Literal[True] | NotGiven = NOT_GIVEN,\n",
|
186 |
+
" stream_options: Optional[ChatCompletionStreamOptionsParam] | NotGiven = NOT_GIVEN,\n",
|
187 |
+
" temperature: Optional[float] | NotGiven = NOT_GIVEN,\n",
|
188 |
+
" tool_choice: ChatCompletionToolChoiceOptionParam | NotGiven = NOT_GIVEN,\n",
|
189 |
+
" tools: Iterable[ChatCompletionToolParam] | NotGiven = NOT_GIVEN,\n",
|
190 |
+
" top_logprobs: Optional[int] | NotGiven = NOT_GIVEN,\n",
|
191 |
+
" top_p: Optional[float] | NotGiven = NOT_GIVEN,\n",
|
192 |
+
" user: str | NotGiven = NOT_GIVEN,\n",
|
193 |
+
" # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.\n",
|
194 |
+
" # The extra values given here take precedence over values defined on the client or passed to this method.\n",
|
195 |
+
" extra_headers: Headers | None = None,\n",
|
196 |
+
" extra_query: Query | None = None,\n",
|
197 |
+
" extra_body: Body | None = None,\n",
|
198 |
+
" timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,\n",
|
199 |
+
" ) -> ChatCompletion | Stream[ChatCompletionChunk]:\n",
|
200 |
+
" return self._post(\n",
|
201 |
+
" \"/chat/completions\",\n",
|
202 |
+
" body=maybe_transform(\n",
|
203 |
+
" {\n",
|
204 |
+
" \"messages\": messages,\n",
|
205 |
+
" \"model\": model,\n",
|
206 |
+
" \"frequency_penalty\": frequency_penalty,\n",
|
207 |
+
" \"function_call\": function_call,\n",
|
208 |
+
" \"functions\": functions,\n",
|
209 |
+
" \"logit_bias\": logit_bias,\n",
|
210 |
+
" \"logprobs\": logprobs,\n",
|
211 |
+
" \"max_tokens\": max_tokens,\n",
|
212 |
+
" \"n\": n,\n",
|
213 |
+
" \"presence_penalty\": presence_penalty,\n",
|
214 |
+
" \"response_format\": response_format,\n",
|
215 |
+
" \"seed\": seed,\n",
|
216 |
+
" \"stop\": stop,\n",
|
217 |
+
" \"stream\": stream,\n",
|
218 |
+
" \"stream_options\": stream_options,\n",
|
219 |
+
" \"temperature\": temperature,\n",
|
220 |
+
" \"tool_choice\": tool_choice,\n",
|
221 |
+
" \"tools\": tools,\n",
|
222 |
+
" \"top_logprobs\": top_logprobs,\n",
|
223 |
+
" \"top_p\": top_p,\n",
|
224 |
+
" \"user\": user,\n",
|
225 |
+
" },\n",
|
226 |
+
" completion_create_params.CompletionCreateParams,\n",
|
227 |
+
" ),\n",
|
228 |
+
" options=make_request_options(\n",
|
229 |
+
" extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout\n",
|
230 |
+
" ),\n",
|
231 |
+
" cast_to=ChatCompletion,\n",
|
232 |
+
" stream=stream or False,\n",
|
233 |
+
" stream_cls=Stream[ChatCompletionChunk],\n",
|
234 |
+
" )\n",
|
235 |
+
"```\n",
|
236 |
+
"\n",
|
237 |
+
"length: 5397\n",
|
238 |
+
"----\n",
|
239 |
+
"[65, 117, 116, 111, 103, 101, 110, 32, 101, 110, 97, 98, 108, 101, 115, 32, 116, 104, 101, 32, 110, 101, 120, 116, 45, 103, 101, 110, 32, 76, 76, 77, 32, 97, 112, 112, 108, 105, 99, 97, 116, 105, 111, 110, 115, 32, 119, 105, 116, 104, 32, 97, 32, 103, 101, 110, 101, 114, 105, 99, 32, 91, 109, 117, 108, 116, 105, 45, 97, 103, 101, 110, 116, 32, 99, 111, 110, 118, 101, 114, 115, 97, 116, 105, 111, 110, 93, 40, 104, 116, 116, 112, 115, 58, 47, 47, 109, 105, 99, 114, 111, 115, 111, 102, 116, 46, 103, 105, 116, 104, 117, 98, 46, 105, 111, 47, 97, 117, 116, 111, 103, 101, 110, 47, 100, 111, 99, 115, 47, 85, 115, 101, 45, 67, 97, 115, 101, 115, 47, 97, 103, 101, 110, 116, 95, 99, 104, 97, 116, 41, 32, 102, 114, 97, 109, 101, 119, 111, 114, 107, 46, 32, 73, 116, 32, 111, 102, 102, 101, 114, 115, 32, 99, 117, 115, 116, 111, 109, 105, 122, 97, 98, 108, 101, 32, 97, 110, 100, 32, 99, 111, 110, 118, 101, 114, 115, 97, 98, 108, 101, 32, 97, 103, 101, 110, 116, 115, 32, 116, 104, 97, 116, 32, 105, 110, 116, 101, 103, 114, 97, 116, 101, 32, 76, 76, 77, 115, 44, 32, 116, 111, 111, 108, 115, 44, 32, 97, 110, 100, 32, 104, 117, 109, 97, 110, 115, 46, 10, 66, 121, 32, 97, 117, 116, 111, 109, 97, 116, 105, 110, 103, 32, 99, 104, 97, 116, 32, 97, 109, 111, 110, 103, 32, 109, 117, 108, 116, 105, 112, 108, 101, 32, 99, 97, 112, 97, 98, 108, 101, 32, 97, 103, 101, 110, 116, 115, 44, 32, 111, 110, 101, 32, 99, 97, 110, 32, 101, 97, 115, 105, 108, 121, 32, 109, 97, 107, 101, 32, 116, 104, 101, 109, 32, 99, 111, 108, 108, 101, 99, 116, 105, 118, 101, 108, 121, 32, 112, 101, 114, 102, 111, 114, 109, 32, 116, 97, 115, 107, 115, 32, 97, 117, 116, 111, 110, 111, 109, 111, 117, 115, 108, 121, 32, 111, 114, 32, 119, 105, 116, 104, 32, 104, 117, 109, 97, 110, 32, 102, 101, 101, 100, 98, 97, 99, 107, 44, 32, 105, 110, 99, 108, 117, 100, 105, 110, 103, 32, 116, 97, 115, 107, 115, 32, 116, 104, 97, 116, 32, 114, 101, 113, 117, 105, 114, 101, 32, 117, 115, 105, 110, 103, 32, 116, 111, 111, 108, 115, 32, 118, 105, 97, 32, 99, 111, 100, 101, 46, 10, 10, 70, 101, 97, 116, 117, 114, 101, 115, 32, 111, 102, 32, 116, 104, 105, 115, 32, 117, 115, 101, 32, 99, 97, 115, 101, 32, 105, 110, 99, 108, 117, 100, 101, 58, 10, 10, 45, 32, 42, 42, 77, 117, 108, 116, 105, 45, 97, 103, 101, 110, 116, 32, 99, 111, 110, 118, 101, 114, 115, 97, 116, 105, 111, 110, 115, 42, 42, 58, 32, 65, 117, 116, 111, 71, 101, 110, 32, 97, 103, 101, 110, 116, 115, 32, 99, 97, 110, 32, 99, 111, 109, 109, 117, 110, 105, 99, 97, 116, 101, 32, 119, 105, 116, 104, 32, 101, 97, 99, 104, 32, 111, 116, 104, 101, 114, 32, 116, 111, 32, 115, 111, 108, 118, 101, 32, 116, 97, 115, 107, 115, 46, 32, 84, 104, 105, 115, 32, 97, 108, 108, 111, 119, 115, 32, 102, 111, 114, 32, 109, 111, 114, 101, 32, 99, 111, 109, 112, 108, 101, 120, 32, 97, 110, 100, 32, 115, 111, 112, 104, 105, 115, 116, 105, 99, 97, 116, 101, 100, 32, 97, 112, 112, 108, 105, 99, 97, 116, 105, 111, 110, 115, 32, 116, 104, 97, 110, 32, 119, 111, 117, 108, 100, 32, 98, 101, 32, 112, 111, 115, 115, 105, 98, 108, 101, 32, 119, 105, 116, 104, 32, 97, 32, 115, 105, 110, 103, 108, 101, 32, 76, 76, 77, 46, 10, 45, 32, 42, 42, 67, 117, 115, 116, 111, 109, 105, 122, 97, 116, 105, 111, 110, 42, 42, 58, 32, 65, 117, 116, 111, 71, 101, 110, 32, 97, 103, 101, 110, 116, 115, 32, 99, 97, 110, 32, 98, 101, 32, 99, 117, 115, 116, 111, 109, 105, 122, 101, 100, 32, 116, 111, 32, 109, 101, 101, 116, 32, 116, 104, 101, 32, 115, 112, 101, 99, 105, 102, 105, 99, 32, 110, 101, 101, 100, 115, 32, 111, 102, 32, 97, 110, 32, 97, 112, 112, 108, 105, 99, 97, 116, 105, 111, 110, 46, 32, 84, 104, 105, 115, 32, 105, 110, 99, 108, 117, 100, 101, 115, 32, 116, 104, 101, 32, 97, 98, 105, 108, 105, 116, 121, 32, 116, 111, 32, 99, 104, 111, 111, 115, 101, 32, 116, 104, 101, 32, 76, 76, 77, 115, 32, 116, 111, 32, 117, 115, 101, 44, 32, 116, 104, 101, 32, 116, 121, 112, 101, 115, 32, 111, 102, 32, 104, 117, 109, 97, 110, 32, 105, 110, 112, 117, 116, 32, 116, 111, 32, 97, 108, 108, 111, 119, 44, 32, 97, 110, 100, 32, 116, 104, 101, 32, 116, 111, 111, 108, 115, 32, 116, 111, 32, 101, 109, 112, 108, 111, 121, 46, 10, 45, 32, 42, 42, 72, 117, 109, 97, 110, 32, 112, 97, 114, 116, 105, 99, 105, 112, 97, 116, 105, 111, 110, 42, 42, 58, 32, 65, 117, 116, 111, 71, 101, 110, 32, 115, 101, 97, 109, 108, 101, 115, 115, 108, 121, 32, 97, 108, 108, 111, 119, 115, 32, 104, 117, 109, 97, 110, 32, 112, 97, 114, 116, 105, 99, 105, 112, 97, 116, 105, 111, 110, 46, 32, 84, 104, 105, 115, 32, 109, 101, 97, 110, 115, 32, 116, 104, 97, 116, 32, 104, 117, 109, 97, 110, 115, 32, 99, 97, 110, 32, 112, 114, 111, 118, 105, 100, 101, 32, 105, 110, 112, 117, 116, 32, 97, 110, 100, 32, 102, 101, 101, 100, 98, 97, 99, 107, 32, 116, 111, 32, 116, 104, 101, 32, 97, 103, 101, 110, 116, 115, 32, 97, 115, 32, 110, 101, 101, 100, 101, 100, 46, 10, 10, 70, 111, 114, 32, 91, 101, 120, 97, 109, 112, 108, 101, 93, 40, 104, 116, 116, 112, 115, 58, 47, 47, 103, 105, 116, 104, 117, 98, 46, 99, 111, 109, 47, 109, 105, 99, 114, 111, 115, 111, 102, 116, 47, 97, 117, 116, 111, 103, 101, 110, 47, 98, 108, 111, 98, 47, 109, 97, 105, 110, 47, 116, 101, 115, 116, 47, 116, 119, 111, 97, 103, 101, 110, 116, 46, 112, 121, 41, 44, 10, 10, 96, 96, 96, 112, 121, 116, 104, 111, 110, 10, 102, 114, 111, 109, 32, 97, 117, 116, 111, 103, 101, 110, 32, 105, 109, 112, 111, 114, 116, 32, 65, 115, 115, 105, 115, 116, 97, 110, 116, 65, 103, 101, 110, 116, 44, 32, 85, 115, 101, 114, 80, 114, 111, 120, 121, 65, 103, 101, 110, 116, 44, 32, 99, 111, 110, 102, 105, 103, 95, 108, 105, 115, 116, 95, 102, 114, 111, 109, 95, 106, 115, 111, 110, 10, 35, 32, 76, 111, 97, 100, 32, 76, 76, 77, 32, 105, 110, 102, 101, 114, 101, 110, 99, 101, 32, 101, 110, 100, 112, 111, 105, 110, 116, 115, 32, 102, 114, 111, 109, 32, 97, 110, 32, 101, 110, 118, 32, 118, 97, 114, 105, 97, 98, 108, 101, 32, 111, 114, 32, 97, 32, 102, 105, 108, 101, 10, 35, 32, 83, 101, 101, 32, 104, 116, 116, 112, 115, 58, 47, 47, 109, 105, 99, 114, 111, 115, 111, 102, 116, 46, 103, 105, 116, 104, 117, 98, 46, 105, 111, 47, 97, 117, 116, 111, 103, 101, 110, 47, 100, 111, 99, 115, 47, 70, 65, 81, 35, 115, 101, 116, 45, 121, 111, 117, 114, 45, 97, 112, 105, 45, 101, 110, 100, 112, 111, 105, 110, 116, 115, 10, 35, 32, 97, 110, 100, 32, 79, 65, 73, 95, 67, 79, 78, 70, 73, 71, 95, 76, 73, 83, 84, 95, 115, 97, 109, 112, 108, 101, 10, 99, 111, 110, 102, 105, 103, 95, 108, 105, 115, 116, 32, 61, 32, 99, 111, 110, 102, 105, 103, 95, 108, 105, 115, 116, 95, 102, 114, 111, 109, 95, 106, 115, 111, 110, 40, 101, 110, 118, 95, 111, 114, 95, 102, 105, 108, 101, 61, 34, 79, 65, 73, 95, 67, 79, 78, 70, 73, 71, 95, 76, 73, 83, 84, 34, 41, 10, 35, 32, 89, 111, 117, 32, 99, 97, 110, 32, 97, 108, 115, 111, 32, 115, 101, 116, 32, 99, 111, 110, 102, 105, 103, 95, 108, 105, 115, 116, 32, 100, 105, 114, 101, 99, 116, 108, 121, 32, 97, 115, 32, 97, 32, 108, 105, 115, 116, 44, 32, 102, 111, 114, 32, 101, 120, 97, 109, 112, 108, 101, 44, 32, 99, 111, 110, 102, 105, 103, 95, 108, 105, 115, 116, 32, 61, 32, 91, 123, 39, 109, 111, 100, 101, 108, 39, 58, 32, 39, 103, 112, 116, 45, 52, 39, 44, 32, 39, 97, 112, 105, 95, 107, 101, 121, 39, 58, 32, 39, 60, 121, 111, 117, 114, 32, 79, 112, 101, 110, 65, 73, 32, 65, 80, 73, 32, 107, 101, 121, 32, 104, 101, 114, 101, 62, 39, 125, 44, 93, 10, 97, 115, 115, 105, 115, 116, 97, 110, 116, 32, 61, 32, 65, 115, 115, 105, 115, 116, 97, 110, 116, 65, 103, 101, 110, 116, 40, 34, 97, 115, 115, 105, 115, 116, 97, 110, 116, 34, 44, 32, 108, 108, 109, 95, 99, 111, 110, 102, 105, 103, 61, 123, 34, 99, 111, 110, 102, 105, 103, 95, 108, 105, 115, 116, 34, 58, 32, 99, 111, 110, 102, 105, 103, 95, 108, 105, 115, 116, 125, 41, 10, 117, 115, 101, 114, 95, 112, 114, 111, 120, 121, 32, 61, 32, 85, 115, 101, 114, 80, 114, 111, 120, 121, 65, 103, 101, 110, 116, 40, 34, 117, 115, 101, 114, 95, 112, 114, 111, 120, 121, 34, 44, 32, 99, 111, 100, 101, 95, 101, 120, 101, 99, 117, 116, 105, 111, 110, 95, 99, 111, 110, 102, 105, 103, 61, 123, 34, 119, 111, 114, 107, 95, 100, 105, 114, 34, 58, 32, 34, 99, 111, 100, 105, 110, 103, 34, 44, 32, 34, 117, 115, 101, 95, 100, 111, 99, 107, 101, 114, 34, 58, 32, 70, 97, 108, 115, 101, 125, 41, 32, 35, 32, 73, 77, 80, 79, 82, 84, 65, 78, 84, 58, 32, 115, 101, 116, 32, 116, 111, 32, 84, 114, 117, 101, 32, 116, 111, 32, 114, 117, 110, 32, 99, 111, 100, 101, 32, 105, 110, 32, 100, 111, 99, 107, 101, 114, 44, 32, 114, 101, 99, 111, 109, 109, 101, 110, 100, 101, 100, 10, 117, 115, 101, 114, 95, 112, 114, 111, 120, 121, 46, 105, 110, 105, 116, 105, 97, 116, 101, 95, 99, 104, 97, 116, 40, 97, 115, 115, 105, 115, 116, 97, 110, 116, 44, 32, 109, 101, 115, 115, 97, 103, 101, 61, 34, 80, 108, 111, 116, 32, 97, 32, 99, 104, 97, 114, 116, 32, 111, 102, 32, 78, 86, 68, 65, 32, 97, 110, 100, 32, 84, 69, 83, 76, 65, 32, 115, 116, 111, 99, 107, 32, 112, 114, 105, 99, 101, 32, 99, 104, 97, 110, 103, 101, 32, 89, 84, 68, 46, 34, 41, 10, 35, 32, 84, 104, 105, 115, 32, 105, 110, 105, 116, 105, 97, 116, 101, 115, 32, 97, 110, 32, 97, 117, 116, 111, 109, 97, 116, 101, 100, 32, 99, 104, 97, 116, 32, 98, 101, 116, 119, 101, 101, 110, 32, 116, 104, 101, 32, 116, 119, 111, 32, 97, 103, 101, 110, 116, 115, 32, 116, 111, 32, 115, 111, 108, 118, 101, 32, 116, 104, 101, 32, 116, 97, 115, 107, 10, 96, 96, 96, 10, 10, 109, 111, 114, 101, 32, 112, 121, 116, 104, 111, 110, 32, 99, 111, 100, 101, 58, 10, 10, 96, 96, 96, 112, 121, 116, 104, 111, 110, 10, 32, 32, 32, 32, 100, 101, 102, 32, 99, 114, 101, 97, 116, 101, 40, 10, 32, 32, 32, 32, 32, 32, 32, 32, 115, 101, 108, 102, 44, 10, 32, 32, 32, 32, 32, 32, 32, 32, 42, 44, 10, 32, 32, 32, 32, 32, 32, 32, 32, 109, 101, 115, 115, 97, 103, 101, 115, 58, 32, 73, 116, 101, 114, 97, 98, 108, 101, 91, 67, 104, 97, 116, 67, 111, 109, 112, 108, 101, 116, 105, 111, 110, 77, 101, 115, 115, 97, 103, 101, 80, 97, 114, 97, 109, 93, 44, 10, 32, 32, 32, 32, 32, 32, 32, 32, 109, 111, 100, 101, 108, 58, 32, 85, 110, 105, 111, 110, 91, 115, 116, 114, 44, 32, 67, 104, 97, 116, 77, 111, 100, 101, 108, 93, 44, 10, 32, 32, 32, 32, 32, 32, 32, 32, 102, 114, 101, 113, 117, 101, 110, 99, 121, 95, 112, 101, 110, 97, 108, 116, 121, 58, 32, 79, 112, 116, 105, 111, 110, 97, 108, 91, 102, 108, 111, 97, 116, 93, 32, 124, 32, 78, 111, 116, 71, 105, 118, 101, 110, 32, 61, 32, 78, 79, 84, 95, 71, 73, 86, 69, 78, 44, 10, 32, 32, 32, 32, 32, 32, 32, 32, 102, 117, 110, 99, 116, 105, 111, 110, 95, 99, 97, 108, 108, 58, 32, 99, 111, 109, 112, 108, 101, 116, 105, 111, 110, 95, 99, 114, 101, 97, 116, 101, 95, 112, 97, 114, 97, 109, 115, 46, 70, 117, 110, 99, 116, 105, 111, 110, 67, 97, 108, 108, 32, 124, 32, 78, 111, 116, 71, 105, 118, 101, 110, 32, 61, 32, 78, 79, 84, 95, 71, 73, 86, 69, 78, 44, 10, 32, 32, 32, 32, 32, 32, 32, 32, 102, 117, 110, 99, 116, 105, 111, 110, 115, 58, 32, 73, 116, 101, 114, 97, 98, 108, 101, 91, 99, 111, 109, 112, 108, 101, 116, 105, 111, 110, 95, 99, 114, 101, 97, 116, 101, 95, 112, 97, 114, 97, 109, 115, 46, 70, 117, 110, 99, 116, 105, 111, 110, 93, 32, 124, 32, 78, 111, 116, 71, 105, 118, 101, 110, 32, 61, 32, 78, 79, 84, 95, 71, 73, 86, 69, 78, 44, 10, 32, 32, 32, 32, 32, 32, 32, 32, 108, 111, 103, 105, 116, 95, 98, 105, 97, 115, 58, 32, 79, 112, 116, 105, 111, 110, 97, 108, 91, 68, 105, 99, 116, 91, 115, 116, 114, 44, 32, 105, 110, 116, 93, 93, 32, 124, 32, 78, 111, 116, 71, 105, 118, 101, 110, 32, 61, 32, 78, 79, 84, 95, 71, 73, 86, 69, 78, 44, 10, 32, 32, 32, 32, 32, 32, 32, 32, 108, 111, 103, 112, 114, 111, 98, 115, 58, 32, 79, 112, 116, 105, 111, 110, 97, 108, 91, 98, 111, 111, 108, 93, 32, 124, 32, 78, 111, 116, 71, 105, 118, 101, 110, 32, 61, 32, 78, 79, 84, 95, 71, 73, 86, 69, 78, 44, 10, 32, 32, 32, 32, 32, 32, 32, 32, 109, 97, 120, 95, 116, 111, 107, 101, 110, 115, 58, 32, 79, 112, 116, 105, 111, 110, 97, 108, 91, 105, 110, 116, 93, 32, 124, 32, 78, 111, 116, 71, 105, 118, 101, 110, 32, 61, 32, 78, 79, 84, 95, 71, 73, 86, 69, 78, 44, 10, 32, 32, 32, 32, 32, 32, 32, 32, 110, 58, 32, 79, 112, 116, 105, 111, 110, 97, 108, 91, 105, 110, 116, 93, 32, 124, 32, 78, 111, 116, 71, 105, 118, 101, 110, 32, 61, 32, 78, 79, 84, 95, 71, 73, 86, 69, 78, 44, 10, 32, 32, 32, 32, 32, 32, 32, 32, 112, 114, 101, 115, 101, 110, 99, 101, 95, 112, 101, 110, 97, 108, 116, 121, 58, 32, 79, 112, 116, 105, 111, 110, 97, 108, 91, 102, 108, 111, 97, 116, 93, 32, 124, 32, 78, 111, 116, 71, 105, 118, 101, 110, 32, 61, 32, 78, 79, 84, 95, 71, 73, 86, 69, 78, 44, 10, 32, 32, 32, 32, 32, 32, 32, 32, 114, 101, 115, 112, 111, 110, 115, 101, 95, 102, 111, 114, 109, 97, 116, 58, 32, 99, 111, 109, 112, 108, 101, 116, 105, 111, 110, 95, 99, 114, 101, 97, 116, 101, 95, 112, 97, 114, 97, 109, 115, 46, 82, 101, 115, 112, 111, 110, 115, 101, 70, 111, 114, 109, 97, 116, 32, 124, 32, 78, 111, 116, 71, 105, 118, 101, 110, 32, 61, 32, 78, 79, 84, 95, 71, 73, 86, 69, 78, 44, 10, 32, 32, 32, 32, 32, 32, 32, 32, 115, 101, 101, 100, 58, 32, 79, 112, 116, 105, 111, 110, 97, 108, 91, 105, 110, 116, 93, 32, 124, 32, 78, 111, 116, 71, 105, 118, 101, 110, 32, 61, 32, 78, 79, 84, 95, 71, 73, 86, 69, 78, 44, 10, 32, 32, 32, 32, 32, 32, 32, 32, 115, 116, 111, 112, 58, 32, 85, 110, 105, 111, 110, 91, 79, 112, 116, 105, 111, 110, 97, 108, 91, 115, 116, 114, 93, 44, 32, 76, 105, 115, 116, 91, 115, 116, 114, 93, 93, 32, 124, 32, 78, 111, 116, 71, 105, 118, 101, 110, 32, 61, 32, 78, 79, 84, 95, 71, 73, 86, 69, 78, 44, 10, 32, 32, 32, 32, 32, 32, 32, 32, 115, 116, 114, 101, 97, 109, 58, 32, 79, 112, 116, 105, 111, 110, 97, 108, 91, 76, 105, 116, 101, 114, 97, 108, 91, 70, 97, 108, 115, 101, 93, 93, 32, 124, 32, 76, 105, 116, 101, 114, 97, 108, 91, 84, 114, 117, 101, 93, 32, 124, 32, 78, 111, 116, 71, 105, 118, 101, 110, 32, 61, 32, 78, 79, 84, 95, 71, 73, 86, 69, 78, 44, 10, 32, 32, 32, 32, 32, 32, 32, 32, 115, 116, 114, 101, 97, 109, 95, 111, 112, 116, 105, 111, 110, 115, 58, 32, 79, 112, 116, 105, 111, 110, 97, 108, 91, 67, 104, 97, 116, 67, 111, 109, 112, 108, 101, 116, 105, 111, 110, 83, 116, 114, 101, 97, 109, 79, 112, 116, 105, 111, 110, 115, 80, 97, 114, 97, 109, 93, 32, 124, 32, 78, 111, 116, 71, 105, 118, 101, 110, 32, 61, 32, 78, 79, 84, 95, 71, 73, 86, 69, 78, 44, 10, 32, 32, 32, 32, 32, 32, 32, 32, 116, 101, 109, 112, 101, 114, 97, 116, 117, 114, 101, 58, 32, 79, 112, 116, 105, 111, 110, 97, 108, 91, 102, 108, 111, 97, 116, 93, 32, 124, 32, 78, 111, 116, 71, 105, 118, 101, 110, 32, 61, 32, 78, 79, 84, 95, 71, 73, 86, 69, 78, 44, 10, 32, 32, 32, 32, 32, 32, 32, 32, 116, 111, 111, 108, 95, 99, 104, 111, 105, 99, 101, 58, 32, 67, 104, 97, 116, 67, 111, 109, 112, 108, 101, 116, 105, 111, 110, 84, 111, 111, 108, 67, 104, 111, 105, 99, 101, 79, 112, 116, 105, 111, 110, 80, 97, 114, 97, 109, 32, 124, 32, 78, 111, 116, 71, 105, 118, 101, 110, 32, 61, 32, 78, 79, 84, 95, 71, 73, 86, 69, 78, 44, 10, 32, 32, 32, 32, 32, 32, 32, 32, 116, 111, 111, 108, 115, 58, 32, 73, 116, 101, 114, 97, 98, 108, 101, 91, 67, 104, 97, 116, 67, 111, 109, 112, 108, 101, 116, 105, 111, 110, 84, 111, 111, 108, 80, 97, 114, 97, 109, 93, 32, 124, 32, 78, 111, 116, 71, 105, 118, 101, 110, 32, 61, 32, 78, 79, 84, 95, 71, 73, 86, 69, 78, 44, 10, 32, 32, 32, 32, 32, 32, 32, 32, 116, 111, 112, 95, 108, 111, 103, 112, 114, 111, 98, 115, 58, 32, 79, 112, 116, 105, 111, 110, 97, 108, 91, 105, 110, 116, 93, 32, 124, 32, 78, 111, 116, 71, 105, 118, 101, 110, 32, 61, 32, 78, 79, 84, 95, 71, 73, 86, 69, 78, 44, 10, 32, 32, 32, 32, 32, 32, 32, 32, 116, 111, 112, 95, 112, 58, 32, 79, 112, 116, 105, 111, 110, 97, 108, 91, 102, 108, 111, 97, 116, 93, 32, 124, 32, 78, 111, 116, 71, 105, 118, 101, 110, 32, 61, 32, 78, 79, 84, 95, 71, 73, 86, 69, 78, 44, 10, 32, 32, 32, 32, 32, 32, 32, 32, 117, 115, 101, 114, 58, 32, 115, 116, 114, 32, 124, 32, 78, 111, 116, 71, 105, 118, 101, 110, 32, 61, 32, 78, 79, 84, 95, 71, 73, 86, 69, 78, 44, 10, 32, 32, 32, 32, 32, 32, 32, 32, 35, 32, 85, 115, 101, 32, 116, 104, 101, 32, 102, 111, 108, 108, 111, 119, 105, 110, 103, 32, 97, 114, 103, 117, 109, 101, 110, 116, 115, 32, 105, 102, 32, 121, 111, 117, 32, 110, 101, 101, 100, 32, 116, 111, 32, 112, 97, 115, 115, 32, 97, 100, 100, 105, 116, 105, 111, 110, 97, 108, 32, 112, 97, 114, 97, 109, 101, 116, 101, 114, 115, 32, 116, 111, 32, 116, 104, 101, 32, 65, 80, 73, 32, 116, 104, 97, 116, 32, 97, 114, 101, 110, 39, 116, 32, 97, 118, 97, 105, 108, 97, 98, 108, 101, 32, 118, 105, 97, 32, 107, 119, 97, 114, 103, 115, 46, 10, 32, 32, 32, 32, 32, 32, 32, 32, 35, 32, 84, 104, 101, 32, 101, 120, 116, 114, 97, 32, 118, 97, 108, 117, 101, 115, 32, 103, 105, 118, 101, 110, 32, 104, 101, 114, 101, 32, 116, 97, 107, 101, 32, 112, 114, 101, 99, 101, 100, 101, 110, 99, 101, 32, 111, 118, 101, 114, 32, 118, 97, 108, 117, 101, 115, 32, 100, 101, 102, 105, 110, 101, 100, 32, 111, 110, 32, 116, 104, 101, 32, 99, 108, 105, 101, 110, 116, 32, 111, 114, 32, 112, 97, 115, 115, 101, 100, 32, 116, 111, 32, 116, 104, 105, 115, 32, 109, 101, 116, 104, 111, 100, 46, 10, 32, 32, 32, 32, 32, 32, 32, 32, 101, 120, 116, 114, 97, 95, 104, 101, 97, 100, 101, 114, 115, 58, 32, 72, 101, 97, 100, 101, 114, 115, 32, 124, 32, 78, 111, 110, 101, 32, 61, 32, 78, 111, 110, 101, 44, 10, 32, 32, 32, 32, 32, 32, 32, 32, 101, 120, 116, 114, 97, 95, 113, 117, 101, 114, 121, 58, 32, 81, 117, 101, 114, 121, 32, 124, 32, 78, 111, 110, 101, 32, 61, 32, 78, 111, 110, 101, 44, 10, 32, 32, 32, 32, 32, 32, 32, 32, 101, 120, 116, 114, 97, 95, 98, 111, 100, 121, 58, 32, 66, 111, 100, 121, 32, 124, 32, 78, 111, 110, 101, 32, 61, 32, 78, 111, 110, 101, 44, 10, 32, 32, 32, 32, 32, 32, 32, 32, 116, 105, 109, 101, 111, 117, 116, 58, 32, 102, 108, 111, 97, 116, 32, 124, 32, 104, 116, 116, 112, 120, 46, 84, 105, 109, 101, 111, 117, 116, 32, 124, 32, 78, 111, 110, 101, 32, 124, 32, 78, 111, 116, 71, 105, 118, 101, 110, 32, 61, 32, 78, 79, 84, 95, 71, 73, 86, 69, 78, 44, 10, 32, 32, 32, 32, 41, 32, 45, 62, 32, 67, 104, 97, 116, 67, 111, 109, 112, 108, 101, 116, 105, 111, 110, 32, 124, 32, 83, 116, 114, 101, 97, 109, 91, 67, 104, 97, 116, 67, 111, 109, 112, 108, 101, 116, 105, 111, 110, 67, 104, 117, 110, 107, 93, 58, 10, 32, 32, 32, 32, 32, 32, 32, 32, 114, 101, 116, 117, 114, 110, 32, 115, 101, 108, 102, 46, 95, 112, 111, 115, 116, 40, 10, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 34, 47, 99, 104, 97, 116, 47, 99, 111, 109, 112, 108, 101, 116, 105, 111, 110, 115, 34, 44, 10, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 98, 111, 100, 121, 61, 109, 97, 121, 98, 101, 95, 116, 114, 97, 110, 115, 102, 111, 114, 109, 40, 10, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 123, 10, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 34, 109, 101, 115, 115, 97, 103, 101, 115, 34, 58, 32, 109, 101, 115, 115, 97, 103, 101, 115, 44, 10, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 34, 109, 111, 100, 101, 108, 34, 58, 32, 109, 111, 100, 101, 108, 44, 10, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 34, 102, 114, 101, 113, 117, 101, 110, 99, 121, 95, 112, 101, 110, 97, 108, 116, 121, 34, 58, 32, 102, 114, 101, 113, 117, 101, 110, 99, 121, 95, 112, 101, 110, 97, 108, 116, 121, 44, 10, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 34, 102, 117, 110, 99, 116, 105, 111, 110, 95, 99, 97, 108, 108, 34, 58, 32, 102, 117, 110, 99, 116, 105, 111, 110, 95, 99, 97, 108, 108, 44, 10, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 34, 102, 117, 110, 99, 116, 105, 111, 110, 115, 34, 58, 32, 102, 117, 110, 99, 116, 105, 111, 110, 115, 44, 10, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 34, 108, 111, 103, 105, 116, 95, 98, 105, 97, 115, 34, 58, 32, 108, 111, 103, 105, 116, 95, 98, 105, 97, 115, 44, 10, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 34, 108, 111, 103, 112, 114, 111, 98, 115, 34, 58, 32, 108, 111, 103, 112, 114, 111, 98, 115, 44, 10, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 34, 109, 97, 120, 95, 116, 111, 107, 101, 110, 115, 34, 58, 32, 109, 97, 120, 95, 116, 111, 107, 101, 110, 115, 44, 10, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 34, 110, 34, 58, 32, 110, 44, 10, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 34, 112, 114, 101, 115, 101, 110, 99, 101, 95, 112, 101, 110, 97, 108, 116, 121, 34, 58, 32, 112, 114, 101, 115, 101, 110, 99, 101, 95, 112, 101, 110, 97, 108, 116, 121, 44, 10, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 34, 114, 101, 115, 112, 111, 110, 115, 101, 95, 102, 111, 114, 109, 97, 116, 34, 58, 32, 114, 101, 115, 112, 111, 110, 115, 101, 95, 102, 111, 114, 109, 97, 116, 44, 10, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 34, 115, 101, 101, 100, 34, 58, 32, 115, 101, 101, 100, 44, 10, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 34, 115, 116, 111, 112, 34, 58, 32, 115, 116, 111, 112, 44, 10, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 34, 115, 116, 114, 101, 97, 109, 34, 58, 32, 115, 116, 114, 101, 97, 109, 44, 10, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 34, 115, 116, 114, 101, 97, 109, 95, 111, 112, 116, 105, 111, 110, 115, 34, 58, 32, 115, 116, 114, 101, 97, 109, 95, 111, 112, 116, 105, 111, 110, 115, 44, 10, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 34, 116, 101, 109, 112, 101, 114, 97, 116, 117, 114, 101, 34, 58, 32, 116, 101, 109, 112, 101, 114, 97, 116, 117, 114, 101, 44, 10, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 34, 116, 111, 111, 108, 95, 99, 104, 111, 105, 99, 101, 34, 58, 32, 116, 111, 111, 108, 95, 99, 104, 111, 105, 99, 101, 44, 10, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 34, 116, 111, 111, 108, 115, 34, 58, 32, 116, 111, 111, 108, 115, 44, 10, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 34, 116, 111, 112, 95, 108, 111, 103, 112, 114, 111, 98, 115, 34, 58, 32, 116, 111, 112, 95, 108, 111, 103, 112, 114, 111, 98, 115, 44, 10, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 34, 116, 111, 112, 95, 112, 34, 58, 32, 116, 111, 112, 95, 112, 44, 10, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 34, 117, 115, 101, 114, 34, 58, 32, 117, 115, 101, 114, 44, 10, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 125, 44, 10, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 99, 111, 109, 112, 108, 101, 116, 105, 111, 110, 95, 99, 114, 101, 97, 116, 101, 95, 112, 97, 114, 97, 109, 115, 46, 67, 111, 109, 112, 108, 101, 116, 105, 111, 110, 67, 114, 101, 97, 116, 101, 80, 97, 114, 97, 109, 115, 44, 10, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 41, 44, 10, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 111, 112, 116, 105, 111, 110, 115, 61, 109, 97, 107, 101, 95, 114, 101, 113, 117, 101, 115, 116, 95, 111, 112, 116, 105, 111, 110, 115, 40, 10, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 101, 120, 116, 114, 97, 95, 104, 101, 97, 100, 101, 114, 115, 61, 101, 120, 116, 114, 97, 95, 104, 101, 97, 100, 101, 114, 115, 44, 32, 101, 120, 116, 114, 97, 95, 113, 117, 101, 114, 121, 61, 101, 120, 116, 114, 97, 95, 113, 117, 101, 114, 121, 44, 32, 101, 120, 116, 114, 97, 95, 98, 111, 100, 121, 61, 101, 120, 116, 114, 97, 95, 98, 111, 100, 121, 44, 32, 116, 105, 109, 101, 111, 117, 116, 61, 116, 105, 109, 101, 111, 117, 116, 10, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 41, 44, 10, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 99, 97, 115, 116, 95, 116, 111, 61, 67, 104, 97, 116, 67, 111, 109, 112, 108, 101, 116, 105, 111, 110, 44, 10, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 115, 116, 114, 101, 97, 109, 61, 115, 116, 114, 101, 97, 109, 32, 111, 114, 32, 70, 97, 108, 115, 101, 44, 10, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 115, 116, 114, 101, 97, 109, 95, 99, 108, 115, 61, 83, 116, 114, 101, 97, 109, 91, 67, 104, 97, 116, 67, 111, 109, 112, 108, 101, 116, 105, 111, 110, 67, 104, 117, 110, 107, 93, 44, 10, 32, 32, 32, 32, 32, 32, 32, 32, 41, 10, 96, 96, 96, 10]\n",
|
240 |
+
"length: 5397\n"
|
241 |
+
]
|
242 |
+
}
|
243 |
+
],
|
244 |
+
"source": [
|
245 |
+
"text = \"\"\"Autogen enables the next-gen LLM applications with a generic [multi-agent conversation](https://microsoft.github.io/autogen/docs/Use-Cases/agent_chat) framework. It offers customizable and conversable agents that integrate LLMs, tools, and humans.\n",
|
246 |
+
"By automating chat among multiple capable agents, one can easily make them collectively perform tasks autonomously or with human feedback, including tasks that require using tools via code.\n",
|
247 |
+
"\n",
|
248 |
+
"Features of this use case include:\n",
|
249 |
+
"\n",
|
250 |
+
"- **Multi-agent conversations**: AutoGen agents can communicate with each other to solve tasks. This allows for more complex and sophisticated applications than would be possible with a single LLM.\n",
|
251 |
+
"- **Customization**: AutoGen agents can be customized to meet the specific needs of an application. This includes the ability to choose the LLMs to use, the types of human input to allow, and the tools to employ.\n",
|
252 |
+
"- **Human participation**: AutoGen seamlessly allows human participation. This means that humans can provide input and feedback to the agents as needed.\n",
|
253 |
+
"\n",
|
254 |
+
"For [example](https://github.com/microsoft/autogen/blob/main/test/twoagent.py),\n",
|
255 |
+
"\n",
|
256 |
+
"```python\n",
|
257 |
+
"from autogen import AssistantAgent, UserProxyAgent, config_list_from_json\n",
|
258 |
+
"# Load LLM inference endpoints from an env variable or a file\n",
|
259 |
+
"# See https://microsoft.github.io/autogen/docs/FAQ#set-your-api-endpoints\n",
|
260 |
+
"# and OAI_CONFIG_LIST_sample\n",
|
261 |
+
"config_list = config_list_from_json(env_or_file=\"OAI_CONFIG_LIST\")\n",
|
262 |
+
"# You can also set config_list directly as a list, for example, config_list = [{'model': 'gpt-4', 'api_key': '<your OpenAI API key here>'},]\n",
|
263 |
+
"assistant = AssistantAgent(\"assistant\", llm_config={\"config_list\": config_list})\n",
|
264 |
+
"user_proxy = UserProxyAgent(\"user_proxy\", code_execution_config={\"work_dir\": \"coding\", \"use_docker\": False}) # IMPORTANT: set to True to run code in docker, recommended\n",
|
265 |
+
"user_proxy.initiate_chat(assistant, message=\"Plot a chart of NVDA and TESLA stock price change YTD.\")\n",
|
266 |
+
"# This initiates an automated chat between the two agents to solve the task\n",
|
267 |
+
"```\n",
|
268 |
+
"\n",
|
269 |
+
"more python code:\n",
|
270 |
+
"\n",
|
271 |
+
"```python\n",
|
272 |
+
" def create(\n",
|
273 |
+
" self,\n",
|
274 |
+
" *,\n",
|
275 |
+
" messages: Iterable[ChatCompletionMessageParam],\n",
|
276 |
+
" model: Union[str, ChatModel],\n",
|
277 |
+
" frequency_penalty: Optional[float] | NotGiven = NOT_GIVEN,\n",
|
278 |
+
" function_call: completion_create_params.FunctionCall | NotGiven = NOT_GIVEN,\n",
|
279 |
+
" functions: Iterable[completion_create_params.Function] | NotGiven = NOT_GIVEN,\n",
|
280 |
+
" logit_bias: Optional[Dict[str, int]] | NotGiven = NOT_GIVEN,\n",
|
281 |
+
" logprobs: Optional[bool] | NotGiven = NOT_GIVEN,\n",
|
282 |
+
" max_tokens: Optional[int] | NotGiven = NOT_GIVEN,\n",
|
283 |
+
" n: Optional[int] | NotGiven = NOT_GIVEN,\n",
|
284 |
+
" presence_penalty: Optional[float] | NotGiven = NOT_GIVEN,\n",
|
285 |
+
" response_format: completion_create_params.ResponseFormat | NotGiven = NOT_GIVEN,\n",
|
286 |
+
" seed: Optional[int] | NotGiven = NOT_GIVEN,\n",
|
287 |
+
" stop: Union[Optional[str], List[str]] | NotGiven = NOT_GIVEN,\n",
|
288 |
+
" stream: Optional[Literal[False]] | Literal[True] | NotGiven = NOT_GIVEN,\n",
|
289 |
+
" stream_options: Optional[ChatCompletionStreamOptionsParam] | NotGiven = NOT_GIVEN,\n",
|
290 |
+
" temperature: Optional[float] | NotGiven = NOT_GIVEN,\n",
|
291 |
+
" tool_choice: ChatCompletionToolChoiceOptionParam | NotGiven = NOT_GIVEN,\n",
|
292 |
+
" tools: Iterable[ChatCompletionToolParam] | NotGiven = NOT_GIVEN,\n",
|
293 |
+
" top_logprobs: Optional[int] | NotGiven = NOT_GIVEN,\n",
|
294 |
+
" top_p: Optional[float] | NotGiven = NOT_GIVEN,\n",
|
295 |
+
" user: str | NotGiven = NOT_GIVEN,\n",
|
296 |
+
" # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.\n",
|
297 |
+
" # The extra values given here take precedence over values defined on the client or passed to this method.\n",
|
298 |
+
" extra_headers: Headers | None = None,\n",
|
299 |
+
" extra_query: Query | None = None,\n",
|
300 |
+
" extra_body: Body | None = None,\n",
|
301 |
+
" timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,\n",
|
302 |
+
" ) -> ChatCompletion | Stream[ChatCompletionChunk]:\n",
|
303 |
+
" return self._post(\n",
|
304 |
+
" \"/chat/completions\",\n",
|
305 |
+
" body=maybe_transform(\n",
|
306 |
+
" {\n",
|
307 |
+
" \"messages\": messages,\n",
|
308 |
+
" \"model\": model,\n",
|
309 |
+
" \"frequency_penalty\": frequency_penalty,\n",
|
310 |
+
" \"function_call\": function_call,\n",
|
311 |
+
" \"functions\": functions,\n",
|
312 |
+
" \"logit_bias\": logit_bias,\n",
|
313 |
+
" \"logprobs\": logprobs,\n",
|
314 |
+
" \"max_tokens\": max_tokens,\n",
|
315 |
+
" \"n\": n,\n",
|
316 |
+
" \"presence_penalty\": presence_penalty,\n",
|
317 |
+
" \"response_format\": response_format,\n",
|
318 |
+
" \"seed\": seed,\n",
|
319 |
+
" \"stop\": stop,\n",
|
320 |
+
" \"stream\": stream,\n",
|
321 |
+
" \"stream_options\": stream_options,\n",
|
322 |
+
" \"temperature\": temperature,\n",
|
323 |
+
" \"tool_choice\": tool_choice,\n",
|
324 |
+
" \"tools\": tools,\n",
|
325 |
+
" \"top_logprobs\": top_logprobs,\n",
|
326 |
+
" \"top_p\": top_p,\n",
|
327 |
+
" \"user\": user,\n",
|
328 |
+
" },\n",
|
329 |
+
" completion_create_params.CompletionCreateParams,\n",
|
330 |
+
" ),\n",
|
331 |
+
" options=make_request_options(\n",
|
332 |
+
" extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout\n",
|
333 |
+
" ),\n",
|
334 |
+
" cast_to=ChatCompletion,\n",
|
335 |
+
" stream=stream or False,\n",
|
336 |
+
" stream_cls=Stream[ChatCompletionChunk],\n",
|
337 |
+
" )\n",
|
338 |
+
"```\n",
|
339 |
+
"\"\"\"\n",
|
340 |
+
"tokens = text.encode('utf-8') # which will produce raw byte strings\n",
|
341 |
+
"tokens = list(map(int, tokens)) # convert the byte strings to integers\n",
|
342 |
+
"print('----')\n",
|
343 |
+
"print(text)\n",
|
344 |
+
"print('length:', len(text))\n",
|
345 |
+
"print('----')\n",
|
346 |
+
"print(tokens)\n",
|
347 |
+
"print('length:', len(tokens))\n"
|
348 |
+
]
|
349 |
+
},
|
350 |
+
{
|
351 |
+
"cell_type": "code",
|
352 |
+
"execution_count": 11,
|
353 |
+
"metadata": {},
|
354 |
+
"outputs": [
|
355 |
+
{
|
356 |
+
"name": "stdout",
|
357 |
+
"output_type": "stream",
|
358 |
+
"text": [
|
359 |
+
"[(770, (32, 32)), (86, (111, 110)), (73, (101, 110)), (66, (10, 32)), (65, (116, 105)), (57, (44, 10)), (56, (105, 111)), (55, (58, 32)), (55, (32, 116)), (52, (97, 116)), (50, (116, 111)), (50, (101, 32)), (48, (32, 78)), (47, (110, 32)), (44, (114, 101)), (40, (115, 116)), (40, (32, 97)), (38, (115, 32)), (38, (101, 114)), (36, (115, 101)), (35, (97, 108)), (35, (32, 99)), (34, (108, 101)), (32, (116, 104)), (31, (114, 97)), (30, (97, 110)), (29, (110, 116)), (28, (118, 101)), (28, (116, 114)), (28, (111, 109)), (28, (97, 109)), (27, (124, 32)), (27, (103, 101)), (27, (101, 97)), (27, (99, 111)), (27, (78, 111)), (27, (61, 32)), (27, (32, 124)), (27, (32, 61)), (26, (116, 32)), (26, (101, 115)), (25, (105, 110)), (24, (110, 115)), (24, (34, 58)), (24, (32, 34)), (23, (116, 101)), (23, (112, 108)), (23, (109, 112)), (22, (111, 116)), (22, (105, 118)), (22, (101, 116)), (22, (44, 32)), (22, (32, 115)), (21, (112, 116)), (21, (110, 97)), (21, (108, 111)), (21, (105, 115)), (21, (104, 97)), (21, (100, 101)), (21, (84, 95)), (20, (116, 71)), (20, (104, 101)), (20, (95, 71)), (20, (86, 69)), (20, (79, 84)), (20, (78, 79)), (20, (78, 44)), (20, (73, 86)), (20, (71, 105)), (20, (71, 73)), (20, (69, 78)), (19, (114, 111)), (19, (111, 114)), (19, (110, 99)), (19, (109, 97)), (18, (117, 116)), (18, (105, 99)), (18, (97, 115)), (17, (97, 114)), (17, (95, 112)), (16, (111, 108)), (16, (111, 100)), (16, (105, 116)), (16, (101, 100)), (16, (99, 97)), (16, (93, 32)), (16, (79, 112)), (15, (112, 114)), (15, (111, 112)), (15, (111, 32)), (15, (109, 101)), (15, (108, 91)), (15, (101, 120)), (15, (101, 95)), (15, (100, 32)), (15, (97, 103)), (15, (95, 99)), (15, (32, 102)), (14, (117, 115)), (14, (115, 115)), (14, (111, 103)), (14, (110, 101)), (14, (32, 111)), (14, (32, 101)), (14, (32, 79)), (13, (115, 44)), (13, (112, 101)), (13, (111, 111)), (13, (108, 105)), (13, (102, 105)), (13, (32, 112)), (13, (32, 109)), (12, (117, 110)), (12, (117, 101)), (12, (115, 58)), (12, (112, 97)), (12, (99, 104)), (12, (67, 104)), (12, (32, 105)), (11, (121, 32)), (11, (120, 116)), (11, (114, 32)), (11, (108, 115)), (11, (101, 101)), (11, (99, 116)), (11, (99, 101)), (11, (98, 108)), (10, (116, 97)), (10, (111, 117)), (10, (110, 102)), (10, (110, 100)), (10, (108, 108)), (10, (107, 101)), (10, (104, 117)), (10, (97, 98)), (10, (95, 108)), (9, (116, 115)), (9, (116, 93)), (9, (115, 111)), (9, (115, 105)), (9, (115, 97)), (9, (115, 34)), (9, (114, 115)), (9, (112, 111)), (9, (108, 116)), (9, (105, 103)), (9, (104, 111)), (9, (97, 95)), (9, (67, 111)), (9, (32, 104)), (8, (116, 121)), (8, (116, 95)), (8, (116, 67)), (8, (113, 117)), (8, (111, 102)), (8, (110, 103)), (8, (110, 95)), (8, (109, 111)), (8, (105, 97)), (8, (102, 114)), (8, (102, 111)), (8, (101, 108)), (8, (101, 44)), (8, (99, 114)), (8, (97, 32)), (8, (96, 96)), (8, (35, 32)), (8, (32, 76)), (7, (117, 114)), (7, (117, 109)), (7, (115, 46)), (7, (111, 98)), (7, (111, 97)), (7, (109, 95)), (7, (104, 105)), (7, (103, 112)), (7, (103, 105)), (7, (103, 95)), (7, (97, 117)), (7, (46, 10)), (7, (32, 84)), (7, (32, 65)), (6, (114, 109)), (6, (112, 95)), (6, (111, 115)), (6, (111, 105)), (6, (109, 105)), (6, (109, 32)), (6, (102, 117)), (6, (102, 32)), (6, (101, 99)), (6, (98, 115)), (6, (97, 112)), (6, (97, 100)), (6, (95, 102)), (6, (95, 98)), (6, (42, 42)), (6, (32, 114)), (6, (32, 110)), (6, (32, 108)), (6, (10, 10)), (5, (120, 121)), (5, (119, 111)), (5, (119, 105)), (5, (116, 117)), (5, (116, 44)), (5, (115, 112)), (5, (111, 120)), (5, (111, 99)), (5, (110, 118)), (5, (110, 105)), (5, (109, 115)), (5, (108, 121)), (5, (108, 117)), (5, (105, 109)), (5, (105, 108)), (5, (104, 32)), (5, (103, 32)), (5, (102, 108)), (5, (101, 113)), (5, (101, 109)), (5, (100, 121)), (5, (100, 105)), (5, (99, 108)), (5, (99, 107)), (5, (98, 111)), (5, (95, 116)), (5, (95, 111)), (5, (91, 67)), (5, (84, 104)), (5, (80, 97)), (5, (76, 77)), (5, (76, 76)), (5, (34, 116)), (5, (32, 119)), (5, (32, 118)), (5, (32, 117)), (5, (32, 85)), (5, (32, 73)), (5, (10, 35)), (4, (121, 58)), (4, (121, 44)), (4, (118, 97)), (4, (117, 108)), (4, (116, 116)), (4, (116, 112)), (4, (116, 40)), (4, (115, 107)), (4, (114, 121)), (4, (114, 116)), (4, (114, 95)), (4, (114, 44)), (4, (112, 121)), (4, (111, 119)), (4, (110, 67)), (4, (110, 47)), (4, (104, 116)), (4, (102, 101)), (4, (101, 111)), (4, (101, 58)), (4, (100, 111)), (4, (98, 105)), (4, (98, 101)), (4, (93, 44)), (4, (91, 115)), (4, (91, 105)), (4, (91, 102)), (4, (85, 115)), (4, (73, 116)), (4, (65, 117)), (4, (65, 103)), (4, (47, 109)), (4, (47, 97)), (4, (46, 32)), (4, (41, 10)), (4, (40, 10)), (4, (34, 115)), (4, (34, 44)), (4, (32, 100)), (4, (32, 98)), (4, (32, 42)), (4, (32, 41)), (4, (10, 96)), (3, (121, 116)), (3, (121, 111)), (3, (121, 95)), (3, (121, 61)), (3, (121, 34)), (3, (120, 95)), (3, (118, 105)), (3, (117, 100)), (3, (117, 98)), (3, (116, 119)), (3, (116, 47)), (3, (116, 46)), (3, (116, 45)), (3, (116, 34)), (3, (115, 61)), (3, (115, 47)), (3, (114, 117)), (3, (114, 105)), (3, (114, 34)), (3, (112, 115)), (3, (112, 112)), (3, (111, 107)), (3, (111, 71)), (3, (110, 10)), (3, (109, 117)), (3, (109, 93)), (3, (108, 95)), (3, (107, 115)), (3, (105, 122)), (3, (105, 114)), (3, (105, 112)), (3, (105, 45)), (3, (102, 116)), (3, (101, 93)), (3, (101, 91)), (3, (99, 121)), (3, (99, 117)), (3, (99, 105)), (3, (98, 46)), (3, (97, 120)), (3, (97, 107)), (3, (97, 99)), (3, (95, 113)), (3, (95, 104)), (3, (93, 93)), (3, (83, 116)), (3, (76, 105)), (3, (73, 32)), (3, (71, 101)), (3, (70, 97)), (3, (65, 73)), (3, (61, 101)), (3, (58, 47)), (3, (58, 10)), (3, (47, 47)), (3, (46, 105)), (3, (45, 97)), (3, (45, 32)), (3, (42, 58)), (3, (41, 44)), (3, (41, 32)), (3, (34, 117)), (3, (34, 109)), (3, (34, 102)), (3, (32, 91)), (3, (32, 67)), (3, (32, 39)), (3, (32, 35)), (3, (10, 45)), (2, (125, 44)), (2, (125, 41)), (2, (123, 34)), (2, (122, 97)), (2, (121, 65)), (2, (121, 46)), (2, (120, 97)), (2, (119, 115)), (2, (117, 32)), (2, (116, 91)), (2, (116, 65)), (2, (116, 58)), (2, (115, 108)), (2, (114, 107)), (2, (114, 103)), (2, (114, 93)), (2, (114, 80)), (2, (112, 117)), (2, (112, 105)), (2, (112, 58)), (2, (112, 44)), (2, (112, 34)), (2, (111, 118)), (2, (111, 47)), (2, (110, 112)), (2, (110, 107)), (2, (110, 93)), (2, (110, 91)), (2, (110, 84)), (2, (110, 46)), (2, (110, 44)), (2, (110, 42)), (2, (109, 109)), (2, (109, 91)), (2, (108, 118)), (2, (108, 102)), (2, (108, 93)), (2, (108, 58)), (2, (108, 44)), (2, (108, 34)), (2, (108, 32)), (2, (107, 93)), (2, (107, 32)), (2, (106, 115)), (2, (105, 102)), (2, (103, 61)), (2, (101, 121)), (2, (101, 102)), (2, (101, 80)), (2, (101, 61)), (2, (101, 34)), (2, (101, 10)), (2, (100, 112)), (2, (100, 98)), (2, (100, 46)), (2, (99, 115)), (2, (99, 32)), (2, (98, 97)), (2, (97, 105)), (2, (96, 112)), (2, (96, 10)), (2, (95, 106)), (2, (95, 100)), (2, (95, 76)), (2, (95, 67)), (2, (93, 40)), (2, (85, 110)), (2, (84, 114)), (2, (84, 111)), (2, (83, 84)), (2, (80, 114)), (2, (80, 73)), (2, (79, 78)), (2, (79, 65)), (2, (78, 70)), (2, (77, 115)), (2, (77, 32)), (2, (76, 73)), (2, (73, 95)), (2, (73, 83)), (2, (73, 71)), (2, (71, 95)), (2, (70, 117)), (2, (70, 111)), (2, (70, 73)), (2, (67, 97)), (2, (67, 79)), (2, (65, 115)), (2, (65, 80)), (2, (65, 32)), (2, (61, 123)), (2, (61, 109)), (2, (61, 34)), (2, (47, 116)), (2, (47, 100)), (2, (47, 99)), (2, (46, 103)), (2, (46, 70)), (2, (40, 104)), (2, (40, 34)), (2, (39, 58)), (2, (34, 108)), (2, (34, 99)), (2, (34, 41)), (2, (32, 107)), (2, (32, 103)), (2, (32, 89)), (2, (32, 83)), (2, (32, 70)), (2, (10, 117)), (2, (10, 70)), (1, (123, 39)), (1, (123, 10)), (1, (122, 101)), (1, (121, 112)), (1, (121, 98)), (1, (121, 41)), (1, (121, 39)), (1, (120, 101)), (1, (120, 46)), (1, (120, 32)), (1, (119, 101)), (1, (119, 97)), (1, (119, 44)), (1, (118, 95)), (1, (118, 32)), (1, (117, 105)), (1, (116, 125)), (1, (116, 108)), (1, (116, 77)), (1, (116, 61)), (1, (116, 41)), (1, (116, 10)), (1, (115, 102)), (1, (115, 80)), (1, (115, 42)), (1, (115, 40)), (1, (115, 10)), (1, (114, 110)), (1, (114, 102)), (1, (114, 58)), (1, (114, 45)), (1, (112, 120)), (1, (112, 104)), (1, (111, 121)), (1, (111, 61)), (1, (110, 111)), (1, (110, 83)), (1, (110, 80)), (1, (110, 77)), (1, (110, 65)), (1, (110, 58)), (1, (110, 40)), (1, (110, 39)), (1, (110, 34)), (1, (109, 108)), (1, (109, 79)), (1, (109, 61)), (1, (109, 58)), (1, (109, 47)), (1, (109, 44)), (1, (109, 40)), (1, (109, 34)), (1, (108, 109)), (1, (108, 100)), (1, (108, 97)), (1, (108, 80)), (1, (108, 67)), (1, (108, 39)), (1, (107, 119)), (1, (107, 95)), (1, (107, 46)), (1, (107, 44)), (1, (107, 10)), (1, (105, 101)), (1, (105, 100)), (1, (105, 98)), (1, (105, 95)), (1, (103, 117)), (1, (103, 115)), (1, (103, 114)), (1, (103, 108)), (1, (103, 34)), (1, (102, 102)), (1, (102, 46)), (1, (102, 44)), (1, (101, 125)), (1, (101, 119)), (1, (101, 103)), (1, (101, 79)), (1, (101, 70)), (1, (101, 62)), (1, (101, 46)), (1, (101, 45)), (1, (101, 40)), (1, (100, 115)), (1, (100, 100)), (1, (100, 58)), (1, (100, 44)), (1, (100, 34)), (1, (100, 10)), (1, (98, 47)), (1, (97, 121)), (1, (97, 118)), (1, (95, 115)), (1, (95, 114)), (1, (95, 107)), (1, (95, 101)), (1, (93, 58)), (1, (93, 10)), (1, (91, 123)), (1, (91, 109)), (1, (91, 101)), (1, (91, 99)), (1, (91, 98)), (1, (91, 84)), (1, (91, 79)), (1, (91, 76)), (1, (91, 70)), (1, (91, 68)), (1, (89, 111)), (1, (89, 84)), (1, (86, 68)), (1, (84, 105)), (1, (84, 69)), (1, (84, 68)), (1, (84, 65)), (1, (84, 58)), (1, (84, 34)), (1, (83, 101)), (1, (83, 76)), (1, (82, 101)), (1, (82, 84)), (1, (81, 117)), (1, (81, 35)), (1, (80, 108)), (1, (80, 79)), (1, (79, 82)), (1, (78, 86)), (1, (78, 84)), (1, (77, 117)), (1, (77, 111)), (1, (77, 101)), (1, (77, 80)), (1, (77, 46)), (1, (76, 111)), (1, (76, 65)), (1, (73, 77)), (1, (72, 117)), (1, (72, 101)), (1, (70, 101)), (1, (70, 65)), (1, (69, 83)), (1, (68, 105)), (1, (68, 65)), (1, (68, 46)), (1, (67, 117)), (1, (67, 114)), (1, (66, 121)), (1, (66, 111)), (1, (65, 81)), (1, (65, 78)), (1, (62, 39)), (1, (62, 32)), (1, (61, 116)), (1, (61, 115)), (1, (61, 83)), (1, (61, 67)), (1, (60, 121)), (1, (52, 39)), (1, (47, 103)), (1, (47, 98)), (1, (47, 85)), (1, (47, 70)), (1, (46, 112)), (1, (46, 99)), (1, (46, 95)), (1, (46, 84)), (1, (46, 82)), (1, (46, 67)), (1, (46, 34)), (1, (45, 121)), (1, (45, 103)), (1, (45, 101)), (1, (45, 67)), (1, (45, 62)), (1, (45, 52)), (1, (44, 93)), (1, (42, 77)), (1, (42, 72)), (1, (42, 67)), (1, (42, 44)), (1, (40, 101)), (1, (40, 97)), (1, (39, 125)), (1, (39, 116)), (1, (39, 109)), (1, (39, 103)), (1, (39, 97)), (1, (39, 60)), (1, (39, 44)), (1, (35, 115)), (1, (34, 119)), (1, (34, 114)), (1, (34, 112)), (1, (34, 110)), (1, (34, 97)), (1, (34, 80)), (1, (34, 79)), (1, (34, 47)), (1, (32, 125)), (1, (32, 123)), (1, (32, 121)), (1, (32, 81)), (1, (32, 72)), (1, (32, 66)), (1, (32, 45)), (1, (10, 109)), (1, (10, 102)), (1, (10, 99)), (1, (10, 97)), (1, (10, 66))]\n"
|
360 |
+
]
|
361 |
+
}
|
362 |
+
],
|
363 |
+
"source": [
|
364 |
+
"def get_stats(ids):\n",
|
365 |
+
" \"\"\"\n",
|
366 |
+
" Get statistics of the token ids. includes the most common token pairs.\n",
|
367 |
+
" \"\"\"\n",
|
368 |
+
" counts = {}\n",
|
369 |
+
" for pair in zip(ids, ids[1:]):\n",
|
370 |
+
" counts[pair] = counts.get(pair, 0) + 1\n",
|
371 |
+
" return counts\n",
|
372 |
+
"\n",
|
373 |
+
"stats = get_stats(tokens)\n",
|
374 |
+
"# print(stats)\n",
|
375 |
+
"print(sorted(((v,k) for k,v in stats.items()), reverse=True))"
|
376 |
+
]
|
377 |
+
},
|
378 |
+
{
|
379 |
+
"cell_type": "code",
|
380 |
+
"execution_count": 12,
|
381 |
+
"metadata": {},
|
382 |
+
"outputs": [
|
383 |
+
{
|
384 |
+
"data": {
|
385 |
+
"text/plain": [
|
386 |
+
"(' ', ' ')"
|
387 |
+
]
|
388 |
+
},
|
389 |
+
"execution_count": 12,
|
390 |
+
"metadata": {},
|
391 |
+
"output_type": "execute_result"
|
392 |
+
}
|
393 |
+
],
|
394 |
+
"source": [
|
395 |
+
"chr(32), chr(32) # the space character is the most common character in the text"
|
396 |
+
]
|
397 |
+
},
|
398 |
+
{
|
399 |
+
"cell_type": "code",
|
400 |
+
"execution_count": 13,
|
401 |
+
"metadata": {},
|
402 |
+
"outputs": [
|
403 |
+
{
|
404 |
+
"name": "stdout",
|
405 |
+
"output_type": "stream",
|
406 |
+
"text": [
|
407 |
+
"[65, 117, 116, 111, 103, 101, 110, 32, 101, 110, 97, 98, 108, 101, 115, 32, 116, 104, 101, 32, 110, 101, 120, 116, 45, 103, 101, 110, 32, 76, 76, 77, 32, 97, 112, 112, 108, 105, 99, 97, 116, 105, 111, 110, 115, 32, 119, 105, 116, 104, 32, 97, 32, 103, 101, 110, 101, 114, 105, 99, 32, 91, 109, 117, 108, 116, 105, 45, 97, 103, 101, 110, 116, 32, 99, 111, 110, 118, 101, 114, 115, 97, 116, 105, 111, 110, 93, 40, 104, 116, 116, 112, 115, 58, 47, 47, 109, 105, 99, 114, 111, 115, 111, 102, 116, 46, 103, 105, 116, 104, 117, 98, 46, 105, 111, 47, 97, 117, 116, 111, 103, 101, 110, 47, 100, 111, 99, 115, 47, 85, 115, 101, 45, 67, 97, 115, 101, 115, 47, 97, 103, 101, 110, 116, 95, 99, 104, 97, 116, 41, 32, 102, 114, 97, 109, 101, 119, 111, 114, 107, 46, 32, 73, 116, 32, 111, 102, 102, 101, 114, 115, 32, 99, 117, 115, 116, 111, 109, 105, 122, 97, 98, 108, 101, 32, 97, 110, 100, 32, 99, 111, 110, 118, 101, 114, 115, 97, 98, 108, 101, 32, 97, 103, 101, 110, 116, 115, 32, 116, 104, 97, 116, 32, 105, 110, 116, 101, 103, 114, 97, 116, 101, 32, 76, 76, 77, 115, 44, 32, 116, 111, 111, 108, 115, 44, 32, 97, 110, 100, 32, 104, 117, 109, 97, 110, 115, 46, 10, 66, 121, 32, 97, 117, 116, 111, 109, 97, 116, 105, 110, 103, 32, 99, 104, 97, 116, 32, 97, 109, 111, 110, 103, 32, 109, 117, 108, 116, 105, 112, 108, 101, 32, 99, 97, 112, 97, 98, 108, 101, 32, 97, 103, 101, 110, 116, 115, 44, 32, 111, 110, 101, 32, 99, 97, 110, 32, 101, 97, 115, 105, 108, 121, 32, 109, 97, 107, 101, 32, 116, 104, 101, 109, 32, 99, 111, 108, 108, 101, 99, 116, 105, 118, 101, 108, 121, 32, 112, 101, 114, 102, 111, 114, 109, 32, 116, 97, 115, 107, 115, 32, 97, 117, 116, 111, 110, 111, 109, 111, 117, 115, 108, 121, 32, 111, 114, 32, 119, 105, 116, 104, 32, 104, 117, 109, 97, 110, 32, 102, 101, 101, 100, 98, 97, 99, 107, 44, 32, 105, 110, 99, 108, 117, 100, 105, 110, 103, 32, 116, 97, 115, 107, 115, 32, 116, 104, 97, 116, 32, 114, 101, 113, 117, 105, 114, 101, 32, 117, 115, 105, 110, 103, 32, 116, 111, 111, 108, 115, 32, 118, 105, 97, 32, 99, 111, 100, 101, 46, 10, 10, 70, 101, 97, 116, 117, 114, 101, 115, 32, 111, 102, 32, 116, 104, 105, 115, 32, 117, 115, 101, 32, 99, 97, 115, 101, 32, 105, 110, 99, 108, 117, 100, 101, 58, 10, 10, 45, 32, 42, 42, 77, 117, 108, 116, 105, 45, 97, 103, 101, 110, 116, 32, 99, 111, 110, 118, 101, 114, 115, 97, 116, 105, 111, 110, 115, 42, 42, 58, 32, 65, 117, 116, 111, 71, 101, 110, 32, 97, 103, 101, 110, 116, 115, 32, 99, 97, 110, 32, 99, 111, 109, 109, 117, 110, 105, 99, 97, 116, 101, 32, 119, 105, 116, 104, 32, 101, 97, 99, 104, 32, 111, 116, 104, 101, 114, 32, 116, 111, 32, 115, 111, 108, 118, 101, 32, 116, 97, 115, 107, 115, 46, 32, 84, 104, 105, 115, 32, 97, 108, 108, 111, 119, 115, 32, 102, 111, 114, 32, 109, 111, 114, 101, 32, 99, 111, 109, 112, 108, 101, 120, 32, 97, 110, 100, 32, 115, 111, 112, 104, 105, 115, 116, 105, 99, 97, 116, 101, 100, 32, 97, 112, 112, 108, 105, 99, 97, 116, 105, 111, 110, 115, 32, 116, 104, 97, 110, 32, 119, 111, 117, 108, 100, 32, 98, 101, 32, 112, 111, 115, 115, 105, 98, 108, 101, 32, 119, 105, 116, 104, 32, 97, 32, 115, 105, 110, 103, 108, 101, 32, 76, 76, 77, 46, 10, 45, 32, 42, 42, 67, 117, 115, 116, 111, 109, 105, 122, 97, 116, 105, 111, 110, 42, 42, 58, 32, 65, 117, 116, 111, 71, 101, 110, 32, 97, 103, 101, 110, 116, 115, 32, 99, 97, 110, 32, 98, 101, 32, 99, 117, 115, 116, 111, 109, 105, 122, 101, 100, 32, 116, 111, 32, 109, 101, 101, 116, 32, 116, 104, 101, 32, 115, 112, 101, 99, 105, 102, 105, 99, 32, 110, 101, 101, 100, 115, 32, 111, 102, 32, 97, 110, 32, 97, 112, 112, 108, 105, 99, 97, 116, 105, 111, 110, 46, 32, 84, 104, 105, 115, 32, 105, 110, 99, 108, 117, 100, 101, 115, 32, 116, 104, 101, 32, 97, 98, 105, 108, 105, 116, 121, 32, 116, 111, 32, 99, 104, 111, 111, 115, 101, 32, 116, 104, 101, 32, 76, 76, 77, 115, 32, 116, 111, 32, 117, 115, 101, 44, 32, 116, 104, 101, 32, 116, 121, 112, 101, 115, 32, 111, 102, 32, 104, 117, 109, 97, 110, 32, 105, 110, 112, 117, 116, 32, 116, 111, 32, 97, 108, 108, 111, 119, 44, 32, 97, 110, 100, 32, 116, 104, 101, 32, 116, 111, 111, 108, 115, 32, 116, 111, 32, 101, 109, 112, 108, 111, 121, 46, 10, 45, 32, 42, 42, 72, 117, 109, 97, 110, 32, 112, 97, 114, 116, 105, 99, 105, 112, 97, 116, 105, 111, 110, 42, 42, 58, 32, 65, 117, 116, 111, 71, 101, 110, 32, 115, 101, 97, 109, 108, 101, 115, 115, 108, 121, 32, 97, 108, 108, 111, 119, 115, 32, 104, 117, 109, 97, 110, 32, 112, 97, 114, 116, 105, 99, 105, 112, 97, 116, 105, 111, 110, 46, 32, 84, 104, 105, 115, 32, 109, 101, 97, 110, 115, 32, 116, 104, 97, 116, 32, 104, 117, 109, 97, 110, 115, 32, 99, 97, 110, 32, 112, 114, 111, 118, 105, 100, 101, 32, 105, 110, 112, 117, 116, 32, 97, 110, 100, 32, 102, 101, 101, 100, 98, 97, 99, 107, 32, 116, 111, 32, 116, 104, 101, 32, 97, 103, 101, 110, 116, 115, 32, 97, 115, 32, 110, 101, 101, 100, 101, 100, 46, 10, 10, 70, 111, 114, 32, 91, 101, 120, 97, 109, 112, 108, 101, 93, 40, 104, 116, 116, 112, 115, 58, 47, 47, 103, 105, 116, 104, 117, 98, 46, 99, 111, 109, 47, 109, 105, 99, 114, 111, 115, 111, 102, 116, 47, 97, 117, 116, 111, 103, 101, 110, 47, 98, 108, 111, 98, 47, 109, 97, 105, 110, 47, 116, 101, 115, 116, 47, 116, 119, 111, 97, 103, 101, 110, 116, 46, 112, 121, 41, 44, 10, 10, 96, 96, 96, 112, 121, 116, 104, 111, 110, 10, 102, 114, 111, 109, 32, 97, 117, 116, 111, 103, 101, 110, 32, 105, 109, 112, 111, 114, 116, 32, 65, 115, 115, 105, 115, 116, 97, 110, 116, 65, 103, 101, 110, 116, 44, 32, 85, 115, 101, 114, 80, 114, 111, 120, 121, 65, 103, 101, 110, 116, 44, 32, 99, 111, 110, 102, 105, 103, 95, 108, 105, 115, 116, 95, 102, 114, 111, 109, 95, 106, 115, 111, 110, 10, 35, 32, 76, 111, 97, 100, 32, 76, 76, 77, 32, 105, 110, 102, 101, 114, 101, 110, 99, 101, 32, 101, 110, 100, 112, 111, 105, 110, 116, 115, 32, 102, 114, 111, 109, 32, 97, 110, 32, 101, 110, 118, 32, 118, 97, 114, 105, 97, 98, 108, 101, 32, 111, 114, 32, 97, 32, 102, 105, 108, 101, 10, 35, 32, 83, 101, 101, 32, 104, 116, 116, 112, 115, 58, 47, 47, 109, 105, 99, 114, 111, 115, 111, 102, 116, 46, 103, 105, 116, 104, 117, 98, 46, 105, 111, 47, 97, 117, 116, 111, 103, 101, 110, 47, 100, 111, 99, 115, 47, 70, 65, 81, 35, 115, 101, 116, 45, 121, 111, 117, 114, 45, 97, 112, 105, 45, 101, 110, 100, 112, 111, 105, 110, 116, 115, 10, 35, 32, 97, 110, 100, 32, 79, 65, 73, 95, 67, 79, 78, 70, 73, 71, 95, 76, 73, 83, 84, 95, 115, 97, 109, 112, 108, 101, 10, 99, 111, 110, 102, 105, 103, 95, 108, 105, 115, 116, 32, 61, 32, 99, 111, 110, 102, 105, 103, 95, 108, 105, 115, 116, 95, 102, 114, 111, 109, 95, 106, 115, 111, 110, 40, 101, 110, 118, 95, 111, 114, 95, 102, 105, 108, 101, 61, 34, 79, 65, 73, 95, 67, 79, 78, 70, 73, 71, 95, 76, 73, 83, 84, 34, 41, 10, 35, 32, 89, 111, 117, 32, 99, 97, 110, 32, 97, 108, 115, 111, 32, 115, 101, 116, 32, 99, 111, 110, 102, 105, 103, 95, 108, 105, 115, 116, 32, 100, 105, 114, 101, 99, 116, 108, 121, 32, 97, 115, 32, 97, 32, 108, 105, 115, 116, 44, 32, 102, 111, 114, 32, 101, 120, 97, 109, 112, 108, 101, 44, 32, 99, 111, 110, 102, 105, 103, 95, 108, 105, 115, 116, 32, 61, 32, 91, 123, 39, 109, 111, 100, 101, 108, 39, 58, 32, 39, 103, 112, 116, 45, 52, 39, 44, 32, 39, 97, 112, 105, 95, 107, 101, 121, 39, 58, 32, 39, 60, 121, 111, 117, 114, 32, 79, 112, 101, 110, 65, 73, 32, 65, 80, 73, 32, 107, 101, 121, 32, 104, 101, 114, 101, 62, 39, 125, 44, 93, 10, 97, 115, 115, 105, 115, 116, 97, 110, 116, 32, 61, 32, 65, 115, 115, 105, 115, 116, 97, 110, 116, 65, 103, 101, 110, 116, 40, 34, 97, 115, 115, 105, 115, 116, 97, 110, 116, 34, 44, 32, 108, 108, 109, 95, 99, 111, 110, 102, 105, 103, 61, 123, 34, 99, 111, 110, 102, 105, 103, 95, 108, 105, 115, 116, 34, 58, 32, 99, 111, 110, 102, 105, 103, 95, 108, 105, 115, 116, 125, 41, 10, 117, 115, 101, 114, 95, 112, 114, 111, 120, 121, 32, 61, 32, 85, 115, 101, 114, 80, 114, 111, 120, 121, 65, 103, 101, 110, 116, 40, 34, 117, 115, 101, 114, 95, 112, 114, 111, 120, 121, 34, 44, 32, 99, 111, 100, 101, 95, 101, 120, 101, 99, 117, 116, 105, 111, 110, 95, 99, 111, 110, 102, 105, 103, 61, 123, 34, 119, 111, 114, 107, 95, 100, 105, 114, 34, 58, 32, 34, 99, 111, 100, 105, 110, 103, 34, 44, 32, 34, 117, 115, 101, 95, 100, 111, 99, 107, 101, 114, 34, 58, 32, 70, 97, 108, 115, 101, 125, 41, 32, 35, 32, 73, 77, 80, 79, 82, 84, 65, 78, 84, 58, 32, 115, 101, 116, 32, 116, 111, 32, 84, 114, 117, 101, 32, 116, 111, 32, 114, 117, 110, 32, 99, 111, 100, 101, 32, 105, 110, 32, 100, 111, 99, 107, 101, 114, 44, 32, 114, 101, 99, 111, 109, 109, 101, 110, 100, 101, 100, 10, 117, 115, 101, 114, 95, 112, 114, 111, 120, 121, 46, 105, 110, 105, 116, 105, 97, 116, 101, 95, 99, 104, 97, 116, 40, 97, 115, 115, 105, 115, 116, 97, 110, 116, 44, 32, 109, 101, 115, 115, 97, 103, 101, 61, 34, 80, 108, 111, 116, 32, 97, 32, 99, 104, 97, 114, 116, 32, 111, 102, 32, 78, 86, 68, 65, 32, 97, 110, 100, 32, 84, 69, 83, 76, 65, 32, 115, 116, 111, 99, 107, 32, 112, 114, 105, 99, 101, 32, 99, 104, 97, 110, 103, 101, 32, 89, 84, 68, 46, 34, 41, 10, 35, 32, 84, 104, 105, 115, 32, 105, 110, 105, 116, 105, 97, 116, 101, 115, 32, 97, 110, 32, 97, 117, 116, 111, 109, 97, 116, 101, 100, 32, 99, 104, 97, 116, 32, 98, 101, 116, 119, 101, 101, 110, 32, 116, 104, 101, 32, 116, 119, 111, 32, 97, 103, 101, 110, 116, 115, 32, 116, 111, 32, 115, 111, 108, 118, 101, 32, 116, 104, 101, 32, 116, 97, 115, 107, 10, 96, 96, 96, 10, 10, 109, 111, 114, 101, 32, 112, 121, 116, 104, 111, 110, 32, 99, 111, 100, 101, 58, 10, 10, 96, 96, 96, 112, 121, 116, 104, 111, 110, 10, 1000, 1000, 100, 101, 102, 32, 99, 114, 101, 97, 116, 101, 40, 10, 1000, 1000, 1000, 1000, 115, 101, 108, 102, 44, 10, 1000, 1000, 1000, 1000, 42, 44, 10, 1000, 1000, 1000, 1000, 109, 101, 115, 115, 97, 103, 101, 115, 58, 32, 73, 116, 101, 114, 97, 98, 108, 101, 91, 67, 104, 97, 116, 67, 111, 109, 112, 108, 101, 116, 105, 111, 110, 77, 101, 115, 115, 97, 103, 101, 80, 97, 114, 97, 109, 93, 44, 10, 1000, 1000, 1000, 1000, 109, 111, 100, 101, 108, 58, 32, 85, 110, 105, 111, 110, 91, 115, 116, 114, 44, 32, 67, 104, 97, 116, 77, 111, 100, 101, 108, 93, 44, 10, 1000, 1000, 1000, 1000, 102, 114, 101, 113, 117, 101, 110, 99, 121, 95, 112, 101, 110, 97, 108, 116, 121, 58, 32, 79, 112, 116, 105, 111, 110, 97, 108, 91, 102, 108, 111, 97, 116, 93, 32, 124, 32, 78, 111, 116, 71, 105, 118, 101, 110, 32, 61, 32, 78, 79, 84, 95, 71, 73, 86, 69, 78, 44, 10, 1000, 1000, 1000, 1000, 102, 117, 110, 99, 116, 105, 111, 110, 95, 99, 97, 108, 108, 58, 32, 99, 111, 109, 112, 108, 101, 116, 105, 111, 110, 95, 99, 114, 101, 97, 116, 101, 95, 112, 97, 114, 97, 109, 115, 46, 70, 117, 110, 99, 116, 105, 111, 110, 67, 97, 108, 108, 32, 124, 32, 78, 111, 116, 71, 105, 118, 101, 110, 32, 61, 32, 78, 79, 84, 95, 71, 73, 86, 69, 78, 44, 10, 1000, 1000, 1000, 1000, 102, 117, 110, 99, 116, 105, 111, 110, 115, 58, 32, 73, 116, 101, 114, 97, 98, 108, 101, 91, 99, 111, 109, 112, 108, 101, 116, 105, 111, 110, 95, 99, 114, 101, 97, 116, 101, 95, 112, 97, 114, 97, 109, 115, 46, 70, 117, 110, 99, 116, 105, 111, 110, 93, 32, 124, 32, 78, 111, 116, 71, 105, 118, 101, 110, 32, 61, 32, 78, 79, 84, 95, 71, 73, 86, 69, 78, 44, 10, 1000, 1000, 1000, 1000, 108, 111, 103, 105, 116, 95, 98, 105, 97, 115, 58, 32, 79, 112, 116, 105, 111, 110, 97, 108, 91, 68, 105, 99, 116, 91, 115, 116, 114, 44, 32, 105, 110, 116, 93, 93, 32, 124, 32, 78, 111, 116, 71, 105, 118, 101, 110, 32, 61, 32, 78, 79, 84, 95, 71, 73, 86, 69, 78, 44, 10, 1000, 1000, 1000, 1000, 108, 111, 103, 112, 114, 111, 98, 115, 58, 32, 79, 112, 116, 105, 111, 110, 97, 108, 91, 98, 111, 111, 108, 93, 32, 124, 32, 78, 111, 116, 71, 105, 118, 101, 110, 32, 61, 32, 78, 79, 84, 95, 71, 73, 86, 69, 78, 44, 10, 1000, 1000, 1000, 1000, 109, 97, 120, 95, 116, 111, 107, 101, 110, 115, 58, 32, 79, 112, 116, 105, 111, 110, 97, 108, 91, 105, 110, 116, 93, 32, 124, 32, 78, 111, 116, 71, 105, 118, 101, 110, 32, 61, 32, 78, 79, 84, 95, 71, 73, 86, 69, 78, 44, 10, 1000, 1000, 1000, 1000, 110, 58, 32, 79, 112, 116, 105, 111, 110, 97, 108, 91, 105, 110, 116, 93, 32, 124, 32, 78, 111, 116, 71, 105, 118, 101, 110, 32, 61, 32, 78, 79, 84, 95, 71, 73, 86, 69, 78, 44, 10, 1000, 1000, 1000, 1000, 112, 114, 101, 115, 101, 110, 99, 101, 95, 112, 101, 110, 97, 108, 116, 121, 58, 32, 79, 112, 116, 105, 111, 110, 97, 108, 91, 102, 108, 111, 97, 116, 93, 32, 124, 32, 78, 111, 116, 71, 105, 118, 101, 110, 32, 61, 32, 78, 79, 84, 95, 71, 73, 86, 69, 78, 44, 10, 1000, 1000, 1000, 1000, 114, 101, 115, 112, 111, 110, 115, 101, 95, 102, 111, 114, 109, 97, 116, 58, 32, 99, 111, 109, 112, 108, 101, 116, 105, 111, 110, 95, 99, 114, 101, 97, 116, 101, 95, 112, 97, 114, 97, 109, 115, 46, 82, 101, 115, 112, 111, 110, 115, 101, 70, 111, 114, 109, 97, 116, 32, 124, 32, 78, 111, 116, 71, 105, 118, 101, 110, 32, 61, 32, 78, 79, 84, 95, 71, 73, 86, 69, 78, 44, 10, 1000, 1000, 1000, 1000, 115, 101, 101, 100, 58, 32, 79, 112, 116, 105, 111, 110, 97, 108, 91, 105, 110, 116, 93, 32, 124, 32, 78, 111, 116, 71, 105, 118, 101, 110, 32, 61, 32, 78, 79, 84, 95, 71, 73, 86, 69, 78, 44, 10, 1000, 1000, 1000, 1000, 115, 116, 111, 112, 58, 32, 85, 110, 105, 111, 110, 91, 79, 112, 116, 105, 111, 110, 97, 108, 91, 115, 116, 114, 93, 44, 32, 76, 105, 115, 116, 91, 115, 116, 114, 93, 93, 32, 124, 32, 78, 111, 116, 71, 105, 118, 101, 110, 32, 61, 32, 78, 79, 84, 95, 71, 73, 86, 69, 78, 44, 10, 1000, 1000, 1000, 1000, 115, 116, 114, 101, 97, 109, 58, 32, 79, 112, 116, 105, 111, 110, 97, 108, 91, 76, 105, 116, 101, 114, 97, 108, 91, 70, 97, 108, 115, 101, 93, 93, 32, 124, 32, 76, 105, 116, 101, 114, 97, 108, 91, 84, 114, 117, 101, 93, 32, 124, 32, 78, 111, 116, 71, 105, 118, 101, 110, 32, 61, 32, 78, 79, 84, 95, 71, 73, 86, 69, 78, 44, 10, 1000, 1000, 1000, 1000, 115, 116, 114, 101, 97, 109, 95, 111, 112, 116, 105, 111, 110, 115, 58, 32, 79, 112, 116, 105, 111, 110, 97, 108, 91, 67, 104, 97, 116, 67, 111, 109, 112, 108, 101, 116, 105, 111, 110, 83, 116, 114, 101, 97, 109, 79, 112, 116, 105, 111, 110, 115, 80, 97, 114, 97, 109, 93, 32, 124, 32, 78, 111, 116, 71, 105, 118, 101, 110, 32, 61, 32, 78, 79, 84, 95, 71, 73, 86, 69, 78, 44, 10, 1000, 1000, 1000, 1000, 116, 101, 109, 112, 101, 114, 97, 116, 117, 114, 101, 58, 32, 79, 112, 116, 105, 111, 110, 97, 108, 91, 102, 108, 111, 97, 116, 93, 32, 124, 32, 78, 111, 116, 71, 105, 118, 101, 110, 32, 61, 32, 78, 79, 84, 95, 71, 73, 86, 69, 78, 44, 10, 1000, 1000, 1000, 1000, 116, 111, 111, 108, 95, 99, 104, 111, 105, 99, 101, 58, 32, 67, 104, 97, 116, 67, 111, 109, 112, 108, 101, 116, 105, 111, 110, 84, 111, 111, 108, 67, 104, 111, 105, 99, 101, 79, 112, 116, 105, 111, 110, 80, 97, 114, 97, 109, 32, 124, 32, 78, 111, 116, 71, 105, 118, 101, 110, 32, 61, 32, 78, 79, 84, 95, 71, 73, 86, 69, 78, 44, 10, 1000, 1000, 1000, 1000, 116, 111, 111, 108, 115, 58, 32, 73, 116, 101, 114, 97, 98, 108, 101, 91, 67, 104, 97, 116, 67, 111, 109, 112, 108, 101, 116, 105, 111, 110, 84, 111, 111, 108, 80, 97, 114, 97, 109, 93, 32, 124, 32, 78, 111, 116, 71, 105, 118, 101, 110, 32, 61, 32, 78, 79, 84, 95, 71, 73, 86, 69, 78, 44, 10, 1000, 1000, 1000, 1000, 116, 111, 112, 95, 108, 111, 103, 112, 114, 111, 98, 115, 58, 32, 79, 112, 116, 105, 111, 110, 97, 108, 91, 105, 110, 116, 93, 32, 124, 32, 78, 111, 116, 71, 105, 118, 101, 110, 32, 61, 32, 78, 79, 84, 95, 71, 73, 86, 69, 78, 44, 10, 1000, 1000, 1000, 1000, 116, 111, 112, 95, 112, 58, 32, 79, 112, 116, 105, 111, 110, 97, 108, 91, 102, 108, 111, 97, 116, 93, 32, 124, 32, 78, 111, 116, 71, 105, 118, 101, 110, 32, 61, 32, 78, 79, 84, 95, 71, 73, 86, 69, 78, 44, 10, 1000, 1000, 1000, 1000, 117, 115, 101, 114, 58, 32, 115, 116, 114, 32, 124, 32, 78, 111, 116, 71, 105, 118, 101, 110, 32, 61, 32, 78, 79, 84, 95, 71, 73, 86, 69, 78, 44, 10, 1000, 1000, 1000, 1000, 35, 32, 85, 115, 101, 32, 116, 104, 101, 32, 102, 111, 108, 108, 111, 119, 105, 110, 103, 32, 97, 114, 103, 117, 109, 101, 110, 116, 115, 32, 105, 102, 32, 121, 111, 117, 32, 110, 101, 101, 100, 32, 116, 111, 32, 112, 97, 115, 115, 32, 97, 100, 100, 105, 116, 105, 111, 110, 97, 108, 32, 112, 97, 114, 97, 109, 101, 116, 101, 114, 115, 32, 116, 111, 32, 116, 104, 101, 32, 65, 80, 73, 32, 116, 104, 97, 116, 32, 97, 114, 101, 110, 39, 116, 32, 97, 118, 97, 105, 108, 97, 98, 108, 101, 32, 118, 105, 97, 32, 107, 119, 97, 114, 103, 115, 46, 10, 1000, 1000, 1000, 1000, 35, 32, 84, 104, 101, 32, 101, 120, 116, 114, 97, 32, 118, 97, 108, 117, 101, 115, 32, 103, 105, 118, 101, 110, 32, 104, 101, 114, 101, 32, 116, 97, 107, 101, 32, 112, 114, 101, 99, 101, 100, 101, 110, 99, 101, 32, 111, 118, 101, 114, 32, 118, 97, 108, 117, 101, 115, 32, 100, 101, 102, 105, 110, 101, 100, 32, 111, 110, 32, 116, 104, 101, 32, 99, 108, 105, 101, 110, 116, 32, 111, 114, 32, 112, 97, 115, 115, 101, 100, 32, 116, 111, 32, 116, 104, 105, 115, 32, 109, 101, 116, 104, 111, 100, 46, 10, 1000, 1000, 1000, 1000, 101, 120, 116, 114, 97, 95, 104, 101, 97, 100, 101, 114, 115, 58, 32, 72, 101, 97, 100, 101, 114, 115, 32, 124, 32, 78, 111, 110, 101, 32, 61, 32, 78, 111, 110, 101, 44, 10, 1000, 1000, 1000, 1000, 101, 120, 116, 114, 97, 95, 113, 117, 101, 114, 121, 58, 32, 81, 117, 101, 114, 121, 32, 124, 32, 78, 111, 110, 101, 32, 61, 32, 78, 111, 110, 101, 44, 10, 1000, 1000, 1000, 1000, 101, 120, 116, 114, 97, 95, 98, 111, 100, 121, 58, 32, 66, 111, 100, 121, 32, 124, 32, 78, 111, 110, 101, 32, 61, 32, 78, 111, 110, 101, 44, 10, 1000, 1000, 1000, 1000, 116, 105, 109, 101, 111, 117, 116, 58, 32, 102, 108, 111, 97, 116, 32, 124, 32, 104, 116, 116, 112, 120, 46, 84, 105, 109, 101, 111, 117, 116, 32, 124, 32, 78, 111, 110, 101, 32, 124, 32, 78, 111, 116, 71, 105, 118, 101, 110, 32, 61, 32, 78, 79, 84, 95, 71, 73, 86, 69, 78, 44, 10, 1000, 1000, 41, 32, 45, 62, 32, 67, 104, 97, 116, 67, 111, 109, 112, 108, 101, 116, 105, 111, 110, 32, 124, 32, 83, 116, 114, 101, 97, 109, 91, 67, 104, 97, 116, 67, 111, 109, 112, 108, 101, 116, 105, 111, 110, 67, 104, 117, 110, 107, 93, 58, 10, 1000, 1000, 1000, 1000, 114, 101, 116, 117, 114, 110, 32, 115, 101, 108, 102, 46, 95, 112, 111, 115, 116, 40, 10, 1000, 1000, 1000, 1000, 1000, 1000, 34, 47, 99, 104, 97, 116, 47, 99, 111, 109, 112, 108, 101, 116, 105, 111, 110, 115, 34, 44, 10, 1000, 1000, 1000, 1000, 1000, 1000, 98, 111, 100, 121, 61, 109, 97, 121, 98, 101, 95, 116, 114, 97, 110, 115, 102, 111, 114, 109, 40, 10, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 123, 10, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 34, 109, 101, 115, 115, 97, 103, 101, 115, 34, 58, 32, 109, 101, 115, 115, 97, 103, 101, 115, 44, 10, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 34, 109, 111, 100, 101, 108, 34, 58, 32, 109, 111, 100, 101, 108, 44, 10, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 34, 102, 114, 101, 113, 117, 101, 110, 99, 121, 95, 112, 101, 110, 97, 108, 116, 121, 34, 58, 32, 102, 114, 101, 113, 117, 101, 110, 99, 121, 95, 112, 101, 110, 97, 108, 116, 121, 44, 10, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 34, 102, 117, 110, 99, 116, 105, 111, 110, 95, 99, 97, 108, 108, 34, 58, 32, 102, 117, 110, 99, 116, 105, 111, 110, 95, 99, 97, 108, 108, 44, 10, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 34, 102, 117, 110, 99, 116, 105, 111, 110, 115, 34, 58, 32, 102, 117, 110, 99, 116, 105, 111, 110, 115, 44, 10, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 34, 108, 111, 103, 105, 116, 95, 98, 105, 97, 115, 34, 58, 32, 108, 111, 103, 105, 116, 95, 98, 105, 97, 115, 44, 10, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 34, 108, 111, 103, 112, 114, 111, 98, 115, 34, 58, 32, 108, 111, 103, 112, 114, 111, 98, 115, 44, 10, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 34, 109, 97, 120, 95, 116, 111, 107, 101, 110, 115, 34, 58, 32, 109, 97, 120, 95, 116, 111, 107, 101, 110, 115, 44, 10, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 34, 110, 34, 58, 32, 110, 44, 10, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 34, 112, 114, 101, 115, 101, 110, 99, 101, 95, 112, 101, 110, 97, 108, 116, 121, 34, 58, 32, 112, 114, 101, 115, 101, 110, 99, 101, 95, 112, 101, 110, 97, 108, 116, 121, 44, 10, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 34, 114, 101, 115, 112, 111, 110, 115, 101, 95, 102, 111, 114, 109, 97, 116, 34, 58, 32, 114, 101, 115, 112, 111, 110, 115, 101, 95, 102, 111, 114, 109, 97, 116, 44, 10, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 34, 115, 101, 101, 100, 34, 58, 32, 115, 101, 101, 100, 44, 10, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 34, 115, 116, 111, 112, 34, 58, 32, 115, 116, 111, 112, 44, 10, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 34, 115, 116, 114, 101, 97, 109, 34, 58, 32, 115, 116, 114, 101, 97, 109, 44, 10, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 34, 115, 116, 114, 101, 97, 109, 95, 111, 112, 116, 105, 111, 110, 115, 34, 58, 32, 115, 116, 114, 101, 97, 109, 95, 111, 112, 116, 105, 111, 110, 115, 44, 10, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 34, 116, 101, 109, 112, 101, 114, 97, 116, 117, 114, 101, 34, 58, 32, 116, 101, 109, 112, 101, 114, 97, 116, 117, 114, 101, 44, 10, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 34, 116, 111, 111, 108, 95, 99, 104, 111, 105, 99, 101, 34, 58, 32, 116, 111, 111, 108, 95, 99, 104, 111, 105, 99, 101, 44, 10, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 34, 116, 111, 111, 108, 115, 34, 58, 32, 116, 111, 111, 108, 115, 44, 10, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 34, 116, 111, 112, 95, 108, 111, 103, 112, 114, 111, 98, 115, 34, 58, 32, 116, 111, 112, 95, 108, 111, 103, 112, 114, 111, 98, 115, 44, 10, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 34, 116, 111, 112, 95, 112, 34, 58, 32, 116, 111, 112, 95, 112, 44, 10, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 34, 117, 115, 101, 114, 34, 58, 32, 117, 115, 101, 114, 44, 10, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 125, 44, 10, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 99, 111, 109, 112, 108, 101, 116, 105, 111, 110, 95, 99, 114, 101, 97, 116, 101, 95, 112, 97, 114, 97, 109, 115, 46, 67, 111, 109, 112, 108, 101, 116, 105, 111, 110, 67, 114, 101, 97, 116, 101, 80, 97, 114, 97, 109, 115, 44, 10, 1000, 1000, 1000, 1000, 1000, 1000, 41, 44, 10, 1000, 1000, 1000, 1000, 1000, 1000, 111, 112, 116, 105, 111, 110, 115, 61, 109, 97, 107, 101, 95, 114, 101, 113, 117, 101, 115, 116, 95, 111, 112, 116, 105, 111, 110, 115, 40, 10, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 101, 120, 116, 114, 97, 95, 104, 101, 97, 100, 101, 114, 115, 61, 101, 120, 116, 114, 97, 95, 104, 101, 97, 100, 101, 114, 115, 44, 32, 101, 120, 116, 114, 97, 95, 113, 117, 101, 114, 121, 61, 101, 120, 116, 114, 97, 95, 113, 117, 101, 114, 121, 44, 32, 101, 120, 116, 114, 97, 95, 98, 111, 100, 121, 61, 101, 120, 116, 114, 97, 95, 98, 111, 100, 121, 44, 32, 116, 105, 109, 101, 111, 117, 116, 61, 116, 105, 109, 101, 111, 117, 116, 10, 1000, 1000, 1000, 1000, 1000, 1000, 41, 44, 10, 1000, 1000, 1000, 1000, 1000, 1000, 99, 97, 115, 116, 95, 116, 111, 61, 67, 104, 97, 116, 67, 111, 109, 112, 108, 101, 116, 105, 111, 110, 44, 10, 1000, 1000, 1000, 1000, 1000, 1000, 115, 116, 114, 101, 97, 109, 61, 115, 116, 114, 101, 97, 109, 32, 111, 114, 32, 70, 97, 108, 115, 101, 44, 10, 1000, 1000, 1000, 1000, 1000, 1000, 115, 116, 114, 101, 97, 109, 95, 99, 108, 115, 61, 83, 116, 114, 101, 97, 109, 91, 67, 104, 97, 116, 67, 111, 109, 112, 108, 101, 116, 105, 111, 110, 67, 104, 117, 110, 107, 93, 44, 10, 1000, 1000, 1000, 1000, 41, 10, 96, 96, 96, 10]\n",
|
408 |
+
"length: 4979\n"
|
409 |
+
]
|
410 |
+
}
|
411 |
+
],
|
412 |
+
"source": [
|
413 |
+
"def merge(ids, pair, idx):\n",
|
414 |
+
" \"\"\"\n",
|
415 |
+
" BPE algorithm\n",
|
416 |
+
" ids: list of integers(tokens)\n",
|
417 |
+
" pair: tuple of consecutive integers\n",
|
418 |
+
" idx: new vocab token to replace the pair\n",
|
419 |
+
" \"\"\"\n",
|
420 |
+
" new_ids = []\n",
|
421 |
+
" i = 0\n",
|
422 |
+
" while i < len(ids):\n",
|
423 |
+
" if i < len(ids) - 1 and ids[i] == pair[0] and ids[i+1] == pair[1]:\n",
|
424 |
+
" new_ids.append(idx)\n",
|
425 |
+
" i += 2\n",
|
426 |
+
" else:\n",
|
427 |
+
" new_ids.append(ids[i])\n",
|
428 |
+
" i += 1\n",
|
429 |
+
" return new_ids\n",
|
430 |
+
"\n",
|
431 |
+
"# merge the most common pair\n",
|
432 |
+
"tokens2 = merge(tokens, (32, 32), 1000)\n",
|
433 |
+
"print(tokens2)\n",
|
434 |
+
"print('length: ',len(tokens2))"
|
435 |
+
]
|
436 |
+
},
|
437 |
+
{
|
438 |
+
"cell_type": "code",
|
439 |
+
"execution_count": 14,
|
440 |
+
"metadata": {},
|
441 |
+
"outputs": [
|
442 |
+
{
|
443 |
+
"name": "stdout",
|
444 |
+
"output_type": "stream",
|
445 |
+
"text": [
|
446 |
+
"merge (32, 32) to 256\n",
|
447 |
+
"merge (256, 256) to 257\n",
|
448 |
+
"merge (257, 257) to 258\n",
|
449 |
+
"merge (111, 110) to 259\n",
|
450 |
+
"merge (101, 110) to 260\n",
|
451 |
+
"merge (116, 105) to 261\n",
|
452 |
+
"merge (10, 258) to 262\n",
|
453 |
+
"merge (58, 32) to 263\n",
|
454 |
+
"merge (44, 262) to 264\n",
|
455 |
+
"merge (261, 259) to 265\n",
|
456 |
+
"merge (101, 32) to 266\n",
|
457 |
+
"merge (116, 111) to 267\n",
|
458 |
+
"merge (32, 78) to 268\n",
|
459 |
+
"merge (97, 116) to 269\n",
|
460 |
+
"merge (115, 32) to 270\n",
|
461 |
+
"merge (101, 114) to 271\n",
|
462 |
+
"merge (114, 101) to 272\n",
|
463 |
+
"merge (97, 108) to 273\n",
|
464 |
+
"merge (116, 104) to 274\n",
|
465 |
+
"merge (115, 116) to 275\n",
|
466 |
+
"merge (97, 110) to 276\n",
|
467 |
+
"merge (260, 32) to 277\n",
|
468 |
+
"merge (97, 109) to 278\n",
|
469 |
+
"merge (108, 101) to 279\n",
|
470 |
+
"merge (32, 124) to 280\n",
|
471 |
+
"merge (105, 110) to 281\n",
|
472 |
+
"merge (34, 263) to 282\n",
|
473 |
+
"merge (111, 109) to 283\n",
|
474 |
+
"merge (61, 268) to 284\n",
|
475 |
+
"merge (44, 32) to 285\n",
|
476 |
+
"merge (280, 268) to 286\n",
|
477 |
+
"merge (257, 34) to 287\n",
|
478 |
+
"merge (264, 258) to 288\n",
|
479 |
+
"merge (115, 101) to 289\n",
|
480 |
+
"merge (108, 111) to 290\n",
|
481 |
+
"merge (84, 95) to 291\n",
|
482 |
+
"merge (105, 118) to 292\n",
|
483 |
+
"merge (292, 277) to 293\n",
|
484 |
+
"merge (112, 265) to 294\n",
|
485 |
+
"merge (111, 116) to 295\n"
|
486 |
+
]
|
487 |
+
}
|
488 |
+
],
|
489 |
+
"source": [
|
490 |
+
"# complete cycle\n",
|
491 |
+
"def get_stats(ids):\n",
|
492 |
+
" counts = {}\n",
|
493 |
+
" for pair in zip(ids, ids[1:]):\n",
|
494 |
+
" counts[pair] = counts.get(pair, 0) +1 \n",
|
495 |
+
" return counts\n",
|
496 |
+
"\n",
|
497 |
+
"def merge(ids, pair, idx):\n",
|
498 |
+
" newids = []\n",
|
499 |
+
" i = 0\n",
|
500 |
+
" while i < len(ids):\n",
|
501 |
+
" if i < len(ids) - 1 and ids[i] == pair[0] and ids [i+1] == pair[1]:\n",
|
502 |
+
" newids.append(idx)\n",
|
503 |
+
" i += 2\n",
|
504 |
+
" else:\n",
|
505 |
+
" newids.append(ids[i])\n",
|
506 |
+
" i += 1\n",
|
507 |
+
" return newids\n",
|
508 |
+
"\n",
|
509 |
+
"# merge all the common pairs and create a new vocab\n",
|
510 |
+
"vocab_size = 296\n",
|
511 |
+
"num_merges = vocab_size - 256 # the utf-8 vocab size is 256\n",
|
512 |
+
"ids = list(tokens)\n",
|
513 |
+
"\n",
|
514 |
+
"\n",
|
515 |
+
"merges = {}\n",
|
516 |
+
"for i in range(num_merges):\n",
|
517 |
+
" stats = get_stats(ids)\n",
|
518 |
+
" pair = max(stats, key = stats.get) # get the most common pair\n",
|
519 |
+
" idx = 256 + i # new vocab token\n",
|
520 |
+
" print(f'merge {pair} to {idx}')\n",
|
521 |
+
" ids = merge(ids, pair, idx)\n",
|
522 |
+
" merges[pair] = idx\n"
|
523 |
+
]
|
524 |
+
},
|
525 |
+
{
|
526 |
+
"cell_type": "code",
|
527 |
+
"execution_count": 15,
|
528 |
+
"metadata": {},
|
529 |
+
"outputs": [
|
530 |
+
{
|
531 |
+
"name": "stdout",
|
532 |
+
"output_type": "stream",
|
533 |
+
"text": [
|
534 |
+
"tokens length: 5397\n",
|
535 |
+
"new tokens length: 3365\n",
|
536 |
+
"compression rate: 1.60X\n"
|
537 |
+
]
|
538 |
+
}
|
539 |
+
],
|
540 |
+
"source": [
|
541 |
+
"print(\"tokens length: \", len(tokens))\n",
|
542 |
+
"print(\"new tokens length: \", len(ids))\n",
|
543 |
+
"print(f\"compression rate: {len(tokens) / len(ids):.2f}X\")"
|
544 |
+
]
|
545 |
+
},
|
546 |
+
{
|
547 |
+
"cell_type": "markdown",
|
548 |
+
"metadata": {},
|
549 |
+
"source": [
|
550 |
+
"#### decoding\n",
|
551 |
+
"\n",
|
552 |
+
"Given the sequence of integers [0, vocab_size], converting it into a string."
|
553 |
+
]
|
554 |
+
},
|
555 |
+
{
|
556 |
+
"cell_type": "code",
|
557 |
+
"execution_count": 16,
|
558 |
+
"metadata": {},
|
559 |
+
"outputs": [
|
560 |
+
{
|
561 |
+
"name": "stdout",
|
562 |
+
"output_type": "stream",
|
563 |
+
"text": [
|
564 |
+
"---\n",
|
565 |
+
"Autogen enables the next-gen LLM applications with a generic [multi-agent conversation](https://microsoft.github.io/autogen/docs/Use-Cases/agent_chat) framework. It offers customizable and conversable agents that integrate LLMs, tools, and humans.\n",
|
566 |
+
"By automating chat among multiple capable agents, one can easily make them collectively perform tasks autonomously or with human feedback, including tasks that require using tools via code.\n",
|
567 |
+
"\n",
|
568 |
+
"Features of this use case include:\n",
|
569 |
+
"\n",
|
570 |
+
"- **Multi-agent conversations**: AutoGen agents can communicate with each other to solve tasks. This allows for more complex and sophisticated applications than would be possible with a single LLM.\n",
|
571 |
+
"- **Customization**: AutoGen agents can be customized to meet the specific needs of an application. This includes the ability to choose the LLMs to use, the types of human input to allow, and the tools to employ.\n",
|
572 |
+
"- **Human participation**: AutoGen seamlessly allows human participation. This means that humans can provide input and feedback to the agents as needed.\n",
|
573 |
+
"\n",
|
574 |
+
"For [example](https://github.com/microsoft/autogen/blob/main/test/twoagent.py),\n",
|
575 |
+
"\n",
|
576 |
+
"```python\n",
|
577 |
+
"from autogen import AssistantAgent, UserProxyAgent, config_list_from_json\n",
|
578 |
+
"# Load LLM inference endpoints from an env variable or a file\n",
|
579 |
+
"# See https://microsoft.github.io/autogen/docs/FAQ#set-your-api-endpoints\n",
|
580 |
+
"# and OAI_CONFIG_LIST_sample\n",
|
581 |
+
"config_list = config_list_from_json(env_or_file=\"OAI_CONFIG_LIST\")\n",
|
582 |
+
"# You can also set config_list directly as a list, for example, config_list = [{'model': 'gpt-4', 'api_key': '<your OpenAI API key here>'},]\n",
|
583 |
+
"assistant = AssistantAgent(\"assistant\", llm_config={\"config_list\": config_list})\n",
|
584 |
+
"user_proxy = UserProxyAgent(\"user_proxy\", code_execution_config={\"work_dir\": \"coding\", \"use_docker\": False}) # IMPORTANT: set to True to run code in docker, recommended\n",
|
585 |
+
"user_proxy.initiate_chat(assistant, message=\"Plot a chart of NVDA and TESLA stock price change YTD.\")\n",
|
586 |
+
"# This initiates an automated chat between the two agents to solve the task\n",
|
587 |
+
"```\n",
|
588 |
+
"\n",
|
589 |
+
"more python code:\n",
|
590 |
+
"\n",
|
591 |
+
"```python\n",
|
592 |
+
" def create(\n",
|
593 |
+
" self,\n",
|
594 |
+
" *,\n",
|
595 |
+
" messages: Iterable[ChatCompletionMessageParam],\n",
|
596 |
+
" model: Union[str, ChatModel],\n",
|
597 |
+
" frequency_penalty: Optional[float] | NotGiven = NOT_GIVEN,\n",
|
598 |
+
" function_call: completion_create_params.FunctionCall | NotGiven = NOT_GIVEN,\n",
|
599 |
+
" functions: Iterable[completion_create_params.Function] | NotGiven = NOT_GIVEN,\n",
|
600 |
+
" logit_bias: Optional[Dict[str, int]] | NotGiven = NOT_GIVEN,\n",
|
601 |
+
" logprobs: Optional[bool] | NotGiven = NOT_GIVEN,\n",
|
602 |
+
" max_tokens: Optional[int] | NotGiven = NOT_GIVEN,\n",
|
603 |
+
" n: Optional[int] | NotGiven = NOT_GIVEN,\n",
|
604 |
+
" presence_penalty: Optional[float] | NotGiven = NOT_GIVEN,\n",
|
605 |
+
" response_format: completion_create_params.ResponseFormat | NotGiven = NOT_GIVEN,\n",
|
606 |
+
" seed: Optional[int] | NotGiven = NOT_GIVEN,\n",
|
607 |
+
" stop: Union[Optional[str], List[str]] | NotGiven = NOT_GIVEN,\n",
|
608 |
+
" stream: Optional[Literal[False]] | Literal[True] | NotGiven = NOT_GIVEN,\n",
|
609 |
+
" stream_options: Optional[ChatCompletionStreamOptionsParam] | NotGiven = NOT_GIVEN,\n",
|
610 |
+
" temperature: Optional[float] | NotGiven = NOT_GIVEN,\n",
|
611 |
+
" tool_choice: ChatCompletionToolChoiceOptionParam | NotGiven = NOT_GIVEN,\n",
|
612 |
+
" tools: Iterable[ChatCompletionToolParam] | NotGiven = NOT_GIVEN,\n",
|
613 |
+
" top_logprobs: Optional[int] | NotGiven = NOT_GIVEN,\n",
|
614 |
+
" top_p: Optional[float] | NotGiven = NOT_GIVEN,\n",
|
615 |
+
" user: str | NotGiven = NOT_GIVEN,\n",
|
616 |
+
" # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.\n",
|
617 |
+
" # The extra values given here take precedence over values defined on the client or passed to this method.\n",
|
618 |
+
" extra_headers: Headers | None = None,\n",
|
619 |
+
" extra_query: Query | None = None,\n",
|
620 |
+
" extra_body: Body | None = None,\n",
|
621 |
+
" timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,\n",
|
622 |
+
" ) -> ChatCompletion | Stream[ChatCompletionChunk]:\n",
|
623 |
+
" return self._post(\n",
|
624 |
+
" \"/chat/completions\",\n",
|
625 |
+
" body=maybe_transform(\n",
|
626 |
+
" {\n",
|
627 |
+
" \"messages\": messages,\n",
|
628 |
+
" \"model\": model,\n",
|
629 |
+
" \"frequency_penalty\": frequency_penalty,\n",
|
630 |
+
" \"function_call\": function_call,\n",
|
631 |
+
" \"functions\": functions,\n",
|
632 |
+
" \"logit_bias\": logit_bias,\n",
|
633 |
+
" \"logprobs\": logprobs,\n",
|
634 |
+
" \"max_tokens\": max_tokens,\n",
|
635 |
+
" \"n\": n,\n",
|
636 |
+
" \"presence_penalty\": presence_penalty,\n",
|
637 |
+
" \"response_format\": response_format,\n",
|
638 |
+
" \"seed\": seed,\n",
|
639 |
+
" \"stop\": stop,\n",
|
640 |
+
" \"stream\": stream,\n",
|
641 |
+
" \"stream_options\": stream_options,\n",
|
642 |
+
" \"temperature\": temperature,\n",
|
643 |
+
" \"tool_choice\": tool_choice,\n",
|
644 |
+
" \"tools\": tools,\n",
|
645 |
+
" \"top_logprobs\": top_logprobs,\n",
|
646 |
+
" \"top_p\": top_p,\n",
|
647 |
+
" \"user\": user,\n",
|
648 |
+
" },\n",
|
649 |
+
" completion_create_params.CompletionCreateParams,\n",
|
650 |
+
" ),\n",
|
651 |
+
" options=make_request_options(\n",
|
652 |
+
" extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout\n",
|
653 |
+
" ),\n",
|
654 |
+
" cast_to=ChatCompletion,\n",
|
655 |
+
" stream=stream or False,\n",
|
656 |
+
" stream_cls=Stream[ChatCompletionChunk],\n",
|
657 |
+
" )\n",
|
658 |
+
"```\n",
|
659 |
+
"\n",
|
660 |
+
"length: 5397\n"
|
661 |
+
]
|
662 |
+
}
|
663 |
+
],
|
664 |
+
"source": [
|
665 |
+
"vocab = {idx: bytes([idx]) for idx in range(256)} # utf-8 vocab\n",
|
666 |
+
"for (p0, p1), idx in merges.items():\n",
|
667 |
+
" vocab[idx] = vocab[p0] + vocab[p1] # adding the extra vocab tokens (256 - 296)\n",
|
668 |
+
"\n",
|
669 |
+
"def decode(ids):\n",
|
670 |
+
" bytetokens = b\"\".join(vocab[i] for i in ids)\n",
|
671 |
+
" text = bytetokens.decode(\"utf-8\", errors=\"replace\") # if there are any errors, replace them with a question mark\n",
|
672 |
+
" return text\n",
|
673 |
+
"\n",
|
674 |
+
"print('---')\n",
|
675 |
+
"print(decode(ids))\n",
|
676 |
+
"print('length: ', len(decode(ids)))"
|
677 |
+
]
|
678 |
+
},
|
679 |
+
{
|
680 |
+
"cell_type": "markdown",
|
681 |
+
"metadata": {},
|
682 |
+
"source": [
|
683 |
+
"#### encoding\n",
|
684 |
+
"convert the string into the tokens"
|
685 |
+
]
|
686 |
+
},
|
687 |
+
{
|
688 |
+
"cell_type": "code",
|
689 |
+
"execution_count": 17,
|
690 |
+
"metadata": {},
|
691 |
+
"outputs": [
|
692 |
+
{
|
693 |
+
"name": "stdout",
|
694 |
+
"output_type": "stream",
|
695 |
+
"text": [
|
696 |
+
"[104, 107]\n"
|
697 |
+
]
|
698 |
+
}
|
699 |
+
],
|
700 |
+
"source": [
|
701 |
+
"def encode(texts):\n",
|
702 |
+
" tokens = list(texts.encode('utf-8'))\n",
|
703 |
+
" while len(tokens) >=2:\n",
|
704 |
+
" stats = get_stats(tokens)\n",
|
705 |
+
" pair = min(stats, key=lambda p: merges.get(p, float('inf'))) # selects the pair with minimum prioroty\n",
|
706 |
+
" if pair not in merges:\n",
|
707 |
+
" break\n",
|
708 |
+
" idx = merges[pair]\n",
|
709 |
+
" tokens = merge(tokens, pair, idx)\n",
|
710 |
+
" return tokens\n",
|
711 |
+
"\n",
|
712 |
+
"print(encode(\"hk\"))"
|
713 |
+
]
|
714 |
+
},
|
715 |
+
{
|
716 |
+
"cell_type": "markdown",
|
717 |
+
"metadata": {},
|
718 |
+
"source": [
|
719 |
+
"*the line ensures the algorithm respects the merge priorities defined\n",
|
720 |
+
"```\n",
|
721 |
+
"pair = min(stats, key=lambda p: merges.get(p, float('inf')))\n",
|
722 |
+
"```"
|
723 |
+
]
|
724 |
+
},
|
725 |
+
{
|
726 |
+
"cell_type": "code",
|
727 |
+
"execution_count": 18,
|
728 |
+
"metadata": {},
|
729 |
+
"outputs": [
|
730 |
+
{
|
731 |
+
"name": "stdout",
|
732 |
+
"output_type": "stream",
|
733 |
+
"text": [
|
734 |
+
" presence_penalty \n"
|
735 |
+
]
|
736 |
+
}
|
737 |
+
],
|
738 |
+
"source": [
|
739 |
+
"print(decode(encode(\" presence_penalty \")))"
|
740 |
+
]
|
741 |
+
}
|
742 |
+
],
|
743 |
+
"metadata": {
|
744 |
+
"kernelspec": {
|
745 |
+
"display_name": "Python 3",
|
746 |
+
"language": "python",
|
747 |
+
"name": "python3"
|
748 |
+
},
|
749 |
+
"language_info": {
|
750 |
+
"codemirror_mode": {
|
751 |
+
"name": "ipython",
|
752 |
+
"version": 3
|
753 |
+
},
|
754 |
+
"file_extension": ".py",
|
755 |
+
"mimetype": "text/x-python",
|
756 |
+
"name": "python",
|
757 |
+
"nbconvert_exporter": "python",
|
758 |
+
"pygments_lexer": "ipython3",
|
759 |
+
"version": "3.11.4"
|
760 |
+
}
|
761 |
+
},
|
762 |
+
"nbformat": 4,
|
763 |
+
"nbformat_minor": 2
|
764 |
+
}
|
tokenizer/public/tokenizer.png
ADDED
tokenizer/sample/bpetokenizer/sample_bpetokenizer.model
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7204cc27b30a3ec11d1c6bb741376eabc5345f4660def0d564cff1cda29a6a28
|
3 |
+
size 720
|
tokenizer/sample/bpetokenizer/sample_bpetokenizer.py
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sys
|
2 |
+
sys.path.append("../")
|
3 |
+
|
4 |
+
from bpetokenizer import BPETokenizer
|
5 |
+
|
6 |
+
special_tokens = {
|
7 |
+
"<|endoftext|>": 1001,
|
8 |
+
"<|startoftext|>": 1002,
|
9 |
+
"[SPECIAL1]": 1003,
|
10 |
+
"[SPECIAL2]": 1004,
|
11 |
+
}
|
12 |
+
|
13 |
+
tokenizer = BPETokenizer(special_tokens=special_tokens)
|
14 |
+
texts = "<|startoftext|> Hello, World! This is a sample text with the special tokens [SPECIAL1] and [SPECIAL2] to test the tokenizer.<|endoftext|>"
|
15 |
+
|
16 |
+
tokenizer.train(texts, vocab_size=310, verbose=True)
|
17 |
+
|
18 |
+
encode_text = """
|
19 |
+
<|startoftext|>Hello, World! This is a sample text with the special tokens [SPECIAL1] and [SPECIAL2] to test the tokenizer.
|
20 |
+
Hello, Universe! Another example sentence containing [SPECIAL1] and [SPECIAL2], used to ensure tokenizer's robustness.
|
21 |
+
Greetings, Earth! Here we have [SPECIAL1] appearing once again, followed by [SPECIAL2] in the same sentence.
|
22 |
+
Hello, World! This is yet another sample text, with [SPECIAL1] and [SPECIAL2] making an appearance.
|
23 |
+
Hey there, World! Testing the tokenizer with [SPECIAL1] and [SPECIAL2] to see if it handles special tokens properly.
|
24 |
+
Salutations, Planet! The tokenizer should recognize [SPECIAL1] and [SPECIAL2] in this long string of text.
|
25 |
+
Hello again, World! [SPECIAL1] and [SPECIAL2] are special tokens that need to be handled correctly by the tokenizer.
|
26 |
+
Welcome, World! Including [SPECIAL1] and [SPECIAL2] multiple times in this large text to ensure proper encoding.
|
27 |
+
Hi, World! Let's add [SPECIAL1] and [SPECIAL2] in various parts of this long sentence to test the tokenizer thoroughly.
|
28 |
+
<|endoftext|>
|
29 |
+
"""
|
30 |
+
ids = tokenizer.encode(encode_text, special_tokens="all")
|
31 |
+
print(ids)
|
32 |
+
|
33 |
+
decode_text = tokenizer.decode(ids)
|
34 |
+
print(decode_text)
|
35 |
+
|
36 |
+
tokenizer.save("sample_bpetokenizer")
|
tokenizer/sample/bpetokenizer/sample_bpetokenizer.vocab
ADDED
@@ -0,0 +1,310 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[\u0000] 0
|
2 |
+
[\u0001] 1
|
3 |
+
[\u0002] 2
|
4 |
+
[\u0003] 3
|
5 |
+
[\u0004] 4
|
6 |
+
[\u0005] 5
|
7 |
+
[\u0006] 6
|
8 |
+
[\u0007] 7
|
9 |
+
[\u0008] 8
|
10 |
+
[\u0009] 9
|
11 |
+
[\u000a] 10
|
12 |
+
[\u000b] 11
|
13 |
+
[\u000c] 12
|
14 |
+
[\u000d] 13
|
15 |
+
[\u000e] 14
|
16 |
+
[\u000f] 15
|
17 |
+
[\u0010] 16
|
18 |
+
[\u0011] 17
|
19 |
+
[\u0012] 18
|
20 |
+
[\u0013] 19
|
21 |
+
[\u0014] 20
|
22 |
+
[\u0015] 21
|
23 |
+
[\u0016] 22
|
24 |
+
[\u0017] 23
|
25 |
+
[\u0018] 24
|
26 |
+
[\u0019] 25
|
27 |
+
[\u001a] 26
|
28 |
+
[\u001b] 27
|
29 |
+
[\u001c] 28
|
30 |
+
[\u001d] 29
|
31 |
+
[\u001e] 30
|
32 |
+
[\u001f] 31
|
33 |
+
[ ] 32
|
34 |
+
[!] 33
|
35 |
+
["] 34
|
36 |
+
[#] 35
|
37 |
+
[$] 36
|
38 |
+
[%] 37
|
39 |
+
[&] 38
|
40 |
+
['] 39
|
41 |
+
[(] 40
|
42 |
+
[)] 41
|
43 |
+
[*] 42
|
44 |
+
[+] 43
|
45 |
+
[,] 44
|
46 |
+
[-] 45
|
47 |
+
[.] 46
|
48 |
+
[/] 47
|
49 |
+
[0] 48
|
50 |
+
[1] 49
|
51 |
+
[2] 50
|
52 |
+
[3] 51
|
53 |
+
[4] 52
|
54 |
+
[5] 53
|
55 |
+
[6] 54
|
56 |
+
[7] 55
|
57 |
+
[8] 56
|
58 |
+
[9] 57
|
59 |
+
[:] 58
|
60 |
+
[;] 59
|
61 |
+
[<] 60
|
62 |
+
[=] 61
|
63 |
+
[>] 62
|
64 |
+
[?] 63
|
65 |
+
[@] 64
|
66 |
+
[A] 65
|
67 |
+
[B] 66
|
68 |
+
[C] 67
|
69 |
+
[D] 68
|
70 |
+
[E] 69
|
71 |
+
[F] 70
|
72 |
+
[G] 71
|
73 |
+
[H] 72
|
74 |
+
[I] 73
|
75 |
+
[J] 74
|
76 |
+
[K] 75
|
77 |
+
[L] 76
|
78 |
+
[M] 77
|
79 |
+
[N] 78
|
80 |
+
[O] 79
|
81 |
+
[P] 80
|
82 |
+
[Q] 81
|
83 |
+
[R] 82
|
84 |
+
[S] 83
|
85 |
+
[T] 84
|
86 |
+
[U] 85
|
87 |
+
[V] 86
|
88 |
+
[W] 87
|
89 |
+
[X] 88
|
90 |
+
[Y] 89
|
91 |
+
[Z] 90
|
92 |
+
[[] 91
|
93 |
+
[\] 92
|
94 |
+
[]] 93
|
95 |
+
[^] 94
|
96 |
+
[_] 95
|
97 |
+
[`] 96
|
98 |
+
[a] 97
|
99 |
+
[b] 98
|
100 |
+
[c] 99
|
101 |
+
[d] 100
|
102 |
+
[e] 101
|
103 |
+
[f] 102
|
104 |
+
[g] 103
|
105 |
+
[h] 104
|
106 |
+
[i] 105
|
107 |
+
[j] 106
|
108 |
+
[k] 107
|
109 |
+
[l] 108
|
110 |
+
[m] 109
|
111 |
+
[n] 110
|
112 |
+
[o] 111
|
113 |
+
[p] 112
|
114 |
+
[q] 113
|
115 |
+
[r] 114
|
116 |
+
[s] 115
|
117 |
+
[t] 116
|
118 |
+
[u] 117
|
119 |
+
[v] 118
|
120 |
+
[w] 119
|
121 |
+
[x] 120
|
122 |
+
[y] 121
|
123 |
+
[z] 122
|
124 |
+
[{] 123
|
125 |
+
[|] 124
|
126 |
+
[}] 125
|
127 |
+
[~] 126
|
128 |
+
[\u007f] 127
|
129 |
+
[�] 128
|
130 |
+
[�] 129
|
131 |
+
[�] 130
|
132 |
+
[�] 131
|
133 |
+
[�] 132
|
134 |
+
[�] 133
|
135 |
+
[�] 134
|
136 |
+
[�] 135
|
137 |
+
[�] 136
|
138 |
+
[�] 137
|
139 |
+
[�] 138
|
140 |
+
[�] 139
|
141 |
+
[�] 140
|
142 |
+
[�] 141
|
143 |
+
[�] 142
|
144 |
+
[�] 143
|
145 |
+
[�] 144
|
146 |
+
[�] 145
|
147 |
+
[�] 146
|
148 |
+
[�] 147
|
149 |
+
[�] 148
|
150 |
+
[�] 149
|
151 |
+
[�] 150
|
152 |
+
[�] 151
|
153 |
+
[�] 152
|
154 |
+
[�] 153
|
155 |
+
[�] 154
|
156 |
+
[�] 155
|
157 |
+
[�] 156
|
158 |
+
[�] 157
|
159 |
+
[�] 158
|
160 |
+
[�] 159
|
161 |
+
[�] 160
|
162 |
+
[�] 161
|
163 |
+
[�] 162
|
164 |
+
[�] 163
|
165 |
+
[�] 164
|
166 |
+
[�] 165
|
167 |
+
[�] 166
|
168 |
+
[�] 167
|
169 |
+
[�] 168
|
170 |
+
[�] 169
|
171 |
+
[�] 170
|
172 |
+
[�] 171
|
173 |
+
[�] 172
|
174 |
+
[�] 173
|
175 |
+
[�] 174
|
176 |
+
[�] 175
|
177 |
+
[�] 176
|
178 |
+
[�] 177
|
179 |
+
[�] 178
|
180 |
+
[�] 179
|
181 |
+
[�] 180
|
182 |
+
[�] 181
|
183 |
+
[�] 182
|
184 |
+
[�] 183
|
185 |
+
[�] 184
|
186 |
+
[�] 185
|
187 |
+
[�] 186
|
188 |
+
[�] 187
|
189 |
+
[�] 188
|
190 |
+
[�] 189
|
191 |
+
[�] 190
|
192 |
+
[�] 191
|
193 |
+
[�] 192
|
194 |
+
[�] 193
|
195 |
+
[�] 194
|
196 |
+
[�] 195
|
197 |
+
[�] 196
|
198 |
+
[�] 197
|
199 |
+
[�] 198
|
200 |
+
[�] 199
|
201 |
+
[�] 200
|
202 |
+
[�] 201
|
203 |
+
[�] 202
|
204 |
+
[�] 203
|
205 |
+
[�] 204
|
206 |
+
[�] 205
|
207 |
+
[�] 206
|
208 |
+
[�] 207
|
209 |
+
[�] 208
|
210 |
+
[�] 209
|
211 |
+
[�] 210
|
212 |
+
[�] 211
|
213 |
+
[�] 212
|
214 |
+
[�] 213
|
215 |
+
[�] 214
|
216 |
+
[�] 215
|
217 |
+
[�] 216
|
218 |
+
[�] 217
|
219 |
+
[�] 218
|
220 |
+
[�] 219
|
221 |
+
[�] 220
|
222 |
+
[�] 221
|
223 |
+
[�] 222
|
224 |
+
[�] 223
|
225 |
+
[�] 224
|
226 |
+
[�] 225
|
227 |
+
[�] 226
|
228 |
+
[�] 227
|
229 |
+
[�] 228
|
230 |
+
[�] 229
|
231 |
+
[�] 230
|
232 |
+
[�] 231
|
233 |
+
[�] 232
|
234 |
+
[�] 233
|
235 |
+
[�] 234
|
236 |
+
[�] 235
|
237 |
+
[�] 236
|
238 |
+
[�] 237
|
239 |
+
[�] 238
|
240 |
+
[�] 239
|
241 |
+
[�] 240
|
242 |
+
[�] 241
|
243 |
+
[�] 242
|
244 |
+
[�] 243
|
245 |
+
[�] 244
|
246 |
+
[�] 245
|
247 |
+
[�] 246
|
248 |
+
[�] 247
|
249 |
+
[�] 248
|
250 |
+
[�] 249
|
251 |
+
[�] 250
|
252 |
+
[�] 251
|
253 |
+
[�] 252
|
254 |
+
[�] 253
|
255 |
+
[�] 254
|
256 |
+
[�] 255
|
257 |
+
[ ][t] -> [ t] 256
|
258 |
+
[e][x] -> [ex] 257
|
259 |
+
[ex][t] -> [ext] 258
|
260 |
+
[ t][o] -> [ to] 259
|
261 |
+
[e][n] -> [en] 260
|
262 |
+
[<][|] -> [<|] 261
|
263 |
+
[s][t] -> [st] 262
|
264 |
+
[o][f] -> [of] 263
|
265 |
+
[of][t] -> [oft] 264
|
266 |
+
[oft][ext] -> [oftext] 265
|
267 |
+
[|][>] -> [|>] 266
|
268 |
+
[i][s] -> [is] 267
|
269 |
+
[ ][a] -> [ a] 268
|
270 |
+
[ ][s] -> [ s] 269
|
271 |
+
[ t][h] -> [ th] 270
|
272 |
+
[ th][e] -> [ the] 271
|
273 |
+
[ to][k] -> [ tok] 272
|
274 |
+
[ tok][en] -> [ token] 273
|
275 |
+
[ ][[] -> [ [] 274
|
276 |
+
[S][P] -> [SP] 275
|
277 |
+
[SP][E] -> [SPE] 276
|
278 |
+
[SPE][C] -> [SPEC] 277
|
279 |
+
[SPEC][I] -> [SPECI] 278
|
280 |
+
[SPECI][A] -> [SPECIA] 279
|
281 |
+
[SPECIA][L] -> [SPECIAL] 280
|
282 |
+
[st][a] -> [sta] 281
|
283 |
+
[sta][r] -> [star] 282
|
284 |
+
[star][t] -> [start] 283
|
285 |
+
[start][oftext] -> [startoftext] 284
|
286 |
+
[ ][H] -> [ H] 285
|
287 |
+
[ H][e] -> [ He] 286
|
288 |
+
[ He][l] -> [ Hel] 287
|
289 |
+
[ Hel][l] -> [ Hell] 288
|
290 |
+
[ Hell][o] -> [ Hello] 289
|
291 |
+
[ ][W] -> [ W] 290
|
292 |
+
[ W][o] -> [ Wo] 291
|
293 |
+
[ Wo][r] -> [ Wor] 292
|
294 |
+
[ Wor][l] -> [ Worl] 293
|
295 |
+
[ Worl][d] -> [ World] 294
|
296 |
+
[ ][T] -> [ T] 295
|
297 |
+
[ T][h] -> [ Th] 296
|
298 |
+
[ Th][is] -> [ This] 297
|
299 |
+
[ ][is] -> [ is] 298
|
300 |
+
[ s][a] -> [ sa] 299
|
301 |
+
[ sa][m] -> [ sam] 300
|
302 |
+
[ sam][p] -> [ samp] 301
|
303 |
+
[ samp][l] -> [ sampl] 302
|
304 |
+
[ sampl][e] -> [ sample] 303
|
305 |
+
[ t][ext] -> [ text] 304
|
306 |
+
[ ][w] -> [ w] 305
|
307 |
+
[ w][i] -> [ wi] 306
|
308 |
+
[ wi][t] -> [ wit] 307
|
309 |
+
[ wit][h] -> [ with] 308
|
310 |
+
[ s][p] -> [ sp] 309
|
tokenizer/sample/load_json_vocab/bpetokenizer_json.py
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from bpetokenizer import BPETokenizer
|
2 |
+
|
3 |
+
tokenizer = BPETokenizer()
|
4 |
+
|
5 |
+
tokenizer.load("sample_bpetokenizer.json", mode="json")
|
6 |
+
|
7 |
+
encode_text = """
|
8 |
+
<|startoftext|>Hello, World! This is a sample text with the special tokens [SPECIAL1] and [SPECIAL2] to test the tokenizer.
|
9 |
+
Hello, Universe! Another example sentence containing [SPECIAL1] and [SPECIAL2], used to ensure tokenizer's robustness.
|
10 |
+
Greetings, Earth! Here we have [SPECIAL1] appearing once again, followed by [SPECIAL2] in the same sentence.<|endoftext|>"""
|
11 |
+
|
12 |
+
print("vocab: ", tokenizer.vocab)
|
13 |
+
print('---')
|
14 |
+
print("merges: ", tokenizer.merges)
|
15 |
+
print('---')
|
16 |
+
print("special tokens: ", tokenizer.special_tokens)
|
17 |
+
|
18 |
+
ids = tokenizer.encode(encode_text, special_tokens="all")
|
19 |
+
print('---')
|
20 |
+
print('Ids: ', ids)
|
21 |
+
|
22 |
+
decode_text = tokenizer.decode(ids)
|
23 |
+
print('---')
|
24 |
+
print(decode_text)
|
tokenizer/sample/load_json_vocab/sample_bpetokenizer.json
ADDED
@@ -0,0 +1,378 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"version": "1.0.31",
|
3 |
+
"pattern": "'(?i:[sdmt]|ll|ve|re)|[^\\r\\n\\p{L}\\p{N}]?+\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]++[\\r\\n]*|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+",
|
4 |
+
"special_tokens": {
|
5 |
+
"<|endoftext|>": 311,
|
6 |
+
"<|startoftext|>": 312,
|
7 |
+
"[SPECIAL1]": 313,
|
8 |
+
"[SPECIAL2]": 314
|
9 |
+
},
|
10 |
+
"merges": {
|
11 |
+
"(32, 116)": 256,
|
12 |
+
"(101, 120)": 257,
|
13 |
+
"(257, 116)": 258,
|
14 |
+
"(256, 111)": 259,
|
15 |
+
"(101, 110)": 260,
|
16 |
+
"(60, 124)": 261,
|
17 |
+
"(115, 116)": 262,
|
18 |
+
"(111, 102)": 263,
|
19 |
+
"(263, 116)": 264,
|
20 |
+
"(264, 258)": 265,
|
21 |
+
"(124, 62)": 266,
|
22 |
+
"(105, 115)": 267,
|
23 |
+
"(32, 97)": 268,
|
24 |
+
"(32, 115)": 269,
|
25 |
+
"(256, 104)": 270,
|
26 |
+
"(270, 101)": 271,
|
27 |
+
"(259, 107)": 272,
|
28 |
+
"(272, 260)": 273,
|
29 |
+
"(32, 91)": 274,
|
30 |
+
"(83, 80)": 275,
|
31 |
+
"(275, 69)": 276,
|
32 |
+
"(276, 67)": 277,
|
33 |
+
"(277, 73)": 278,
|
34 |
+
"(278, 65)": 279,
|
35 |
+
"(279, 76)": 280,
|
36 |
+
"(262, 97)": 281,
|
37 |
+
"(281, 114)": 282,
|
38 |
+
"(282, 116)": 283,
|
39 |
+
"(283, 265)": 284,
|
40 |
+
"(32, 72)": 285,
|
41 |
+
"(285, 101)": 286,
|
42 |
+
"(286, 108)": 287,
|
43 |
+
"(287, 108)": 288,
|
44 |
+
"(288, 111)": 289,
|
45 |
+
"(32, 87)": 290,
|
46 |
+
"(290, 111)": 291,
|
47 |
+
"(291, 114)": 292,
|
48 |
+
"(292, 108)": 293,
|
49 |
+
"(293, 100)": 294,
|
50 |
+
"(32, 84)": 295,
|
51 |
+
"(295, 104)": 296,
|
52 |
+
"(296, 267)": 297,
|
53 |
+
"(32, 267)": 298,
|
54 |
+
"(269, 97)": 299,
|
55 |
+
"(299, 109)": 300,
|
56 |
+
"(300, 112)": 301,
|
57 |
+
"(301, 108)": 302,
|
58 |
+
"(302, 101)": 303,
|
59 |
+
"(256, 258)": 304,
|
60 |
+
"(32, 119)": 305,
|
61 |
+
"(305, 105)": 306,
|
62 |
+
"(306, 116)": 307,
|
63 |
+
"(307, 104)": 308,
|
64 |
+
"(269, 112)": 309
|
65 |
+
},
|
66 |
+
"vocab": {
|
67 |
+
"0": "\\u0000",
|
68 |
+
"1": "\\u0001",
|
69 |
+
"2": "\\u0002",
|
70 |
+
"3": "\\u0003",
|
71 |
+
"4": "\\u0004",
|
72 |
+
"5": "\\u0005",
|
73 |
+
"6": "\\u0006",
|
74 |
+
"7": "\\u0007",
|
75 |
+
"8": "\\u0008",
|
76 |
+
"9": "\\u0009",
|
77 |
+
"10": "\\u000a",
|
78 |
+
"11": "\\u000b",
|
79 |
+
"12": "\\u000c",
|
80 |
+
"13": "\\u000d",
|
81 |
+
"14": "\\u000e",
|
82 |
+
"15": "\\u000f",
|
83 |
+
"16": "\\u0010",
|
84 |
+
"17": "\\u0011",
|
85 |
+
"18": "\\u0012",
|
86 |
+
"19": "\\u0013",
|
87 |
+
"20": "\\u0014",
|
88 |
+
"21": "\\u0015",
|
89 |
+
"22": "\\u0016",
|
90 |
+
"23": "\\u0017",
|
91 |
+
"24": "\\u0018",
|
92 |
+
"25": "\\u0019",
|
93 |
+
"26": "\\u001a",
|
94 |
+
"27": "\\u001b",
|
95 |
+
"28": "\\u001c",
|
96 |
+
"29": "\\u001d",
|
97 |
+
"30": "\\u001e",
|
98 |
+
"31": "\\u001f",
|
99 |
+
"32": " ",
|
100 |
+
"33": "!",
|
101 |
+
"34": "\"",
|
102 |
+
"35": "#",
|
103 |
+
"36": "$",
|
104 |
+
"37": "%",
|
105 |
+
"38": "&",
|
106 |
+
"39": "'",
|
107 |
+
"40": "(",
|
108 |
+
"41": ")",
|
109 |
+
"42": "*",
|
110 |
+
"43": "+",
|
111 |
+
"44": ",",
|
112 |
+
"45": "-",
|
113 |
+
"46": ".",
|
114 |
+
"47": "/",
|
115 |
+
"48": "0",
|
116 |
+
"49": "1",
|
117 |
+
"50": "2",
|
118 |
+
"51": "3",
|
119 |
+
"52": "4",
|
120 |
+
"53": "5",
|
121 |
+
"54": "6",
|
122 |
+
"55": "7",
|
123 |
+
"56": "8",
|
124 |
+
"57": "9",
|
125 |
+
"58": ":",
|
126 |
+
"59": ";",
|
127 |
+
"60": "<",
|
128 |
+
"61": "=",
|
129 |
+
"62": ">",
|
130 |
+
"63": "?",
|
131 |
+
"64": "@",
|
132 |
+
"65": "A",
|
133 |
+
"66": "B",
|
134 |
+
"67": "C",
|
135 |
+
"68": "D",
|
136 |
+
"69": "E",
|
137 |
+
"70": "F",
|
138 |
+
"71": "G",
|
139 |
+
"72": "H",
|
140 |
+
"73": "I",
|
141 |
+
"74": "J",
|
142 |
+
"75": "K",
|
143 |
+
"76": "L",
|
144 |
+
"77": "M",
|
145 |
+
"78": "N",
|
146 |
+
"79": "O",
|
147 |
+
"80": "P",
|
148 |
+
"81": "Q",
|
149 |
+
"82": "R",
|
150 |
+
"83": "S",
|
151 |
+
"84": "T",
|
152 |
+
"85": "U",
|
153 |
+
"86": "V",
|
154 |
+
"87": "W",
|
155 |
+
"88": "X",
|
156 |
+
"89": "Y",
|
157 |
+
"90": "Z",
|
158 |
+
"91": "[",
|
159 |
+
"92": "\\",
|
160 |
+
"93": "]",
|
161 |
+
"94": "^",
|
162 |
+
"95": "_",
|
163 |
+
"96": "`",
|
164 |
+
"97": "a",
|
165 |
+
"98": "b",
|
166 |
+
"99": "c",
|
167 |
+
"100": "d",
|
168 |
+
"101": "e",
|
169 |
+
"102": "f",
|
170 |
+
"103": "g",
|
171 |
+
"104": "h",
|
172 |
+
"105": "i",
|
173 |
+
"106": "j",
|
174 |
+
"107": "k",
|
175 |
+
"108": "l",
|
176 |
+
"109": "m",
|
177 |
+
"110": "n",
|
178 |
+
"111": "o",
|
179 |
+
"112": "p",
|
180 |
+
"113": "q",
|
181 |
+
"114": "r",
|
182 |
+
"115": "s",
|
183 |
+
"116": "t",
|
184 |
+
"117": "u",
|
185 |
+
"118": "v",
|
186 |
+
"119": "w",
|
187 |
+
"120": "x",
|
188 |
+
"121": "y",
|
189 |
+
"122": "z",
|
190 |
+
"123": "{",
|
191 |
+
"124": "|",
|
192 |
+
"125": "}",
|
193 |
+
"126": "~",
|
194 |
+
"127": "\\u007f",
|
195 |
+
"128": "�",
|
196 |
+
"129": "�",
|
197 |
+
"130": "�",
|
198 |
+
"131": "�",
|
199 |
+
"132": "�",
|
200 |
+
"133": "�",
|
201 |
+
"134": "�",
|
202 |
+
"135": "�",
|
203 |
+
"136": "�",
|
204 |
+
"137": "�",
|
205 |
+
"138": "�",
|
206 |
+
"139": "�",
|
207 |
+
"140": "�",
|
208 |
+
"141": "�",
|
209 |
+
"142": "�",
|
210 |
+
"143": "�",
|
211 |
+
"144": "�",
|
212 |
+
"145": "�",
|
213 |
+
"146": "�",
|
214 |
+
"147": "�",
|
215 |
+
"148": "�",
|
216 |
+
"149": "�",
|
217 |
+
"150": "�",
|
218 |
+
"151": "�",
|
219 |
+
"152": "�",
|
220 |
+
"153": "�",
|
221 |
+
"154": "�",
|
222 |
+
"155": "�",
|
223 |
+
"156": "�",
|
224 |
+
"157": "�",
|
225 |
+
"158": "�",
|
226 |
+
"159": "�",
|
227 |
+
"160": "�",
|
228 |
+
"161": "�",
|
229 |
+
"162": "�",
|
230 |
+
"163": "�",
|
231 |
+
"164": "�",
|
232 |
+
"165": "�",
|
233 |
+
"166": "�",
|
234 |
+
"167": "�",
|
235 |
+
"168": "�",
|
236 |
+
"169": "�",
|
237 |
+
"170": "�",
|
238 |
+
"171": "�",
|
239 |
+
"172": "�",
|
240 |
+
"173": "�",
|
241 |
+
"174": "�",
|
242 |
+
"175": "�",
|
243 |
+
"176": "�",
|
244 |
+
"177": "�",
|
245 |
+
"178": "�",
|
246 |
+
"179": "�",
|
247 |
+
"180": "�",
|
248 |
+
"181": "�",
|
249 |
+
"182": "�",
|
250 |
+
"183": "�",
|
251 |
+
"184": "�",
|
252 |
+
"185": "�",
|
253 |
+
"186": "�",
|
254 |
+
"187": "�",
|
255 |
+
"188": "�",
|
256 |
+
"189": "�",
|
257 |
+
"190": "�",
|
258 |
+
"191": "�",
|
259 |
+
"192": "�",
|
260 |
+
"193": "�",
|
261 |
+
"194": "�",
|
262 |
+
"195": "�",
|
263 |
+
"196": "�",
|
264 |
+
"197": "�",
|
265 |
+
"198": "�",
|
266 |
+
"199": "�",
|
267 |
+
"200": "�",
|
268 |
+
"201": "�",
|
269 |
+
"202": "�",
|
270 |
+
"203": "�",
|
271 |
+
"204": "�",
|
272 |
+
"205": "�",
|
273 |
+
"206": "�",
|
274 |
+
"207": "�",
|
275 |
+
"208": "�",
|
276 |
+
"209": "�",
|
277 |
+
"210": "�",
|
278 |
+
"211": "�",
|
279 |
+
"212": "�",
|
280 |
+
"213": "�",
|
281 |
+
"214": "�",
|
282 |
+
"215": "�",
|
283 |
+
"216": "�",
|
284 |
+
"217": "�",
|
285 |
+
"218": "�",
|
286 |
+
"219": "�",
|
287 |
+
"220": "�",
|
288 |
+
"221": "�",
|
289 |
+
"222": "�",
|
290 |
+
"223": "�",
|
291 |
+
"224": "�",
|
292 |
+
"225": "�",
|
293 |
+
"226": "�",
|
294 |
+
"227": "�",
|
295 |
+
"228": "�",
|
296 |
+
"229": "�",
|
297 |
+
"230": "�",
|
298 |
+
"231": "�",
|
299 |
+
"232": "�",
|
300 |
+
"233": "�",
|
301 |
+
"234": "�",
|
302 |
+
"235": "�",
|
303 |
+
"236": "�",
|
304 |
+
"237": "�",
|
305 |
+
"238": "�",
|
306 |
+
"239": "�",
|
307 |
+
"240": "�",
|
308 |
+
"241": "�",
|
309 |
+
"242": "�",
|
310 |
+
"243": "�",
|
311 |
+
"244": "�",
|
312 |
+
"245": "�",
|
313 |
+
"246": "�",
|
314 |
+
"247": "�",
|
315 |
+
"248": "�",
|
316 |
+
"249": "�",
|
317 |
+
"250": "�",
|
318 |
+
"251": "�",
|
319 |
+
"252": "�",
|
320 |
+
"253": "�",
|
321 |
+
"254": "�",
|
322 |
+
"255": "�",
|
323 |
+
"256": " t",
|
324 |
+
"257": "ex",
|
325 |
+
"258": "ext",
|
326 |
+
"259": " to",
|
327 |
+
"260": "en",
|
328 |
+
"261": "<|",
|
329 |
+
"262": "st",
|
330 |
+
"263": "of",
|
331 |
+
"264": "oft",
|
332 |
+
"265": "oftext",
|
333 |
+
"266": "|>",
|
334 |
+
"267": "is",
|
335 |
+
"268": " a",
|
336 |
+
"269": " s",
|
337 |
+
"270": " th",
|
338 |
+
"271": " the",
|
339 |
+
"272": " tok",
|
340 |
+
"273": " token",
|
341 |
+
"274": " [",
|
342 |
+
"275": "SP",
|
343 |
+
"276": "SPE",
|
344 |
+
"277": "SPEC",
|
345 |
+
"278": "SPECI",
|
346 |
+
"279": "SPECIA",
|
347 |
+
"280": "SPECIAL",
|
348 |
+
"281": "sta",
|
349 |
+
"282": "star",
|
350 |
+
"283": "start",
|
351 |
+
"284": "startoftext",
|
352 |
+
"285": " H",
|
353 |
+
"286": " He",
|
354 |
+
"287": " Hel",
|
355 |
+
"288": " Hell",
|
356 |
+
"289": " Hello",
|
357 |
+
"290": " W",
|
358 |
+
"291": " Wo",
|
359 |
+
"292": " Wor",
|
360 |
+
"293": " Worl",
|
361 |
+
"294": " World",
|
362 |
+
"295": " T",
|
363 |
+
"296": " Th",
|
364 |
+
"297": " This",
|
365 |
+
"298": " is",
|
366 |
+
"299": " sa",
|
367 |
+
"300": " sam",
|
368 |
+
"301": " samp",
|
369 |
+
"302": " sampl",
|
370 |
+
"303": " sample",
|
371 |
+
"304": " text",
|
372 |
+
"305": " w",
|
373 |
+
"306": " wi",
|
374 |
+
"307": " wit",
|
375 |
+
"308": " with",
|
376 |
+
"309": " sp"
|
377 |
+
}
|
378 |
+
}
|
tokenizer/sample/load_json_vocab/tokens.py
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sys
|
2 |
+
sys.path.append('../')
|
3 |
+
|
4 |
+
from bpetokenizer import BPETokenizer
|
5 |
+
|
6 |
+
# intializing the tokenizer
|
7 |
+
tokenizer = BPETokenizer()
|
8 |
+
|
9 |
+
# load the vocab which is pretrained
|
10 |
+
tokenizer.load("sample_bpetokenizer.json", mode="json")
|
11 |
+
|
12 |
+
text = "<|startoftext|>This method? generates the tokens! which are split, before the tokenization using the pattern: default we use the gpt4 split pattern mentioned in the tiktoken.<|endoftext|>"
|
13 |
+
|
14 |
+
|
15 |
+
# this method returns a list of tokens of the text passed.
|
16 |
+
tokens = tokenizer.tokens(text, verbose=True) # if verbose, prints the text chunks and also the pattern used to split.
|
17 |
+
print('---')
|
18 |
+
print("tokens: ", tokens)
|
19 |
+
|
20 |
+
"""
|
21 |
+
tokens: ['<|', 'st', 'ar', 't', 'oftext', '|>', 'T', 'h', 'is', ' ', 'm', 'e', 'th', 'o', 'd', '?', ' ', 'g', 'en', 'er', 'a', 't', 'e', 's', ' the', ' token',
|
22 |
+
's', '!', ' w', 'h', 'i', 'c', 'h', ' a', 'r', 'e', ' s', 'pl', 'i', 't', ',', ' ', 'b', 'e', 'f', 'o', 'r', 'e', ' the',
|
23 |
+
' tokeniz', 'a', 't', 'i', 'on', ' ', 'u', 's', 'ing', ' the', ' ', 'p', 'a', 't', 't', 'er', 'n', ':', ' ', 'd', 'e', 'f', 'a', 'u', 'l', 't', ' w', 'e', ' ',
|
24 |
+
'u', 'se', ' the', ' ', 'g', 'p', 't', '4', ' s', 'pl', 'i', 't', ' ', 'p', 'a', 't', 't', 'er', 'n', ' ',
|
25 |
+
'm', 'en', 't', 'i', 'on', 'e', 'd', ' ', 'in', ' the', ' t', 'i', 'k', 't', 'o', 'k', 'en', '.', '<|', 'en', 'd', 'oftext', '|>']
|
26 |
+
"""
|
tokenizer/sample/tokenizer/wiki.model
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e8329f536bb6d37b88b4bd5b75770c5240a06d343f32756dd34eea37c49d69e2
|
3 |
+
size 36
|
tokenizer/sample/tokenizer/wiki.py
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from bpetokenizer import Tokenizer
|
2 |
+
|
3 |
+
text = "aaabdaaabac"
|
4 |
+
tokenizer = Tokenizer()
|
5 |
+
tokenizer.train(text, 259, verbose=True)
|
6 |
+
|
7 |
+
ids = tokenizer.encode(text)
|
8 |
+
print(ids)
|
9 |
+
print('---')
|
10 |
+
|
11 |
+
decoded_text = tokenizer.decode(ids)
|
12 |
+
print(decoded_text)
|
13 |
+
|
14 |
+
tokenizer.save("wiki")
|
tokenizer/sample/tokenizer/wiki.vocab
ADDED
@@ -0,0 +1,259 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[\u0000] 0
|
2 |
+
[\u0001] 1
|
3 |
+
[\u0002] 2
|
4 |
+
[\u0003] 3
|
5 |
+
[\u0004] 4
|
6 |
+
[\u0005] 5
|
7 |
+
[\u0006] 6
|
8 |
+
[\u0007] 7
|
9 |
+
[\u0008] 8
|
10 |
+
[\u0009] 9
|
11 |
+
[\u000a] 10
|
12 |
+
[\u000b] 11
|
13 |
+
[\u000c] 12
|
14 |
+
[\u000d] 13
|
15 |
+
[\u000e] 14
|
16 |
+
[\u000f] 15
|
17 |
+
[\u0010] 16
|
18 |
+
[\u0011] 17
|
19 |
+
[\u0012] 18
|
20 |
+
[\u0013] 19
|
21 |
+
[\u0014] 20
|
22 |
+
[\u0015] 21
|
23 |
+
[\u0016] 22
|
24 |
+
[\u0017] 23
|
25 |
+
[\u0018] 24
|
26 |
+
[\u0019] 25
|
27 |
+
[\u001a] 26
|
28 |
+
[\u001b] 27
|
29 |
+
[\u001c] 28
|
30 |
+
[\u001d] 29
|
31 |
+
[\u001e] 30
|
32 |
+
[\u001f] 31
|
33 |
+
[ ] 32
|
34 |
+
[!] 33
|
35 |
+
["] 34
|
36 |
+
[#] 35
|
37 |
+
[$] 36
|
38 |
+
[%] 37
|
39 |
+
[&] 38
|
40 |
+
['] 39
|
41 |
+
[(] 40
|
42 |
+
[)] 41
|
43 |
+
[*] 42
|
44 |
+
[+] 43
|
45 |
+
[,] 44
|
46 |
+
[-] 45
|
47 |
+
[.] 46
|
48 |
+
[/] 47
|
49 |
+
[0] 48
|
50 |
+
[1] 49
|
51 |
+
[2] 50
|
52 |
+
[3] 51
|
53 |
+
[4] 52
|
54 |
+
[5] 53
|
55 |
+
[6] 54
|
56 |
+
[7] 55
|
57 |
+
[8] 56
|
58 |
+
[9] 57
|
59 |
+
[:] 58
|
60 |
+
[;] 59
|
61 |
+
[<] 60
|
62 |
+
[=] 61
|
63 |
+
[>] 62
|
64 |
+
[?] 63
|
65 |
+
[@] 64
|
66 |
+
[A] 65
|
67 |
+
[B] 66
|
68 |
+
[C] 67
|
69 |
+
[D] 68
|
70 |
+
[E] 69
|
71 |
+
[F] 70
|
72 |
+
[G] 71
|
73 |
+
[H] 72
|
74 |
+
[I] 73
|
75 |
+
[J] 74
|
76 |
+
[K] 75
|
77 |
+
[L] 76
|
78 |
+
[M] 77
|
79 |
+
[N] 78
|
80 |
+
[O] 79
|
81 |
+
[P] 80
|
82 |
+
[Q] 81
|
83 |
+
[R] 82
|
84 |
+
[S] 83
|
85 |
+
[T] 84
|
86 |
+
[U] 85
|
87 |
+
[V] 86
|
88 |
+
[W] 87
|
89 |
+
[X] 88
|
90 |
+
[Y] 89
|
91 |
+
[Z] 90
|
92 |
+
[[] 91
|
93 |
+
[\] 92
|
94 |
+
[]] 93
|
95 |
+
[^] 94
|
96 |
+
[_] 95
|
97 |
+
[`] 96
|
98 |
+
[a] 97
|
99 |
+
[b] 98
|
100 |
+
[c] 99
|
101 |
+
[d] 100
|
102 |
+
[e] 101
|
103 |
+
[f] 102
|
104 |
+
[g] 103
|
105 |
+
[h] 104
|
106 |
+
[i] 105
|
107 |
+
[j] 106
|
108 |
+
[k] 107
|
109 |
+
[l] 108
|
110 |
+
[m] 109
|
111 |
+
[n] 110
|
112 |
+
[o] 111
|
113 |
+
[p] 112
|
114 |
+
[q] 113
|
115 |
+
[r] 114
|
116 |
+
[s] 115
|
117 |
+
[t] 116
|
118 |
+
[u] 117
|
119 |
+
[v] 118
|
120 |
+
[w] 119
|
121 |
+
[x] 120
|
122 |
+
[y] 121
|
123 |
+
[z] 122
|
124 |
+
[{] 123
|
125 |
+
[|] 124
|
126 |
+
[}] 125
|
127 |
+
[~] 126
|
128 |
+
[\u007f] 127
|
129 |
+
[�] 128
|
130 |
+
[�] 129
|
131 |
+
[�] 130
|
132 |
+
[�] 131
|
133 |
+
[�] 132
|
134 |
+
[�] 133
|
135 |
+
[�] 134
|
136 |
+
[�] 135
|
137 |
+
[�] 136
|
138 |
+
[�] 137
|
139 |
+
[�] 138
|
140 |
+
[�] 139
|
141 |
+
[�] 140
|
142 |
+
[�] 141
|
143 |
+
[�] 142
|
144 |
+
[�] 143
|
145 |
+
[�] 144
|
146 |
+
[�] 145
|
147 |
+
[�] 146
|
148 |
+
[�] 147
|
149 |
+
[�] 148
|
150 |
+
[�] 149
|
151 |
+
[�] 150
|
152 |
+
[�] 151
|
153 |
+
[�] 152
|
154 |
+
[�] 153
|
155 |
+
[�] 154
|
156 |
+
[�] 155
|
157 |
+
[�] 156
|
158 |
+
[�] 157
|
159 |
+
[�] 158
|
160 |
+
[�] 159
|
161 |
+
[�] 160
|
162 |
+
[�] 161
|
163 |
+
[�] 162
|
164 |
+
[�] 163
|
165 |
+
[�] 164
|
166 |
+
[�] 165
|
167 |
+
[�] 166
|
168 |
+
[�] 167
|
169 |
+
[�] 168
|
170 |
+
[�] 169
|
171 |
+
[�] 170
|
172 |
+
[�] 171
|
173 |
+
[�] 172
|
174 |
+
[�] 173
|
175 |
+
[�] 174
|
176 |
+
[�] 175
|
177 |
+
[�] 176
|
178 |
+
[�] 177
|
179 |
+
[�] 178
|
180 |
+
[�] 179
|
181 |
+
[�] 180
|
182 |
+
[�] 181
|
183 |
+
[�] 182
|
184 |
+
[�] 183
|
185 |
+
[�] 184
|
186 |
+
[�] 185
|
187 |
+
[�] 186
|
188 |
+
[�] 187
|
189 |
+
[�] 188
|
190 |
+
[�] 189
|
191 |
+
[�] 190
|
192 |
+
[�] 191
|
193 |
+
[�] 192
|
194 |
+
[�] 193
|
195 |
+
[�] 194
|
196 |
+
[�] 195
|
197 |
+
[�] 196
|
198 |
+
[�] 197
|
199 |
+
[�] 198
|
200 |
+
[�] 199
|
201 |
+
[�] 200
|
202 |
+
[�] 201
|
203 |
+
[�] 202
|
204 |
+
[�] 203
|
205 |
+
[�] 204
|
206 |
+
[�] 205
|
207 |
+
[�] 206
|
208 |
+
[�] 207
|
209 |
+
[�] 208
|
210 |
+
[�] 209
|
211 |
+
[�] 210
|
212 |
+
[�] 211
|
213 |
+
[�] 212
|
214 |
+
[�] 213
|
215 |
+
[�] 214
|
216 |
+
[�] 215
|
217 |
+
[�] 216
|
218 |
+
[�] 217
|
219 |
+
[�] 218
|
220 |
+
[�] 219
|
221 |
+
[�] 220
|
222 |
+
[�] 221
|
223 |
+
[�] 222
|
224 |
+
[�] 223
|
225 |
+
[�] 224
|
226 |
+
[�] 225
|
227 |
+
[�] 226
|
228 |
+
[�] 227
|
229 |
+
[�] 228
|
230 |
+
[�] 229
|
231 |
+
[�] 230
|
232 |
+
[�] 231
|
233 |
+
[�] 232
|
234 |
+
[�] 233
|
235 |
+
[�] 234
|
236 |
+
[�] 235
|
237 |
+
[�] 236
|
238 |
+
[�] 237
|
239 |
+
[�] 238
|
240 |
+
[�] 239
|
241 |
+
[�] 240
|
242 |
+
[�] 241
|
243 |
+
[�] 242
|
244 |
+
[�] 243
|
245 |
+
[�] 244
|
246 |
+
[�] 245
|
247 |
+
[�] 246
|
248 |
+
[�] 247
|
249 |
+
[�] 248
|
250 |
+
[�] 249
|
251 |
+
[�] 250
|
252 |
+
[�] 251
|
253 |
+
[�] 252
|
254 |
+
[�] 253
|
255 |
+
[�] 254
|
256 |
+
[�] 255
|
257 |
+
[a][a] -> [aa] 256
|
258 |
+
[aa][a] -> [aaa] 257
|
259 |
+
[aaa][b] -> [aaab] 258
|
tokenizer/setup.py
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from setuptools import find_packages, setup
|
3 |
+
|
4 |
+
here = os.path.abspath(os.path.dirname(__file__))
|
5 |
+
|
6 |
+
# Get the code version
|
7 |
+
version = {}
|
8 |
+
with open(os.path.join(here, "bpetokenizer/version.py")) as f:
|
9 |
+
exec(f.read(), version)
|
10 |
+
__version__ = version["__version__"]
|
11 |
+
|
12 |
+
|
13 |
+
with open("README.md", "r", encoding="utf-8") as f:
|
14 |
+
long_description = f.read()
|
15 |
+
|
16 |
+
|
17 |
+
setup(
|
18 |
+
name="bpetokenizer",
|
19 |
+
version=__version__,
|
20 |
+
description="Byte Pair Encoding Tokenizer with special tokens and regex pattern",
|
21 |
+
long_description=long_description,
|
22 |
+
long_description_content_type="text/markdown",
|
23 |
+
url="https://github.com/Hk669/bpetokenizer",
|
24 |
+
author="Hrushikesh Dokala",
|
25 |
+
author_email="hrushi669@gmail.com",
|
26 |
+
license="MIT",
|
27 |
+
packages=find_packages(include=["bpetokenizer"]),
|
28 |
+
classifiers=[
|
29 |
+
"License :: OSI Approved :: MIT License",
|
30 |
+
"Programming Language :: Python :: 3",
|
31 |
+
"Operating System :: OS Independent",
|
32 |
+
],
|
33 |
+
install_requires=["regex"],
|
34 |
+
extras_require={
|
35 |
+
"dev": ["pytest", "twine"],
|
36 |
+
},
|
37 |
+
python_requires=">=3.9,<3.13",
|
38 |
+
)
|
tokenizer/tests/__pycache__/test_tokenizer.cpython-39-pytest-7.1.2.pyc
ADDED
Binary file (7.46 kB). View file
|
|
tokenizer/tests/test_tokenizer.py
ADDED
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import pytest
|
3 |
+
from bpetokenizer import BPETokenizer, Tokenizer
|
4 |
+
|
5 |
+
@pytest.fixture
|
6 |
+
def tokenizer():
|
7 |
+
return Tokenizer()
|
8 |
+
|
9 |
+
@pytest.fixture
|
10 |
+
def bpe_tokenizer():
|
11 |
+
return BPETokenizer()
|
12 |
+
|
13 |
+
|
14 |
+
def test_train():
|
15 |
+
"""Test the training of the tokenizer."""
|
16 |
+
text = "aaabdaaabac"
|
17 |
+
tokenizer = Tokenizer()
|
18 |
+
tokenizer.train(text, 259, verbose=False)
|
19 |
+
assert len(tokenizer.vocab) == 259
|
20 |
+
assert len(tokenizer.merges) == 3
|
21 |
+
assert tokenizer.decode(tokenizer.encode(text)) == "aaabdaaabac"
|
22 |
+
|
23 |
+
|
24 |
+
def test_encode():
|
25 |
+
"""Test the encoding of the tokenizer."""
|
26 |
+
text = "aaabdaaabac"
|
27 |
+
tokenizer = Tokenizer()
|
28 |
+
tokenizer.train(text, 259, verbose=False)
|
29 |
+
assert tokenizer.encode("aaabdaaabac") == [258, 100, 258, 97, 99]
|
30 |
+
|
31 |
+
|
32 |
+
def test_decode():
|
33 |
+
"""Test the decoding of the tokenizer."""
|
34 |
+
text = "aaabdaaabac"
|
35 |
+
tokenizer = Tokenizer()
|
36 |
+
tokenizer.train(text, 259, verbose=False)
|
37 |
+
assert tokenizer.decode([258, 100, 258, 97, 99]) == "aaabdaaabac"
|
38 |
+
|
39 |
+
|
40 |
+
def test_train_bpe():
|
41 |
+
"""Test the training of the BPE tokenizer."""
|
42 |
+
text = "aaabdaaabac"
|
43 |
+
tokenizer = BPETokenizer()
|
44 |
+
tokenizer.train(text, 256 + 3, verbose=False)
|
45 |
+
assert len(tokenizer.vocab) == 259
|
46 |
+
assert len(tokenizer.merges) == 3
|
47 |
+
assert tokenizer.decode(tokenizer.encode(text)) == "aaabdaaabac"
|
48 |
+
|
49 |
+
|
50 |
+
def test_train_bpe_w_special_tokens():
|
51 |
+
"""Test the bpetokenizer with special tokens"""
|
52 |
+
special_tokens = {
|
53 |
+
"<|endoftext|>": 1001,
|
54 |
+
"<|startoftext|>": 1002,
|
55 |
+
"[SPECIAL1]": 1003,
|
56 |
+
"[SPECIAL2]": 1004,
|
57 |
+
}
|
58 |
+
|
59 |
+
PATTERN = r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]++[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+"""
|
60 |
+
tokenizer = BPETokenizer(special_tokens=special_tokens, pattern=PATTERN)
|
61 |
+
texts = "<|startoftext|> Hello, World! This is a sample text with the special tokens [SPECIAL1] and [SPECIAL2] to test the tokenizer.<|endoftext|>"
|
62 |
+
tokenizer.train(texts, vocab_size=310, verbose=False)
|
63 |
+
|
64 |
+
assert len(tokenizer.vocab) == 310
|
65 |
+
assert len(tokenizer.merges) == 310 - 256
|
66 |
+
assert tokenizer.decode(tokenizer.encode(texts)) == texts
|
67 |
+
assert tokenizer.inverse_special_tokens == {v: k for k,v in special_tokens.items()}
|
68 |
+
assert tokenizer.special_tokens == special_tokens
|
69 |
+
assert tokenizer.pattern == PATTERN
|