winstxnhdw commited on
Commit
67f7ef2
1 Parent(s): 9bc76c4

feat: initialise repository and add model file

Browse files
config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "",
3
+ "eos_token": "<|endoftext|>",
4
+ "layer_norm_epsilon": null,
5
+ "unk_token": "<|unk|>"
6
+ }
model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1ef2050e5feb3ed0e734426a1050c07c5da47740be500156b32546db0a12b92a
3
+ size 2604249008
pyproject.toml ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [tool.pylint.format]
2
+ good-names = [
3
+ 'x', 'y', 'z', 'id', 'ok'
4
+ ]
5
+
6
+ [tool.pylint.messages_control]
7
+ disable = [
8
+ 'missing-module-docstring',
9
+ 'no-name-in-module',
10
+ 'useless-import-alias',
11
+ 'line-too-long',
12
+ 'too-many-arguments',
13
+ 'too-few-public-methods'
14
+ ]
15
+
16
+ [tool.pyright]
17
+ reportUnknownVariableType = false
18
+ reportUnknownMemberType = false
19
+ reportMissingTypeStubs = false
replit_lm_tokenizer.py ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ """
16
+ Forked from the file src/transformers/models/bert_generation/tokenization_bert_generation.py from the HuggingFace Transformers library.
17
+ Permalink: https://github.com/huggingface/transformers/blob/04ab5605fbb4ef207b10bf2772d88c53fc242e83/src/transformers/models/bert_generation/tokenization_bert_generation.py
18
+
19
+ Tokenizer class for ReplitLM
20
+ Class is modified for compatibility with custom vocabulary and to achieve desired encode/decode behavior for Replit Code V1 3B model.
21
+ """
22
+ import os
23
+ from shutil import copyfile
24
+ from typing import Any
25
+
26
+ from sentencepiece import SentencePieceProcessor
27
+ from transformers import PreTrainedTokenizer
28
+
29
+ VOCAB_FILES_NAMES = {'vocab_file': 'spiece.model'}
30
+
31
+ class ReplitLMTokenizer(PreTrainedTokenizer):
32
+ """
33
+ Construct a ReplitLMTokenizer tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece).
34
+ This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods.
35
+
36
+ Args:
37
+ vocab_file (`str`):
38
+ [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
39
+ contains the vocabulary necessary to instantiate a tokenizer.
40
+ eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
41
+ The end of sequence token.
42
+ bos_token (`str`, *optional*, defaults to `None`):
43
+ The begin of sequence token.
44
+ unk_token (`str`, *optional*, defaults to `"<|unk|>"`):
45
+ The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
46
+ token instead.
47
+ pad_token (`str`, *optional*, defaults to `"<|pad|>"`):
48
+ The token used for padding, for example when batching sequences of different lengths.
49
+ sp_model_kwargs (`dict`, *optional*):
50
+ Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
51
+ SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
52
+ to set:
53
+ - `enable_sampling`: Enable subword regularization.
54
+ - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
55
+ - `nbest_size = {0,1}`: No sampling is performed.
56
+ - `nbest_size > 1`: samples from the nbest_size results.
57
+ - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
58
+ using forward-filtering-and-backward-sampling algorithm.
59
+ - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
60
+ BPE-dropout.
61
+ """
62
+ vocab_files_names = VOCAB_FILES_NAMES
63
+ prefix_tokens: list[int] = []
64
+ model_input_names = ['input_ids', 'attention_mask']
65
+
66
+ def __init__(
67
+ self,
68
+ vocab_file: str,
69
+ bos_token: str | None = None,
70
+ eos_token: str | None ='<|endoftext|>',
71
+ unk_token: str | None ='<|unk|>',
72
+ pad_token: str | None ='<|pad|>',
73
+ sep_token: str | None = None,
74
+ sp_model_kwargs: dict[str, Any] | None = None,
75
+ **kwargs: dict[str, Any]
76
+ ):
77
+ self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
78
+ super().__init__(bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, pad_token=pad_token, sep_token=sep_token, sp_model_kwargs=self.sp_model_kwargs, **kwargs)
79
+ self.vocab_file = vocab_file
80
+ self.sp_model = SentencePieceProcessor(**self.sp_model_kwargs)
81
+ self.sp_model.Load(vocab_file)
82
+
83
+ @property
84
+ def vocab_size(self) -> int:
85
+ return self.sp_model.GetPieceSize()
86
+
87
+
88
+ def get_vocab(self):
89
+ vocab = { self.convert_ids_to_tokens(i): i for i in range(self.vocab_size) }
90
+ vocab.update(self.added_tokens_encoder)
91
+ return vocab
92
+
93
+
94
+ def __getstate__(self):
95
+ state = self.__dict__.copy()
96
+ state['sp_model'] = None
97
+ return state
98
+
99
+
100
+ def __setstate__(self, dictionary: dict[Any, Any]):
101
+ self.__dict__ = dictionary
102
+ if not hasattr(self, 'sp_model_kwargs'):
103
+ self.sp_model_kwargs = {}
104
+
105
+ self.sp_model = SentencePieceProcessor(**self.sp_model_kwargs)
106
+ self.sp_model.Load(self.vocab_file)
107
+
108
+
109
+ def _tokenize(self, text: str, **_) -> list[str]:
110
+ """Take as input a string and return a list of strings (tokens) for words/sub-words"""
111
+ return self.sp_model.Encode(text, out_type=str)
112
+
113
+
114
+ def _convert_token_to_id(self, token: str) -> int:
115
+ """Converts a token (str) in an id using the vocab."""
116
+ return self.sp_model.PieceToId(token)
117
+
118
+
119
+ def _convert_id_to_token(self, index: int) -> str:
120
+ """Converts an index (integer) in a token (str) using the vocab."""
121
+ return self.sp_model.IdToPiece(index)
122
+
123
+
124
+ def convert_tokens_to_string(self, tokens: list[str]) -> str:
125
+ """Converts a sequence of tokens (string) in a single string."""
126
+ return self.sp_model.Decode(tokens)
127
+
128
+
129
+ def save_vocabulary(self, save_directory: str, filename_prefix: str | None = None) -> tuple[str]:
130
+ if not os.path.isdir(save_directory):
131
+ raise ValueError(f'Vocabulary path ({save_directory}) should be a directory')
132
+
133
+ out_vocab_file = os.path.join(
134
+ save_directory,
135
+ f"{filename_prefix}{'-' if filename_prefix else ''}{VOCAB_FILES_NAMES['vocab_file']}"
136
+ )
137
+
138
+ if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
139
+ copyfile(self.vocab_file, out_vocab_file)
140
+
141
+ elif not os.path.isfile(self.vocab_file):
142
+ with open(out_vocab_file, 'wb') as file:
143
+ content_spiece_model = self.sp_model.serialized_model_proto()
144
+ file.write(content_spiece_model) # type: ignore
145
+
146
+ return (out_vocab_file,)
spiece.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7e1ba8b7df0701723d2d901c7a42182fe77bf0045173f2cdb474ca6ea3eb1c02
3
+ size 707660
tokenizer_config.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_map": {
3
+ "AutoTokenizer": [
4
+ "replit_lm_tokenizer.ReplitLMTokenizer",
5
+ null
6
+ ]
7
+ },
8
+ "bos_token": null,
9
+ "clean_up_tokenization_spaces": false,
10
+ "eos_token": "<|endoftext|>",
11
+ "model_max_length": 2048,
12
+ "pad_token": "<|pad|>",
13
+ "padding_side": "right",
14
+ "sep_token": null,
15
+ "sp_model_kwargs": {},
16
+ "tokenizer_class": "ReplitLMTokenizer",
17
+ "unk_token": "<|unk|>"
18
+ }
vocabulary.json ADDED
The diff for this file is too large to render. See raw diff