add gemma_7b
Browse files
.gitattributes
CHANGED
@@ -35,4 +35,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
vocab/belle_7b_2m/belle-7b-2m/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
37 |
vocab/bloom/tokenizer/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
38 |
vocab/
|
|
|
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
vocab/belle_7b_2m/belle-7b-2m/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
37 |
vocab/bloom/tokenizer/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
38 |
+
vocab/gemma_7b/gemma-7b/tokenizer.model filter=lfs diff=lfs merge=lfs -text
|
39 |
+
vocab/gemma_7b/gemma-7b/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
40 |
vocab/
|
41 |
+
|
vocab/gemma_7b/__init__.py
CHANGED
@@ -1,7 +1,17 @@
|
|
1 |
|
2 |
-
|
|
|
3 |
from transformers import AutoTokenizer
|
4 |
|
5 |
tokenizer = AutoTokenizer.from_pretrained("google/gemma-7b", trust_remote_code=True)
|
6 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
tokenizer.comments = ""
|
|
|
1 |
|
2 |
+
import os
|
3 |
+
import config
|
4 |
from transformers import AutoTokenizer
|
5 |
|
6 |
tokenizer = AutoTokenizer.from_pretrained("google/gemma-7b", trust_remote_code=True)
|
7 |
|
8 |
+
|
9 |
+
|
10 |
+
if config.USE_REMOTE:
|
11 |
+
tokenizer = AutoTokenizer.from_pretrained("google/gemma-7b", trust_remote_code=True)
|
12 |
+
else:
|
13 |
+
CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
|
14 |
+
TOKENIZER_DIR = os.path.join(CURRENT_DIR, "gemma-7b")
|
15 |
+
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_DIR, trust_remote_code=True)
|
16 |
+
|
17 |
tokenizer.comments = ""
|
vocab/gemma_7b/gemma-7b/special_tokens_map.json
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"bos_token": {
|
3 |
+
"content": "<bos>",
|
4 |
+
"lstrip": false,
|
5 |
+
"normalized": false,
|
6 |
+
"rstrip": false,
|
7 |
+
"single_word": false
|
8 |
+
},
|
9 |
+
"eos_token": {
|
10 |
+
"content": "<eos>",
|
11 |
+
"lstrip": false,
|
12 |
+
"normalized": false,
|
13 |
+
"rstrip": false,
|
14 |
+
"single_word": false
|
15 |
+
},
|
16 |
+
"pad_token": {
|
17 |
+
"content": "<pad>",
|
18 |
+
"lstrip": false,
|
19 |
+
"normalized": false,
|
20 |
+
"rstrip": false,
|
21 |
+
"single_word": false
|
22 |
+
},
|
23 |
+
"unk_token": {
|
24 |
+
"content": "<unk>",
|
25 |
+
"lstrip": false,
|
26 |
+
"normalized": false,
|
27 |
+
"rstrip": false,
|
28 |
+
"single_word": false
|
29 |
+
}
|
30 |
+
}
|
vocab/gemma_7b/gemma-7b/tokenizer.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d0d908b4f9326e0998815690e325b6abbd378978553e10627924dd825db7e243
|
3 |
+
size 17477553
|
vocab/gemma_7b/gemma-7b/tokenizer.model
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2
|
3 |
+
size 4241003
|
vocab/gemma_7b/gemma-7b/tokenizer_config.json
ADDED
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"add_bos_token": true,
|
3 |
+
"add_eos_token": false,
|
4 |
+
"added_tokens_decoder": {
|
5 |
+
"0": {
|
6 |
+
"content": "<pad>",
|
7 |
+
"lstrip": false,
|
8 |
+
"normalized": false,
|
9 |
+
"rstrip": false,
|
10 |
+
"single_word": false,
|
11 |
+
"special": true
|
12 |
+
},
|
13 |
+
"1": {
|
14 |
+
"content": "<eos>",
|
15 |
+
"lstrip": false,
|
16 |
+
"normalized": false,
|
17 |
+
"rstrip": false,
|
18 |
+
"single_word": false,
|
19 |
+
"special": true
|
20 |
+
},
|
21 |
+
"2": {
|
22 |
+
"content": "<bos>",
|
23 |
+
"lstrip": false,
|
24 |
+
"normalized": false,
|
25 |
+
"rstrip": false,
|
26 |
+
"single_word": false,
|
27 |
+
"special": true
|
28 |
+
},
|
29 |
+
"3": {
|
30 |
+
"content": "<unk>",
|
31 |
+
"lstrip": false,
|
32 |
+
"normalized": false,
|
33 |
+
"rstrip": false,
|
34 |
+
"single_word": false,
|
35 |
+
"special": true
|
36 |
+
}
|
37 |
+
},
|
38 |
+
"bos_token": "<bos>",
|
39 |
+
"clean_up_tokenization_spaces": false,
|
40 |
+
"eos_token": "<eos>",
|
41 |
+
"legacy": null,
|
42 |
+
"model_max_length": 1000000000000000019884624838656,
|
43 |
+
"pad_token": "<pad>",
|
44 |
+
"sp_model_kwargs": {},
|
45 |
+
"spaces_between_special_tokens": false,
|
46 |
+
"tokenizer_class": "GemmaTokenizer",
|
47 |
+
"unk_token": "<unk>",
|
48 |
+
"use_default_system_prompt": false
|
49 |
+
}
|