Christopher Akiki commited on
Commit
431fc47
1 Parent(s): a5726e8

Add sentdex model and tokenizer

Browse files
.gitattributes CHANGED
@@ -15,3 +15,4 @@
15
  *.pt filter=lfs diff=lfs merge=lfs -text
16
  *.pth filter=lfs diff=lfs merge=lfs -text
17
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
15
  *.pt filter=lfs diff=lfs merge=lfs -text
16
  *.pth filter=lfs diff=lfs merge=lfs -text
17
  *tfevents* filter=lfs diff=lfs merge=lfs -text
18
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
added_tokens.json ADDED
@@ -0,0 +1 @@
 
1
+ {"<|endoftext|>": 52000}
clone_sentdex_model_tokenizer.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ from transformers import AutoTokenizer, TFAutoModelForCausalLM
3
+ import tensorflow as tf
4
+
5
+ tokenizer = AutoTokenizer.from_pretrained("Sentdex/GPyT")
6
+ model = TFAutoModelForCausalLM.from_pretrained("Sentdex/GPyT", from_pt=True)
7
+
8
+ tokenizer.save_pretrained(save_directory='./')
9
+ model.save_pretrained(save_directory='./', saved_model=True)
config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "Sentdex/GPyT",
3
+ "activation_function": "gelu_new",
4
+ "architectures": [
5
+ "GPT2LMHeadModel"
6
+ ],
7
+ "attn_pdrop": 0.1,
8
+ "bos_token_id": 0,
9
+ "embd_pdrop": 0.1,
10
+ "eos_token_id": 2,
11
+ "gradient_checkpointing": false,
12
+ "initializer_range": 0.02,
13
+ "layer_norm_epsilon": 1e-05,
14
+ "model_type": "gpt2",
15
+ "n_ctx": 1024,
16
+ "n_embd": 768,
17
+ "n_head": 12,
18
+ "n_inner": null,
19
+ "n_layer": 12,
20
+ "n_positions": 1024,
21
+ "resid_pdrop": 0.1,
22
+ "scale_attn_weights": true,
23
+ "summary_activation": null,
24
+ "summary_first_dropout": 0.1,
25
+ "summary_proj_to_labels": true,
26
+ "summary_type": "cls_index",
27
+ "summary_use_proj": true,
28
+ "transformers_version": "4.9.1",
29
+ "use_cache": true,
30
+ "vocab_size": 52000
31
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
saved_model/1/keras_metadata.pb ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6d421caaf137a4c2050961971319ae0316e50664cfb590c7c5f444bb5d64b76d
3
+ size 67064
saved_model/1/saved_model.pb ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e949c88949ecccebec606466c6a09546d1142140ae8455c5571b9c4ac7db5efd
3
+ size 5079982
saved_model/1/variables/variables.data-00000-of-00001 ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:449d69e48d6fd879391fd8e811d76b6fe5a0a2ef3f9ec181d7266fdb829ffec1
3
+ size 503191134
saved_model/1/variables/variables.index ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3d770805de94ce9274fbc80084f94a83aaa950cac80934211ef5d6741cb326b5
3
+ size 8900
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
1
+ {"bos_token": {"content": "<|endoftext|>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "eos_token": {"content": "<|endoftext|>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "unk_token": {"content": "<|endoftext|>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}}
tf_model.h5 ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4afd368410a15f1aae0c996ce4b11fd1b6205147a659668064c09418f06f2cb5
3
+ size 503289416
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
1
+ {"unk_token": {"content": "<|endoftext|>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "bos_token": {"content": "<|endoftext|>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "eos_token": {"content": "<|endoftext|>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "add_prefix_space": false, "special_tokens_map_file": null, "name_or_path": "Sentdex/GPyT", "errors": "replace", "tokenizer_class": "GPT2Tokenizer"}
vocab.json ADDED
The diff for this file is too large to render. See raw diff