sarnikowski commited on
Commit
6086e12
1 Parent(s): 58d456c

release: v0.1.0

Browse files
README.md ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language: da
3
+ license: cc-by-4.0
4
+ ---
5
+
6
+ # Danish ConvBERT small (cased)
7
+
8
+ [ConvBERT](https://arxiv.org/abs/2008.02496) model pretrained on a custom Danish corpus (~17.5gb).
9
+ For details regarding data sources and training procedure, along with benchmarks on downstream tasks, go to: https://github.com/sarnikowski/danish_transformers
10
+
11
+ ## Usage
12
+
13
+ ```python
14
+ from transformers import ConvBertTokenizer, ConvBertModel
15
+
16
+ tokenizer = ConvBertTokenizer.from_pretrained("sarnikowski/convbert-small-da-cased")
17
+ model = ConvBertModel.from_pretrained("sarnikowski/convbert-small-da-cased")
18
+ ```
19
+
20
+ ## Questions?
21
+
22
+ If you have any questions feel free to open an issue on the [danish_transformers](https://github.com/sarnikowski/danish_transformers) repository, or send an email to p.sarnikowski@gmail.com
config.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": ".",
3
+ "architectures": [
4
+ "ConvBertForPreTraining"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "bos_token_id": 0,
8
+ "conv_kernel_size": 9,
9
+ "directionality": "bidi",
10
+ "embedding_size": 128,
11
+ "eos_token_id": 2,
12
+ "head_ratio": 2,
13
+ "hidden_act": "gelu",
14
+ "hidden_dropout_prob": 0.1,
15
+ "hidden_size": 256,
16
+ "initializer_range": 0.02,
17
+ "intermediate_size": 1024,
18
+ "layer_norm_eps": 1e-12,
19
+ "max_position_embeddings": 512,
20
+ "model_type": "convbert",
21
+ "num_attention_heads": 4,
22
+ "num_groups": 1,
23
+ "num_hidden_layers": 12,
24
+ "pad_token_id": 0,
25
+ "transformers_version": "4.4.0.dev0",
26
+ "type_vocab_size": 2,
27
+ "vocab_size": 28995
28
+ }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0ee5811a108a8f098e44faf91bd96f6e70351d5e4967210257fbd2a98ce6850f
3
+ size 52183983
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]"}
tf_model.h5 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f55327b06240e4635212dde7ffb214acd782417fd8b81dbb5d8cfa453bc8f074
3
+ size 52129664
tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"do_lower_case": false}
vocab.txt ADDED
The diff for this file is too large to render. See raw diff