Jón Daðason commited on
Commit
3162693
1 Parent(s): 7df00fe

Adding model

Browse files
README.md CHANGED
@@ -1,3 +1,15 @@
1
  ---
 
 
2
  license: cc-by-4.0
 
 
3
  ---
 
 
 
 
 
 
 
 
 
1
  ---
2
+ language:
3
+ - is
4
  license: cc-by-4.0
5
+ datasets:
6
+ - igc
7
  ---
8
+
9
+ # Icelandic GPT-2 model
10
+ This Icelandic GPT-2 language model was pretrained on the [Icelandic Gigaword Corpus](http://igc.arnastofnun.is/) (IGC, 2020 version), which contains approximately 1.532 million running words. The model was trained for 20 epochs on a TPU v3-8, with a total training time of 3 days and 21 hours. The hyperparameters used for training can be found in the [JAX/Flax documentation](https://github.com/huggingface/transformers/tree/main/examples/flax/language-modeling#train-model-1) for the Transformers library. The model uses a byte-level BPE tokenizer with a vocabulary size of 51,000.
11
+
12
+ **Note**: This model was pretrained on a tokenized and sentence-segmentized version of the IGC, which is reflected by the generated text. A new version of this model, trained on a pre-tokenized version of IGC (2022 version), is forthcoming.
13
+
14
+ # Acknowledgments
15
+ This research was supported with Cloud TPUs from Google's TPU Research Cloud (TRC).
config.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_function": "gelu_new",
3
+ "architectures": [
4
+ "GPT2LMHeadModel"
5
+ ],
6
+ "attn_pdrop": 0.0,
7
+ "bos_token_id": 50256,
8
+ "embd_pdrop": 0.0,
9
+ "eos_token_id": 50256,
10
+ "initializer_range": 0.02,
11
+ "layer_norm_epsilon": 1e-05,
12
+ "model_type": "gpt2",
13
+ "n_ctx": 1024,
14
+ "n_embd": 768,
15
+ "n_head": 12,
16
+ "n_inner": null,
17
+ "n_layer": 12,
18
+ "n_positions": 1024,
19
+ "reorder_and_upcast_attn": false,
20
+ "resid_pdrop": 0.0,
21
+ "scale_attn_by_inverse_layer_idx": false,
22
+ "scale_attn_weights": true,
23
+ "summary_activation": null,
24
+ "summary_first_dropout": 0.1,
25
+ "summary_proj_to_labels": true,
26
+ "summary_type": "cls_index",
27
+ "summary_use_proj": true,
28
+ "task_specific_params": {
29
+ "text-generation": {
30
+ "do_sample": true,
31
+ "max_length": 50
32
+ }
33
+ },
34
+ "torch_dtype": "float32",
35
+ "transformers_version": "4.17.0.dev0",
36
+ "use_cache": true,
37
+ "vocab_size": 51000
38
+ }
flax_model.msgpack ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f2dbccf2e3ec779ab56e2912bfdefc0aa7178c392ef3fc1f5c33c9a8ce0bab83
3
+ size 500046616
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ba9089b8874a22f0e727485b8f078fe6df0ad94f2033de12365827dfb255a388
3
+ size 512683881
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"model_max_length": 512, "special_tokens_map_file": null, "tokenizer_class": "GPT2Tokenizer"}