edaiofficial commited on
Commit
a2c106c
β€’
1 Parent(s): 45eca8f

additional commits for edo and urhobo

Browse files
en-bin/main/best.ckpt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fce36367df49ca223c81143710ecd537dbea2696be41dc8e5cef09543921c5e6
3
- size 184697550
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ac68bafa1531f3e949b79c516db30c37910eeb5beeccad20dd7136c5eb8d555d
3
+ size 155498304
en-bin/main/config.yaml CHANGED
@@ -4,14 +4,14 @@ name: "enbin_transformer"
4
  data:
5
  src: "en"
6
  trg: "bin"
7
- train: "/content/drive/My Drive/masakhane/en-bin-baseline/train"
8
- dev: "/content/drive/My Drive/masakhane/en-bin-baseline/dev"
9
- test: "/content/drive/My Drive/masakhane/en-bin-baseline/test"
10
  level: "bpe"
11
  lowercase: False
12
  max_sent_length: 100
13
- src_vocab: "/content/drive/My Drive/masakhane/en-bin-baseline/vocab-nonBPE.txt"
14
- trg_vocab: "/content/drive/My Drive/masakhane/en-bin-baseline/vocab-nonBPE.txt"
15
 
16
  testing:
17
  beam_size: 5
@@ -39,11 +39,11 @@ training:
39
  eval_batch_type: "token"
40
  batch_multiplier: 1
41
  early_stopping_metric: "ppl"
42
- epochs: 120 # TODO: Decrease for when playing around and checking of working. Around 30 is sufficient to check if its working at all
43
  validation_freq: 100 # TODO: Set to at least once per epoch.
44
  logging_freq: 100
45
  eval_metric: "bleu"
46
- model_dir: "/content/drive/My Drive/masakhane/en-bin-baseline/models/enbin_transformer"
47
  overwrite: True # TODO: Set to True if you want to overwrite possibly existing models.
48
  shuffle: True
49
  use_cuda: True
4
  data:
5
  src: "en"
6
  trg: "bin"
7
+ train: "/content/drive/My Drive/masakhane/en-bin-baseline/train.bpe"
8
+ dev: "/content/drive/My Drive/masakhane/en-bin-baseline/dev.bpe"
9
+ test: "/content/drive/My Drive/masakhane/en-bin-baseline/test.bpe"
10
  level: "bpe"
11
  lowercase: False
12
  max_sent_length: 100
13
+ src_vocab: "/content/drive/My Drive/masakhane/en-bin-baseline/vocab.txt"
14
+ trg_vocab: "/content/drive/My Drive/masakhane/en-bin-baseline/vocab.txt"
15
 
16
  testing:
17
  beam_size: 5
39
  eval_batch_type: "token"
40
  batch_multiplier: 1
41
  early_stopping_metric: "ppl"
42
+ epochs: 400 # TODO: Decrease for when playing around and checking of working. Around 30 is sufficient to check if its working at all
43
  validation_freq: 100 # TODO: Set to at least once per epoch.
44
  logging_freq: 100
45
  eval_metric: "bleu"
46
+ model_dir: ""
47
  overwrite: True # TODO: Set to True if you want to overwrite possibly existing models.
48
  shuffle: True
49
  use_cuda: True
en-bin/main/src_vocab.txt CHANGED
The diff for this file is too large to render. See raw diff
en-bin/main/trg_vocab.txt CHANGED
The diff for this file is too large to render. See raw diff
en-urh/{jw300-baseline β†’ main}/English_to_Urhobo_BPE_notebook.ipynb RENAMED
File without changes
en-urh/{jw300-baseline β†’ main}/English_to_Urhobo_Word-level_notebook.ipynb RENAMED
File without changes
en-urh/{jw300-baseline β†’ main}/README.md RENAMED
File without changes
en-urh/main/best.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9c4d349b6ae2fece09947cac59d66c34d8ae9464d6c5de115479cb2f99d56e30
3
+ size 212008098
en-urh/main/config.yaml ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ name: "enurh_transformer"
3
+
4
+ data:
5
+ src: "en"
6
+ trg: "urh"
7
+ train: "data/enurh/train"
8
+ dev: "data/enurh/dev"
9
+ test: "data/enurh/test"
10
+ level: "word"
11
+ lowercase: False
12
+ max_sent_length: 100
13
+ src_vocab: "data/enurh/vocab-nonBPE.txt"
14
+ trg_vocab: "data/enurh/vocab-nonBPE.txt"
15
+
16
+ testing:
17
+ beam_size: 5
18
+ alpha: 1.0
19
+
20
+ training:
21
+ #load_model: "/content/drive/My Drive/masakhane/en-urh-baseline/models/enurh_transformer/1.ckpt" # if uncommented, load a pre-trained model from this checkpoint
22
+ random_seed: 42
23
+ optimizer: "adam"
24
+ normalization: "tokens"
25
+ adam_betas: [0.9, 0.999]
26
+ scheduling: "plateau" # TODO: try switching from plateau to Noam scheduling
27
+ patience: 5 # For plateau: decrease learning rate by decrease_factor if validation score has not improved for this many validation rounds.
28
+ learning_rate_factor: 0.5 # factor for Noam scheduler (used with Transformer)
29
+ learning_rate_warmup: 1000 # warmup steps for Noam scheduler (used with Transformer)
30
+ decrease_factor: 0.7
31
+ loss: "crossentropy"
32
+ learning_rate: 0.0003
33
+ learning_rate_min: 0.00000001
34
+ weight_decay: 0.0
35
+ label_smoothing: 0.1
36
+ batch_size: 4096
37
+ batch_type: "token"
38
+ eval_batch_size: 3600
39
+ eval_batch_type: "token"
40
+ batch_multiplier: 1
41
+ early_stopping_metric: "ppl"
42
+ epochs: 150 # TODO: Decrease for when playing around and checking of working. Around 30 is sufficient to check if its working at all
43
+ validation_freq: 1000 # TODO: Set to at least once per epoch.
44
+ logging_freq: 100
45
+ eval_metric: "bleu"
46
+ model_dir: "models/enurh_transformer"
47
+ overwrite: True # TODO: Set to True if you want to overwrite possibly existing models.
48
+ shuffle: True
49
+ use_cuda: True
50
+ max_output_length: 100
51
+ print_valid_sents: [0, 1, 2, 3]
52
+ keep_last_ckpts: 3
53
+
54
+ model:
55
+ initializer: "xavier"
56
+ bias_initializer: "zeros"
57
+ init_gain: 1.0
58
+ embed_initializer: "xavier"
59
+ embed_init_gain: 1.0
60
+ tied_embeddings: True
61
+ tied_softmax: True
62
+ encoder:
63
+ type: "transformer"
64
+ num_layers: 6
65
+ num_heads: 4 # TODO: Increase to 8 for larger data.
66
+ embeddings:
67
+ embedding_dim: 256 # TODO: Increase to 512 for larger data.
68
+ scale: True
69
+ dropout: 0.2
70
+ # typically ff_size = 4 x hidden_size
71
+ hidden_size: 256 # TODO: Increase to 512 for larger data.
72
+ ff_size: 1024 # TODO: Increase to 2048 for larger data.
73
+ dropout: 0.3
74
+ decoder:
75
+ type: "transformer"
76
+ num_layers: 6
77
+ num_heads: 4 # TODO: Increase to 8 for larger data.
78
+ embeddings:
79
+ embedding_dim: 256 # TODO: Increase to 512 for larger data.
80
+ scale: True
81
+ dropout: 0.2
82
+ # typically ff_size = 4 x hidden_size
83
+ hidden_size: 256 # TODO: Increase to 512 for larger data.
84
+ ff_size: 1024 # TODO: Increase to 2048 for larger data.
85
+ dropout: 0.3
en-urh/main/drive-download-20211020T042645Z-001.zip ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:231461f7c6bbe7fb25ab1a6662d8cf2cf755d2a5a20669e6b3bd2db893e79a81
3
+ size 194637565
en-urh/main/src_vocab.txt ADDED
The diff for this file is too large to render. See raw diff
en-urh/{jw300-baseline β†’ main}/test.en RENAMED
File without changes
en-urh/{jw300-baseline β†’ main}/test.urh RENAMED
File without changes
en-urh/main/trg_vocab.txt ADDED
The diff for this file is too large to render. See raw diff