nicholasKluge commited on
Commit
eb32620
1 Parent(s): 36f7a59

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +27 -31
README.md CHANGED
@@ -66,37 +66,33 @@ This repository has the [source code](https://github.com/Nkluge-correa/Aira) use
66
 
67
  ## Training Set-up
68
 
69
- | Section | Setting | Value |
70
- |----------------|-----------------------------|--------------------------------------|
71
- | Model args. | vocab_size | 32000 |
72
- | | hidden_size | 768 |
73
- | | intermediate_size | 3072 |
74
- | | max_position_embeddings | 2048 |
75
- | | num_attention_heads | 12 |
76
- | | num_hidden_layers | 12 |
77
- | | num_key_value_heads | 12 |
78
- | | torch_dtype | "float32" |
79
- | Data args. | dataset_name | "nicholasKluge/portuguese-corpus-v3" |
80
- | | dataset_split | "train" |
81
- | | train_num_samples | 1831873 |
82
- | | val_num_samples | 18000 |
83
- | | block_size | 2048 |
84
- | Training args. | evaluation_strategy | "steps" |
85
- | | eval_steps | 100000 |
86
- | | per_device_train_batch_size | 4 |
87
- | | per_device_eval_batch_size | 4 |
88
- | | gradient_accumulation_steps | 1 |
89
- | | learning_rate | 0.0006 |
90
- | | adam_epsilon | 0.00000001 |
91
- | | weight_decay | 0.01 |
92
- | | lr_scheduler_type | "cosine" |
93
- | | warmup_ratio | 0.01 |
94
- | | num_train_epochs | 1 |
95
- | | gradient_checkpointing | false |
96
- | | seed | 42 |
97
- | | mixed_precision | 'no' |
98
- | | checkpointing_steps | 22000 |
99
- | | tf32 | true |
100
 
101
  ## Basic usage
102
 
 
66
 
67
  ## Training Set-up
68
 
69
+ | Arguments | Value |
70
+ |-------------------------------|--------------------------------------|
71
+ | vocabulary size | 32000 |
72
+ | hidden dimension size | 768 |
73
+ | intermediate dimension size | 3072 |
74
+ | context length | 2048 |
75
+ | nº attention heads | 12 |
76
+ | nº hidden layers | 12 |
77
+ | nº key value heads | 12 |
78
+ | nº training samples | 1831873 |
79
+ | validation samples | 18000 |
80
+ | nº epochs | 1 |
81
+ | evaluation steps | 100000 |
82
+ | train batch size | 4 |
83
+ | eval batch size | 4 |
84
+ | gradient accumulation steps | 1 |
85
+ | learning rate | 0.0006 |
86
+ | adam epsilon | 0.00000001 |
87
+ | weight decay | 0.01 |
88
+ | scheduler type | "cosine" |
89
+ | warmup ratio | 0.01 |
90
+ | gradient checkpointing | false |
91
+ | seed | 42 |
92
+ | mixed precision | 'no' |
93
+ | torch dtype | "float32" |
94
+ | tf32 | true |
95
+
 
 
 
 
96
 
97
  ## Basic usage
98