|
=== Dataset Information === |
|
Total tokens: 268,435,456 |
|
Estimated size: 1.00 GB |
|
|
|
=== Model Architecture === |
|
return_dict: True |
|
output_hidden_states: False |
|
output_attentions: False |
|
torchscript: False |
|
torch_dtype: float32 |
|
use_bfloat16: False |
|
tf_legacy_loss: False |
|
pruned_heads: {} |
|
tie_word_embeddings: True |
|
chunk_size_feed_forward: 0 |
|
is_encoder_decoder: False |
|
is_decoder: False |
|
cross_attention_hidden_size: None |
|
add_cross_attention: False |
|
tie_encoder_decoder: False |
|
max_length: 20 |
|
min_length: 0 |
|
do_sample: False |
|
early_stopping: False |
|
num_beams: 1 |
|
num_beam_groups: 1 |
|
diversity_penalty: 0.0 |
|
temperature: 1.0 |
|
top_k: 50 |
|
top_p: 1.0 |
|
typical_p: 1.0 |
|
repetition_penalty: 1.0 |
|
length_penalty: 1.0 |
|
no_repeat_ngram_size: 0 |
|
encoder_no_repeat_ngram_size: 0 |
|
bad_words_ids: None |
|
num_return_sequences: 1 |
|
output_scores: False |
|
return_dict_in_generate: False |
|
forced_bos_token_id: None |
|
forced_eos_token_id: None |
|
remove_invalid_values: False |
|
exponential_decay_length_penalty: None |
|
suppress_tokens: None |
|
begin_suppress_tokens: None |
|
architectures: ['RobertaForMaskedLM'] |
|
finetuning_task: None |
|
id2label: {0: 'LABEL_0', 1: 'LABEL_1'} |
|
label2id: {'LABEL_0': 0, 'LABEL_1': 1} |
|
tokenizer_class: None |
|
prefix: None |
|
bos_token_id: 0 |
|
pad_token_id: 1 |
|
eos_token_id: 2 |
|
sep_token_id: None |
|
decoder_start_token_id: None |
|
task_specific_params: None |
|
problem_type: None |
|
_name_or_path: |
|
_attn_implementation_autoset: False |
|
transformers_version: 4.46.3 |
|
vocab_size: 50265 |
|
hidden_size: 768 |
|
num_hidden_layers: 12 |
|
num_attention_heads: 12 |
|
hidden_act: gelu |
|
intermediate_size: 3072 |
|
hidden_dropout_prob: 0.1 |
|
attention_probs_dropout_prob: 0.1 |
|
max_position_embeddings: 514 |
|
type_vocab_size: 1 |
|
initializer_range: 0.02 |
|
layer_norm_eps: 1e-05 |
|
position_embedding_type: absolute |
|
use_cache: True |
|
classifier_dropout: None |
|
model_type: roberta |
|
|
|
=== Training Parameters === |
|
learning_rate: 0.0007 |
|
batch_size: 64 |
|
gradient_accumulation: 5 |
|
warmup_steps: 10000 |
|
total_steps: 20000 |
|
|
|
=== Dataset Configuration === |
|
Dataset: oscar-corpus/mOSCAR |
|
Subset: fra_Latn |
|
Split: train |
|
Streaming: True |
|
|