|
{ |
|
"_name_or_path": "rpt-torch-1", |
|
"add_null_attn": true, |
|
"architectures": [ |
|
"RPTForCausalLM" |
|
], |
|
"attn_pdrop": 0.0, |
|
"augment_across_neighbors": true, |
|
"augment_neighbors": true, |
|
"aux_loss_schedule_steps": 12500, |
|
"aux_scale": 0.1, |
|
"bos_token_id": 0, |
|
"cca_freq": 1, |
|
"chunk_size": 64, |
|
"document_length": 16384, |
|
"embd_pdrop": 0.0, |
|
"eos_token_id": 1, |
|
"fcm_max_ratio": 0.0, |
|
"fcm_min_ratio": 0.0, |
|
"gated_ff": true, |
|
"hidden_size": 2048, |
|
"initializer_range": 1, |
|
"intermediate_size": 5504, |
|
"margin_schedule_steps": 56250, |
|
"max_margin": 4, |
|
"max_sequence_length": 4096, |
|
"mesh_dim": null, |
|
"model_type": "rpt", |
|
"mult_in_complex": false, |
|
"n_windows": 1, |
|
"num_attention_heads": 16, |
|
"num_document_chunks": 256, |
|
"num_hidden_layers": 22, |
|
"num_key_value_heads": 16, |
|
"num_neighbors": 2, |
|
"num_scored_neighbors": 20, |
|
"num_sequence_chunks": 64, |
|
"palm_init": true, |
|
"remat_attention": "", |
|
"remat_block": "nothing_saveable", |
|
"remat_mlp": "", |
|
"resid_pdrop": 0.0, |
|
"retriever_fill_value": -10000.0, |
|
"return_ret_metrics": true, |
|
"rms_norm_eps": 1e-06, |
|
"rms_one_baseline": true, |
|
"rot_dim": 0, |
|
"run_modules": "all", |
|
"scan_attention": false, |
|
"scan_key_chunk_size": 2048, |
|
"scan_mlp": false, |
|
"scan_mlp_chunk_size": 1024, |
|
"scan_query_chunk_size": 1024, |
|
"scheduled_sampling_max_prob": 1.0, |
|
"scheduled_sampling_min_prob": 0.01, |
|
"sliding_window": false, |
|
"ss_schedule_steps": 56250, |
|
"stride": 1024, |
|
"threshold_nei_scores": 0.0, |
|
"tie_word_embeddings": true, |
|
"torch_dtype": "float32", |
|
"transformers_version": "4.38.0", |
|
"use_cache": true, |
|
"use_cca_norm2": false, |
|
"use_xnei_bias": true, |
|
"vocab_size": 50277, |
|
"window_length": 2048 |
|
} |
|
|