BERTu-ud / config.json
KurtMica's picture
Model files.
933e371
{
"dataset_reader": {
"type": "multitask",
"readers": {
"ud": {
"type": "universal_dependencies",
"token_indexers": {
"transformer": {
"type": "pretrained_transformer_mismatched",
"max_length": 512,
"model_name": "MLRS/BERTu"
}
}
}
}
},
"model": {
"type": "multitask",
"arg_name_mapping": {
"backbone": {
"tokens": "text",
"words": "text"
}
},
"backbone": {
"type": "embedder_and_mask",
"text_field_embedder": {
"token_embedders": {
"transformer": {
"type": "pretrained_transformer_mismatched_with_dropout",
"last_layer_only": false,
"layer_dropout": 0.1,
"max_length": 512,
"model_name": "MLRS/BERTu",
"tokenizer_kwargs": {},
"train_parameters": true
}
}
}
},
"heads": {
"ud": {
"type": "biaffine_parser",
"arc_representation_dim": 100,
"dropout": 0.3,
"encoder": {
"type": "pass_through",
"input_dim": 768
},
"initializer": {
"regexes": [
[
".*projection.*weight",
{
"type": "xavier_uniform"
}
],
[
".*projection.*bias",
{
"type": "zero"
}
],
[
".*tag_bilinear.*weight",
{
"type": "xavier_uniform"
}
],
[
".*tag_bilinear.*bias",
{
"type": "zero"
}
],
[
".*weight_ih.*",
{
"type": "xavier_uniform"
}
],
[
".*weight_hh.*",
{
"type": "orthogonal"
}
],
[
".*bias_ih.*",
{
"type": "zero"
}
],
[
".*bias_hh.*",
{
"type": "lstm_hidden_bias"
}
]
]
},
"input_dropout": 0.3,
"tag_representation_dim": 100,
"use_mst_decoding_for_validation": true
}
}
},
"train_data_path": {
"ud": "ud-treebanks-v2.8/UD_Maltese-MUDT/mt_mudt-ud-train.conllu"
},
"validation_data_path": {
"ud": "ud-treebanks-v2.8/UD_Maltese-MUDT/mt_mudt-ud-dev.conllu"
},
"trainer": {
"callbacks": [
{
"tensorboard_writer": {
"should_log_learning_rate": true,
"should_log_parameter_statistics": true
},
"type": "tensorboard"
}
],
"cuda_device": 0,
"grad_norm": 5,
"learning_rate_scheduler": {
"type": "ulmfit_sqrt",
"affected_group_count": 2,
"decay_factor": 0.05,
"discriminative_fine_tuning": true,
"factor": 5,
"gradual_unfreezing": true,
"model_size": 1,
"start_step": 9,
"warmup_steps": 9
},
"num_epochs": 200,
"optimizer": {
"type": "huggingface_adamw",
"betas": [
0.9,
0.999
],
"correct_bias": false,
"lr": 0.0005,
"parameter_groups": [
[
[
"text_field_embedder.*transformer_model.embeddings.*_embeddings.*",
"text_field_embedder.*transformer_model.encoder.*.(key|query|value|dense).weight"
],
{}
],
[
[
"text_field_embedder.*transformer_model.embeddings.LayerNorm.*",
"text_field_embedder.*transformer_model.encoder.*.output.LayerNorm.*",
"text_field_embedder.*transformer_model.encoder.*.(key|query|value|dense).bias",
"text_field_embedder.*transformer_model.pooler.dense.bias"
],
{
"weight_decay": 0
}
],
[
[
"text_field_embedder.*._scalar_mix.*",
"text_field_embedder.*transformer_model.pooler.dense.weight",
"_head_sentinel",
"head_arc_feedforward._linear_layers.*.weight",
"child_arc_feedforward._linear_layers.*.weight",
"head_tag_feedforward._linear_layers.*.weight",
"child_tag_feedforward._linear_layers.*.weight",
"arc_attention._weight_matrix",
"tag_bilinear.weight",
"tag_projection_layer._module.weight",
"crf",
"linear.weight",
"tagger_linear.weight"
],
{}
],
[
[
"head_arc_feedforward._linear_layers.*.bias",
"child_arc_feedforward._linear_layers.*.bias",
"head_tag_feedforward._linear_layers.*.bias",
"child_tag_feedforward._linear_layers.*.bias",
"arc_attention._bias",
"tag_bilinear.bias",
"tag_projection_layer._module.bias",
"linear.bias",
"tagger_linear.bias"
],
{
"weight_decay": 0
}
]
],
"weight_decay": 0.01
},
"patience": 20,
"validation_metric": [
"+ud_LAS"
]
},
"data_loader": {
"type": "multitask",
"scheduler": {
"type": "unbalanced_homogeneous_roundrobin",
"batch_size": 128,
"dataset_sizes": {
"ud": 1123
}
},
"shuffle": true
},
"numpy_seed": 2460,
"pytorch_seed": 246,
"random_seed": 24601,
"validation_data_loader": {
"type": "multitask",
"scheduler": {
"type": "homogeneous_roundrobin",
"batch_size": 128
},
"shuffle": true
}
}