2023-05-13 15:32:42,395 - INFO - allennlp.common.params - random_seed = 13370 2023-05-13 15:32:42,395 - INFO - allennlp.common.params - numpy_seed = 1337 2023-05-13 15:32:42,395 - INFO - allennlp.common.params - pytorch_seed = 133 2023-05-13 15:32:42,396 - INFO - allennlp.common.checks - Pytorch version: 1.11.0+cu102 2023-05-13 15:32:42,396 - INFO - allennlp.common.params - type = default 2023-05-13 15:32:42,397 - INFO - allennlp.common.params - dataset_reader.type = seq2rel 2023-05-13 15:32:42,397 - INFO - allennlp.common.params - dataset_reader.max_instances = None 2023-05-13 15:32:42,397 - INFO - allennlp.common.params - dataset_reader.manual_distributed_sharding = False 2023-05-13 15:32:42,397 - INFO - allennlp.common.params - dataset_reader.manual_multiprocess_sharding = False 2023-05-13 15:32:42,397 - INFO - allennlp.common.params - dataset_reader.target_namespace = target_tokens 2023-05-13 15:32:42,398 - INFO - allennlp.common.params - dataset_reader.source_tokenizer.type = pretrained_transformer 2023-05-13 15:32:42,398 - INFO - allennlp.common.params - dataset_reader.source_tokenizer.model_name = microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext 2023-05-13 15:32:42,398 - INFO - allennlp.common.params - dataset_reader.source_tokenizer.add_special_tokens = True 2023-05-13 15:32:42,398 - INFO - allennlp.common.params - dataset_reader.source_tokenizer.max_length = 512 2023-05-13 15:32:42,398 - INFO - allennlp.common.params - dataset_reader.source_tokenizer.tokenizer_kwargs.additional_special_tokens = ['@ARG@', '@TRIGGER@'] 2023-05-13 15:32:42,399 - INFO - allennlp.common.params - dataset_reader.source_tokenizer.tokenizer_kwargs.do_lower_case = True 2023-05-13 15:32:42,399 - INFO - allennlp.common.params - dataset_reader.source_tokenizer.verification_tokens = None 2023-05-13 15:32:48,173 - INFO - allennlp.common.params - dataset_reader.target_tokenizer.type = pretrained_transformer 2023-05-13 15:32:48,174 - INFO - allennlp.common.params - dataset_reader.target_tokenizer.model_name = microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext 2023-05-13 15:32:48,174 - INFO - allennlp.common.params - dataset_reader.target_tokenizer.add_special_tokens = False 2023-05-13 15:32:48,174 - INFO - allennlp.common.params - dataset_reader.target_tokenizer.max_length = None 2023-05-13 15:32:48,174 - INFO - allennlp.common.params - dataset_reader.target_tokenizer.tokenizer_kwargs.additional_special_tokens = ['@ARG@', '@TRIGGER@', '@OSP@', '@start@', '@end@', ';'] 2023-05-13 15:32:48,174 - INFO - allennlp.common.params - dataset_reader.target_tokenizer.tokenizer_kwargs.do_lower_case = True 2023-05-13 15:32:48,174 - INFO - allennlp.common.params - dataset_reader.target_tokenizer.verification_tokens = None 2023-05-13 15:32:55,200 - INFO - allennlp.common.params - dataset_reader.source_token_indexers.tokens.type = pretrained_transformer 2023-05-13 15:32:55,200 - INFO - allennlp.common.params - dataset_reader.source_token_indexers.tokens.token_min_padding_length = 0 2023-05-13 15:32:55,201 - INFO - allennlp.common.params - dataset_reader.source_token_indexers.tokens.model_name = microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext 2023-05-13 15:32:55,201 - INFO - allennlp.common.params - dataset_reader.source_token_indexers.tokens.namespace = tags 2023-05-13 15:32:55,201 - INFO - allennlp.common.params - dataset_reader.source_token_indexers.tokens.max_length = None 2023-05-13 15:32:55,201 - INFO - allennlp.common.params - dataset_reader.source_token_indexers.tokens.tokenizer_kwargs.additional_special_tokens = ['@ARG@', '@TRIGGER@'] 2023-05-13 15:32:55,201 - INFO - allennlp.common.params - dataset_reader.source_token_indexers.tokens.tokenizer_kwargs.do_lower_case = True 2023-05-13 15:32:55,202 - INFO - allennlp.common.params - dataset_reader.max_length = 512 2023-05-13 15:32:55,202 - INFO - allennlp.common.params - train_data_path = ../granular/train_transform.tsv 2023-05-13 15:32:55,203 - INFO - allennlp.common.params - datasets_for_vocab_creation = None 2023-05-13 15:32:55,203 - INFO - allennlp.common.params - validation_dataset_reader = None 2023-05-13 15:32:55,203 - INFO - allennlp.common.params - validation_data_path = ../granular/dev_transform.tsv 2023-05-13 15:32:55,203 - INFO - allennlp.common.params - test_data_path = None 2023-05-13 15:32:55,203 - INFO - allennlp.common.params - evaluate_on_test = False 2023-05-13 15:32:55,203 - INFO - allennlp.common.params - batch_weight_key = 2023-05-13 15:32:55,203 - INFO - allennlp.common.params - data_loader.type = multiprocess 2023-05-13 15:32:55,204 - INFO - allennlp.common.params - data_loader.batch_size = None 2023-05-13 15:32:55,204 - INFO - allennlp.common.params - data_loader.drop_last = False 2023-05-13 15:32:55,204 - INFO - allennlp.common.params - data_loader.shuffle = False 2023-05-13 15:32:55,204 - INFO - allennlp.common.params - data_loader.batch_sampler.type = bucket 2023-05-13 15:32:55,204 - INFO - allennlp.common.params - data_loader.batch_sampler.batch_size = 4 2023-05-13 15:32:55,204 - INFO - allennlp.common.params - data_loader.batch_sampler.sorting_keys = ['source_tokens'] 2023-05-13 15:32:55,205 - INFO - allennlp.common.params - data_loader.batch_sampler.padding_noise = 0.1 2023-05-13 15:32:55,205 - INFO - allennlp.common.params - data_loader.batch_sampler.drop_last = False 2023-05-13 15:32:55,205 - INFO - allennlp.common.params - data_loader.batch_sampler.shuffle = True 2023-05-13 15:32:55,205 - INFO - allennlp.common.params - data_loader.batches_per_epoch = None 2023-05-13 15:32:55,205 - INFO - allennlp.common.params - data_loader.num_workers = 0 2023-05-13 15:32:55,205 - INFO - allennlp.common.params - data_loader.max_instances_in_memory = None 2023-05-13 15:32:55,205 - INFO - allennlp.common.params - data_loader.start_method = fork 2023-05-13 15:32:55,205 - INFO - allennlp.common.params - data_loader.cuda_device = None 2023-05-13 15:32:55,205 - INFO - allennlp.common.params - data_loader.quiet = False 2023-05-13 15:32:55,205 - INFO - allennlp.common.params - data_loader.collate_fn = 2023-05-13 15:32:55,206 - INFO - tqdm - loading instances: 0it [00:00, ?it/s] 2023-05-13 15:32:55,206 - INFO - seq2rel.dataset_reader - Reading instances from lines in file at: ../granular/train_transform.tsv 2023-05-13 15:32:59,867 - INFO - allennlp.common.params - validation_data_loader.type = multiprocess 2023-05-13 15:32:59,867 - INFO - allennlp.common.params - validation_data_loader.batch_size = None 2023-05-13 15:32:59,867 - INFO - allennlp.common.params - validation_data_loader.drop_last = False 2023-05-13 15:32:59,868 - INFO - allennlp.common.params - validation_data_loader.shuffle = False 2023-05-13 15:32:59,868 - INFO - allennlp.common.params - validation_data_loader.batch_sampler.type = bucket 2023-05-13 15:32:59,868 - INFO - allennlp.common.params - validation_data_loader.batch_sampler.batch_size = 128 2023-05-13 15:32:59,868 - INFO - allennlp.common.params - validation_data_loader.batch_sampler.sorting_keys = ['source_tokens'] 2023-05-13 15:32:59,868 - INFO - allennlp.common.params - validation_data_loader.batch_sampler.padding_noise = 0 2023-05-13 15:32:59,868 - INFO - allennlp.common.params - validation_data_loader.batch_sampler.drop_last = False 2023-05-13 15:32:59,868 - INFO - allennlp.common.params - validation_data_loader.batch_sampler.shuffle = True 2023-05-13 15:32:59,869 - INFO - allennlp.common.params - validation_data_loader.batches_per_epoch = None 2023-05-13 15:32:59,869 - INFO - allennlp.common.params - validation_data_loader.num_workers = 0 2023-05-13 15:32:59,869 - INFO - allennlp.common.params - validation_data_loader.max_instances_in_memory = None 2023-05-13 15:32:59,869 - INFO - allennlp.common.params - validation_data_loader.start_method = fork 2023-05-13 15:32:59,869 - INFO - allennlp.common.params - validation_data_loader.cuda_device = None 2023-05-13 15:32:59,869 - INFO - allennlp.common.params - validation_data_loader.quiet = False 2023-05-13 15:32:59,869 - INFO - allennlp.common.params - validation_data_loader.collate_fn = 2023-05-13 15:32:59,869 - INFO - tqdm - loading instances: 0it [00:00, ?it/s] 2023-05-13 15:32:59,870 - INFO - seq2rel.dataset_reader - Reading instances from lines in file at: ../granular/dev_transform.tsv 2023-05-13 15:33:01,994 - INFO - allennlp.common.params - vocabulary.type = from_instances 2023-05-13 15:33:01,994 - INFO - allennlp.common.params - vocabulary.min_count = None 2023-05-13 15:33:01,995 - INFO - allennlp.common.params - vocabulary.non_padded_namespaces = ('*tags', '*labels') 2023-05-13 15:33:01,995 - INFO - allennlp.common.params - vocabulary.pretrained_files = None 2023-05-13 15:33:01,995 - INFO - allennlp.common.params - vocabulary.only_include_pretrained_words = False 2023-05-13 15:33:01,995 - INFO - allennlp.common.params - vocabulary.min_pretrained_embeddings = None 2023-05-13 15:33:01,995 - INFO - allennlp.common.params - vocabulary.padding_token = @@PADDING@@ 2023-05-13 15:33:01,996 - INFO - allennlp.common.params - vocabulary.oov_token = @@UNKNOWN@@ 2023-05-13 15:33:01,996 - INFO - allennlp.data.vocabulary - Fitting token dictionary from dataset. 2023-05-13 15:33:01,996 - INFO - tqdm - building vocab: 0it [00:00, ?it/s] 2023-05-13 15:33:02,129 - INFO - allennlp.common.params - model.type = copynet_seq2rel 2023-05-13 15:33:02,130 - INFO - allennlp.common.params - model.regularizer = None 2023-05-13 15:33:02,130 - INFO - allennlp.common.params - model.source_embedder.type = basic 2023-05-13 15:33:02,130 - INFO - allennlp.common.params - model.source_embedder.token_embedders.tokens.type = pretrained_transformer 2023-05-13 15:33:02,131 - INFO - allennlp.common.params - model.source_embedder.token_embedders.tokens.model_name = microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext 2023-05-13 15:33:02,131 - INFO - allennlp.common.params - model.source_embedder.token_embedders.tokens.max_length = None 2023-05-13 15:33:02,131 - INFO - allennlp.common.params - model.source_embedder.token_embedders.tokens.sub_module = None 2023-05-13 15:33:02,131 - INFO - allennlp.common.params - model.source_embedder.token_embedders.tokens.train_parameters = True 2023-05-13 15:33:02,131 - INFO - allennlp.common.params - model.source_embedder.token_embedders.tokens.eval_mode = False 2023-05-13 15:33:02,131 - INFO - allennlp.common.params - model.source_embedder.token_embedders.tokens.last_layer_only = True 2023-05-13 15:33:02,131 - INFO - allennlp.common.params - model.source_embedder.token_embedders.tokens.override_weights_file = None 2023-05-13 15:33:02,131 - INFO - allennlp.common.params - model.source_embedder.token_embedders.tokens.override_weights_strip_prefix = None 2023-05-13 15:33:02,132 - INFO - allennlp.common.params - model.source_embedder.token_embedders.tokens.reinit_modules = 2 2023-05-13 15:33:02,132 - INFO - allennlp.common.params - model.source_embedder.token_embedders.tokens.load_weights = True 2023-05-13 15:33:02,132 - INFO - allennlp.common.params - model.source_embedder.token_embedders.tokens.gradient_checkpointing = None 2023-05-13 15:33:02,132 - INFO - allennlp.common.params - model.source_embedder.token_embedders.tokens.tokenizer_kwargs.additional_special_tokens = ['@ARG@', '@TRIGGER@'] 2023-05-13 15:33:02,132 - INFO - allennlp.common.params - model.source_embedder.token_embedders.tokens.tokenizer_kwargs.do_lower_case = True 2023-05-13 15:33:02,132 - INFO - allennlp.common.params - model.source_embedder.token_embedders.tokens.transformer_kwargs = None 2023-05-13 15:33:12,743 - INFO - allennlp.common.params - model.encoder = None 2023-05-13 15:33:12,744 - INFO - allennlp.common.params - model.label_smoothing = None 2023-05-13 15:33:12,744 - INFO - allennlp.common.params - model.target_embedding_dim = 256 2023-05-13 15:33:12,744 - INFO - allennlp.common.params - model.scheduled_sampling_ratio = 0.0 2023-05-13 15:33:12,744 - INFO - allennlp.common.params - model.copy_token = @COPY@ 2023-05-13 15:33:12,744 - INFO - allennlp.common.params - model.target_namespace = target_tokens 2023-05-13 15:33:12,744 - INFO - allennlp.common.params - model.tensor_based_metric = None 2023-05-13 15:33:12,744 - INFO - allennlp.common.params - model.token_based_metric = None 2023-05-13 15:33:12,744 - INFO - allennlp.common.params - model.initializer = 2023-05-13 15:33:12,745 - INFO - allennlp.common.params - model.target_tokenizer.type = pretrained_transformer 2023-05-13 15:33:12,745 - INFO - allennlp.common.params - model.target_tokenizer.model_name = microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext 2023-05-13 15:33:12,745 - INFO - allennlp.common.params - model.target_tokenizer.add_special_tokens = False 2023-05-13 15:33:12,745 - INFO - allennlp.common.params - model.target_tokenizer.max_length = None 2023-05-13 15:33:12,745 - INFO - allennlp.common.params - model.target_tokenizer.tokenizer_kwargs.additional_special_tokens = ['@ARG@', '@TRIGGER@', '@OSP@', '@start@', '@end@', ';'] 2023-05-13 15:33:12,745 - INFO - allennlp.common.params - model.target_tokenizer.tokenizer_kwargs.do_lower_case = True 2023-05-13 15:33:12,745 - INFO - allennlp.common.params - model.target_tokenizer.verification_tokens = None 2023-05-13 15:33:12,746 - INFO - allennlp.common.params - model.dropout = 0.1 2023-05-13 15:33:12,746 - INFO - allennlp.common.params - model.weight_dropout = 0.5 2023-05-13 15:33:12,747 - INFO - allennlp.common.params - model.sequence_based_metrics.0.type = f1_seq2rel 2023-05-13 15:33:12,747 - INFO - allennlp.common.params - model.sequence_based_metrics.0.labels = ['OSP'] 2023-05-13 15:33:12,747 - INFO - allennlp.common.params - model.sequence_based_metrics.0.threshold = None 2023-05-13 15:33:12,747 - INFO - allennlp.common.params - model.sequence_based_metrics.0.ordered_ents = False 2023-05-13 15:33:12,747 - INFO - allennlp.common.params - model.sequence_based_metrics.0.remove_duplicate_ents = True 2023-05-13 15:33:12,747 - INFO - allennlp.common.params - model.sequence_based_metrics.0.average = micro 2023-05-13 15:33:12,747 - INFO - allennlp.common.params - model.init_decoder_state_strategy = mean 2023-05-13 15:33:12,748 - INFO - allennlp.common.params - model.attention.type = multihead_attention 2023-05-13 15:33:12,748 - INFO - allennlp.common.params - model.attention.normalize = True 2023-05-13 15:33:12,748 - INFO - allennlp.common.params - model.attention.num_heads = 6 2023-05-13 15:33:12,815 - INFO - allennlp.common.params - model.beam_search.type = beam_search 2023-05-13 15:33:12,815 - INFO - allennlp.common.params - model.beam_search.max_steps = 96 2023-05-13 15:33:12,815 - INFO - allennlp.common.params - model.beam_search.beam_size = 1 2023-05-13 15:33:12,815 - INFO - allennlp.common.params - model.beam_search.per_node_beam_size = None 2023-05-13 15:33:12,815 - INFO - allennlp.common.params - model.beam_search.sampler = None 2023-05-13 15:33:12,815 - INFO - allennlp.common.params - model.beam_search.min_steps = None 2023-05-13 15:33:12,816 - INFO - allennlp.common.params - model.beam_search.final_sequence_scorer.type = length-normalized-sequence-log-prob 2023-05-13 15:33:12,816 - INFO - allennlp.common.params - model.beam_search.final_sequence_scorer.length_penalty = 1 2023-05-13 15:33:12,816 - INFO - allennlp.common.params - model.beam_search.constraints = None 2023-05-13 15:33:12,816 - INFO - allennlp.nn.initializers - Initializing parameters 2023-05-13 15:33:12,817 - INFO - allennlp.nn.initializers - Done initializing parameters; the following parameters are using their default initialization from their code 2023-05-13 15:33:12,817 - INFO - allennlp.nn.initializers - _attention._multihead_attn.in_proj_bias 2023-05-13 15:33:12,817 - INFO - allennlp.nn.initializers - _attention._multihead_attn.in_proj_weight 2023-05-13 15:33:12,817 - INFO - allennlp.nn.initializers - _attention._multihead_attn.out_proj.bias 2023-05-13 15:33:12,817 - INFO - allennlp.nn.initializers - _attention._multihead_attn.out_proj.weight 2023-05-13 15:33:12,817 - INFO - allennlp.nn.initializers - _decoder_cell.bias_hh 2023-05-13 15:33:12,817 - INFO - allennlp.nn.initializers - _decoder_cell.bias_ih 2023-05-13 15:33:12,817 - INFO - allennlp.nn.initializers - _decoder_cell.weight_hh 2023-05-13 15:33:12,817 - INFO - allennlp.nn.initializers - _decoder_cell.weight_ih 2023-05-13 15:33:12,817 - INFO - allennlp.nn.initializers - _input_projection_layer.bias 2023-05-13 15:33:12,817 - INFO - allennlp.nn.initializers - _input_projection_layer.weight 2023-05-13 15:33:12,818 - INFO - allennlp.nn.initializers - _output_copying_layer.bias 2023-05-13 15:33:12,818 - INFO - allennlp.nn.initializers - _output_copying_layer.weight 2023-05-13 15:33:12,818 - INFO - allennlp.nn.initializers - _output_generation_layer.bias 2023-05-13 15:33:12,818 - INFO - allennlp.nn.initializers - _output_generation_layer.weight 2023-05-13 15:33:12,818 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.embeddings.LayerNorm.bias 2023-05-13 15:33:12,818 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.embeddings.LayerNorm.weight 2023-05-13 15:33:12,818 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.embeddings.position_embeddings.weight 2023-05-13 15:33:12,818 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.embeddings.token_type_embeddings.weight 2023-05-13 15:33:12,818 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.embeddings.word_embeddings.weight 2023-05-13 15:33:12,818 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.0.attention.output.LayerNorm.bias 2023-05-13 15:33:12,818 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.0.attention.output.LayerNorm.weight 2023-05-13 15:33:12,818 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.0.attention.output.dense.bias 2023-05-13 15:33:12,818 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.0.attention.output.dense.weight 2023-05-13 15:33:12,818 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.0.attention.self.key.bias 2023-05-13 15:33:12,818 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.0.attention.self.key.weight 2023-05-13 15:33:12,818 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.0.attention.self.query.bias 2023-05-13 15:33:12,818 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.0.attention.self.query.weight 2023-05-13 15:33:12,818 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.0.attention.self.value.bias 2023-05-13 15:33:12,818 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.0.attention.self.value.weight 2023-05-13 15:33:12,818 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.0.intermediate.dense.bias 2023-05-13 15:33:12,819 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.0.intermediate.dense.weight 2023-05-13 15:33:12,819 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.0.output.LayerNorm.bias 2023-05-13 15:33:12,819 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.0.output.LayerNorm.weight 2023-05-13 15:33:12,819 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.0.output.dense.bias 2023-05-13 15:33:12,819 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.0.output.dense.weight 2023-05-13 15:33:12,819 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.1.attention.output.LayerNorm.bias 2023-05-13 15:33:12,819 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.1.attention.output.LayerNorm.weight 2023-05-13 15:33:12,819 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.1.attention.output.dense.bias 2023-05-13 15:33:12,819 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.1.attention.output.dense.weight 2023-05-13 15:33:12,819 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.1.attention.self.key.bias 2023-05-13 15:33:12,819 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.1.attention.self.key.weight 2023-05-13 15:33:12,819 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.1.attention.self.query.bias 2023-05-13 15:33:12,819 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.1.attention.self.query.weight 2023-05-13 15:33:12,819 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.1.attention.self.value.bias 2023-05-13 15:33:12,819 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.1.attention.self.value.weight 2023-05-13 15:33:12,819 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.1.intermediate.dense.bias 2023-05-13 15:33:12,819 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.1.intermediate.dense.weight 2023-05-13 15:33:12,819 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.1.output.LayerNorm.bias 2023-05-13 15:33:12,819 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.1.output.LayerNorm.weight 2023-05-13 15:33:12,819 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.1.output.dense.bias 2023-05-13 15:33:12,819 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.1.output.dense.weight 2023-05-13 15:33:12,820 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.10.attention.output.LayerNorm.bias 2023-05-13 15:33:12,820 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.10.attention.output.LayerNorm.weight 2023-05-13 15:33:12,820 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.10.attention.output.dense.bias 2023-05-13 15:33:12,820 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.10.attention.output.dense.weight 2023-05-13 15:33:12,820 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.10.attention.self.key.bias 2023-05-13 15:33:12,820 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.10.attention.self.key.weight 2023-05-13 15:33:12,820 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.10.attention.self.query.bias 2023-05-13 15:33:12,820 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.10.attention.self.query.weight 2023-05-13 15:33:12,820 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.10.attention.self.value.bias 2023-05-13 15:33:12,820 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.10.attention.self.value.weight 2023-05-13 15:33:12,820 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.10.intermediate.dense.bias 2023-05-13 15:33:12,820 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.10.intermediate.dense.weight 2023-05-13 15:33:12,820 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.10.output.LayerNorm.bias 2023-05-13 15:33:12,820 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.10.output.LayerNorm.weight 2023-05-13 15:33:12,820 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.10.output.dense.bias 2023-05-13 15:33:12,821 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.10.output.dense.weight 2023-05-13 15:33:12,821 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.11.attention.output.LayerNorm.bias 2023-05-13 15:33:12,821 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.11.attention.output.LayerNorm.weight 2023-05-13 15:33:12,821 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.11.attention.output.dense.bias 2023-05-13 15:33:12,821 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.11.attention.output.dense.weight 2023-05-13 15:33:12,821 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.11.attention.self.key.bias 2023-05-13 15:33:12,821 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.11.attention.self.key.weight 2023-05-13 15:33:12,821 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.11.attention.self.query.bias 2023-05-13 15:33:12,821 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.11.attention.self.query.weight 2023-05-13 15:33:12,821 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.11.attention.self.value.bias 2023-05-13 15:33:12,821 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.11.attention.self.value.weight 2023-05-13 15:33:12,821 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.11.intermediate.dense.bias 2023-05-13 15:33:12,821 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.11.intermediate.dense.weight 2023-05-13 15:33:12,821 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.11.output.LayerNorm.bias 2023-05-13 15:33:12,821 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.11.output.LayerNorm.weight 2023-05-13 15:33:12,821 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.11.output.dense.bias 2023-05-13 15:33:12,821 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.11.output.dense.weight 2023-05-13 15:33:12,821 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.2.attention.output.LayerNorm.bias 2023-05-13 15:33:12,821 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.2.attention.output.LayerNorm.weight 2023-05-13 15:33:12,822 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.2.attention.output.dense.bias 2023-05-13 15:33:12,822 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.2.attention.output.dense.weight 2023-05-13 15:33:12,822 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.2.attention.self.key.bias 2023-05-13 15:33:12,822 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.2.attention.self.key.weight 2023-05-13 15:33:12,822 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.2.attention.self.query.bias 2023-05-13 15:33:12,822 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.2.attention.self.query.weight 2023-05-13 15:33:12,822 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.2.attention.self.value.bias 2023-05-13 15:33:12,822 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.2.attention.self.value.weight 2023-05-13 15:33:12,822 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.2.intermediate.dense.bias 2023-05-13 15:33:12,822 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.2.intermediate.dense.weight 2023-05-13 15:33:12,822 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.2.output.LayerNorm.bias 2023-05-13 15:33:12,822 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.2.output.LayerNorm.weight 2023-05-13 15:33:12,822 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.2.output.dense.bias 2023-05-13 15:33:12,822 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.2.output.dense.weight 2023-05-13 15:33:12,830 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.3.attention.output.LayerNorm.bias 2023-05-13 15:33:12,830 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.3.attention.output.LayerNorm.weight 2023-05-13 15:33:12,830 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.3.attention.output.dense.bias 2023-05-13 15:33:12,830 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.3.attention.output.dense.weight 2023-05-13 15:33:12,830 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.3.attention.self.key.bias 2023-05-13 15:33:12,830 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.3.attention.self.key.weight 2023-05-13 15:33:12,830 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.3.attention.self.query.bias 2023-05-13 15:33:12,830 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.3.attention.self.query.weight 2023-05-13 15:33:12,830 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.3.attention.self.value.bias 2023-05-13 15:33:12,830 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.3.attention.self.value.weight 2023-05-13 15:33:12,830 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.3.intermediate.dense.bias 2023-05-13 15:33:12,830 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.3.intermediate.dense.weight 2023-05-13 15:33:12,830 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.3.output.LayerNorm.bias 2023-05-13 15:33:12,831 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.3.output.LayerNorm.weight 2023-05-13 15:33:12,831 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.3.output.dense.bias 2023-05-13 15:33:12,831 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.3.output.dense.weight 2023-05-13 15:33:12,831 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.4.attention.output.LayerNorm.bias 2023-05-13 15:33:12,831 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.4.attention.output.LayerNorm.weight 2023-05-13 15:33:12,831 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.4.attention.output.dense.bias 2023-05-13 15:33:12,831 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.4.attention.output.dense.weight 2023-05-13 15:33:12,831 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.4.attention.self.key.bias 2023-05-13 15:33:12,831 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.4.attention.self.key.weight 2023-05-13 15:33:12,831 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.4.attention.self.query.bias 2023-05-13 15:33:12,831 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.4.attention.self.query.weight 2023-05-13 15:33:12,831 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.4.attention.self.value.bias 2023-05-13 15:33:12,831 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.4.attention.self.value.weight 2023-05-13 15:33:12,831 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.4.intermediate.dense.bias 2023-05-13 15:33:12,832 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.4.intermediate.dense.weight 2023-05-13 15:33:12,832 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.4.output.LayerNorm.bias 2023-05-13 15:33:12,832 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.4.output.LayerNorm.weight 2023-05-13 15:33:12,832 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.4.output.dense.bias 2023-05-13 15:33:12,832 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.4.output.dense.weight 2023-05-13 15:33:12,832 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.5.attention.output.LayerNorm.bias 2023-05-13 15:33:12,832 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.5.attention.output.LayerNorm.weight 2023-05-13 15:33:12,832 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.5.attention.output.dense.bias 2023-05-13 15:33:12,832 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.5.attention.output.dense.weight 2023-05-13 15:33:12,832 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.5.attention.self.key.bias 2023-05-13 15:33:12,832 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.5.attention.self.key.weight 2023-05-13 15:33:12,832 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.5.attention.self.query.bias 2023-05-13 15:33:12,832 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.5.attention.self.query.weight 2023-05-13 15:33:12,833 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.5.attention.self.value.bias 2023-05-13 15:33:12,833 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.5.attention.self.value.weight 2023-05-13 15:33:12,833 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.5.intermediate.dense.bias 2023-05-13 15:33:12,833 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.5.intermediate.dense.weight 2023-05-13 15:33:12,833 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.5.output.LayerNorm.bias 2023-05-13 15:33:12,833 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.5.output.LayerNorm.weight 2023-05-13 15:33:12,833 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.5.output.dense.bias 2023-05-13 15:33:12,833 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.5.output.dense.weight 2023-05-13 15:33:12,833 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.6.attention.output.LayerNorm.bias 2023-05-13 15:33:12,833 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.6.attention.output.LayerNorm.weight 2023-05-13 15:33:12,833 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.6.attention.output.dense.bias 2023-05-13 15:33:12,833 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.6.attention.output.dense.weight 2023-05-13 15:33:12,833 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.6.attention.self.key.bias 2023-05-13 15:33:12,833 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.6.attention.self.key.weight 2023-05-13 15:33:12,833 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.6.attention.self.query.bias 2023-05-13 15:33:12,834 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.6.attention.self.query.weight 2023-05-13 15:33:12,834 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.6.attention.self.value.bias 2023-05-13 15:33:12,834 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.6.attention.self.value.weight 2023-05-13 15:33:12,834 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.6.intermediate.dense.bias 2023-05-13 15:33:12,834 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.6.intermediate.dense.weight 2023-05-13 15:33:12,834 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.6.output.LayerNorm.bias 2023-05-13 15:33:12,834 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.6.output.LayerNorm.weight 2023-05-13 15:33:12,834 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.6.output.dense.bias 2023-05-13 15:33:12,834 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.6.output.dense.weight 2023-05-13 15:33:12,834 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.7.attention.output.LayerNorm.bias 2023-05-13 15:33:12,834 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.7.attention.output.LayerNorm.weight 2023-05-13 15:33:12,834 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.7.attention.output.dense.bias 2023-05-13 15:33:12,834 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.7.attention.output.dense.weight 2023-05-13 15:33:12,834 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.7.attention.self.key.bias 2023-05-13 15:33:12,834 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.7.attention.self.key.weight 2023-05-13 15:33:12,835 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.7.attention.self.query.bias 2023-05-13 15:33:12,835 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.7.attention.self.query.weight 2023-05-13 15:33:12,835 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.7.attention.self.value.bias 2023-05-13 15:33:12,835 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.7.attention.self.value.weight 2023-05-13 15:33:12,835 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.7.intermediate.dense.bias 2023-05-13 15:33:12,835 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.7.intermediate.dense.weight 2023-05-13 15:33:12,835 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.7.output.LayerNorm.bias 2023-05-13 15:33:12,835 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.7.output.LayerNorm.weight 2023-05-13 15:33:12,835 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.7.output.dense.bias 2023-05-13 15:33:12,835 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.7.output.dense.weight 2023-05-13 15:33:12,835 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.8.attention.output.LayerNorm.bias 2023-05-13 15:33:12,835 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.8.attention.output.LayerNorm.weight 2023-05-13 15:33:12,835 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.8.attention.output.dense.bias 2023-05-13 15:33:12,835 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.8.attention.output.dense.weight 2023-05-13 15:33:12,835 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.8.attention.self.key.bias 2023-05-13 15:33:12,835 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.8.attention.self.key.weight 2023-05-13 15:33:12,835 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.8.attention.self.query.bias 2023-05-13 15:33:12,835 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.8.attention.self.query.weight 2023-05-13 15:33:12,835 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.8.attention.self.value.bias 2023-05-13 15:33:12,835 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.8.attention.self.value.weight 2023-05-13 15:33:12,836 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.8.intermediate.dense.bias 2023-05-13 15:33:12,836 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.8.intermediate.dense.weight 2023-05-13 15:33:12,836 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.8.output.LayerNorm.bias 2023-05-13 15:33:12,836 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.8.output.LayerNorm.weight 2023-05-13 15:33:12,836 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.8.output.dense.bias 2023-05-13 15:33:12,836 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.8.output.dense.weight 2023-05-13 15:33:12,836 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.9.attention.output.LayerNorm.bias 2023-05-13 15:33:12,836 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.9.attention.output.LayerNorm.weight 2023-05-13 15:33:12,836 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.9.attention.output.dense.bias 2023-05-13 15:33:12,836 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.9.attention.output.dense.weight 2023-05-13 15:33:12,836 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.9.attention.self.key.bias 2023-05-13 15:33:12,836 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.9.attention.self.key.weight 2023-05-13 15:33:12,836 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.9.attention.self.query.bias 2023-05-13 15:33:12,836 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.9.attention.self.query.weight 2023-05-13 15:33:12,836 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.9.attention.self.value.bias 2023-05-13 15:33:12,836 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.9.attention.self.value.weight 2023-05-13 15:33:12,836 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.9.intermediate.dense.bias 2023-05-13 15:33:12,836 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.9.intermediate.dense.weight 2023-05-13 15:33:12,836 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.9.output.LayerNorm.bias 2023-05-13 15:33:12,836 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.9.output.LayerNorm.weight 2023-05-13 15:33:12,837 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.9.output.dense.bias 2023-05-13 15:33:12,837 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.9.output.dense.weight 2023-05-13 15:33:12,837 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.pooler.dense.bias 2023-05-13 15:33:12,837 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.pooler.dense.weight 2023-05-13 15:33:12,837 - INFO - allennlp.nn.initializers - _target_embedder.weight 2023-05-13 15:33:13,241 - INFO - allennlp.common.params - trainer.type = gradient_descent 2023-05-13 15:33:13,241 - INFO - allennlp.common.params - trainer.cuda_device = None 2023-05-13 15:33:13,241 - INFO - allennlp.common.params - trainer.distributed = False 2023-05-13 15:33:13,241 - INFO - allennlp.common.params - trainer.world_size = 1 2023-05-13 15:33:13,242 - INFO - allennlp.common.params - trainer.patience = None 2023-05-13 15:33:13,242 - INFO - allennlp.common.params - trainer.validation_metric = +fscore 2023-05-13 15:33:13,242 - INFO - allennlp.common.params - trainer.num_epochs = 25 2023-05-13 15:33:13,242 - INFO - allennlp.common.params - trainer.grad_norm = 1 2023-05-13 15:33:13,242 - INFO - allennlp.common.params - trainer.grad_clipping = None 2023-05-13 15:33:13,242 - INFO - allennlp.common.params - trainer.num_gradient_accumulation_steps = 1 2023-05-13 15:33:13,242 - INFO - allennlp.common.params - trainer.use_amp = True 2023-05-13 15:33:13,242 - INFO - allennlp.common.params - trainer.no_grad = None 2023-05-13 15:33:13,243 - INFO - allennlp.common.params - trainer.momentum_scheduler = None 2023-05-13 15:33:13,243 - INFO - allennlp.common.params - trainer.moving_average = None 2023-05-13 15:33:13,243 - INFO - allennlp.common.params - trainer.enable_default_callbacks = True 2023-05-13 15:33:13,243 - INFO - allennlp.common.params - trainer.run_confidence_checks = True 2023-05-13 15:33:13,243 - INFO - allennlp.common.params - trainer.grad_scaling = True 2023-05-13 15:33:16,509 - INFO - allennlp.common.params - trainer.optimizer.type = huggingface_adamw 2023-05-13 15:33:16,510 - INFO - allennlp.common.params - trainer.optimizer.lr = 0.0004 2023-05-13 15:33:16,510 - INFO - allennlp.common.params - trainer.optimizer.betas = (0.9, 0.999) 2023-05-13 15:33:16,510 - INFO - allennlp.common.params - trainer.optimizer.eps = 1e-08 2023-05-13 15:33:16,510 - INFO - allennlp.common.params - trainer.optimizer.weight_decay = 0 2023-05-13 15:33:16,510 - INFO - allennlp.common.params - trainer.optimizer.correct_bias = True 2023-05-13 15:33:16,511 - INFO - allennlp.training.optimizers - Done constructing parameter groups. 2023-05-13 15:33:16,511 - INFO - allennlp.training.optimizers - Group 0: ['_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.4.attention.self.value.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.7.attention.self.query.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.3.output.dense.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.8.output.dense.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.11.output.dense.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.5.attention.output.dense.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.8.attention.self.value.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.2.attention.self.key.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.11.attention.output.dense.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.4.attention.self.query.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.10.intermediate.dense.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.9.attention.self.key.weight', '_source_embedder.token_embedder_tokens.transformer_model.embeddings.word_embeddings.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.4.output.dense.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.9.attention.output.dense.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.0.attention.output.dense.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.6.attention.self.query.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.11.attention.self.query.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.7.attention.self.key.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.6.output.dense.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.7.output.dense.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.3.attention.self.value.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.0.attention.self.value.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.1.attention.self.key.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.8.intermediate.dense.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.1.attention.self.query.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.10.attention.self.query.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.9.attention.self.query.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.10.attention.self.value.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.10.attention.output.dense.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.5.attention.self.query.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.0.intermediate.dense.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.1.attention.output.dense.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.1.intermediate.dense.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.4.attention.output.dense.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.5.output.dense.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.2.output.dense.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.8.attention.output.dense.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.2.attention.self.query.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.6.attention.self.key.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.6.intermediate.dense.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.11.attention.self.key.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.6.attention.output.dense.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.2.intermediate.dense.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.3.intermediate.dense.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.6.attention.self.value.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.5.attention.self.key.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.7.attention.self.value.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.7.intermediate.dense.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.0.attention.self.query.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.3.attention.self.key.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.7.attention.output.dense.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.2.attention.output.dense.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.9.attention.self.value.weight', '_source_embedder.token_embedder_tokens.transformer_model.embeddings.token_type_embeddings.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.11.intermediate.dense.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.1.output.dense.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.5.attention.self.value.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.3.attention.output.dense.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.11.attention.self.value.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.0.output.dense.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.4.attention.self.key.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.8.attention.self.key.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.9.output.dense.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.0.attention.self.key.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.5.intermediate.dense.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.8.attention.self.query.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.9.intermediate.dense.weight', '_source_embedder.token_embedder_tokens.transformer_model.pooler.dense.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.2.attention.self.value.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.10.attention.self.key.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.4.intermediate.dense.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.10.output.dense.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.3.attention.self.query.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.1.attention.self.value.weight', '_source_embedder.token_embedder_tokens.transformer_model.embeddings.position_embeddings.weight'], {'lr': 2e-05, 'weight_decay': 0.01} 2023-05-13 15:33:16,511 - INFO - allennlp.training.optimizers - Group 1: ['_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.3.attention.output.LayerNorm.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.7.attention.output.LayerNorm.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.5.attention.self.query.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.11.attention.self.query.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.11.output.LayerNorm.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.7.output.dense.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.6.intermediate.dense.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.6.attention.output.LayerNorm.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.0.attention.self.query.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.3.intermediate.dense.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.9.attention.output.LayerNorm.bias', '_source_embedder.token_embedder_tokens.transformer_model.pooler.dense.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.10.output.LayerNorm.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.11.intermediate.dense.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.5.attention.output.LayerNorm.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.10.attention.self.value.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.4.attention.output.LayerNorm.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.11.attention.output.LayerNorm.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.9.intermediate.dense.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.9.attention.self.query.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.10.output.dense.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.4.attention.self.query.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.2.attention.output.LayerNorm.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.3.attention.self.value.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.10.attention.self.key.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.3.attention.output.LayerNorm.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.5.attention.output.dense.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.9.attention.output.LayerNorm.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.1.attention.self.value.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.6.attention.output.dense.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.6.attention.output.LayerNorm.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.6.attention.self.key.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.7.attention.self.key.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.1.attention.output.LayerNorm.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.7.attention.self.query.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.4.intermediate.dense.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.8.output.dense.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.3.output.LayerNorm.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.5.output.LayerNorm.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.3.attention.self.query.bias', '_source_embedder.token_embedder_tokens.transformer_model.embeddings.LayerNorm.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.3.attention.output.dense.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.0.attention.self.value.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.4.attention.self.value.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.4.attention.output.LayerNorm.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.7.attention.output.dense.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.4.attention.self.key.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.0.output.dense.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.6.output.LayerNorm.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.9.output.dense.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.9.output.LayerNorm.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.7.intermediate.dense.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.3.output.dense.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.0.attention.self.key.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.6.output.LayerNorm.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.2.attention.self.query.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.9.attention.output.dense.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.11.attention.self.value.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.8.intermediate.dense.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.8.attention.self.value.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.9.attention.self.value.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.1.attention.output.dense.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.11.output.LayerNorm.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.0.output.LayerNorm.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.5.output.LayerNorm.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.1.attention.self.key.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.4.attention.output.dense.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.11.attention.output.dense.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.3.attention.self.key.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.3.output.LayerNorm.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.10.intermediate.dense.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.8.attention.output.LayerNorm.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.7.output.LayerNorm.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.2.output.LayerNorm.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.9.output.LayerNorm.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.6.output.dense.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.10.attention.self.query.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.10.output.LayerNorm.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.2.attention.self.value.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.8.output.LayerNorm.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.8.attention.output.LayerNorm.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.8.attention.output.dense.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.10.attention.output.LayerNorm.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.2.output.dense.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.2.attention.output.LayerNorm.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.4.output.dense.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.8.attention.self.key.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.4.output.LayerNorm.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.11.attention.self.key.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.10.attention.output.dense.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.1.output.dense.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.11.output.dense.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.5.attention.output.LayerNorm.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.6.attention.self.query.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.7.output.LayerNorm.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.1.intermediate.dense.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.6.attention.self.value.bias', '_source_embedder.token_embedder_tokens.transformer_model.embeddings.LayerNorm.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.7.attention.output.LayerNorm.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.5.intermediate.dense.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.11.attention.output.LayerNorm.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.0.attention.output.dense.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.1.output.LayerNorm.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.0.attention.output.LayerNorm.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.1.attention.output.LayerNorm.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.0.intermediate.dense.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.2.attention.output.dense.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.2.output.LayerNorm.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.5.output.dense.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.2.intermediate.dense.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.8.attention.self.query.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.0.attention.output.LayerNorm.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.2.attention.self.key.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.1.attention.self.query.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.4.output.LayerNorm.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.8.output.LayerNorm.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.7.attention.self.value.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.1.output.LayerNorm.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.10.attention.output.LayerNorm.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.5.attention.self.value.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.5.attention.self.key.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.0.output.LayerNorm.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.9.attention.self.key.bias'], {'lr': 2e-05, 'weight_decay': 0} 2023-05-13 15:33:16,548 - INFO - allennlp.training.optimizers - Group 2: ['_output_generation_layer.bias', '_attention._multihead_attn.out_proj.bias', '_decoder_cell.weight_hh_raw', '_output_copying_layer.bias', '_target_embedder.weight', '_input_projection_layer.weight', '_decoder_cell.module.bias_hh', '_attention._multihead_attn.in_proj_bias', '_output_generation_layer.weight', '_output_copying_layer.weight', '_decoder_cell.module.weight_ih', '_input_projection_layer.bias', '_attention._multihead_attn.out_proj.weight', '_decoder_cell.module.bias_ih', '_attention._multihead_attn.in_proj_weight'], {} 2023-05-13 15:33:16,548 - INFO - allennlp.training.optimizers - Number of trainable parameters: 118547721 2023-05-13 15:33:16,551 - INFO - allennlp.common.util - The following parameters are Frozen (without gradient): 2023-05-13 15:33:16,552 - INFO - allennlp.common.util - The following parameters are Tunable (with gradient): 2023-05-13 15:33:16,552 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.embeddings.word_embeddings.weight 2023-05-13 15:33:16,552 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.embeddings.position_embeddings.weight 2023-05-13 15:33:16,552 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.embeddings.token_type_embeddings.weight 2023-05-13 15:33:16,552 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.embeddings.LayerNorm.weight 2023-05-13 15:33:16,552 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.embeddings.LayerNorm.bias 2023-05-13 15:33:16,552 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.0.attention.self.query.weight 2023-05-13 15:33:16,552 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.0.attention.self.query.bias 2023-05-13 15:33:16,552 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.0.attention.self.key.weight 2023-05-13 15:33:16,552 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.0.attention.self.key.bias 2023-05-13 15:33:16,552 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.0.attention.self.value.weight 2023-05-13 15:33:16,552 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.0.attention.self.value.bias 2023-05-13 15:33:16,552 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.0.attention.output.dense.weight 2023-05-13 15:33:16,553 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.0.attention.output.dense.bias 2023-05-13 15:33:16,553 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.0.attention.output.LayerNorm.weight 2023-05-13 15:33:16,553 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.0.attention.output.LayerNorm.bias 2023-05-13 15:33:16,553 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.0.intermediate.dense.weight 2023-05-13 15:33:16,553 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.0.intermediate.dense.bias 2023-05-13 15:33:16,553 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.0.output.dense.weight 2023-05-13 15:33:16,553 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.0.output.dense.bias 2023-05-13 15:33:16,553 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.0.output.LayerNorm.weight 2023-05-13 15:33:16,553 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.0.output.LayerNorm.bias 2023-05-13 15:33:16,553 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.1.attention.self.query.weight 2023-05-13 15:33:16,553 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.1.attention.self.query.bias 2023-05-13 15:33:16,553 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.1.attention.self.key.weight 2023-05-13 15:33:16,553 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.1.attention.self.key.bias 2023-05-13 15:33:16,553 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.1.attention.self.value.weight 2023-05-13 15:33:16,553 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.1.attention.self.value.bias 2023-05-13 15:33:16,554 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.1.attention.output.dense.weight 2023-05-13 15:33:16,554 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.1.attention.output.dense.bias 2023-05-13 15:33:16,554 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.1.attention.output.LayerNorm.weight 2023-05-13 15:33:16,554 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.1.attention.output.LayerNorm.bias 2023-05-13 15:33:16,554 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.1.intermediate.dense.weight 2023-05-13 15:33:16,554 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.1.intermediate.dense.bias 2023-05-13 15:33:16,554 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.1.output.dense.weight 2023-05-13 15:33:16,554 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.1.output.dense.bias 2023-05-13 15:33:16,554 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.1.output.LayerNorm.weight 2023-05-13 15:33:16,554 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.1.output.LayerNorm.bias 2023-05-13 15:33:16,554 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.2.attention.self.query.weight 2023-05-13 15:33:16,554 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.2.attention.self.query.bias 2023-05-13 15:33:16,554 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.2.attention.self.key.weight 2023-05-13 15:33:16,554 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.2.attention.self.key.bias 2023-05-13 15:33:16,554 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.2.attention.self.value.weight 2023-05-13 15:33:16,554 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.2.attention.self.value.bias 2023-05-13 15:33:16,555 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.2.attention.output.dense.weight 2023-05-13 15:33:16,555 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.2.attention.output.dense.bias 2023-05-13 15:33:16,555 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.2.attention.output.LayerNorm.weight 2023-05-13 15:33:16,555 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.2.attention.output.LayerNorm.bias 2023-05-13 15:33:16,555 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.2.intermediate.dense.weight 2023-05-13 15:33:16,555 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.2.intermediate.dense.bias 2023-05-13 15:33:16,555 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.2.output.dense.weight 2023-05-13 15:33:16,555 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.2.output.dense.bias 2023-05-13 15:33:16,555 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.2.output.LayerNorm.weight 2023-05-13 15:33:16,555 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.2.output.LayerNorm.bias 2023-05-13 15:33:16,555 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.3.attention.self.query.weight 2023-05-13 15:33:16,555 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.3.attention.self.query.bias 2023-05-13 15:33:16,555 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.3.attention.self.key.weight 2023-05-13 15:33:16,555 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.3.attention.self.key.bias 2023-05-13 15:33:16,555 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.3.attention.self.value.weight 2023-05-13 15:33:16,555 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.3.attention.self.value.bias 2023-05-13 15:33:16,555 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.3.attention.output.dense.weight 2023-05-13 15:33:16,556 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.3.attention.output.dense.bias 2023-05-13 15:33:16,556 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.3.attention.output.LayerNorm.weight 2023-05-13 15:33:16,556 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.3.attention.output.LayerNorm.bias 2023-05-13 15:33:16,556 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.3.intermediate.dense.weight 2023-05-13 15:33:16,556 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.3.intermediate.dense.bias 2023-05-13 15:33:16,556 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.3.output.dense.weight 2023-05-13 15:33:16,556 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.3.output.dense.bias 2023-05-13 15:33:16,556 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.3.output.LayerNorm.weight 2023-05-13 15:33:16,556 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.3.output.LayerNorm.bias 2023-05-13 15:33:16,556 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.4.attention.self.query.weight 2023-05-13 15:33:16,556 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.4.attention.self.query.bias 2023-05-13 15:33:16,556 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.4.attention.self.key.weight 2023-05-13 15:33:16,556 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.4.attention.self.key.bias 2023-05-13 15:33:16,556 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.4.attention.self.value.weight 2023-05-13 15:33:16,556 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.4.attention.self.value.bias 2023-05-13 15:33:16,556 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.4.attention.output.dense.weight 2023-05-13 15:33:16,557 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.4.attention.output.dense.bias 2023-05-13 15:33:16,557 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.4.attention.output.LayerNorm.weight 2023-05-13 15:33:16,557 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.4.attention.output.LayerNorm.bias 2023-05-13 15:33:16,557 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.4.intermediate.dense.weight 2023-05-13 15:33:16,557 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.4.intermediate.dense.bias 2023-05-13 15:33:16,557 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.4.output.dense.weight 2023-05-13 15:33:16,557 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.4.output.dense.bias 2023-05-13 15:33:16,557 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.4.output.LayerNorm.weight 2023-05-13 15:33:16,557 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.4.output.LayerNorm.bias 2023-05-13 15:33:16,557 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.5.attention.self.query.weight 2023-05-13 15:33:16,557 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.5.attention.self.query.bias 2023-05-13 15:33:16,557 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.5.attention.self.key.weight 2023-05-13 15:33:16,557 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.5.attention.self.key.bias 2023-05-13 15:33:16,557 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.5.attention.self.value.weight 2023-05-13 15:33:16,557 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.5.attention.self.value.bias 2023-05-13 15:33:16,557 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.5.attention.output.dense.weight 2023-05-13 15:33:16,557 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.5.attention.output.dense.bias 2023-05-13 15:33:16,558 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.5.attention.output.LayerNorm.weight 2023-05-13 15:33:16,558 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.5.attention.output.LayerNorm.bias 2023-05-13 15:33:16,558 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.5.intermediate.dense.weight 2023-05-13 15:33:16,558 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.5.intermediate.dense.bias 2023-05-13 15:33:16,558 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.5.output.dense.weight 2023-05-13 15:33:16,558 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.5.output.dense.bias 2023-05-13 15:33:16,558 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.5.output.LayerNorm.weight 2023-05-13 15:33:16,558 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.5.output.LayerNorm.bias 2023-05-13 15:33:16,558 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.6.attention.self.query.weight 2023-05-13 15:33:16,558 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.6.attention.self.query.bias 2023-05-13 15:33:16,558 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.6.attention.self.key.weight 2023-05-13 15:33:16,558 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.6.attention.self.key.bias 2023-05-13 15:33:16,558 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.6.attention.self.value.weight 2023-05-13 15:33:16,558 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.6.attention.self.value.bias 2023-05-13 15:33:16,558 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.6.attention.output.dense.weight 2023-05-13 15:33:16,558 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.6.attention.output.dense.bias 2023-05-13 15:33:16,558 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.6.attention.output.LayerNorm.weight 2023-05-13 15:33:16,559 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.6.attention.output.LayerNorm.bias 2023-05-13 15:33:16,559 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.6.intermediate.dense.weight 2023-05-13 15:33:16,559 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.6.intermediate.dense.bias 2023-05-13 15:33:16,559 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.6.output.dense.weight 2023-05-13 15:33:16,559 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.6.output.dense.bias 2023-05-13 15:33:16,559 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.6.output.LayerNorm.weight 2023-05-13 15:33:16,559 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.6.output.LayerNorm.bias 2023-05-13 15:33:16,559 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.7.attention.self.query.weight 2023-05-13 15:33:16,650 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.7.attention.self.query.bias 2023-05-13 15:33:16,650 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.7.attention.self.key.weight 2023-05-13 15:33:16,650 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.7.attention.self.key.bias 2023-05-13 15:33:16,650 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.7.attention.self.value.weight 2023-05-13 15:33:16,651 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.7.attention.self.value.bias 2023-05-13 15:33:16,651 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.7.attention.output.dense.weight 2023-05-13 15:33:16,651 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.7.attention.output.dense.bias 2023-05-13 15:33:16,651 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.7.attention.output.LayerNorm.weight 2023-05-13 15:33:16,651 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.7.attention.output.LayerNorm.bias 2023-05-13 15:33:16,651 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.7.intermediate.dense.weight 2023-05-13 15:33:16,651 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.7.intermediate.dense.bias 2023-05-13 15:33:16,651 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.7.output.dense.weight 2023-05-13 15:33:16,651 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.7.output.dense.bias 2023-05-13 15:33:16,651 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.7.output.LayerNorm.weight 2023-05-13 15:33:16,651 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.7.output.LayerNorm.bias 2023-05-13 15:33:16,652 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.8.attention.self.query.weight 2023-05-13 15:33:16,652 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.8.attention.self.query.bias 2023-05-13 15:33:16,652 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.8.attention.self.key.weight 2023-05-13 15:33:16,652 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.8.attention.self.key.bias 2023-05-13 15:33:16,652 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.8.attention.self.value.weight 2023-05-13 15:33:16,652 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.8.attention.self.value.bias 2023-05-13 15:33:16,652 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.8.attention.output.dense.weight 2023-05-13 15:33:16,652 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.8.attention.output.dense.bias 2023-05-13 15:33:16,652 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.8.attention.output.LayerNorm.weight 2023-05-13 15:33:16,652 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.8.attention.output.LayerNorm.bias 2023-05-13 15:33:16,652 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.8.intermediate.dense.weight 2023-05-13 15:33:16,653 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.8.intermediate.dense.bias 2023-05-13 15:33:16,653 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.8.output.dense.weight 2023-05-13 15:33:16,653 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.8.output.dense.bias 2023-05-13 15:33:16,653 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.8.output.LayerNorm.weight 2023-05-13 15:33:16,653 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.8.output.LayerNorm.bias 2023-05-13 15:33:16,653 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.9.attention.self.query.weight 2023-05-13 15:33:16,653 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.9.attention.self.query.bias 2023-05-13 15:33:16,653 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.9.attention.self.key.weight 2023-05-13 15:33:16,653 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.9.attention.self.key.bias 2023-05-13 15:33:16,653 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.9.attention.self.value.weight 2023-05-13 15:33:16,653 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.9.attention.self.value.bias 2023-05-13 15:33:16,653 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.9.attention.output.dense.weight 2023-05-13 15:33:16,653 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.9.attention.output.dense.bias 2023-05-13 15:33:16,654 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.9.attention.output.LayerNorm.weight 2023-05-13 15:33:16,654 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.9.attention.output.LayerNorm.bias 2023-05-13 15:33:16,654 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.9.intermediate.dense.weight 2023-05-13 15:33:16,654 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.9.intermediate.dense.bias 2023-05-13 15:33:16,654 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.9.output.dense.weight 2023-05-13 15:33:16,654 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.9.output.dense.bias 2023-05-13 15:33:16,654 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.9.output.LayerNorm.weight 2023-05-13 15:33:16,654 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.9.output.LayerNorm.bias 2023-05-13 15:33:16,654 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.10.attention.self.query.weight 2023-05-13 15:33:16,654 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.10.attention.self.query.bias 2023-05-13 15:33:16,654 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.10.attention.self.key.weight 2023-05-13 15:33:16,655 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.10.attention.self.key.bias 2023-05-13 15:33:16,655 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.10.attention.self.value.weight 2023-05-13 15:33:16,655 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.10.attention.self.value.bias 2023-05-13 15:33:16,655 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.10.attention.output.dense.weight 2023-05-13 15:33:16,655 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.10.attention.output.dense.bias 2023-05-13 15:33:16,655 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.10.attention.output.LayerNorm.weight 2023-05-13 15:33:16,655 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.10.attention.output.LayerNorm.bias 2023-05-13 15:33:16,655 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.10.intermediate.dense.weight 2023-05-13 15:33:16,655 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.10.intermediate.dense.bias 2023-05-13 15:33:16,655 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.10.output.dense.weight 2023-05-13 15:33:16,655 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.10.output.dense.bias 2023-05-13 15:33:16,655 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.10.output.LayerNorm.weight 2023-05-13 15:33:16,656 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.10.output.LayerNorm.bias 2023-05-13 15:33:16,656 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.11.attention.self.query.weight 2023-05-13 15:33:16,656 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.11.attention.self.query.bias 2023-05-13 15:33:16,656 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.11.attention.self.key.weight 2023-05-13 15:33:16,656 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.11.attention.self.key.bias 2023-05-13 15:33:16,656 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.11.attention.self.value.weight 2023-05-13 15:33:16,656 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.11.attention.self.value.bias 2023-05-13 15:33:16,656 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.11.attention.output.dense.weight 2023-05-13 15:33:16,656 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.11.attention.output.dense.bias 2023-05-13 15:33:16,656 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.11.attention.output.LayerNorm.weight 2023-05-13 15:33:16,656 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.11.attention.output.LayerNorm.bias 2023-05-13 15:33:16,656 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.11.intermediate.dense.weight 2023-05-13 15:33:16,656 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.11.intermediate.dense.bias 2023-05-13 15:33:16,657 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.11.output.dense.weight 2023-05-13 15:33:16,657 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.11.output.dense.bias 2023-05-13 15:33:16,657 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.11.output.LayerNorm.weight 2023-05-13 15:33:16,657 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.11.output.LayerNorm.bias 2023-05-13 15:33:16,657 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.pooler.dense.weight 2023-05-13 15:33:16,657 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.pooler.dense.bias 2023-05-13 15:33:16,657 - INFO - allennlp.common.util - _target_embedder.weight 2023-05-13 15:33:16,657 - INFO - allennlp.common.util - _attention._multihead_attn.in_proj_weight 2023-05-13 15:33:16,657 - INFO - allennlp.common.util - _attention._multihead_attn.in_proj_bias 2023-05-13 15:33:16,657 - INFO - allennlp.common.util - _attention._multihead_attn.out_proj.weight 2023-05-13 15:33:16,657 - INFO - allennlp.common.util - _attention._multihead_attn.out_proj.bias 2023-05-13 15:33:16,657 - INFO - allennlp.common.util - _input_projection_layer.weight 2023-05-13 15:33:16,657 - INFO - allennlp.common.util - _input_projection_layer.bias 2023-05-13 15:33:16,657 - INFO - allennlp.common.util - _decoder_cell.weight_hh_raw 2023-05-13 15:33:16,657 - INFO - allennlp.common.util - _decoder_cell.module.weight_ih 2023-05-13 15:33:16,657 - INFO - allennlp.common.util - _decoder_cell.module.bias_ih 2023-05-13 15:33:16,657 - INFO - allennlp.common.util - _decoder_cell.module.bias_hh 2023-05-13 15:33:16,658 - INFO - allennlp.common.util - _output_generation_layer.weight 2023-05-13 15:33:16,658 - INFO - allennlp.common.util - _output_generation_layer.bias 2023-05-13 15:33:16,658 - INFO - allennlp.common.util - _output_copying_layer.weight 2023-05-13 15:33:16,658 - INFO - allennlp.common.util - _output_copying_layer.bias 2023-05-13 15:33:16,658 - INFO - allennlp.common.params - trainer.learning_rate_scheduler.type = linear_with_warmup 2023-05-13 15:33:16,658 - INFO - allennlp.common.params - trainer.learning_rate_scheduler.warmup_steps = 2906 2023-05-13 15:33:16,658 - INFO - allennlp.common.params - trainer.learning_rate_scheduler.last_epoch = -1 2023-05-13 15:33:16,658 - INFO - allennlp.common.params - trainer.checkpointer.type = default 2023-05-13 15:33:16,659 - INFO - allennlp.common.params - trainer.checkpointer.save_completed_epochs = True 2023-05-13 15:33:16,659 - INFO - allennlp.common.params - trainer.checkpointer.save_every_num_seconds = None 2023-05-13 15:33:16,659 - INFO - allennlp.common.params - trainer.checkpointer.save_every_num_batches = None 2023-05-13 15:33:16,659 - INFO - allennlp.common.params - trainer.checkpointer.keep_most_recent_by_count = 1 2023-05-13 15:33:16,659 - INFO - allennlp.common.params - trainer.checkpointer.keep_most_recent_by_age = None 2023-05-13 15:33:16,659 - INFO - allennlp.common.params - trainer.callbacks.0.type = should_validate_callback 2023-05-13 15:33:16,659 - INFO - allennlp.common.params - trainer.callbacks.0.validation_start = 15 2023-05-13 15:33:16,660 - INFO - allennlp.common.params - trainer.callbacks.0.validation_interval = 1 2023-05-13 15:33:16,660 - WARNING - allennlp.training.gradient_descent_trainer - You provided a validation dataset but patience was set to None, meaning that early stopping is disabled 2023-05-13 15:33:16,661 - INFO - allennlp.training.gradient_descent_trainer - Beginning training. 2023-05-13 15:33:16,662 - INFO - allennlp.training.gradient_descent_trainer - Epoch 0/24 2023-05-13 15:33:16,662 - INFO - allennlp.training.gradient_descent_trainer - Worker 0 memory usage: 4.3G 2023-05-13 15:33:16,662 - INFO - allennlp.training.gradient_descent_trainer - GPU 0 memory usage: 453M 2023-05-13 15:33:16,663 - INFO - allennlp.training.gradient_descent_trainer - Training 2023-05-13 15:33:16,663 - INFO - tqdm - 0%| | 0/1163 [00:00") 2023-05-13 15:33:26,761 - INFO - tqdm - batch_loss: 22.0751, loss: 13.0001 ||: 6%|5 | 64/1163 [00:10<03:36, 5.07it/s] 2023-05-13 15:33:36,821 - INFO - tqdm - batch_loss: 1.7625, loss: 11.5015 ||: 12%|#2 | 140/1163 [00:20<02:02, 8.35it/s] 2023-05-13 15:33:46,936 - INFO - tqdm - batch_loss: 8.3574, loss: 11.3590 ||: 17%|#7 | 201/1163 [00:30<02:18, 6.95it/s] 2023-05-13 15:33:57,017 - INFO - tqdm - batch_loss: 28.9927, loss: 10.0826 ||: 24%|##3 | 277/1163 [00:40<02:44, 5.38it/s] 2023-05-13 15:34:07,119 - INFO - tqdm - batch_loss: 9.8202, loss: 9.8024 ||: 29%|##9 | 340/1163 [00:50<02:01, 6.77it/s] 2023-05-13 15:34:17,238 - INFO - tqdm - batch_loss: 11.5271, loss: 9.1016 ||: 35%|###5 | 408/1163 [01:00<02:15, 5.58it/s] 2023-05-13 15:34:27,387 - INFO - tqdm - batch_loss: 7.2027, loss: 8.6231 ||: 41%|####1 | 480/1163 [01:10<01:28, 7.73it/s] 2023-05-13 15:34:37,404 - INFO - tqdm - batch_loss: 10.3921, loss: 8.1658 ||: 47%|####6 | 544/1163 [01:20<02:01, 5.10it/s] 2023-05-13 15:34:47,524 - INFO - tqdm - batch_loss: 2.4879, loss: 7.8877 ||: 53%|#####3 | 620/1163 [01:30<01:25, 6.32it/s] 2023-05-13 15:34:57,608 - INFO - tqdm - batch_loss: 0.0219, loss: 7.5193 ||: 59%|#####9 | 687/1163 [01:40<01:00, 7.81it/s] 2023-05-13 15:35:07,708 - INFO - tqdm - batch_loss: 1.8938, loss: 7.1715 ||: 66%|######5 | 762/1163 [01:51<01:05, 6.16it/s] 2023-05-13 15:35:17,805 - INFO - tqdm - batch_loss: 0.0029, loss: 6.9154 ||: 71%|#######1 | 830/1163 [02:01<00:38, 8.63it/s] 2023-05-13 15:35:27,904 - INFO - tqdm - batch_loss: 0.0070, loss: 6.7275 ||: 77%|#######7 | 896/1163 [02:11<00:41, 6.50it/s] 2023-05-13 15:35:37,987 - INFO - tqdm - batch_loss: 1.5801, loss: 6.3753 ||: 83%|########3 | 969/1163 [02:21<00:27, 7.07it/s] 2023-05-13 15:35:48,133 - INFO - tqdm - batch_loss: 0.8383, loss: 6.1441 ||: 89%|########9 | 1036/1163 [02:31<00:19, 6.60it/s] 2023-05-13 15:35:58,316 - INFO - tqdm - batch_loss: 1.5910, loss: 5.8352 ||: 96%|#########6| 1117/1163 [02:41<00:06, 6.68it/s] 2023-05-13 15:36:05,705 - INFO - tqdm - batch_loss: 2.2784, loss: 5.7149 ||: 100%|#########9| 1158/1163 [02:49<00:00, 5.30it/s] 2023-05-13 15:36:05,844 - INFO - tqdm - batch_loss: 3.6173, loss: 5.7131 ||: 100%|#########9| 1159/1163 [02:49<00:00, 5.75it/s] 2023-05-13 15:36:06,018 - INFO - tqdm - batch_loss: 3.0704, loss: 5.7108 ||: 100%|#########9| 1160/1163 [02:49<00:00, 5.76it/s] 2023-05-13 15:36:06,252 - INFO - tqdm - batch_loss: 6.7589, loss: 5.7117 ||: 100%|#########9| 1161/1163 [02:49<00:00, 5.21it/s] 2023-05-13 15:36:06,518 - INFO - tqdm - batch_loss: 2.4498, loss: 5.7040 ||: 100%|##########| 1163/1163 [02:49<00:00, 6.08it/s] 2023-05-13 15:36:06,519 - INFO - tqdm - batch_loss: 2.4498, loss: 5.7040 ||: 100%|##########| 1163/1163 [02:49<00:00, 6.85it/s] 2023-05-13 15:36:06,521 - INFO - allennlp.training.callbacks.console_logger - Training | Validation 2023-05-13 15:36:06,521 - INFO - allennlp.training.callbacks.console_logger - gpu_0_memory_MB | 453.306 | N/A 2023-05-13 15:36:06,521 - INFO - allennlp.training.callbacks.console_logger - loss | 5.704 | N/A 2023-05-13 15:36:06,521 - INFO - allennlp.training.callbacks.console_logger - worker_0_memory_MB | 4427.691 | N/A 2023-05-13 15:36:11,824 - INFO - allennlp.training.gradient_descent_trainer - Epoch duration: 0:02:55.162598 2023-05-13 15:36:11,825 - INFO - allennlp.training.gradient_descent_trainer - Estimated training time remaining: 1:07:56 2023-05-13 15:36:11,825 - INFO - allennlp.training.gradient_descent_trainer - Epoch 1/24 2023-05-13 15:36:11,825 - INFO - allennlp.training.gradient_descent_trainer - Worker 0 memory usage: 4.4G 2023-05-13 15:36:11,826 - INFO - allennlp.training.gradient_descent_trainer - GPU 0 memory usage: 2.9G 2023-05-13 15:36:11,829 - INFO - allennlp.training.gradient_descent_trainer - Training 2023-05-13 15:36:11,829 - INFO - tqdm - 0%| | 0/1163 [00:00