igorktech commited on
Commit
060b9a1
1 Parent(s): 4c00a22

Update modelling_hat.py

Browse files
Files changed (1) hide show
  1. modelling_hat.py +113 -110
modelling_hat.py CHANGED
@@ -319,116 +319,119 @@ class SentenceClassifierOutput(ModelOutput):
319
  sentence_attentions: Optional[Tuple[torch.FloatTensor]] = None
320
 
321
 
322
- class HATConfig(PretrainedConfig):
323
- r"""
324
- This is the configuration class to store the configuration of a :class:`~transformers.HAT`.
325
- It is used to instantiate a HAT model according to the specified arguments,
326
- defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration
327
- to that of the HAT `kiddothe2b/hat-base-4096 <https://huggingface.co/kiddothe2b/hat-base-4096>`__ architecture.
328
-
329
- Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
330
- outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
331
-
332
-
333
- Args:
334
- vocab_size (:obj:`int`, `optional`, defaults to 30522):
335
- Vocabulary size of the BERT model. Defines the number of different tokens that can be represented by the
336
- :obj:`inputs_ids` passed when calling :class:`~transformers.BertModel` or
337
- :class:`~transformers.TFBertModel`.
338
- max_sentences (:obj:`int`, `optional`, defaults to 64):
339
- The maximum number of sentences that this model might ever be used with.
340
- max_sentence_size (:obj:`int`, `optional`, defaults to 128):
341
- The maximum sentence length that this model might ever be used with.
342
- model_max_length (:obj:`int`, `optional`, defaults to 8192):
343
- The maximum sequence length (max_sentences * max_sentence_size) that this model might ever be used with
344
- encoder_layout (:obj:`Dict`):
345
- The sentence/document encoder layout.
346
- hidden_size (:obj:`int`, `optional`, defaults to 768):
347
- Dimensionality of the encoder layers and the pooler layer.
348
- num_hidden_layers (:obj:`int`, `optional`, defaults to 12):
349
- Number of hidden layers in the Transformer encoder.
350
- num_attention_heads (:obj:`int`, `optional`, defaults to 12):
351
- Number of attention heads for each attention layer in the Transformer encoder.
352
- intermediate_size (:obj:`int`, `optional`, defaults to 3072):
353
- Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
354
- hidden_act (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"gelu"`):
355
- The non-linear activation function (function or string) in the encoder and pooler. If string,
356
- :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
357
- hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
358
- The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
359
- attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
360
- The dropout ratio for the attention probabilities.
361
- max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
362
- The maximum sequence length that this model might ever be used with. Typically set this to something large
363
- just in case (e.g., 512 or 1024 or 2048).
364
- type_vocab_size (:obj:`int`, `optional`, defaults to 2):
365
- The vocabulary size of the :obj:`token_type_ids` passed when calling :class:`~transformers.BertModel` or
366
- :class:`~transformers.TFBertModel`.
367
- initializer_range (:obj:`float`, `optional`, defaults to 0.02):
368
- The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
369
- layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):
370
- The epsilon used by the layer normalization layers.
371
- position_embedding_type (:obj:`str`, `optional`, defaults to :obj:`"absolute"`):
372
- Type of position embedding. Choose one of :obj:`"absolute"`, :obj:`"relative_key"`,
373
- :obj:`"relative_key_query"`. For positional embeddings use :obj:`"absolute"`. For more information on
374
- :obj:`"relative_key"`, please refer to `Self-Attention with Relative Position Representations (Shaw et al.)
375
- <https://arxiv.org/abs/1803.02155>`__. For more information on :obj:`"relative_key_query"`, please refer to
376
- `Method 4` in `Improve Transformer Models with Better Relative Position Embeddings (Huang et al.)
377
- <https://arxiv.org/abs/2009.13658>`__.
378
- use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
379
- Whether or not the model should return the last key/values attentions (not used by all models). Only
380
- relevant if ``config.is_decoder=True``.
381
- classifier_dropout (:obj:`float`, `optional`):
382
- The dropout ratio for the classification head.
383
- """
384
- model_type = "hierarchical-transformer"
385
-
386
- def __init__(
387
- self,
388
- vocab_size=30522,
389
- hidden_size=768,
390
- max_sentences=64,
391
- max_sentence_size=128,
392
- model_max_length=8192,
393
- num_hidden_layers=12,
394
- num_attention_heads=12,
395
- intermediate_size=3072,
396
- hidden_act="gelu",
397
- hidden_dropout_prob=0.1,
398
- attention_probs_dropout_prob=0.1,
399
- max_position_embeddings=512,
400
- type_vocab_size=2,
401
- initializer_range=0.02,
402
- layer_norm_eps=1e-12,
403
- pad_token_id=0,
404
- position_embedding_type="absolute",
405
- encoder_layout=None,
406
- use_cache=True,
407
- classifier_dropout=None,
408
- **kwargs
409
- ):
410
- super().__init__(pad_token_id=pad_token_id, **kwargs)
411
-
412
- self.vocab_size = vocab_size
413
- self.hidden_size = hidden_size
414
- self.max_sentences = max_sentences
415
- self.max_sentence_size = max_sentence_size
416
- self.model_max_length = model_max_length
417
- self.encoder_layout = encoder_layout
418
- self.num_hidden_layers = num_hidden_layers
419
- self.num_attention_heads = num_attention_heads
420
- self.hidden_act = hidden_act
421
- self.intermediate_size = intermediate_size
422
- self.hidden_dropout_prob = hidden_dropout_prob
423
- self.attention_probs_dropout_prob = attention_probs_dropout_prob
424
- self.max_position_embeddings = max_position_embeddings
425
- self.type_vocab_size = type_vocab_size
426
- self.initializer_range = initializer_range
427
- self.layer_norm_eps = layer_norm_eps
428
- self.position_embedding_type = position_embedding_type
429
- self.use_cache = use_cache
430
- self.classifier_dropout = classifier_dropout
431
-
 
 
 
432
 
433
  class HATEmbeddings(nn.Module):
434
  """
 
319
  sentence_attentions: Optional[Tuple[torch.FloatTensor]] = None
320
 
321
 
322
+ # class HATConfig(PretrainedConfig):
323
+ # r"""
324
+ # This is the configuration class to store the configuration of a :class:`~transformers.HAT`.
325
+ # It is used to instantiate a HAT model according to the specified arguments,
326
+ # defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration
327
+ # to that of the HAT `kiddothe2b/hat-base-4096 <https://huggingface.co/kiddothe2b/hat-base-4096>`__ architecture.
328
+
329
+ # Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
330
+ # outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
331
+
332
+
333
+ # Args:
334
+ # vocab_size (:obj:`int`, `optional`, defaults to 30522):
335
+ # Vocabulary size of the BERT model. Defines the number of different tokens that can be represented by the
336
+ # :obj:`inputs_ids` passed when calling :class:`~transformers.BertModel` or
337
+ # :class:`~transformers.TFBertModel`.
338
+ # max_sentences (:obj:`int`, `optional`, defaults to 64):
339
+ # The maximum number of sentences that this model might ever be used with.
340
+ # max_sentence_size (:obj:`int`, `optional`, defaults to 128):
341
+ # The maximum sentence length that this model might ever be used with.
342
+ # model_max_length (:obj:`int`, `optional`, defaults to 8192):
343
+ # The maximum sequence length (max_sentences * max_sentence_size) that this model might ever be used with
344
+ # encoder_layout (:obj:`Dict`):
345
+ # The sentence/document encoder layout.
346
+ # hidden_size (:obj:`int`, `optional`, defaults to 768):
347
+ # Dimensionality of the encoder layers and the pooler layer.
348
+ # num_hidden_layers (:obj:`int`, `optional`, defaults to 12):
349
+ # Number of hidden layers in the Transformer encoder.
350
+ # num_attention_heads (:obj:`int`, `optional`, defaults to 12):
351
+ # Number of attention heads for each attention layer in the Transformer encoder.
352
+ # intermediate_size (:obj:`int`, `optional`, defaults to 3072):
353
+ # Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
354
+ # hidden_act (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"gelu"`):
355
+ # The non-linear activation function (function or string) in the encoder and pooler. If string,
356
+ # :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
357
+ # hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
358
+ # The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
359
+ # attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
360
+ # The dropout ratio for the attention probabilities.
361
+ # max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
362
+ # The maximum sequence length that this model might ever be used with. Typically set this to something large
363
+ # just in case (e.g., 512 or 1024 or 2048).
364
+ # type_vocab_size (:obj:`int`, `optional`, defaults to 2):
365
+ # The vocabulary size of the :obj:`token_type_ids` passed when calling :class:`~transformers.BertModel` or
366
+ # :class:`~transformers.TFBertModel`.
367
+ # initializer_range (:obj:`float`, `optional`, defaults to 0.02):
368
+ # The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
369
+ # layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):
370
+ # The epsilon used by the layer normalization layers.
371
+ # position_embedding_type (:obj:`str`, `optional`, defaults to :obj:`"absolute"`):
372
+ # Type of position embedding. Choose one of :obj:`"absolute"`, :obj:`"relative_key"`,
373
+ # :obj:`"relative_key_query"`. For positional embeddings use :obj:`"absolute"`. For more information on
374
+ # :obj:`"relative_key"`, please refer to `Self-Attention with Relative Position Representations (Shaw et al.)
375
+ # <https://arxiv.org/abs/1803.02155>`__. For more information on :obj:`"relative_key_query"`, please refer to
376
+ # `Method 4` in `Improve Transformer Models with Better Relative Position Embeddings (Huang et al.)
377
+ # <https://arxiv.org/abs/2009.13658>`__.
378
+ # use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
379
+ # Whether or not the model should return the last key/values attentions (not used by all models). Only
380
+ # relevant if ``config.is_decoder=True``.
381
+ # classifier_dropout (:obj:`float`, `optional`):
382
+ # The dropout ratio for the classification head.
383
+ # """
384
+ # model_type = "hierarchical-transformer"
385
+
386
+ # def __init__(
387
+ # self,
388
+ # vocab_size=30522,
389
+ # hidden_size=768,
390
+ # max_sentences=64,
391
+ # max_sentence_size=128,
392
+ # model_max_length=8192,
393
+ # num_hidden_layers=12,
394
+ # num_attention_heads=12,
395
+ # intermediate_size=3072,
396
+ # hidden_act="gelu",
397
+ # hidden_dropout_prob=0.1,
398
+ # attention_probs_dropout_prob=0.1,
399
+ # max_position_embeddings=512,
400
+ # type_vocab_size=2,
401
+ # initializer_range=0.02,
402
+ # layer_norm_eps=1e-12,
403
+ # pad_token_id=0,
404
+ # position_embedding_type="absolute",
405
+ # encoder_layout=None,
406
+ # use_cache=True,
407
+ # classifier_dropout=None,
408
+ # **kwargs
409
+ # ):
410
+ # super().__init__(pad_token_id=pad_token_id, **kwargs)
411
+
412
+ # self.vocab_size = vocab_size
413
+ # self.hidden_size = hidden_size
414
+ # self.max_sentences = max_sentences
415
+ # self.max_sentence_size = max_sentence_size
416
+ # self.model_max_length = model_max_length
417
+ # self.encoder_layout = encoder_layout
418
+ # self.num_hidden_layers = num_hidden_layers
419
+ # self.num_attention_heads = num_attention_heads
420
+ # self.hidden_act = hidden_act
421
+ # self.intermediate_size = intermediate_size
422
+ # self.hidden_dropout_prob = hidden_dropout_prob
423
+ # self.attention_probs_dropout_prob = attention_probs_dropout_prob
424
+ # self.max_position_embeddings = max_position_embeddings
425
+ # self.type_vocab_size = type_vocab_size
426
+ # self.initializer_range = initializer_range
427
+ # self.layer_norm_eps = layer_norm_eps
428
+ # self.position_embedding_type = position_embedding_type
429
+ # self.use_cache = use_cache
430
+ # self.classifier_dropout = classifier_dropout
431
+
432
+
433
+
434
+ from configuration_hat import HATConfig
435
 
436
  class HATEmbeddings(nn.Module):
437
  """