#!/usr/bin/env python # coding: utf-8 # In[7]: from dataclasses import dataclass, field from datetime import datetime from typing import List, Optional from transformers.file_utils import ExplicitEnum task_to_keys = { "mimic3-50": ("mimic3-50"), "mimic3-full": ("mimic3-full"), } class TransformerLayerUpdateStrategy(ExplicitEnum): NO = "no" LAST = "last" ALL = "all" class DocumentPoolingStrategy(ExplicitEnum): FLAT = "flat" MAX = "max" MEAN = "mean" @dataclass class DataTrainingArguments: """ Arguments pertaining to what data we are going to input our model for training and eval. Using `HfArgumentParser` we can turn this class into argparse arguments to be able to specify them on the command line. """ task_name: Optional[str] = field( default=None, metadata={"help": "The name of the task to train on: " + ", ".join(task_to_keys.keys())}, ) dataset_name: Optional[str] = field( default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."} ) dataset_config_name: Optional[str] = field( default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."} ) max_seq_length: int = field( default=128, metadata={ "help": "The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded." }, ) overwrite_cache: bool = field( default=False, metadata={"help": "Overwrite the cached preprocessed datasets or not."} ) pad_to_max_length: bool = field( default=True, metadata={ "help": "Whether to pad all samples to `max_seq_length`. " "If False, will pad the samples dynamically when batching to the maximum length in the batch." }, ) max_train_samples: Optional[int] = field( default=None, metadata={ "help": "For debugging purposes or quicker training, truncate the number of training examples to this " "value if set." }, ) max_eval_samples: Optional[int] = field( default=None, metadata={ "help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this " "value if set." }, ) max_predict_samples: Optional[int] = field( default=None, metadata={ "help": "For debugging purposes or quicker training, truncate the number of prediction examples to this " "value if set." }, ) train_file: Optional[str] = field( default=None, metadata={"help": "A csv or a json file containing the training data."} ) validation_file: Optional[str] = field( default=None, metadata={"help": "A csv or a json file containing the validation data."} ) test_file: Optional[str] = field(default=None, metadata={"help": "A csv or a json file containing the test data."}) # customized data arguments label_dictionary_file: Optional[str] = field( default=None, metadata={"help": "The name of the test data file."} ) code_max_seq_length: int = field( default=128, metadata={ "help": "The maximum total input sequence length after tokenization for code long titles" }, ) code_batch_size: int = field( default=8, metadata={ "help": "The batch size for generating code representation" }, ) ignore_keys_for_eval: Optional[List[str]] = field( default=None, metadata={"help": "The list of keys to be ignored during evaluation process."} ) use_cached_datasets: bool = field( default=True, metadata={"help": "if use cached datasets to save preprocessing time. The cached datasets were preprocessed " "and saved into data folder."}) data_segmented: bool = field( default=False, metadata={"help": "if dataset is segmented or not"}) lazy_loading: bool = field( default=False, metadata={"help": "if dataset is larger than 500MB, please use lazy_loading"}) def __post_init__(self): if self.task_name is not None: self.task_name = self.task_name.lower() if self.task_name not in task_to_keys.keys(): raise ValueError("Unknown task, you should pick one in " + ",".join(task_to_keys.keys())) elif self.dataset_name is not None: pass elif self.train_file is None or self.validation_file is None: raise ValueError("Need a training/validation file") elif self.label_dictionary_file is None: raise ValueError("label dictionary must be provided") else: train_extension = self.train_file.split(".")[-1] assert train_extension in ["csv", "json"], "`train_file` should be a csv or a json file." validation_extension = self.validation_file.split(".")[-1] assert ( validation_extension == train_extension ), "`validation_file` should have the same extension (csv or json) as `train_file`." @dataclass class ModelArguments: """ Arguments pertaining to which model/config/tokenizer we are going to fine-tune from. """ model_name_or_path: str = field( metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"} ) config_name: Optional[str] = field( default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"} ) tokenizer_name: Optional[str] = field( default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"} ) cache_dir: Optional[str] = field( default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"}, ) use_fast_tokenizer: bool = field( default=True, metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."}, ) model_revision: str = field( default="main", metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."}, ) use_auth_token: bool = field( default=False, metadata={ "help": "Will use the token generated when running `transformers-cli login` (necessary to use this script " "with private models)." }, ) # Customized model arguments d_model: int = field(default=768, metadata={"help": "hidden size of model. should be the same as base transformer " "model"}) dropout: float = field(default=0.1, metadata={"help": "Dropout of transformer layer"}) dropout_att: float = field(default=0.1, metadata={"help": "Dropout of label-wise attention layer"}) num_chunks_per_document: int = field(default=0.1, metadata={"help": "Num of chunks per document"}) transformer_layer_update_strategy: TransformerLayerUpdateStrategy = field( default="all", metadata={"help": "Update which transformer layers when training"}) use_code_representation: bool = field( default=True, metadata={"help": "if use code representation as the " "initial parameters of code vectors in attention layer"}) multi_head_attention: bool = field( default=True, metadata={"help": "if use multi head attention for different chunks"}) chunk_attention: bool = field( default=True, metadata={"help": "if use chunk attention for each label"}) multi_head_chunk_attention: bool = field( default=True, metadata={"help": "if use multi head chunk attention for each label"}) num_hidden_layers: int = field( default=2, metadata={"help": "NUm of hidden layers in longformer"} ) linear_init_mean: float = field(default=0.0, metadata={"help": "mean value for initializing linear layer weights"}) linear_init_std: float = field(default=0.03, metadata={"help": "standard deviation value for initializing linear " "layer weights"}) document_pooling_strategy: DocumentPoolingStrategy = field( default="flat", metadata={"help": "how to pool document representation after label-wise attention layer for each label"})