meghanaraok commited on
Commit
e9066e1
1 Parent(s): ddf05a4

Upload run_coding.py

Browse files
Files changed (1) hide show
  1. run_coding.py +213 -0
run_coding.py ADDED
@@ -0,0 +1,213 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # coding: utf-8
3
+
4
+ # In[7]:
5
+
6
+
7
+ from dataclasses import dataclass, field
8
+ from datetime import datetime
9
+ from typing import List, Optional
10
+ from transformers.file_utils import ExplicitEnum
11
+
12
+ task_to_keys = {
13
+ "mimic3-50": ("mimic3-50"),
14
+ "mimic3-full": ("mimic3-full"),
15
+ }
16
+
17
+ class TransformerLayerUpdateStrategy(ExplicitEnum):
18
+ NO = "no"
19
+ LAST = "last"
20
+ ALL = "all"
21
+
22
+ class DocumentPoolingStrategy(ExplicitEnum):
23
+ FLAT = "flat"
24
+ MAX = "max"
25
+ MEAN = "mean"
26
+
27
+
28
+ @dataclass
29
+ class DataTrainingArguments:
30
+ """
31
+ Arguments pertaining to what data we are going to input our model for training and eval.
32
+
33
+ Using `HfArgumentParser` we can turn this class
34
+ into argparse arguments to be able to specify them on
35
+ the command line.
36
+ """
37
+
38
+ task_name: Optional[str] = field(
39
+ default=None,
40
+ metadata={"help": "The name of the task to train on: " + ", ".join(task_to_keys.keys())},
41
+ )
42
+ dataset_name: Optional[str] = field(
43
+ default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
44
+ )
45
+ dataset_config_name: Optional[str] = field(
46
+ default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
47
+ )
48
+ max_seq_length: int = field(
49
+ default=128,
50
+ metadata={
51
+ "help": "The maximum total input sequence length after tokenization. Sequences longer "
52
+ "than this will be truncated, sequences shorter will be padded."
53
+ },
54
+ )
55
+ overwrite_cache: bool = field(
56
+ default=False, metadata={"help": "Overwrite the cached preprocessed datasets or not."}
57
+ )
58
+ pad_to_max_length: bool = field(
59
+ default=True,
60
+ metadata={
61
+ "help": "Whether to pad all samples to `max_seq_length`. "
62
+ "If False, will pad the samples dynamically when batching to the maximum length in the batch."
63
+ },
64
+ )
65
+ max_train_samples: Optional[int] = field(
66
+ default=None,
67
+ metadata={
68
+ "help": "For debugging purposes or quicker training, truncate the number of training examples to this "
69
+ "value if set."
70
+ },
71
+ )
72
+ max_eval_samples: Optional[int] = field(
73
+ default=None,
74
+ metadata={
75
+ "help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
76
+ "value if set."
77
+ },
78
+ )
79
+ max_predict_samples: Optional[int] = field(
80
+ default=None,
81
+ metadata={
82
+ "help": "For debugging purposes or quicker training, truncate the number of prediction examples to this "
83
+ "value if set."
84
+ },
85
+ )
86
+ train_file: Optional[str] = field(
87
+ default=None, metadata={"help": "A csv or a json file containing the training data."}
88
+ )
89
+ validation_file: Optional[str] = field(
90
+ default=None, metadata={"help": "A csv or a json file containing the validation data."}
91
+ )
92
+ test_file: Optional[str] = field(default=None, metadata={"help": "A csv or a json file containing the test data."})
93
+
94
+ # customized data arguments
95
+ label_dictionary_file: Optional[str] = field(
96
+ default=None, metadata={"help": "The name of the test data file."}
97
+ )
98
+ code_max_seq_length: int = field(
99
+ default=128,
100
+ metadata={
101
+ "help": "The maximum total input sequence length after tokenization for code long titles"
102
+ },
103
+ )
104
+ code_batch_size: int = field(
105
+ default=8,
106
+ metadata={
107
+ "help": "The batch size for generating code representation"
108
+ },
109
+ )
110
+ ignore_keys_for_eval: Optional[List[str]] = field(
111
+ default=None, metadata={"help": "The list of keys to be ignored during evaluation process."}
112
+ )
113
+ use_cached_datasets: bool = field(
114
+ default=True,
115
+ metadata={"help": "if use cached datasets to save preprocessing time. The cached datasets were preprocessed "
116
+ "and saved into data folder."})
117
+ data_segmented: bool = field(
118
+ default=False,
119
+ metadata={"help": "if dataset is segmented or not"})
120
+
121
+ lazy_loading: bool = field(
122
+ default=False,
123
+ metadata={"help": "if dataset is larger than 500MB, please use lazy_loading"})
124
+
125
+ def __post_init__(self):
126
+ if self.task_name is not None:
127
+ self.task_name = self.task_name.lower()
128
+ if self.task_name not in task_to_keys.keys():
129
+ raise ValueError("Unknown task, you should pick one in " + ",".join(task_to_keys.keys()))
130
+ elif self.dataset_name is not None:
131
+ pass
132
+ elif self.train_file is None or self.validation_file is None:
133
+ raise ValueError("Need a training/validation file")
134
+ elif self.label_dictionary_file is None:
135
+ raise ValueError("label dictionary must be provided")
136
+ else:
137
+ train_extension = self.train_file.split(".")[-1]
138
+ assert train_extension in ["csv", "json"], "`train_file` should be a csv or a json file."
139
+ validation_extension = self.validation_file.split(".")[-1]
140
+ assert (
141
+ validation_extension == train_extension
142
+ ), "`validation_file` should have the same extension (csv or json) as `train_file`."
143
+
144
+
145
+ @dataclass
146
+ class ModelArguments:
147
+ """
148
+ Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
149
+ """
150
+
151
+ model_name_or_path: str = field(
152
+ metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
153
+ )
154
+ config_name: Optional[str] = field(
155
+ default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
156
+ )
157
+ tokenizer_name: Optional[str] = field(
158
+ default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
159
+ )
160
+ cache_dir: Optional[str] = field(
161
+ default=None,
162
+ metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
163
+ )
164
+ use_fast_tokenizer: bool = field(
165
+ default=True,
166
+ metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
167
+ )
168
+ model_revision: str = field(
169
+ default="main",
170
+ metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
171
+ )
172
+ use_auth_token: bool = field(
173
+ default=False,
174
+ metadata={
175
+ "help": "Will use the token generated when running `transformers-cli login` (necessary to use this script "
176
+ "with private models)."
177
+ },
178
+ )
179
+ # Customized model arguments
180
+ d_model: int = field(default=768, metadata={"help": "hidden size of model. should be the same as base transformer "
181
+ "model"})
182
+ dropout: float = field(default=0.1, metadata={"help": "Dropout of transformer layer"})
183
+ dropout_att: float = field(default=0.1, metadata={"help": "Dropout of label-wise attention layer"})
184
+ num_chunks_per_document: int = field(default=0.1, metadata={"help": "Num of chunks per document"})
185
+ transformer_layer_update_strategy: TransformerLayerUpdateStrategy = field(
186
+ default="all",
187
+ metadata={"help": "Update which transformer layers when training"})
188
+ use_code_representation: bool = field(
189
+ default=True,
190
+ metadata={"help": "if use code representation as the "
191
+ "initial parameters of code vectors in attention layer"})
192
+ multi_head_attention: bool = field(
193
+ default=True,
194
+ metadata={"help": "if use multi head attention for different chunks"})
195
+ chunk_attention: bool = field(
196
+ default=True,
197
+ metadata={"help": "if use chunk attention for each label"})
198
+
199
+ multi_head_chunk_attention: bool = field(
200
+ default=True,
201
+ metadata={"help": "if use multi head chunk attention for each label"})
202
+
203
+ num_hidden_layers: int = field(
204
+ default=2, metadata={"help": "NUm of hidden layers in longformer"}
205
+ )
206
+
207
+ linear_init_mean: float = field(default=0.0, metadata={"help": "mean value for initializing linear layer weights"})
208
+ linear_init_std: float = field(default=0.03, metadata={"help": "standard deviation value for initializing linear "
209
+ "layer weights"})
210
+ document_pooling_strategy: DocumentPoolingStrategy = field(
211
+ default="flat",
212
+ metadata={"help": "how to pool document representation after label-wise attention layer for each label"})
213
+