kkawamu1 commited on
Commit
3f13a7b
1 Parent(s): edfc42e

Commit codes

Browse files
__init__.py ADDED
File without changes
app/__init__.py ADDED
File without changes
app/components.py ADDED
@@ -0,0 +1,217 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import collections
2
+ import os
3
+ from typing import Dict
4
+
5
+ import streamlit as st
6
+ from datasets import get_dataset_config_names
7
+ from jinja2 import Environment, FileSystemLoader
8
+
9
+ import utils
10
+ from configuration import OPTIMIZERS_ACCELERATE, OPTIMIZERS_TRAINER, TASKS, TASKS_TO_PIPELINE_TAG
11
+ from utils import (get_dataset_infos_dict, get_datasets, get_model_to_model_id,
12
+ render_features)
13
+
14
+
15
+ def show_API_component(inputs: Dict[str, str]) -> Dict[str, str]:
16
+ template_dict = collections.defaultdict()
17
+ template_dirs = [
18
+ f for f in os.scandir("templates") if f.is_dir() and f.name != "example"
19
+ ]
20
+ template_dirs = sorted(template_dirs, key=lambda e: e.name)
21
+ for template_dir in template_dirs:
22
+ template_dict[template_dir.name] = template_dir.path
23
+ st.write("## API")
24
+ inputs['api'] = st.selectbox(
25
+
26
+ "Which Hugging Face API do you want to use?", list(template_dict.keys())
27
+ )
28
+ inputs['template_dir'] = template_dict.get(inputs['api'])
29
+ return inputs
30
+
31
+
32
+ def show_model_component(inputs: Dict[str, str]) -> Dict[str, str]:
33
+
34
+ model_info = get_model_to_model_id()
35
+ models = model_info['model_to_model_id']
36
+ models_pipeline = model_info["model_to_pipeline_tag"]
37
+ st.write("## Model")
38
+ models_for_task = []
39
+ for model in models:
40
+ if (models_pipeline[model] == inputs["nlp_task"]):
41
+ models_for_task.append(model)
42
+ model = st.selectbox("Which model?", list(models_for_task))
43
+ inputs["model_checkpoint"] = models.get(model)
44
+ inputs["pretrained"] = st.checkbox("Use pre-trained model")
45
+ return inputs
46
+
47
+
48
+ def show_task_component(inputs: Dict[str, str]) -> Dict[str, str]:
49
+ st.write("## Task")
50
+ task = st.selectbox("Which task?", TASKS)
51
+ inputs["task"] = task
52
+ inputs["nlp_task"] = st.selectbox(
53
+ "Which NLP task?", TASKS_TO_PIPELINE_TAG[task])
54
+ return inputs
55
+
56
+
57
+ def show_input_data_component(inputs: Dict[str, str]) -> Dict[str, str]:
58
+ st.write("## Input data")
59
+ english_datasets = get_datasets()
60
+ english_datasets_for_task = []
61
+
62
+ for dataset in english_datasets:
63
+ for task_category in english_datasets[dataset]:
64
+ if task_category == inputs["nlp_task"]:
65
+ english_datasets_for_task.append(dataset)
66
+ continue
67
+
68
+ inputs["dataset"] = st.selectbox(
69
+ "Which one?", tuple(english_datasets_for_task)
70
+ )
71
+
72
+ configs = get_dataset_config_names(inputs["dataset"])
73
+ inputs["subset"] = st.selectbox("Which subset?", list(configs))
74
+
75
+ data_info_dict = get_dataset_infos_dict(
76
+ inputs["dataset"], inputs["subset"])
77
+
78
+ assert data_info_dict.splits is not None
79
+ if 'train' in list(data_info_dict.splits.keys()):
80
+ train_index = list(data_info_dict.splits.keys()).index('train')
81
+ else:
82
+ train_index = 0
83
+
84
+ inputs["train"] = st.selectbox("Which split for training?", list(
85
+ data_info_dict.splits.keys()), index=train_index)
86
+
87
+ if 'validation' in list(data_info_dict.splits.keys()):
88
+ validation_index = list(
89
+ data_info_dict.splits.keys()).index('validation')
90
+ else:
91
+ validation_index = len(list(data_info_dict.splits.keys()))-1
92
+
93
+ inputs["validation"] = st.selectbox("Which split for validation?", list(
94
+ data_info_dict.splits.keys()), index=validation_index)
95
+
96
+ assert data_info_dict.features is not None
97
+ feature_index = 0
98
+ if inputs["nlp_task"] == 'translation':
99
+ if 'translation' in list(data_info_dict.features.keys()):
100
+ feature_index = list(
101
+ data_info_dict.features.keys()).index('translation')
102
+
103
+ inputs["feature"] = st.selectbox(
104
+ "Which data feature?", list(data_info_dict.features.keys()), feature_index)
105
+
106
+ if inputs["feature"] == 'translation':
107
+ inputs["source_language"] = st.selectbox(
108
+ "Which language for source?", list(data_info_dict.features['translation'].languages))
109
+ inputs["target_language"] = st.selectbox(
110
+ "Which language for target?", list(data_info_dict.features['translation'].languages))
111
+
112
+ return inputs
113
+
114
+
115
+ def show_preprocessing_component(inputs: Dict[str, str]) -> Dict[str, str]:
116
+ st.write("## Preprocessing")
117
+ inputs["block_size"] = st.number_input(
118
+ "The length of each block (i.e. context size)", 1, None, 128)
119
+
120
+ if inputs["task"] == "MaskedLM":
121
+ inputs["mlm_probability"] = st.number_input(
122
+ "The probability with which to (randomly) mask tokens in the input", 0.0, 1.00, 0.15)
123
+ inputs["whole_word_masking"] = st.checkbox(
124
+ "Use whole word masking")
125
+ return inputs
126
+
127
+
128
+ def show_training_comoponent(inputs: Dict[str, str]) -> Dict[str, str]:
129
+ st.write("## Training")
130
+
131
+ # inputs['with_tracker'] = st.selectbox(
132
+ # "Loggers to monitor the training ", ["none", "all", "tensorboard", "wandb", "comet_ml"])
133
+ inputs["seed"] = st.number_input(
134
+ "Seed", 1, None, 4)
135
+
136
+ if inputs['api'] == 'Accelerate':
137
+ optimizer_dict_to_use = OPTIMIZERS_ACCELERATE
138
+ else:
139
+ optimizer_dict_to_use = OPTIMIZERS_TRAINER
140
+
141
+ inputs["optimizer"] = st.selectbox(
142
+ "Optimizer", list(optimizer_dict_to_use.keys()))
143
+ default_lr = optimizer_dict_to_use[inputs["optimizer"]]
144
+ inputs["lr"] = st.number_input(
145
+ "Learning rate", 0.000, None, default_lr, format="%f"
146
+ )
147
+ inputs["use_weight_decay"] = st.checkbox("Use weight decay")
148
+ if inputs["use_weight_decay"]:
149
+ inputs["weight_decay"] = st.number_input(
150
+ "Weight decay", 0.000, None, 0.01, format="%f"
151
+ )
152
+
153
+ inputs["gradient_accumulation_steps"] = st.number_input(
154
+ "Gradient Accumulation Steps", 1, None, 8)
155
+
156
+ inputs['lr_scheduler_type'] = st.selectbox(
157
+ "The scheduler type to use", ["linear", "cosine", "cosine_with_restarts", "polynomial", "constant", "constant_with_warmup"])
158
+ inputs['num_warmup_steps'] = st.number_input(
159
+ "Num warmup steps", 0, None, 0)
160
+ inputs["batch_size"] = st.number_input("Batch size", 1, None, 32)
161
+ inputs["num_epochs"] = st.number_input("Epochs", 1, None, 3)
162
+ return inputs
163
+
164
+
165
+ def show_datset_view_component(inputs: Dict[str, str]) -> Dict[str, str]:
166
+ data_info_dict = get_dataset_infos_dict(
167
+ inputs["dataset"], inputs["subset"])
168
+ st.write(f'## Dataset view: {inputs["dataset"]}/{inputs["subset"]}')
169
+ st.markdown(
170
+ "*Homepage*: "
171
+ + data_info_dict.homepage
172
+ + "\n\n*Dataset*: https://github.com/huggingface/datasets/blob/master/datasets/%s/%s.py"
173
+ % (inputs["dataset"], inputs["dataset"])
174
+ )
175
+ s = []
176
+ s .append('dataset' + "=" + inputs["dataset"])
177
+ s.append('config' + "=" + inputs["subset"])
178
+ st.markdown(
179
+ "*Permalink*: https://huggingface.co/datasets/viewer/?"
180
+ + "&".join(s)
181
+ )
182
+ # https://github.com/huggingface/datasets-viewer/blob/master/run.py#L282
183
+ st.write(f'{data_info_dict.description}')
184
+ st.write(render_features(data_info_dict.features))
185
+ # TODO make a conditional if the size of the data is too big, switch to streaming mode
186
+ # TODO cashe this part of the code
187
+ # selected_dataset = load_dataset(
188
+ # inputs["dataset"], inputs["subset"], split=inputs["train"], streaming=True)
189
+ # print(selected_dataset)
190
+ # print(next(iter(selected_dataset)))
191
+ return inputs
192
+
193
+ def show_code_component(inputs: Dict[str, str]) -> Dict[str, str]:
194
+ # Generate code and notebook based on template.py.jinja file in the template dir.
195
+ env = Environment(
196
+ loader=FileSystemLoader(inputs['template_dir']), trim_blocks=True, lstrip_blocks=True,
197
+ )
198
+
199
+ template = env.get_template(f'task_templates/{inputs["nlp_task"]}.py.jinja')
200
+ code = template.render(header=utils.code_header, notebook=False, **inputs)
201
+ notebook_code = template.render(
202
+ header=utils.notebook_header, notebook=True, **inputs)
203
+
204
+ notebook = utils.to_notebook(notebook_code)
205
+
206
+ st.write(f'## Code view: {inputs["api"]}')
207
+ st.write("") # add vertical space
208
+ col1, col2 = st.beta_columns(2)
209
+ with col1:
210
+ utils.download_button(code, "generated-code.py", "🐍 Download (.py)")
211
+ with col2:
212
+ utils.download_button(
213
+ notebook, "generated-notebook.ipynb", "📓 Download (.ipynb)")
214
+ colab_error = st.empty()
215
+ # Display code.
216
+ st.code(code)
217
+ return inputs
app/configuration.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ INCLUDED_USERS = ['google', 'EleutherAI',
2
+ "Helsinki-NLP", "bigscience", "facebook", "openai", "microsoft"]
3
+
4
+ #TODO create a tempalte for text2text-generation
5
+ # TASKS_TO_PIPELINE_TAG = {
6
+ # "CausalLM": ['text-generation'], "MaskedLM": ["fill-mask"], "Seq2SeqLM": ['text2text-generation', 'translation']}
7
+ TASKS_TO_PIPELINE_TAG = {
8
+ "CausalLM": ['text-generation'], "MaskedLM": ["fill-mask"], "Seq2SeqLM": ['translation']}
9
+
10
+
11
+ TASKS = list(TASKS_TO_PIPELINE_TAG.keys())
12
+
13
+ OPTIMIZERS_ACCELERATE = {
14
+ "AdamW": 0.0001, "Adadelta": 1.0, "Adagrad": 0.01, "Adam": 0.001, "SparseAdam": 0.001, "Adamax": 0.002, "ASGD": 0.01, "LBFGS": 1.0, "NAdam": 0.002, "RAdam": 0.001, "RMSprop": 0.01, "Rprop": 0.01, "SGD": 0.01
15
+ }
16
+
17
+ OPTIMIZERS_TRAINER = {'adamw_hf': 0.0001, 'adamw_torch': 0.0001, 'adamw_apex_fused': 0.0001, 'adafactor': 0.0001}
app/main.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+ from components import (show_API_component, show_code_component,
4
+ show_datset_view_component, show_input_data_component,
5
+ show_model_component, show_preprocessing_component,
6
+ show_task_component, show_training_comoponent)
7
+
8
+ st.set_page_config(
9
+ page_title="Training Code Generator for Hugging Face Models ", layout="wide"
10
+ )
11
+
12
+ st.markdown("<br>", unsafe_allow_html=True)
13
+
14
+ """
15
+ # Training Code Generator for Hugging Face Models 🤗
16
+ """
17
+ st.markdown("<br>", unsafe_allow_html=True)
18
+ """
19
+ ---
20
+ """
21
+
22
+ inputs = {}
23
+
24
+ with st.sidebar:
25
+ st.info(
26
+ "**Select the configuration**"
27
+ )
28
+ inputs = show_API_component(inputs)
29
+ inputs = show_task_component(inputs)
30
+ inputs = show_model_component(inputs)
31
+ inputs = show_input_data_component(inputs)
32
+ inputs = show_preprocessing_component(inputs)
33
+ inputs = show_training_comoponent(inputs)
34
+
35
+ inputs = show_datset_view_component(inputs)
36
+ inputs = show_code_component(inputs)
app/utils.py ADDED
@@ -0,0 +1,216 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+ import importlib.util
3
+ import math
4
+ import re
5
+ import uuid
6
+ from types import ModuleType
7
+ from typing import Dict
8
+
9
+ import datasets
10
+ import jupytext
11
+ import requests
12
+ import streamlit as st
13
+ from datasets import DatasetInfo, get_dataset_infos
14
+ from datasets.info import DatasetInfosDict
15
+
16
+ from configuration import INCLUDED_USERS, TASKS_TO_PIPELINE_TAG
17
+
18
+
19
+ def import_from_file(module_name: str, filepath: str) -> ModuleType:
20
+ """
21
+ Imports a module from file.
22
+ Args:
23
+ module_name (str): Assigned to the module's __name__ parameter (does not
24
+ influence how the module is named outside of this function)
25
+ filepath (str): Path to the .py file
26
+ Returns:
27
+ The module
28
+ """
29
+ spec = importlib.util.spec_from_file_location(module_name, filepath)
30
+ module = importlib.util.module_from_spec(spec)
31
+ spec.loader.exec_module(module)
32
+ return module
33
+
34
+
35
+ def notebook_header(text: str):
36
+ """
37
+ Insert section header into a jinja file, formatted as notebook cell.
38
+
39
+ Leave 2 blank lines before the header.
40
+ """
41
+ return f"""# # {text}
42
+ """
43
+
44
+
45
+ def code_header(text: str):
46
+ """
47
+ Insert section header into a jinja file, formatted as Python comment.
48
+
49
+ Leave 2 blank lines before the header.
50
+ """
51
+ seperator_len = (75 - len(text)) / 2
52
+ seperator_len_left = math.floor(seperator_len)
53
+ seperator_len_right = math.ceil(seperator_len)
54
+ return f"# {'-' * seperator_len_left} {text} {'-' * seperator_len_right}"
55
+
56
+
57
+ def to_notebook(code: str) -> str:
58
+ """Converts Python code to Jupyter notebook format."""
59
+ notebook = jupytext.reads(code, fmt="py")
60
+ # print(jupytext.writes(notebook, fmt="ipynb"))
61
+ return jupytext.writes(notebook, fmt="ipynb")
62
+
63
+
64
+ def download_button(
65
+ object_to_download: str, download_filename: str, button_text: str # , pickle_it=False
66
+ ):
67
+ """
68
+ Generates a link to download the given object_to_download.
69
+
70
+ From: https://discuss.streamlit.io/t/a-download-button-with-custom-css/4220
71
+ Params:
72
+ ------
73
+ object_to_download: The object to be downloaded.
74
+ download_filename (str): filename and extension of file. e.g. mydata.csv,
75
+ some_txt_output.txt download_link_text (str): Text to display for download
76
+ link.
77
+ button_text (str): Text to display on download button (e.g. 'click here to download file')
78
+ pickle_it (bool): If True, pickle file.
79
+ Returns:
80
+ -------
81
+ (str): the anchor tag to download object_to_download
82
+ Examples:
83
+ --------
84
+ download_link(your_df, 'YOUR_DF.csv', 'Click to download data!')
85
+ download_link(your_str, 'YOUR_STRING.txt', 'Click to download text!')
86
+ """
87
+
88
+ # try:
89
+ # # some strings <-> bytes conversions necessary here
90
+ b64 = base64.b64encode(object_to_download.encode()).decode()
91
+ # except AttributeError:
92
+ # b64 = base64.b64encode(object_to_download).decode()
93
+
94
+ button_uuid = str(uuid.uuid4()).replace("-", "")
95
+ button_id = re.sub("\d+", "", button_uuid)
96
+
97
+ custom_css = f"""
98
+ <style>
99
+ #{button_id} {{
100
+ display: inline-flex;
101
+ align-items: center;
102
+ justify-content: center;
103
+ background-color: rgb(255, 255, 255);
104
+ color: rgb(38, 39, 48);
105
+ padding: .25rem .75rem;
106
+ position: relative;
107
+ text-decoration: none;
108
+ border-radius: 4px;
109
+ border-width: 1px;
110
+ border-style: solid;
111
+ border-color: rgb(230, 234, 241);
112
+ border-image: initial;
113
+ }}
114
+ #{button_id}:hover {{
115
+ border-color: rgb(246, 51, 102);
116
+ color: rgb(246, 51, 102);
117
+ }}
118
+ #{button_id}:active {{
119
+ box-shadow: none;
120
+ background-color: rgb(246, 51, 102);
121
+ color: white;
122
+ }}
123
+ </style> """
124
+
125
+ dl_link = (
126
+ custom_css
127
+ + f'<a download="{download_filename}" id="{button_id}" href="data:file/txt;base64,{b64}">{button_text}</a><br><br>'
128
+ )
129
+
130
+ st.markdown(dl_link, unsafe_allow_html=True)
131
+
132
+
133
+ @st.cache
134
+ def get_model_to_model_id() -> Dict[str, Dict[str, str]]:
135
+ requests.get("https://huggingface.co")
136
+ response = requests.get("https://huggingface.co/api/models")
137
+ tags = response.json()
138
+ model_to_model_id = {}
139
+ model_to_pipeline_tag = {}
140
+
141
+ for model in tags:
142
+ model_name = model['modelId']
143
+ is_community_model = "/" in model_name
144
+ if is_community_model:
145
+ user = model_name.split("/")[0]
146
+ if user not in INCLUDED_USERS:
147
+ continue
148
+
149
+ # TODO Right now if pipiline is not defined, skip
150
+ if "pipeline_tag" in model:
151
+ model_to_model_id[model['id']] = model['modelId']
152
+ model_to_pipeline_tag[model['id']] = model["pipeline_tag"]
153
+ return {"model_to_model_id": model_to_model_id, "model_to_pipeline_tag": model_to_pipeline_tag}
154
+
155
+
156
+ @st.cache
157
+ def get_datasets() -> Dict[str, str]:
158
+ english_datasets = {}
159
+ response = requests.get(
160
+ "https://huggingface.co/api/datasets?full=true&languages=en")
161
+ tags = response.json()
162
+ for dataset in tags:
163
+ dataset_name = dataset["id"]
164
+
165
+ is_community_dataset = "/" in dataset_name
166
+ if is_community_dataset:
167
+ # user = dataset_name.split("/")[0]
168
+ # if user in INCLUDED_USERS:
169
+ # english_datasets.append(dataset_name)
170
+ continue
171
+
172
+ if "cardData" not in dataset:
173
+ continue
174
+ metadata = dataset["cardData"]
175
+
176
+ if "languages" not in metadata:
177
+ continue
178
+
179
+ if "task_categories" not in metadata:
180
+ continue
181
+
182
+ task_is_valid = False
183
+ for task_category in metadata["task_categories"]:
184
+
185
+ if any(task_category in task for task in list(TASKS_TO_PIPELINE_TAG.values())):
186
+ task_is_valid = True
187
+ if not task_is_valid:
188
+ continue
189
+
190
+ languages = metadata["languages"]
191
+
192
+ if "en" in languages or "en-US" in languages:
193
+ english_datasets[dataset_name] = metadata["task_categories"]
194
+ return english_datasets
195
+
196
+
197
+ @st.cache
198
+ def get_dataset_infos_dict(dataset: str, subset: str) -> DatasetInfo:
199
+ return DatasetInfosDict(get_dataset_infos(dataset))[subset]
200
+
201
+ # https://github.com/huggingface/datasets-viewer/blob/master/run.py#L49
202
+
203
+
204
+ def render_features(features):
205
+ # TODO redner translation object with the languages tags
206
+ if isinstance(features, dict):
207
+ return {k: render_features(v) for k, v in features.items()}
208
+ if isinstance(features, datasets.features.ClassLabel):
209
+ return features.names
210
+
211
+ if isinstance(features, datasets.features.Value):
212
+ return features.dtype
213
+
214
+ if isinstance(features, datasets.features.Sequence):
215
+ return {"[]": render_features(features.feature)}
216
+ return features
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
1
+ streamlit
2
+ datasets
3
+ jupytext
4
+ Jinja2
templates/Accelerate/task_templates/fill-mask.py.jinja ADDED
@@ -0,0 +1,271 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Before running, install required packages:
2
+ {% if notebook %}
3
+
4
+ !
5
+ {%- else %}
6
+ #
7
+ {%- endif %}
8
+ pip install datasets transformers[sentencepiece] accelerate
9
+
10
+ import collections
11
+ import logging
12
+ import math
13
+
14
+ import datasets
15
+ import numpy as np
16
+ import torch
17
+ import transformers
18
+ from accelerate import Accelerator
19
+ from accelerate.logging import get_logger
20
+ from accelerate.utils import set_seed
21
+ from codecarbon import EmissionsTracker
22
+ from datasets import load_dataset
23
+ from torch.optim import {{ optimizer }}
24
+ from torch.utils.data import DataLoader
25
+ from torch.utils.data.dataloader import DataLoader
26
+ from tqdm.auto import tqdm
27
+ from transformers import (AutoConfig, AutoModelForMaskedLM, AutoTokenizer,
28
+ DataCollatorForLanguageModeling, Trainer,
29
+ TrainingArguments, default_data_collator,
30
+ get_scheduler)
31
+ from transformers.utils.versions import require_version
32
+
33
+ {{ header("Setup") }}
34
+
35
+ tracker = EmissionsTracker(log_level='error')
36
+ tracker.start()
37
+
38
+ logger = get_logger(__name__)
39
+ require_version("datasets>=1.8.0")
40
+
41
+ accelerator = Accelerator()
42
+ set_seed({{ seed }})
43
+
44
+ logging.basicConfig(
45
+ format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
46
+ datefmt="%m/%d/%Y %H:%M:%S",
47
+ level=logging.ERROR,
48
+ )
49
+ logger.info(accelerator.state, main_process_only=False)
50
+ if accelerator.is_local_main_process:
51
+ datasets.utils.logging.set_verbosity_warning()
52
+ transformers.utils.logging.set_verbosity_info()
53
+ else:
54
+ datasets.utils.logging.set_verbosity_error()
55
+ transformers.utils.logging.set_verbosity_error()
56
+
57
+ {{ header("Load model and dataset") }}
58
+
59
+ {% if subset == 'default' %}
60
+ datasets = load_dataset('{{dataset}}')
61
+ {% else %}
62
+ datasets = load_dataset('{{dataset}}', '{{ subset }}')
63
+ {% endif %}
64
+ model_checkpoint = "{{model_checkpoint}}"
65
+ tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)
66
+ {% if pretrained %}
67
+ model = AutoModelFor{{task}}.from_pretrained(model_checkpoint)
68
+ {% else %}
69
+ config = AutoConfig.from_pretrained(model_checkpoint)
70
+ model = AutoModelFor{{task}}.from_config(config)
71
+ {% endif %}
72
+ model.resize_token_embeddings(len(tokenizer))
73
+ model_name = model_checkpoint.split("/")[-1]
74
+ if tokenizer.pad_token is None:
75
+ tokenizer.pad_token = tokenizer.eos_token
76
+
77
+ {{ header("Preprocessing") }}
78
+
79
+ def tokenize_function(examples):
80
+ result = tokenizer(examples["{{ feature }}"])
81
+ {% if task=="MaskedLM" %}
82
+ {% if whole_word_masking %}
83
+ if tokenizer.is_fast:
84
+ result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
85
+ {% endif %}
86
+ {% endif %}
87
+ return result
88
+
89
+ with accelerator.main_process_first():
90
+ tokenized_datasets = datasets.map(tokenize_function, batched=True, num_proc=4, remove_columns=list(set(sum(list(datasets.column_names.values()),[]))), desc="Running tokenizer on dataset"
91
+ )
92
+
93
+ block_size = {{ block_size }}
94
+
95
+ def group_texts(examples):
96
+ # Concatenate all texts.
97
+ concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
98
+ total_length = len(concatenated_examples[list(examples.keys())[0]])
99
+ # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
100
+ # customize this part to your needs.
101
+ total_length = (total_length // block_size) * block_size
102
+ # Split by chunks of max_len.
103
+ result = {
104
+ k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
105
+ for k, t in concatenated_examples.items()
106
+ }
107
+ result["labels"] = result["input_ids"].copy()
108
+ return result
109
+
110
+ with accelerator.main_process_first():
111
+ lm_datasets = tokenized_datasets.map(
112
+ group_texts,
113
+ batched=True,
114
+ batch_size=1000,
115
+ num_proc=4,
116
+ desc=f"Grouping texts in chunks of {block_size}",
117
+ )
118
+
119
+ {% if whole_word_masking %}
120
+ def whole_word_masking_data_collator(features):
121
+ for feature in features:
122
+ word_ids = feature.pop("word_ids")
123
+
124
+ # Create a map between words and corresponding token indices
125
+ mapping = collections.defaultdict(list)
126
+ current_word_index = -1
127
+ current_word = None
128
+ for idx, word_id in enumerate(word_ids):
129
+ if word_id is not None:
130
+ if word_id != current_word:
131
+ current_word = word_id
132
+ current_word_index += 1
133
+ mapping[current_word_index].append(idx)
134
+
135
+ # Randomly mask words
136
+ wwm_probability = {{ mlm_probability }}
137
+ mask = np.random.binomial(1, wwm_probability, (len(mapping),))
138
+ input_ids = feature["input_ids"]
139
+ labels = feature["labels"]
140
+ new_labels = [-100] * len(labels)
141
+ for word_id in np.where(mask)[0]:
142
+ word_id = word_id.item()
143
+ for idx in mapping[word_id]:
144
+ new_labels[idx] = labels[idx]
145
+ input_ids[idx] = tokenizer.mask_token_id
146
+
147
+ return default_data_collator(features)
148
+
149
+ data_collator = whole_word_masking_data_collator
150
+ {% else %}
151
+ data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability={{ mlm_probability }})
152
+ {% endif %}
153
+
154
+ def insert_random_mask(batch):
155
+ features = [dict(zip(batch, t)) for t in zip(*batch.values())]
156
+ masked_inputs = data_collator(features)
157
+ # Create a new "masked" column for each column in the dataset
158
+ return {"masked_" + k: v.numpy() for k, v in masked_inputs.items()}
159
+
160
+ {% if whole_word_masking %}
161
+ lm_datasetst = lm_datasets.remove_columns(["word_ids"])
162
+ {% endif %}
163
+ with accelerator.main_process_first():
164
+ eval_dataset = lm_datasets["{{ validation }}"].map(
165
+ insert_random_mask,
166
+ batched=True,
167
+ remove_columns=lm_datasets["{{ validation }}"].column_names,
168
+ desc="Inserting a random mask on eval dataset"
169
+ )
170
+
171
+ eval_dataset = eval_dataset.rename_columns(
172
+ {
173
+ name: name.split('masked_')[1] for name in eval_dataset.features.keys()
174
+ }
175
+ )
176
+
177
+
178
+ batch_size = {{ batch_size }}
179
+ train_dataloader = DataLoader(
180
+ lm_datasets["{{ train }}"],
181
+ shuffle=True,
182
+ batch_size=batch_size,
183
+ collate_fn=data_collator,
184
+ )
185
+ eval_dataloader = DataLoader(
186
+ eval_dataset, batch_size=batch_size, collate_fn=default_data_collator
187
+ )
188
+
189
+ {{ header("Training") }}
190
+
191
+ {% if use_weight_decay %}
192
+ weight_decay = {{ weight_decay }}
193
+ def get_grouped_params(model, no_decay=["bias", "LayerNorm.weight"]):
194
+ params_with_wd, params_without_wd = [], []
195
+ for n, p in model.named_parameters():
196
+ if any(nd in n for nd in no_decay):
197
+ params_without_wd.append(p)
198
+ else:
199
+ params_with_wd.append(p)
200
+ return [
201
+ {"params": params_with_wd, "weight_decay": weight_decay},
202
+ {"params": params_without_wd, "weight_decay": 0.0},
203
+ ]
204
+
205
+ optimizer = {{ optimizer }}(get_grouped_params(model), lr={{ lr }})
206
+ {% else %}
207
+ optimizer = {{ optimizer }}(model.parameters(), lr={{ lr }})
208
+ {% endif %}
209
+
210
+ accelerator = Accelerator()
211
+ model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
212
+ model, optimizer, train_dataloader, eval_dataloader
213
+ )
214
+
215
+ num_train_epochs = {{ num_epochs }}
216
+ gradient_accumulation_steps = {{ gradient_accumulation_steps }}
217
+ num_update_steps_per_epoch = math.ceil(len(train_dataloader) / gradient_accumulation_steps)
218
+ max_train_steps = num_train_epochs * num_update_steps_per_epoch
219
+ output_dir=f"{model_name}-finetuned"
220
+
221
+ lr_scheduler = get_scheduler(
222
+ '{{ lr_scheduler_type }}',
223
+ optimizer=optimizer,
224
+ num_warmup_steps={{ num_warmup_steps }},
225
+ num_training_steps=max_train_steps,
226
+ )
227
+
228
+ progress_bar = tqdm(range(max_train_steps), disable=not accelerator.is_local_main_process)
229
+ for epoch in range(num_train_epochs):
230
+ # Training
231
+ model.train()
232
+ for step, batch in enumerate(train_dataloader):
233
+ outputs = model(**batch)
234
+ loss = outputs.loss / gradient_accumulation_steps
235
+ accelerator.backward(loss)
236
+
237
+ if step % gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1:
238
+ #TODO Let the user decide on clip grad norm
239
+ accelerator.clip_grad_norm_(model.parameters(), 1.0)
240
+ optimizer.step()
241
+ lr_scheduler.step()
242
+ optimizer.zero_grad()
243
+ progress_bar.update(1)
244
+
245
+ # Evaluation
246
+ model.eval()
247
+ losses = []
248
+ for step, batch in enumerate(eval_dataloader):
249
+ with torch.no_grad():
250
+ outputs = model(**batch)
251
+
252
+ loss = outputs.loss
253
+ losses.append(accelerator.gather(loss.repeat(batch_size)))
254
+
255
+ losses = torch.cat(losses)
256
+ losses = losses[: len(eval_dataset)]
257
+ try:
258
+ eval_loss = torch.mean(losses)
259
+ perplexity = math.exp(eval_loss)
260
+ except OverflowError:
261
+ perplexity = float("inf")
262
+ accelerator.print({"loss/eval": eval_loss, "perplexity": perplexity})
263
+ model.train()
264
+ accelerator.wait_for_everyone()
265
+ unwrapped_model = accelerator.unwrap_model(model)
266
+ unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
267
+ if accelerator.is_main_process:
268
+ tokenizer.save_pretrained(output_dir)
269
+
270
+ emissions = tracker.stop()
271
+ accelerator.print(f'Emissions: {emissions} kg')
templates/Accelerate/task_templates/text-generation.py.jinja ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Before running, install required packages:
2
+ {% if notebook %}
3
+
4
+ !
5
+ {%- else %}
6
+ #
7
+ {%- endif %}
8
+ pip install datasets transformers[sentencepiece] accelerate sacrebleu==1.4.14 codecarbon sacremoses
9
+
10
+ import collections
11
+ import logging
12
+ import math
13
+ import random
14
+
15
+ import datasets
16
+ import numpy as np
17
+ import torch
18
+ import transformers
19
+ from accelerate import Accelerator
20
+ from accelerate.logging import get_logger
21
+ from accelerate.utils import set_seed
22
+ from codecarbon import EmissionsTracker
23
+ from datasets import load_dataset
24
+ from torch.optim import {{ optimizer }}
25
+ from torch.utils.data import DataLoader
26
+ from torch.utils.data.dataloader import DataLoader
27
+ from tqdm.auto import tqdm
28
+ from transformers import (AutoConfig, AutoModelForCausalLM, AutoModelForMaskedLM, AutoTokenizer,
29
+ DataCollatorForLanguageModeling, Trainer,
30
+ TrainingArguments, default_data_collator,
31
+ get_scheduler)
32
+ from transformers.utils.versions import require_version
33
+
34
+ {{ header("Setup") }}
35
+
36
+ tracker = EmissionsTracker(log_level='error')
37
+ tracker.start()
38
+
39
+ logger = get_logger(__name__)
40
+ require_version("datasets>=1.8.0")
41
+
42
+ accelerator = Accelerator()
43
+ set_seed({{ seed }})
44
+
45
+ logging.basicConfig(
46
+ format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
47
+ datefmt="%m/%d/%Y %H:%M:%S",
48
+ level=logging.ERROR,
49
+ )
50
+ logger.info(accelerator.state, main_process_only=False)
51
+ if accelerator.is_local_main_process:
52
+ datasets.utils.logging.set_verbosity_warning()
53
+ transformers.utils.logging.set_verbosity_info()
54
+ else:
55
+ datasets.utils.logging.set_verbosity_error()
56
+ transformers.utils.logging.set_verbosity_error()
57
+
58
+ {{ header("Load model and dataset") }}
59
+
60
+ {% if subset == 'default' %}
61
+ datasets = load_dataset('{{dataset}}')
62
+ {% else %}
63
+ datasets = load_dataset('{{dataset}}', '{{ subset }}')
64
+ {% endif %}
65
+ model_checkpoint = "{{model_checkpoint}}"
66
+ tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)
67
+ {% if pretrained %}
68
+ model = AutoModelFor{{task}}.from_pretrained(model_checkpoint)
69
+ {% else %}
70
+ config = AutoConfig.from_pretrained(model_checkpoint)
71
+ model = AutoModelFor{{task}}.from_config(config)
72
+ {% endif %}
73
+ model.resize_token_embeddings(len(tokenizer))
74
+ model_name = model_checkpoint.split("/")[-1]
75
+
76
+ if tokenizer.pad_token is None:
77
+ tokenizer.pad_token = tokenizer.eos_token
78
+
79
+ {{ header("Preprocessing") }}
80
+
81
+ def tokenize_function(examples):
82
+ result = tokenizer(examples["{{ feature }}"])
83
+ {% if task=="MaskedLM" %}
84
+ {% if whole_word_masking %}
85
+ if tokenizer.is_fast:
86
+ result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
87
+ {% endif %}
88
+ {% endif %}
89
+ return result
90
+
91
+ with accelerator.main_process_first():
92
+ tokenized_datasets = datasets.map(tokenize_function, batched=True, num_proc=4, remove_columns=list(set(sum(list(datasets.column_names.values()),[]))), desc="Running tokenizer on dataset"
93
+ )
94
+
95
+ block_size = {{ block_size }}
96
+
97
+ def group_texts(examples):
98
+ # Concatenate all texts.
99
+ concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
100
+ total_length = len(concatenated_examples[list(examples.keys())[0]])
101
+ # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
102
+ # customize this part to your needs.
103
+ total_length = (total_length // block_size) * block_size
104
+ # Split by chunks of max_len.
105
+ result = {
106
+ k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
107
+ for k, t in concatenated_examples.items()
108
+ }
109
+ result["labels"] = result["input_ids"].copy()
110
+ return result
111
+
112
+ with accelerator.main_process_first():
113
+ lm_datasets = tokenized_datasets.map(
114
+ group_texts,
115
+ batched=True,
116
+ batch_size=1000,
117
+ num_proc=4,
118
+ desc=f"Grouping texts in chunks of {block_size}",
119
+ )
120
+
121
+ data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)
122
+ batch_size = {{ batch_size }}
123
+ train_dataloader = DataLoader(lm_datasets["{{ train }}"], batch_size=batch_size, shuffle=True, collate_fn=data_collator)
124
+ eval_dataloader = DataLoader(lm_datasets["{{ validation }}"], batch_size=batch_size, collate_fn=data_collator)
125
+
126
+ {{ header("Training") }}
127
+
128
+ {% if use_weight_decay %}
129
+ weight_decay = {{ weight_decay }}
130
+ def get_grouped_params(model, no_decay=["bias", "LayerNorm.weight"]):
131
+ params_with_wd, params_without_wd = [], []
132
+ for n, p in model.named_parameters():
133
+ if any(nd in n for nd in no_decay):
134
+ params_without_wd.append(p)
135
+ else:
136
+ params_with_wd.append(p)
137
+ return [
138
+ {"params": params_with_wd, "weight_decay": weight_decay},
139
+ {"params": params_without_wd, "weight_decay": 0.0},
140
+ ]
141
+
142
+ optimizer = {{ optimizer }}(get_grouped_params(model), lr={{ lr }})
143
+ {% else %}
144
+ optimizer = {{ optimizer }}(model.parameters(), lr={{ lr }})
145
+ {% endif %}
146
+
147
+ model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
148
+ model, optimizer, train_dataloader, eval_dataloader
149
+ )
150
+
151
+ num_train_epochs = {{ num_epochs }}
152
+ gradient_accumulation_steps = {{ gradient_accumulation_steps }}
153
+ num_update_steps_per_epoch = math.ceil(len(train_dataloader) / gradient_accumulation_steps)
154
+ max_train_steps = num_train_epochs * num_update_steps_per_epoch
155
+ output_dir=f"{model_name}-finetuned"
156
+
157
+ lr_scheduler = get_scheduler(
158
+ '{{ lr_scheduler_type }}',
159
+ optimizer=optimizer,
160
+ num_warmup_steps={{ num_warmup_steps }},
161
+ num_training_steps=max_train_steps,
162
+ )
163
+
164
+ progress_bar = tqdm(range(max_train_steps), disable=not accelerator.is_local_main_process)
165
+ for epoch in range(num_train_epochs):
166
+ # Training
167
+ model.train()
168
+ for step, batch in enumerate(train_dataloader):
169
+ outputs = model(**batch)
170
+ loss = outputs.loss / gradient_accumulation_steps
171
+ accelerator.backward(loss)
172
+ if step % gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1:
173
+ #TODO Let the user decide on clip grad norm
174
+ accelerator.clip_grad_norm_(model.parameters(), 1.0)
175
+ optimizer.step()
176
+ lr_scheduler.step()
177
+ optimizer.zero_grad()
178
+ progress_bar.update(1)
179
+
180
+ # Evaluation
181
+ model.eval()
182
+ losses = []
183
+ for step, batch in enumerate(eval_dataloader):
184
+ with torch.no_grad():
185
+ outputs = model(**batch)
186
+
187
+ loss = outputs.loss
188
+ losses.append(accelerator.gather(loss.repeat(batch_size)))
189
+
190
+ losses = torch.cat(losses)
191
+ losses = losses[: len(eval_dataloader.dataset)]
192
+ try:
193
+ eval_loss = torch.mean(losses)
194
+ perplexity = math.exp(eval_loss)
195
+ except OverflowError:
196
+ perplexity = float("inf")
197
+
198
+ accelerator.print(f"epoch {epoch}: perplexity: {perplexity} eval_loss: {eval_loss}")
199
+ model.train()
200
+ accelerator.wait_for_everyone()
201
+ unwrapped_model = accelerator.unwrap_model(model)
202
+ unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
203
+ if accelerator.is_main_process:
204
+ tokenizer.save_pretrained(output_dir)
205
+
206
+ emissions = tracker.stop()
207
+ accelerator.print(f'Emissions: {emissions} kg')
templates/Accelerate/task_templates/translation.py.jinja ADDED
@@ -0,0 +1,287 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Before running, install required packages:
2
+ {% if notebook %}
3
+
4
+ !
5
+ {%- else %}
6
+ #
7
+ {%- endif %}
8
+ pip install datasets transformers[sentencepiece] accelerate sacrebleu==1.4.14 codecarbon sacremoses
9
+
10
+ import collections
11
+ import logging
12
+ import math
13
+ import random
14
+
15
+ import babel
16
+ import datasets
17
+ import numpy as np
18
+ import torch
19
+ import transformers
20
+ from accelerate import Accelerator
21
+ from accelerate.logging import get_logger
22
+ from accelerate.utils import set_seed
23
+ from codecarbon import EmissionsTracker
24
+ from datasets import load_dataset, load_metric
25
+ from torch.optim import {{ optimizer }}
26
+ from torch.utils.data import DataLoader
27
+ from torch.utils.data.dataloader import DataLoader
28
+ from tqdm.auto import tqdm
29
+ from transformers import (AutoConfig, AutoModelForSeq2SeqLM, AutoTokenizer,
30
+ DataCollatorForLanguageModeling,
31
+ DataCollatorForSeq2Seq, MBartTokenizer,
32
+ MBartTokenizerFast, Trainer, TrainingArguments,
33
+ default_data_collator, get_scheduler)
34
+ from transformers.utils.versions import require_version
35
+
36
+ {{ header("Setup") }}
37
+
38
+ tracker = EmissionsTracker(log_level='error')
39
+ tracker.start()
40
+
41
+ logger = get_logger(__name__)
42
+ require_version("datasets>=1.8.0")
43
+
44
+ accelerator = Accelerator()
45
+ set_seed({{ seed }})
46
+
47
+ logging.basicConfig(
48
+ format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
49
+ datefmt="%m/%d/%Y %H:%M:%S",
50
+ level=logging.ERROR,
51
+ )
52
+ logger.info(accelerator.state, main_process_only=False)
53
+ if accelerator.is_local_main_process:
54
+ datasets.utils.logging.set_verbosity_warning()
55
+ transformers.utils.logging.set_verbosity_info()
56
+ else:
57
+ datasets.utils.logging.set_verbosity_error()
58
+ transformers.utils.logging.set_verbosity_error()
59
+
60
+ {{ header("Load model and dataset") }}
61
+
62
+ {% if subset == 'default' %}
63
+ datasets = load_dataset('{{dataset}}')
64
+ {% else %}
65
+ datasets = load_dataset('{{dataset}}', '{{ subset }}')
66
+ {% endif %}
67
+ metric = load_metric("sacrebleu")
68
+ model_checkpoint = "{{model_checkpoint}}"
69
+ tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)
70
+ {% if pretrained %}
71
+ model = AutoModelFor{{task}}.from_pretrained(model_checkpoint)
72
+ {% else %}
73
+ config = AutoConfig.from_pretrained(model_checkpoint)
74
+ model = AutoModelFor{{task}}.from_config(config)
75
+ {% endif %}
76
+ model.resize_token_embeddings(len(tokenizer))
77
+ model_name = model_checkpoint.split("/")[-1]
78
+
79
+ {{ header("Preprocessing") }}
80
+
81
+ source_lang = '{{ source_language }}'
82
+ target_lang = '{{ target_language }}'
83
+ {% if 'mbart' in model_checkpoint %}
84
+
85
+ # Set decoder_start_token_id
86
+ if model.config.decoder_start_token_id is None and isinstance(tokenizer, (MBartTokenizer, MBartTokenizerFast)):
87
+ assert (
88
+ target_lang is not None and source_lang is not None
89
+ ), "mBart requires --target_lang and --source_lang"
90
+ if isinstance(tokenizer, MBartTokenizer):
91
+ model.config.decoder_start_token_id = tokenizer.lang_code_to_id[target_lang]
92
+ else:
93
+ model.config.decoder_start_token_id = tokenizer.convert_tokens_to_ids(target_lang)
94
+
95
+ {% endif %}
96
+ {% if 't5' in model_checkpoint %}
97
+ if model_checkpoint in ["t5-small", "t5-base", "t5-larg", "t5-3b", "t5-11b"]:
98
+ for language in (source_lang, target_lang):
99
+ if language != language[:2]:
100
+ logging.warning(
101
+ 'Extended language code %s not supported. Falling back on %s.',
102
+ language, language[:2]
103
+ )
104
+ lang_id_to_string = {
105
+ source_lang: babel.Locale(source_lang[:2]).english_name,
106
+ target_lang: babel.Locale(target_lang[:2]).english_name,
107
+ }
108
+ src_str = 'translate {}'.format(lang_id_to_string[source_lang])
109
+ tgt_str = ' to {}: '.format(lang_id_to_string[target_lang])
110
+ prefix = src_str + tgt_str
111
+ else:
112
+ prefix = ""
113
+ {% else %}
114
+ prefix = ""
115
+ {% endif %}
116
+ {% if 'mbart' in model_checkpoint %}
117
+
118
+ # For translation we set the codes of our source and target languages (only useful for mBART, the others will
119
+ # ignore those attributes).
120
+ if isinstance(tokenizer, (MBartTokenizer, MBartTokenizerFast)):
121
+ label = ['ar_AR', 'cs_CZ', 'de_DE', 'en_XX', 'es_XX', 'et_EE', 'fi_FI', 'fr_XX', 'gu_IN', 'hi_IN', 'it_IT', 'ja_XX', 'kk_KZ', 'ko_KR', 'lt_LT', 'lv_LV', 'my_MM', 'ne_NP', 'nl_XX', 'ro_RO', 'ru_RU', 'si_LK', 'tr_TR', 'vi_VN', 'zh_CN']
122
+ source_code = [item for item in label if item.startswith(source_lang)][0]
123
+ target_code = [item for item in label if item.startswith(target_lang)][0]
124
+ if source_lang is not None:
125
+ tokenizer.src_lang = source_code
126
+ if target_lang is not None:
127
+ tokenizer.tgt_lang = target_code
128
+ {% endif %}
129
+ max_input_length = {{ block_size }}
130
+ max_target_length = {{ block_size }}
131
+
132
+ def preprocess_function(examples):
133
+ inputs = [prefix + ex[source_lang] for ex in examples["translation"]]
134
+ targets = [ex[target_lang] for ex in examples["translation"]]
135
+ model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)
136
+
137
+ # Setup the tokenizer for targets
138
+ with tokenizer.as_target_tokenizer():
139
+ labels = tokenizer(targets, max_length=max_target_length, truncation=True)
140
+
141
+ model_inputs["labels"] = labels["input_ids"]
142
+ return model_inputs
143
+
144
+ with accelerator.main_process_first():
145
+ tokenized_datasets = datasets.map(preprocess_function, batched=True, num_proc=4, remove_columns=list(
146
+ set(sum(list(datasets.column_names.values()), []))), desc="Running tokenizer on dataset")
147
+
148
+ data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, pad_to_multiple_of=8 if accelerator.use_fp16 else None)
149
+ batch_size = {{ batch_size }}
150
+ train_dataloader = DataLoader(tokenized_datasets["{{ train }}"], batch_size=batch_size, shuffle=True, collate_fn=data_collator)
151
+ eval_dataloader = DataLoader(tokenized_datasets["{{ validation }}"], batch_size=batch_size, collate_fn=data_collator)
152
+
153
+ {{ header("Training") }}
154
+
155
+ def compute_metrics(eval_preds):
156
+ preds, labels = eval_preds
157
+ # In case the model returns more than the prediction logits
158
+ if isinstance(preds, tuple):
159
+ preds = preds[0]
160
+
161
+ decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
162
+
163
+ # Replace -100s in the labels as we can't decode them
164
+ labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
165
+ decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
166
+
167
+ # Some simple post-processing
168
+ decoded_preds = [pred.strip() for pred in decoded_preds]
169
+ decoded_labels = [[label.strip()] for label in decoded_labels]
170
+
171
+ result = metric.compute(predictions=decoded_preds,
172
+ references=decoded_labels)
173
+ return {"bleu": result["score"]}
174
+
175
+
176
+ def postprocess(predictions, labels):
177
+ predictions = predictions.cpu().numpy()
178
+ labels = labels.cpu().numpy()
179
+
180
+ decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
181
+
182
+ # Replace -100 in the labels as we can't decode them.
183
+ labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
184
+ decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
185
+
186
+ # Some simple post-processing
187
+ decoded_preds = [pred.strip() for pred in decoded_preds]
188
+ decoded_labels = [[label.strip()] for label in decoded_labels]
189
+ return decoded_preds, decoded_labels
190
+
191
+ {% if use_weight_decay %}
192
+ weight_decay = {{ weight_decay }}
193
+ def get_grouped_params(model, no_decay=["bias", "LayerNorm.weight"]):
194
+ params_with_wd, params_without_wd = [], []
195
+ for n, p in model.named_parameters():
196
+ if any(nd in n for nd in no_decay):
197
+ params_without_wd.append(p)
198
+ else:
199
+ params_with_wd.append(p)
200
+ return [
201
+ {"params": params_with_wd, "weight_decay": weight_decay},
202
+ {"params": params_without_wd, "weight_decay": 0.0},
203
+ ]
204
+
205
+ optimizer = {{ optimizer }}(get_grouped_params(model), lr={{ lr }})
206
+ {% else %}
207
+ optimizer = {{ optimizer }}(model.parameters(), lr={{ lr }})
208
+ {% endif %}
209
+ model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
210
+ model, optimizer, train_dataloader, eval_dataloader
211
+ )
212
+
213
+ num_train_epochs = {{ num_epochs }}
214
+ gradient_accumulation_steps = {{ gradient_accumulation_steps }}
215
+ num_update_steps_per_epoch = math.ceil(len(train_dataloader) / gradient_accumulation_steps)
216
+ max_train_steps = num_train_epochs * num_update_steps_per_epoch
217
+ output_dir=f"{model_name}-finetuned"
218
+
219
+ lr_scheduler = get_scheduler(
220
+ '{{ lr_scheduler_type }}',
221
+ optimizer=optimizer,
222
+ num_warmup_steps={{ num_warmup_steps }},
223
+ num_training_steps=max_train_steps,
224
+ )
225
+
226
+ progress_bar = tqdm(range(max_train_steps), disable=not accelerator.is_local_main_process)
227
+ for epoch in range(num_train_epochs):
228
+ # Training
229
+ model.train()
230
+ for step, batch in enumerate(train_dataloader):
231
+ outputs = model(**batch)
232
+ loss = outputs.loss
233
+ loss = loss / gradient_accumulation_steps
234
+ accelerator.backward(loss)
235
+
236
+ if step % gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1:
237
+ optimizer.step()
238
+ lr_scheduler.step()
239
+ optimizer.zero_grad()
240
+ progress_bar.update(1)
241
+
242
+
243
+ # Evaluation
244
+ model.eval()
245
+ samples_seen = 0
246
+ for step, batch in enumerate(eval_dataloader):
247
+ with torch.no_grad():
248
+ generated_tokens = accelerator.unwrap_model(model).generate(
249
+ batch["input_ids"],
250
+ attention_mask=batch["attention_mask"],
251
+ max_length=128,
252
+ )
253
+ labels = batch["labels"]
254
+
255
+ # Necessary to pad predictions and labels for being gathered
256
+ generated_tokens = accelerator.pad_across_processes(
257
+ generated_tokens, dim=1, pad_index=tokenizer.pad_token_id
258
+ )
259
+ labels = accelerator.pad_across_processes(labels, dim=1, pad_index=-100)
260
+
261
+ predictions_gathered = accelerator.gather(generated_tokens)
262
+ labels_gathered = accelerator.gather(labels)
263
+
264
+ decoded_preds, decoded_labels = postprocess(predictions_gathered, labels_gathered)
265
+
266
+ if accelerator.num_processes > 1:
267
+ if step == len(eval_dataloader) - 1:
268
+ decoded_preds = decoded_preds[: len(
269
+ eval_dataloader.dataset) - samples_seen]
270
+ decoded_labels = decoded_labels[: len(
271
+ eval_dataloader.dataset) - samples_seen]
272
+ else:
273
+ samples_seen += len(decoded_labels)
274
+ metric.add_batch(predictions=decoded_preds, references=decoded_labels)
275
+
276
+ results = metric.compute()
277
+ print(f"epoch {epoch}, BLEU score: {results['score']:.2f}")
278
+
279
+ # Save and upload
280
+ accelerator.wait_for_everyone()
281
+ unwrapped_model = accelerator.unwrap_model(model)
282
+ unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
283
+ if accelerator.is_main_process:
284
+ tokenizer.save_pretrained(output_dir)
285
+
286
+ emissions = tracker.stop()
287
+ print(f'Emissions: {emissions} kg')
templates/Trainer/task_templates/fill-mask.py.jinja ADDED
@@ -0,0 +1,185 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Before running, install required packages:
2
+ {% if notebook %}
3
+
4
+ !
5
+ {%- else %}
6
+ #
7
+ {%- endif %}
8
+ pip install datasets transformers
9
+
10
+ import collections
11
+ import math
12
+ import logging
13
+
14
+ import numpy as np
15
+ import transformers
16
+ import datasets
17
+ from datasets import load_dataset
18
+ from transformers import (AutoConfig, AutoModelForMaskedLM, AutoTokenizer,
19
+ DataCollatorForLanguageModeling, Trainer,
20
+ TrainingArguments, default_data_collator, set_seed)
21
+ from transformers.testing_utils import CaptureLogger
22
+ from transformers.utils.versions import require_version
23
+
24
+ {{ header("Setup") }}
25
+
26
+
27
+ logger = logging.getLogger(__name__)
28
+ require_version("datasets>=1.8.0")
29
+ set_seed({{ seed }})
30
+ logging.basicConfig(
31
+ format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
32
+ datefmt="%m/%d/%Y %H:%M:%S",
33
+ level=logging.ERROR,
34
+ )
35
+ datasets.utils.logging.set_verbosity_warning()
36
+ transformers.utils.logging.set_verbosity_info()
37
+
38
+
39
+ {{ header("Load model and dataset") }}
40
+
41
+ {% if subset == 'default' %}
42
+ datasets = load_dataset('{{dataset}}')
43
+ {% else %}
44
+ datasets = load_dataset('{{dataset}}', '{{ subset }}')
45
+ {% endif %}
46
+ model_checkpoint = "{{model_checkpoint}}"
47
+ tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)
48
+ {% if pretrained %}
49
+ model = AutoModelFor{{task}}.from_pretrained(model_checkpoint)
50
+ {% else %}
51
+ config = AutoConfig.from_pretrained(model_checkpoint)
52
+ model = AutoModelFor{{task}}.from_config(config)
53
+ {% endif %}
54
+ model.resize_token_embeddings(len(tokenizer))
55
+ model_name = model_checkpoint.split("/")[-1]
56
+
57
+ if tokenizer.pad_token is None:
58
+ tokenizer.pad_token = tokenizer.eos_token
59
+
60
+ {{ header("Preprocessing") }}
61
+
62
+ # since this will be pickled to avoid _LazyModule error in Hasher force logger loading before tokenize_function
63
+ tok_logger = transformers.utils.logging.get_logger("transformers.tokenization_utils_base")
64
+ def tokenize_function(examples):
65
+ with CaptureLogger(tok_logger) as cl:
66
+ result = tokenizer(examples["{{ feature }}"])
67
+ if "Token indices sequence length is longer than the" in cl.out:
68
+ tok_logger.warning(
69
+ "^^^^^^^^^^^^^^^^ Please ignore the warning above - this long input will be chunked into smaller bits"
70
+ " before being passed to the model."
71
+ )
72
+ if tokenizer.is_fast:
73
+ result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
74
+ return result
75
+
76
+ tokenized_datasets = datasets.map(tokenize_function, batched=True, num_proc=4, remove_columns=list(set(sum(list(datasets.column_names.values()),[]))), desc="Running tokenizer on dataset"
77
+ )
78
+ block_size = {{ block_size }}
79
+
80
+ def group_texts(examples):
81
+ # Concatenate all texts.
82
+ concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
83
+ total_length = len(concatenated_examples[list(examples.keys())[0]])
84
+ # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
85
+ # customize this part to your needs.
86
+ total_length = (total_length // block_size) * block_size
87
+ # Split by chunks of max_len.
88
+ result = {
89
+ k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
90
+ for k, t in concatenated_examples.items()
91
+ }
92
+ result["labels"] = result["input_ids"].copy()
93
+ return result
94
+
95
+ lm_datasets = tokenized_datasets.map(
96
+ group_texts,
97
+ batched=True,
98
+ batch_size=1000,
99
+ num_proc=4,
100
+ desc=f"Grouping texts in chunks of {block_size}",
101
+ )
102
+
103
+ {{ header("Training") }}
104
+
105
+ training_args = TrainingArguments(
106
+ output_dir=f"{model_name}-finetuned",
107
+ per_device_train_batch_size={{ batch_size }},
108
+ per_device_eval_batch_size={{ batch_size }},
109
+ evaluation_strategy='epoch',
110
+ logging_strategy='epoch',
111
+ save_strategy='epoch',
112
+ optim='{{ optimizer }}',
113
+ learning_rate={{ lr }},
114
+ num_train_epochs={{ num_epochs }},
115
+ gradient_accumulation_steps={{ gradient_accumulation_steps }},
116
+ lr_scheduler_type='{{ lr_scheduler_type }}',
117
+ warmup_steps={{ num_warmup_steps }},
118
+ {% if use_weight_decay%}
119
+ weight_decay={{ weight_decay }},
120
+ {% endif %}
121
+ push_to_hub=False,
122
+ dataloader_num_workers=0,
123
+ {% if task=="MaskedLM" %}
124
+ {% if whole_word_masking %}
125
+ remove_unused_columns=False,
126
+ {% endif %}
127
+ {% endif %}
128
+ load_best_model_at_end=True,
129
+ log_level='error'
130
+ )
131
+
132
+
133
+ {% if whole_word_masking %}
134
+ def whole_word_masking_data_collator(features):
135
+ for feature in features:
136
+ word_ids = feature.pop("word_ids")
137
+
138
+ # Create a map between words and corresponding token indices
139
+ mapping = collections.defaultdict(list)
140
+ current_word_index = -1
141
+ current_word = None
142
+ for idx, word_id in enumerate(word_ids):
143
+ if word_id is not None:
144
+ if word_id != current_word:
145
+ current_word = word_id
146
+ current_word_index += 1
147
+ mapping[current_word_index].append(idx)
148
+
149
+ # Randomly mask words
150
+ wwm_probability = {{ mlm_probability }}
151
+ mask = np.random.binomial(1, wwm_probability, (len(mapping),))
152
+ input_ids = feature["input_ids"]
153
+ labels = feature["labels"]
154
+ new_labels = [-100] * len(labels)
155
+ for word_id in np.where(mask)[0]:
156
+ word_id = word_id.item()
157
+ for idx in mapping[word_id]:
158
+ new_labels[idx] = labels[idx]
159
+ input_ids[idx] = tokenizer.mask_token_id
160
+
161
+ return default_data_collator(features)
162
+ data_collator = whole_word_masking_data_collator
163
+ {% else %}
164
+ data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability={{ mlm_probability }})
165
+ {% endif %}
166
+
167
+
168
+ trainer = Trainer(
169
+ model=model,
170
+ args=training_args,
171
+ train_dataset=lm_datasets["{{ train }}"],
172
+ eval_dataset=lm_datasets["{{ validation }}"],
173
+ data_collator=data_collator,
174
+ )
175
+
176
+ train_result = trainer.train()
177
+ trainer.save_model()
178
+ trainer.log_metrics("train", train_result.metrics)
179
+ trainer.save_metrics("train", train_result.metrics)
180
+ trainer.save_state()
181
+ eval_results = trainer.evaluate()
182
+ eval_results["perplexity"] = math.exp(eval_results['eval_loss'])
183
+ print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")
184
+ trainer.log_metrics("eval", eval_results)
185
+ trainer.save_metrics("eval", eval_results)
templates/Trainer/task_templates/text-generation.py.jinja ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Before running, install required packages:
2
+ {% if notebook %}
3
+
4
+ !
5
+ {%- else %}
6
+ #
7
+ {%- endif %}
8
+ pip install datasets transformers
9
+
10
+ import collections
11
+ import math
12
+ import logging
13
+
14
+ import numpy as np
15
+ import transformers
16
+ import datasets
17
+ from datasets import load_dataset
18
+ from transformers import (AutoConfig, AutoModelForCausalLM, AutoTokenizer,
19
+ DataCollatorForLanguageModeling, Trainer,
20
+ TrainingArguments, default_data_collator, set_seed)
21
+ from transformers.testing_utils import CaptureLogger
22
+ from transformers.utils.versions import require_version
23
+
24
+ {{ header("Setup") }}
25
+
26
+
27
+ logger = logging.getLogger(__name__)
28
+ require_version("datasets>=1.8.0")
29
+ set_seed({{ seed }})
30
+ logging.basicConfig(
31
+ format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
32
+ datefmt="%m/%d/%Y %H:%M:%S",
33
+ level=logging.ERROR,
34
+ )
35
+ datasets.utils.logging.set_verbosity_warning()
36
+ transformers.utils.logging.set_verbosity_info()
37
+
38
+
39
+ {{ header("Load model and dataset") }}
40
+
41
+ {% if subset == 'default' %}
42
+ datasets = load_dataset('{{dataset}}')
43
+ {% else %}
44
+ datasets = load_dataset('{{dataset}}', '{{ subset }}')
45
+ {% endif %}
46
+ model_checkpoint = "{{model_checkpoint}}"
47
+ tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)
48
+ {% if pretrained %}
49
+ model = AutoModelFor{{task}}.from_pretrained(model_checkpoint)
50
+ {% else %}
51
+ config = AutoConfig.from_pretrained(model_checkpoint)
52
+ model = AutoModelFor{{task}}.from_config(config)
53
+ {% endif %}
54
+ model.resize_token_embeddings(len(tokenizer))
55
+ model_name = model_checkpoint.split("/")[-1]
56
+
57
+ if tokenizer.pad_token is None:
58
+ tokenizer.pad_token = tokenizer.eos_token
59
+
60
+ {{ header("Preprocessing") }}
61
+
62
+ # since this will be pickled to avoid _LazyModule error in Hasher force logger loading before tokenize_function
63
+ tok_logger = transformers.utils.logging.get_logger("transformers.tokenization_utils_base")
64
+ def tokenize_function(examples):
65
+ with CaptureLogger(tok_logger) as cl:
66
+ result = tokenizer(examples["{{ feature }}"])
67
+ if "Token indices sequence length is longer than the" in cl.out:
68
+ tok_logger.warning(
69
+ "^^^^^^^^^^^^^^^^ Please ignore the warning above - this long input will be chunked into smaller bits"
70
+ " before being passed to the model."
71
+ )
72
+ if tokenizer.is_fast:
73
+ result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
74
+ return result
75
+
76
+ tokenized_datasets = datasets.map(tokenize_function, batched=True, num_proc=4, remove_columns=list(set(sum(list(datasets.column_names.values()),[]))), desc="Running tokenizer on dataset"
77
+ )
78
+ block_size = {{ block_size }}
79
+
80
+ def group_texts(examples):
81
+ # Concatenate all texts.
82
+ concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
83
+ total_length = len(concatenated_examples[list(examples.keys())[0]])
84
+ # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
85
+ # customize this part to your needs.
86
+ total_length = (total_length // block_size) * block_size
87
+ # Split by chunks of max_len.
88
+ result = {
89
+ k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
90
+ for k, t in concatenated_examples.items()
91
+ }
92
+ result["labels"] = result["input_ids"].copy()
93
+ return result
94
+
95
+ lm_datasets = tokenized_datasets.map(
96
+ group_texts,
97
+ batched=True,
98
+ batch_size=1000,
99
+ num_proc=4,
100
+ desc=f"Grouping texts in chunks of {block_size}",
101
+ )
102
+
103
+ {{ header("Training") }}
104
+
105
+ training_args = TrainingArguments(
106
+ output_dir=f"{model_name}-finetuned",
107
+ per_device_train_batch_size={{ batch_size }},
108
+ per_device_eval_batch_size={{ batch_size }},
109
+ evaluation_strategy='epoch',
110
+ logging_strategy='epoch',
111
+ save_strategy='epoch',
112
+ optim='{{ optimizer }}',
113
+ learning_rate={{ lr }},
114
+ num_train_epochs={{ num_epochs }},
115
+ gradient_accumulation_steps={{ gradient_accumulation_steps }},
116
+ lr_scheduler_type='{{ lr_scheduler_type }}',
117
+ warmup_steps={{ num_warmup_steps }},
118
+ {% if use_weight_decay%}
119
+ weight_decay={{ weight_decay }},
120
+ {% endif %}
121
+ push_to_hub=False,
122
+ dataloader_num_workers=0,
123
+ {% if task=="MaskedLM" %}
124
+ {% if whole_word_masking %}
125
+ remove_unused_columns=False,
126
+ {% endif %}
127
+ {% endif %}
128
+ load_best_model_at_end=True,
129
+ log_level='error'
130
+ )
131
+
132
+ data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)
133
+
134
+
135
+ trainer = Trainer(
136
+ model=model,
137
+ args=training_args,
138
+ train_dataset=lm_datasets["{{ train }}"],
139
+ eval_dataset=lm_datasets["{{ validation }}"],
140
+ data_collator=data_collator,
141
+ )
142
+
143
+ train_result = trainer.train()
144
+ trainer.save_model()
145
+ trainer.log_metrics("train", train_result.metrics)
146
+ trainer.save_metrics("train", train_result.metrics)
147
+ trainer.save_state()
148
+ eval_results = trainer.evaluate()
149
+ eval_results["perplexity"] = math.exp(eval_results['eval_loss'])
150
+ print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")
151
+ trainer.log_metrics("eval", eval_results)
152
+ trainer.save_metrics("eval", eval_results)
templates/Trainer/task_templates/translation.py.jinja ADDED
@@ -0,0 +1,218 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Before running, install required packages:
2
+ {% if notebook %}
3
+
4
+ !
5
+ {%- else %}
6
+ #
7
+ {%- endif %}
8
+ pip install datasets transformers[sentencepiece] accelerate sacrebleu==1.4.14 sacremoses
9
+
10
+ import collections
11
+ import logging
12
+ import math
13
+ import random
14
+
15
+ import babel
16
+ import datasets
17
+ import numpy as np
18
+ import torch
19
+ import transformers
20
+ from datasets import load_dataset, load_metric
21
+ from torch.utils.data import DataLoader
22
+ from torch.utils.data.dataloader import DataLoader
23
+ from tqdm.auto import tqdm
24
+ from transformers import (AutoConfig, AutoModelForSeq2SeqLM, AutoTokenizer,
25
+ DataCollatorForLanguageModeling,
26
+ DataCollatorForSeq2Seq, MBartTokenizer,
27
+ MBartTokenizerFast, Seq2SeqTrainer, Seq2SeqTrainingArguments,
28
+ default_data_collator, get_scheduler)
29
+ from transformers.utils.versions import require_version
30
+
31
+ {{ header("Setup") }}
32
+
33
+
34
+ logger = logging.getLogger(__name__)
35
+ require_version("datasets>=1.8.0")
36
+ set_seed({{ seed }})
37
+ logging.basicConfig(
38
+ format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
39
+ datefmt="%m/%d/%Y %H:%M:%S",
40
+ level=logging.ERROR,
41
+ )
42
+ datasets.utils.logging.set_verbosity_warning()
43
+ transformers.utils.logging.set_verbosity_info()
44
+
45
+ {{ header("Load model and dataset") }}
46
+
47
+ {% if subset == 'default' %}
48
+ datasets = load_dataset('{{dataset}}')
49
+ {% else %}
50
+ datasets = load_dataset('{{dataset}}', '{{ subset }}')
51
+ {% endif %}
52
+ metric = load_metric("sacrebleu")
53
+ model_checkpoint = "{{model_checkpoint}}"
54
+ tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)
55
+ {% if pretrained %}
56
+ model = AutoModelFor{{task}}.from_pretrained(model_checkpoint)
57
+ {% else %}
58
+ config = AutoConfig.from_pretrained(model_checkpoint)
59
+ model = AutoModelFor{{task}}.from_config(config)
60
+ {% endif %}
61
+ model.resize_token_embeddings(len(tokenizer))
62
+ model_name = model_checkpoint.split("/")[-1]
63
+
64
+ {{ header("Preprocessing") }}
65
+
66
+ source_lang = '{{ source_language }}'
67
+ target_lang = '{{ target_language }}'
68
+ {% if 'mbart' in model_checkpoint %}
69
+
70
+ # Set decoder_start_token_id
71
+ if model.config.decoder_start_token_id is None and isinstance(tokenizer, (MBartTokenizer, MBartTokenizerFast)):
72
+ assert (
73
+ target_lang is not None and source_lang is not None
74
+ ), "mBart requires --target_lang and --source_lang"
75
+ if isinstance(tokenizer, MBartTokenizer):
76
+ model.config.decoder_start_token_id = tokenizer.lang_code_to_id[target_lang]
77
+ else:
78
+ model.config.decoder_start_token_id = tokenizer.convert_tokens_to_ids(target_lang)
79
+
80
+ {% endif %}
81
+ {% if 't5' in model_checkpoint %}
82
+ if model_checkpoint in ["t5-small", "t5-base", "t5-larg", "t5-3b", "t5-11b"]:
83
+ for language in (source_lang, target_lang):
84
+ if language != language[:2]:
85
+ logging.warning(
86
+ 'Extended language code %s not supported. Falling back on %s.',
87
+ language, language[:2]
88
+ )
89
+ lang_id_to_string = {
90
+ source_lang: babel.Locale(source_lang[:2]).english_name,
91
+ target_lang: babel.Locale(target_lang[:2]).english_name,
92
+ }
93
+ src_str = 'translate {}'.format(lang_id_to_string[source_lang])
94
+ tgt_str = ' to {}: '.format(lang_id_to_string[target_lang])
95
+ prefix = src_str + tgt_str
96
+ else:
97
+ prefix = ""
98
+ {% else %}
99
+ prefix = ""
100
+ {% endif %}
101
+ {% if 'mbart' in model_checkpoint %}
102
+
103
+ # For translation we set the codes of our source and target languages (only useful for mBART, the others will
104
+ # ignore those attributes).
105
+ if isinstance(tokenizer, (MBartTokenizer, MBartTokenizerFast)):
106
+ label = ['ar_AR', 'cs_CZ', 'de_DE', 'en_XX', 'es_XX', 'et_EE', 'fi_FI', 'fr_XX', 'gu_IN', 'hi_IN', 'it_IT', 'ja_XX', 'kk_KZ', 'ko_KR', 'lt_LT', 'lv_LV', 'my_MM', 'ne_NP', 'nl_XX', 'ro_RO', 'ru_RU', 'si_LK', 'tr_TR', 'vi_VN', 'zh_CN']
107
+ source_code = [item for item in label if item.startswith(source_lang)][0]
108
+ target_code = [item for item in label if item.startswith(target_lang)][0]
109
+ if source_lang is not None:
110
+ tokenizer.src_lang = source_code
111
+ if target_lang is not None:
112
+ tokenizer.tgt_lang = target_code
113
+ {% endif %}
114
+ max_input_length = {{ block_size }}
115
+ max_target_length = {{ block_size }}
116
+
117
+ def preprocess_function(examples):
118
+ inputs = [prefix + ex[source_lang] for ex in examples["translation"]]
119
+ targets = [ex[target_lang] for ex in examples["translation"]]
120
+ model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)
121
+
122
+ # Setup the tokenizer for targets
123
+ with tokenizer.as_target_tokenizer():
124
+ labels = tokenizer(targets, max_length=max_target_length, truncation=True)
125
+
126
+ model_inputs["labels"] = labels["input_ids"]
127
+ return model_inputs
128
+
129
+
130
+ tokenized_datasets = datasets.map(preprocess_function, batched=True, num_proc=4, remove_columns=list(
131
+ set(sum(list(datasets.column_names.values()), []))), desc="Running tokenizer on dataset")
132
+
133
+ data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
134
+ batch_size = {{ batch_size }}
135
+
136
+ {{ header("Training") }}
137
+
138
+ def compute_metrics(eval_preds):
139
+ preds, labels = eval_preds
140
+ # In case the model returns more than the prediction logits
141
+ if isinstance(preds, tuple):
142
+ preds = preds[0]
143
+
144
+ decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
145
+
146
+ # Replace -100s in the labels as we can't decode them
147
+ labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
148
+ decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
149
+
150
+ # Some simple post-processing
151
+ decoded_preds = [pred.strip() for pred in decoded_preds]
152
+ decoded_labels = [[label.strip()] for label in decoded_labels]
153
+
154
+ result = metric.compute(predictions=decoded_preds,
155
+ references=decoded_labels)
156
+ return {"bleu": result["score"]}
157
+
158
+
159
+ def postprocess(predictions, labels):
160
+ predictions = predictions.cpu().numpy()
161
+ labels = labels.cpu().numpy()
162
+
163
+ decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
164
+
165
+ # Replace -100 in the labels as we can't decode them.
166
+ labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
167
+ decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
168
+
169
+ # Some simple post-processing
170
+ decoded_preds = [pred.strip() for pred in decoded_preds]
171
+ decoded_labels = [[label.strip()] for label in decoded_labels]
172
+ return decoded_preds, decoded_labels
173
+
174
+
175
+
176
+ training_args = Seq2SeqTrainingArguments(
177
+ output_dir=f"{model_name}-finetuned",
178
+ per_device_train_batch_size={{ batch_size }},
179
+ per_device_eval_batch_size={{ batch_size }},
180
+ evaluation_strategy='epoch',
181
+ logging_strategy='epoch',
182
+ save_strategy='epoch',
183
+ optim='{{ optimizer }}',
184
+ learning_rate={{ lr }},
185
+ num_train_epochs={{ num_epochs }},
186
+ gradient_accumulation_steps={{ gradient_accumulation_steps }},
187
+ lr_scheduler_type='{{ lr_scheduler_type }}',
188
+ warmup_steps={{ num_warmup_steps }},
189
+ {% if use_weight_decay%}
190
+ weight_decay={{ weight_decay }},
191
+ {% endif %}
192
+ push_to_hub=False,
193
+ dataloader_num_workers=0,
194
+ {% if task=="MaskedLM" %}
195
+ {% if whole_word_masking %}
196
+ remove_unused_columns=False,
197
+ {% endif %}
198
+ {% endif %}
199
+ load_best_model_at_end=True,
200
+ log_level='error'
201
+ )
202
+
203
+ trainer = Seq2SeqTrainer(
204
+ model=model,
205
+ args=training_args,
206
+ train_dataset=lm_datasets["{{ train }}"],
207
+ eval_dataset=lm_datasets["{{ validation }}"],
208
+ data_collator=data_collator,
209
+ )
210
+
211
+ train_result = trainer.train()
212
+ trainer.save_model()
213
+ trainer.log_metrics("train", train_result.metrics)
214
+ trainer.save_metrics("train", train_result.metrics)
215
+ trainer.save_state()
216
+ eval_results = trainer.evaluate()
217
+ trainer.log_metrics("eval", eval_results)
218
+ trainer.save_metrics("eval", eval_results)