Narsil HF staff commited on
Commit
39367a0
·
1 Parent(s): bbb5216

Saving orginal script used.

Browse files
Files changed (1) hide show
  1. create_dummy_models.py +370 -0
create_dummy_models.py ADDED
@@ -0,0 +1,370 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
4
+
5
+ import copy
6
+ import re
7
+ import importlib
8
+ import os
9
+ import tempfile
10
+ from collections import OrderedDict
11
+ import string
12
+
13
+ import h5py
14
+ import numpy as np
15
+ import torch
16
+ from tqdm import tqdm
17
+
18
+ from transformers import (
19
+ AutoTokenizer,
20
+ CONFIG_MAPPING,
21
+ MODEL_FOR_CAUSAL_LM_MAPPING,
22
+ MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING,
23
+ MODEL_FOR_MASKED_LM_MAPPING,
24
+ MODEL_FOR_MULTIPLE_CHOICE_MAPPING,
25
+ MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING,
26
+ MODEL_FOR_OBJECT_DETECTION_MAPPING,
27
+ MODEL_FOR_PRETRAINING_MAPPING,
28
+ MODEL_FOR_QUESTION_ANSWERING_MAPPING,
29
+ MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING,
30
+ MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
31
+ MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING,
32
+ MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
33
+ MODEL_MAPPING,
34
+ MODEL_WITH_LM_HEAD_MAPPING,
35
+ TF_MODEL_FOR_CAUSAL_LM_MAPPING,
36
+ TF_MODEL_FOR_MASKED_LM_MAPPING,
37
+ TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING,
38
+ TF_MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING,
39
+ TF_MODEL_FOR_PRETRAINING_MAPPING,
40
+ TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING,
41
+ TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING,
42
+ TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
43
+ TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
44
+ TF_MODEL_MAPPING,
45
+ TF_MODEL_WITH_LM_HEAD_MAPPING,
46
+ logging,
47
+ )
48
+
49
+ logging.set_verbosity_error()
50
+ HOME = os.getenv("HOME")
51
+ weights_path = f"{HOME}/data/weights"
52
+
53
+
54
+ def to_snake_case(name):
55
+ "https://stackoverflow.com/questions/1175208/elegant-python-function-to-convert-camelcase-to-snake-case"
56
+ name = re.sub("(.)([A-Z][a-z]+)", r"\1_\2", name)
57
+ name = re.sub("__([A-Z])", r"_\1", name)
58
+ name = re.sub("([a-z0-9])([A-Z])", r"\1_\2", name)
59
+ return name.lower()
60
+
61
+
62
+ def flattened(somelist):
63
+ output = []
64
+ for item in somelist:
65
+ if isinstance(item, (tuple, list)):
66
+ output.extend(list(item))
67
+ else:
68
+ output.append(item)
69
+ return output
70
+
71
+
72
+ # UTILITY METHODS
73
+ def get_tiny_config_from_class(configuration_class):
74
+ """
75
+ Retrieve a tiny configuration from the configuration class. It uses each class' `ModelTester`.
76
+ Args:
77
+ configuration_class: Subclass of `PreTrainedConfig`.
78
+
79
+ Returns:
80
+ an instance of the configuration passed, with very small hyper-parameters
81
+
82
+ """
83
+ model_type = configuration_class.model_type
84
+ camel_case_model_name = configuration_class.__name__.split("Config")[0]
85
+
86
+ try:
87
+ module = importlib.import_module(f".test_modeling_{model_type.replace('-', '_')}", package="tests")
88
+ model_tester_class = getattr(module, f"{camel_case_model_name}ModelTester", None)
89
+ except ModuleNotFoundError:
90
+ print(f"Will not build {model_type}: no model tester or cannot find the testing module from the model name.")
91
+ return
92
+
93
+ if model_tester_class is None:
94
+ return
95
+
96
+ model_tester = model_tester_class(parent=None)
97
+
98
+ if hasattr(model_tester, "get_pipeline_config"):
99
+ return model_tester.get_pipeline_config()
100
+ elif hasattr(model_tester, "get_config"):
101
+ return model_tester.get_config()
102
+
103
+
104
+ def eventual_create_tokenizer(dirname, architecture, config):
105
+ try:
106
+ _ = AutoTokenizer.from_pretrained(dirname, local_files_only=True)
107
+ return
108
+ except:
109
+ pass
110
+ checkpoint = get_checkpoint_from_architecture(architecture)
111
+ if checkpoint is None:
112
+ return
113
+ tokenizer = get_tiny_tokenizer_from_checkpoint(checkpoint)
114
+ if tokenizer is None:
115
+ return
116
+ if hasattr(config, "max_position_embeddings"):
117
+ tokenizer.model_max_length = config.max_position_embeddings
118
+
119
+ assert tokenizer.vocab_size <= config.vocab_size
120
+ if checkpoint is not None and tokenizer is not None:
121
+ try:
122
+ tokenizer.save_pretrained(dirname)
123
+ except Exception:
124
+ pass
125
+ try:
126
+ tokenizer._tokenizer.save(f"{dirname}/tokenizer.json")
127
+ except Exception:
128
+ return
129
+ _ = AutoTokenizer.from_pretrained(dirname, local_files_only=True)
130
+ # print(f"SUCCESS {dirname}")
131
+
132
+
133
+ def build_pt_architecture(architecture, config):
134
+ dirname = os.path.join(weights_path, config.model_type, to_snake_case(architecture.__name__))
135
+ try:
136
+ model = architecture.from_pretrained(dirname, local_files_only=True)
137
+ # Already created
138
+ print(f"{dirname} already created")
139
+ return
140
+ except Exception:
141
+ pass
142
+ state_dict = {}
143
+
144
+ if "DPRQuestionEncoder" in architecture.__name__:
145
+ # Not supported
146
+ return
147
+
148
+ if "ReformerModelWithLMHead" in architecture.__name__:
149
+ config.is_decoder = True
150
+
151
+ if "ReformerForMaskedLM" in architecture.__name__:
152
+ config.is_decoder = False
153
+
154
+ os.makedirs(dirname, exist_ok=True)
155
+ config.save_pretrained(dirname)
156
+ eventual_create_tokenizer(dirname, architecture, config)
157
+
158
+ model = architecture.from_pretrained(None, config=config, state_dict=state_dict, local_files_only=True)
159
+ model.save_pretrained(dirname)
160
+
161
+ # Make sure we can load what we just saved
162
+ model = architecture.from_pretrained(dirname, local_files_only=True)
163
+
164
+
165
+ def build_pytorch_weights_from_multiple_architectures(pytorch_architectures):
166
+ # Create the PyTorch tiny models
167
+ for config, architectures in tqdm(pytorch_architectures.items(), desc="Building PyTorch weights"):
168
+ base_tiny_config = get_tiny_config_from_class(config)
169
+
170
+ if base_tiny_config is None:
171
+ continue
172
+
173
+ flat_architectures = flattened(architectures)
174
+
175
+ for architecture in flat_architectures:
176
+ build_pt_architecture(architecture, copy.deepcopy(base_tiny_config))
177
+
178
+
179
+ def build_tf_architecture(architecture, config):
180
+ # [2:] remove TF prefix of architecture name
181
+ dirname = os.path.join(weights_path, config.model_type, to_snake_case(architecture.__name__[2:]))
182
+ try:
183
+ model = architecture.from_pretrained(dirname, local_files_only=True)
184
+ # Already created
185
+ return
186
+ except Exception:
187
+ pass
188
+
189
+ if "DPRQuestionEncoder" in architecture.__name__:
190
+ # Not supported
191
+ return
192
+
193
+ if "ReformerModelWithLMHead" in architecture.__name__:
194
+ config.is_decoder = True
195
+
196
+ if "ReformerForMaskedLM" in architecture.__name__:
197
+ config.is_decoder = False
198
+
199
+ config.num_labels = 2
200
+
201
+ os.makedirs(dirname, exist_ok=True)
202
+ config.save_pretrained(dirname)
203
+ eventual_create_tokenizer(dirname, architecture, config)
204
+
205
+ try:
206
+ model = architecture.from_pretrained(dirname, config=config, from_pt=True, local_files_only=True)
207
+ except Exception as e:
208
+ raise ValueError(f"Couldn't load {architecture.__name__}.") from e
209
+ model.save_pretrained(dirname)
210
+
211
+ model = architecture.from_pretrained(dirname, local_files_only=True)
212
+
213
+
214
+ def build_tensorflow_weights_from_multiple_architectures(tensorflow_architectures):
215
+ # Create the TensorFlow tiny models
216
+ for config, architectures in tqdm(tensorflow_architectures.items(), desc="Building TensorFlow weights"):
217
+ base_tiny_config = get_tiny_config_from_class(config)
218
+
219
+ if base_tiny_config is None:
220
+ continue
221
+
222
+ flat_architectures = flattened(architectures)
223
+ for architecture in flat_architectures:
224
+ build_tf_architecture(architecture, copy.deepcopy(base_tiny_config))
225
+
226
+
227
+ def get_tiny_tokenizer_from_checkpoint(checkpoint):
228
+ try:
229
+ tokenizer = AutoTokenizer.from_pretrained(checkpoint, local_files_only=True)
230
+ except Exception:
231
+ return
232
+ # logger.warning("Training new from iterator ...")
233
+ vocabulary = string.ascii_letters + string.digits + " "
234
+ if not tokenizer.__class__.__name__.endswith("Fast"):
235
+ return
236
+ try:
237
+ tokenizer = tokenizer.train_new_from_iterator(vocabulary, vocab_size=len(vocabulary), show_progress=False)
238
+ except: # noqa: E722
239
+ return
240
+ # logger.warning("Trained.")
241
+ return tokenizer
242
+
243
+
244
+ def get_checkpoint_from_architecture(architecture):
245
+ try:
246
+ module = importlib.import_module(architecture.__module__)
247
+ except Exception:
248
+ # logger.error(f"Ignoring architecture {architecture}")
249
+ return
250
+
251
+ if hasattr(module, "_CHECKPOINT_FOR_DOC"):
252
+ return module._CHECKPOINT_FOR_DOC
253
+ else:
254
+ # logger.warning(f"Can't retrieve checkpoint from {architecture.__name__}")
255
+ pass
256
+
257
+
258
+ def pt_architectures():
259
+ pytorch_mappings = [
260
+ MODEL_MAPPING,
261
+ MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
262
+ MODEL_FOR_MASKED_LM_MAPPING,
263
+ MODEL_FOR_PRETRAINING_MAPPING,
264
+ MODEL_FOR_CAUSAL_LM_MAPPING,
265
+ MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING,
266
+ MODEL_FOR_MULTIPLE_CHOICE_MAPPING,
267
+ MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING,
268
+ MODEL_FOR_OBJECT_DETECTION_MAPPING,
269
+ MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
270
+ MODEL_WITH_LM_HEAD_MAPPING,
271
+ MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING,
272
+ MODEL_FOR_QUESTION_ANSWERING_MAPPING,
273
+ MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING,
274
+ ]
275
+
276
+ pt_architectures = {
277
+ config: [pytorch_mapping[config] for pytorch_mapping in pytorch_mappings if config in pytorch_mapping]
278
+ for config in CONFIG_MAPPING.values()
279
+ }
280
+
281
+ build_pytorch_weights_from_multiple_architectures(pt_architectures)
282
+ print("Built PyTorch weights")
283
+
284
+ for config, architectures in tqdm(pt_architectures.items(), desc="Checking PyTorch weights validity"):
285
+ base_tiny_config = get_tiny_config_from_class(config)
286
+
287
+ if base_tiny_config is None:
288
+ continue
289
+
290
+ flat_architectures = flattened(architectures)
291
+ for architecture in flat_architectures:
292
+ if "DPRQuestionEncoder" in architecture.__name__:
293
+ continue
294
+
295
+ dirname = os.path.join(weights_path, config.model_type, to_snake_case(architecture.__name__))
296
+ model, loading_info = architecture.from_pretrained(
297
+ dirname,
298
+ output_loading_info=True,
299
+ local_files_only=True,
300
+ )
301
+ if len(loading_info["missing_keys"]) > 0:
302
+ raise ValueError(f"Missing weights when loading PyTorch checkpoints: {loading_info['missing_keys']}")
303
+
304
+ print("Checked PyTorch weights")
305
+
306
+
307
+ def tf_architectures():
308
+ tensorflow_mappings = [
309
+ TF_MODEL_MAPPING,
310
+ TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
311
+ TF_MODEL_FOR_MASKED_LM_MAPPING,
312
+ TF_MODEL_FOR_PRETRAINING_MAPPING,
313
+ TF_MODEL_FOR_CAUSAL_LM_MAPPING,
314
+ TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING,
315
+ TF_MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING,
316
+ TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
317
+ TF_MODEL_WITH_LM_HEAD_MAPPING,
318
+ TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING,
319
+ TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING,
320
+ ]
321
+ tf_architectures = {
322
+ config: [
323
+ tensorflow_mapping[config] for tensorflow_mapping in tensorflow_mappings if config in tensorflow_mapping
324
+ ]
325
+ for config in CONFIG_MAPPING.values()
326
+ }
327
+ build_tensorflow_weights_from_multiple_architectures(tf_architectures)
328
+ print("Built TensorFlow weights")
329
+ for config, architectures in tqdm(tf_architectures.items(), desc="Checking TensorFlow weights validity"):
330
+ base_tiny_config = get_tiny_config_from_class(config)
331
+
332
+ if base_tiny_config is None:
333
+ continue
334
+
335
+ flat_architectures = flattened(architectures)
336
+
337
+ for architecture in flat_architectures:
338
+ if "DPRQuestionEncoder" in architecture.__name__:
339
+ # Not supported
340
+ return
341
+
342
+ # [2:] to remove TF prefix
343
+ dirname = os.path.join(weights_path, config.model_type, to_snake_case(architecture.__name__[2:]))
344
+ try:
345
+ model, loading_info = architecture.from_pretrained(
346
+ dirname, output_loading_info=True, local_files_only=True
347
+ )
348
+ except Exception as e:
349
+ raise ValueError(f"Couldn't load {architecture.__name__}") from e
350
+
351
+ if len(loading_info["missing_keys"]) != 0:
352
+ required_weights_missing = []
353
+ for missing_key in loading_info["missing_keys"]:
354
+ if "dropout" not in missing_key:
355
+ required_weights_missing.append(missing_key)
356
+
357
+ if len(required_weights_missing) > 0:
358
+ raise ValueError(f"Found missing weights in {architecture}: {required_weights_missing}")
359
+
360
+ print("Checked TensorFlow weights")
361
+
362
+
363
+ def main():
364
+ # Define the PyTorch and TensorFlow mappings
365
+ pt_architectures()
366
+ tf_architectures()
367
+
368
+
369
+ if __name__ == "__main__":
370
+ main()