Spaces:

NLPV
/

ISCO-code-predictor-api

Sleeping

App Files Files

Pradeep Kumar commited on Aug 15, 2024

Commit

c130734

•

1 Parent(s): b64b72d

Upload 10 files

Browse files

Files changed (9) hide show

export_tfhub.py +219 -0
export_tfhub_lib.py +493 -0
export_tfhub_lib_test.py +1080 -0
squad_evaluate_v1_1.py +106 -0
squad_evaluate_v2_0.py +249 -0
tf1_bert_checkpoint_converter_lib.py +201 -0
tf2_albert_encoder_checkpoint_converter.py +170 -0
tf2_bert_encoder_checkpoint_converter.py +160 -0
tokenization_test.py +156 -0

export_tfhub.py ADDED Viewed

	@@ -0,0 +1,219 @@

+# Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+r"""Exports a BERT-like encoder and its preprocessing as SavedModels for TF Hub.
+This tool creates preprocessor and encoder SavedModels suitable for uploading
+to https://tfhub.dev that implement the preprocessor and encoder APIs defined
+at https://www.tensorflow.org/hub/common_saved_model_apis/text.
+For a full usage guide, see
+https://github.com/tensorflow/models/blob/master/official/nlp/docs/tfhub.md
+Minimal usage examples:
+1) Exporting an Encoder from checkpoint and config.
+```
+export_tfhub \
+  --encoder_config_file=${BERT_DIR:?}/bert_encoder.yaml \
+  --model_checkpoint_path=${BERT_DIR:?}/bert_model.ckpt \
+  --vocab_file=${BERT_DIR:?}/vocab.txt \
+  --export_type=model \
+  --export_path=/tmp/bert_model
+```
+An --encoder_config_file can specify encoder types other than BERT.
+For BERT, a --bert_config_file in the legacy JSON format can be passed instead.
+Flag --vocab_file (and flag --do_lower_case, whose default value is guessed
+from the vocab_file path) capture how BertTokenizer was used in pre-training.
+Use flag --sp_model_file instead if SentencepieceTokenizer was used.
+Changing --export_type to model_with_mlm additionally creates an `.mlm`
+subobject on the exported SavedModel that can be called to produce
+the logits of the Masked Language Model task from pretraining.
+The help string for flag --model_checkpoint_path explains the checkpoint
+formats required for each --export_type.
+2) Exporting a preprocessor SavedModel
+```
+export_tfhub \
+  --vocab_file ${BERT_DIR:?}/vocab.txt \
+  --export_type preprocessing --export_path /tmp/bert_preprocessing
+```
+Be sure to use flag values that match the encoder and how it has been
+pre-trained (see above for --vocab_file vs --sp_model_file).
+If your encoder has been trained with text preprocessing for which tfhub.dev
+already has SavedModel, you could guide your users to reuse that one instead
+of exporting and publishing your own.
+TODO(b/175369555): When exporting to users of TensorFlow 2.4, add flag
+`--experimental_disable_assert_in_preprocessing`.
+"""
+from absl import app
+from absl import flags
+import gin
+from official.legacy.bert import configs
+from official.modeling import hyperparams
+from official.nlp.configs import encoders
+from official.nlp.tools import export_tfhub_lib
+FLAGS = flags.FLAGS
+flags.DEFINE_enum(
+    "export_type", "model",
+    ["model", "model_with_mlm", "preprocessing"],
+    "The overall type of SavedModel to export. Flags "
+    "--bert_config_file/--encoder_config_file and --vocab_file/--sp_model_file "
+    "control which particular encoder model and preprocessing are exported.")
+flags.DEFINE_string(
+    "export_path", None,
+    "Directory to which the SavedModel is written.")
+flags.DEFINE_string(
+    "encoder_config_file", None,
+    "A yaml file representing `encoders.EncoderConfig` to define the encoder "
+    "(BERT or other). "
+    "Exactly one of --bert_config_file and --encoder_config_file can be set. "
+    "Needed for --export_type model and model_with_mlm.")
+flags.DEFINE_string(
+    "bert_config_file", None,
+    "A JSON file with a legacy BERT configuration to define the BERT encoder. "
+    "Exactly one of --bert_config_file and --encoder_config_file can be set. "
+    "Needed for --export_type model and model_with_mlm.")
+flags.DEFINE_bool(
+    "copy_pooler_dense_to_encoder", False,
+    "When the model is trained using `BertPretrainerV2`, the pool layer "
+    "of next sentence prediction task exists in `ClassificationHead` passed "
+    "to `BertPretrainerV2`. If True, we will copy this pooler's dense layer "
+    "to the encoder that is exported by this tool (as in classic BERT). "
+    "Using `BertPretrainerV2` and leaving this False exports an untrained "
+    "(randomly initialized) pooling layer, which some authors recommend for "
+    "subsequent fine-tuning,")
+flags.DEFINE_string(
+    "model_checkpoint_path", None,
+    "File path to a pre-trained model checkpoint. "
+    "For --export_type model, this has to be an object-based (TF2) checkpoint "
+    "that can be restored to `tf.train.Checkpoint(encoder=encoder)` "
+    "for the `encoder` defined by the config file."
+    "(Legacy checkpoints with `model=` instead of `encoder=` are also "
+    "supported for now.) "
+    "For --export_type model_with_mlm, it must be restorable to "
+    "`tf.train.Checkpoint(**BertPretrainerV2(...).checkpoint_items)`. "
+    "(For now, `tf.train.Checkpoint(pretrainer=BertPretrainerV2(...))` is also "
+    "accepted.)")
+flags.DEFINE_string(
+    "vocab_file", None,
+    "For encoders trained on BertTokenzier input: "
+    "the vocabulary file that the encoder model was trained with. "
+    "Exactly one of --vocab_file and --sp_model_file can be set. "
+    "Needed for --export_type model, model_with_mlm and preprocessing.")
+flags.DEFINE_string(
+    "sp_model_file", None,
+    "For encoders trained on SentencepieceTokenzier input: "
+    "the SentencePiece .model file that the encoder model was trained with. "
+    "Exactly one of --vocab_file and --sp_model_file can be set. "
+    "Needed for --export_type model, model_with_mlm and preprocessing.")
+flags.DEFINE_bool(
+    "do_lower_case", None,
+    "Whether to lowercase before tokenization. "
+    "If left as None, and --vocab_file is set, do_lower_case will be enabled "
+    "if 'uncased' appears in the name of --vocab_file. "
+    "If left as None, and --sp_model_file set, do_lower_case defaults to true. "
+    "Needed for --export_type model, model_with_mlm and preprocessing.")
+flags.DEFINE_integer(
+    "default_seq_length", 128,
+    "The sequence length of preprocessing results from "
+    "top-level preprocess method. This is also the default "
+    "sequence length for the bert_pack_inputs subobject."
+    "Needed for --export_type preprocessing.")
+flags.DEFINE_bool(
+    "tokenize_with_offsets", False,  # TODO(b/181866850)
+    "Whether to export a .tokenize_with_offsets subobject for "
+    "--export_type preprocessing.")
+flags.DEFINE_multi_string(
+    "gin_file", default=None,
+    help="List of paths to the config files.")
+flags.DEFINE_multi_string(
+    "gin_params", default=None,
+    help="List of Gin bindings.")
+flags.DEFINE_bool(  # TODO(b/175369555): Remove this flag and its use.
+    "experimental_disable_assert_in_preprocessing", False,
+    "Export a preprocessing model without tf.Assert ops. "
+    "Usually, that would be a bad idea, except TF2.4 has an issue with "
+    "Assert ops in tf.functions used in Dataset.map() on a TPU worker, "
+    "and omitting the Assert ops lets SavedModels avoid the issue.")
+def main(argv):
+  if len(argv) > 1:
+    raise app.UsageError("Too many command-line arguments.")
+  gin.parse_config_files_and_bindings(FLAGS.gin_file, FLAGS.gin_params)
+  if bool(FLAGS.vocab_file) == bool(FLAGS.sp_model_file):
+    raise ValueError("Exactly one of `vocab_file` and `sp_model_file` "
+                     "can be specified, but got %s and %s." %
+                     (FLAGS.vocab_file, FLAGS.sp_model_file))
+  do_lower_case = export_tfhub_lib.get_do_lower_case(
+      FLAGS.do_lower_case, FLAGS.vocab_file, FLAGS.sp_model_file)
+  if FLAGS.export_type in ("model", "model_with_mlm"):
+    if bool(FLAGS.bert_config_file) == bool(FLAGS.encoder_config_file):
+      raise ValueError("Exactly one of `bert_config_file` and "
+                       "`encoder_config_file` can be specified, but got "
+                       "%s and %s." %
+                       (FLAGS.bert_config_file, FLAGS.encoder_config_file))
+    if FLAGS.bert_config_file:
+      bert_config = configs.BertConfig.from_json_file(FLAGS.bert_config_file)
+      encoder_config = None
+    else:
+      bert_config = None
+      encoder_config = encoders.EncoderConfig()
+      encoder_config = hyperparams.override_params_dict(
+          encoder_config, FLAGS.encoder_config_file, is_strict=True)
+    export_tfhub_lib.export_model(
+        FLAGS.export_path,
+        bert_config=bert_config,
+        encoder_config=encoder_config,
+        model_checkpoint_path=FLAGS.model_checkpoint_path,
+        vocab_file=FLAGS.vocab_file,
+        sp_model_file=FLAGS.sp_model_file,
+        do_lower_case=do_lower_case,
+        with_mlm=FLAGS.export_type == "model_with_mlm",
+        copy_pooler_dense_to_encoder=FLAGS.copy_pooler_dense_to_encoder)
+  elif FLAGS.export_type == "preprocessing":
+    export_tfhub_lib.export_preprocessing(
+        FLAGS.export_path,
+        vocab_file=FLAGS.vocab_file,
+        sp_model_file=FLAGS.sp_model_file,
+        do_lower_case=do_lower_case,
+        default_seq_length=FLAGS.default_seq_length,
+        tokenize_with_offsets=FLAGS.tokenize_with_offsets,
+        experimental_disable_assert=
+        FLAGS.experimental_disable_assert_in_preprocessing)
+  else:
+    raise app.UsageError(
+        "Unknown value '%s' for flag --export_type" % FLAGS.export_type)
+if __name__ == "__main__":
+  app.run(main)

export_tfhub_lib.py ADDED Viewed

	@@ -0,0 +1,493 @@

+# Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Library of components of export_tfhub.py. See docstring there for more."""
+import contextlib
+import hashlib
+import os
+import tempfile
+from typing import Optional, Text, Tuple
+# Import libraries
+from absl import logging
+import tensorflow as tf, tf_keras
+# pylint: disable=g-direct-tensorflow-import  TODO(b/175369555): Remove these.
+from tensorflow.core.protobuf import saved_model_pb2
+from tensorflow.python.ops import control_flow_assert
+# pylint: enable=g-direct-tensorflow-import
+from official.legacy.bert import configs
+from official.modeling import tf_utils
+from official.nlp.configs import encoders
+from official.nlp.modeling import layers
+from official.nlp.modeling import models
+from official.nlp.modeling import networks
+def get_bert_encoder(bert_config):
+  """Returns a BertEncoder with dict outputs."""
+  bert_encoder = networks.BertEncoder(
+      vocab_size=bert_config.vocab_size,
+      hidden_size=bert_config.hidden_size,
+      num_layers=bert_config.num_hidden_layers,
+      num_attention_heads=bert_config.num_attention_heads,
+      intermediate_size=bert_config.intermediate_size,
+      activation=tf_utils.get_activation(bert_config.hidden_act),
+      dropout_rate=bert_config.hidden_dropout_prob,
+      attention_dropout_rate=bert_config.attention_probs_dropout_prob,
+      max_sequence_length=bert_config.max_position_embeddings,
+      type_vocab_size=bert_config.type_vocab_size,
+      initializer=tf_keras.initializers.TruncatedNormal(
+          stddev=bert_config.initializer_range),
+      embedding_width=bert_config.embedding_size,
+      dict_outputs=True)
+  return bert_encoder
+def get_do_lower_case(do_lower_case, vocab_file=None, sp_model_file=None):
+  """Returns do_lower_case, replacing None by a guess from vocab file name."""
+  if do_lower_case is not None:
+    return do_lower_case
+  elif vocab_file:
+    do_lower_case = "uncased" in vocab_file
+    logging.info("Using do_lower_case=%s based on name of vocab_file=%s",
+                 do_lower_case, vocab_file)
+    return do_lower_case
+  elif sp_model_file:
+    do_lower_case = True  # All public ALBERTs (as of Oct 2020) do it.
+    logging.info("Defaulting to do_lower_case=%s for Sentencepiece tokenizer",
+                 do_lower_case)
+    return do_lower_case
+  else:
+    raise ValueError("Must set vocab_file or sp_model_file.")
+def _create_model(
+    *,
+    bert_config: Optional[configs.BertConfig] = None,
+    encoder_config: Optional[encoders.EncoderConfig] = None,
+    with_mlm: bool,
+) -> Tuple[tf_keras.Model, tf_keras.Model]:
+  """Creates the model to export and the model to restore the checkpoint.
+  Args:
+    bert_config: A legacy `BertConfig` to create a `BertEncoder` object. Exactly
+      one of encoder_config and bert_config must be set.
+    encoder_config: An `EncoderConfig` to create an encoder of the configured
+      type (`BertEncoder` or other).
+    with_mlm: A bool to control the second component of the result. If True,
+      will create a `BertPretrainerV2` object; otherwise, will create a
+      `BertEncoder` object.
+  Returns:
+    A Tuple of (1) a Keras model that will be exported, (2) a `BertPretrainerV2`
+    object or `BertEncoder` object depending on the value of `with_mlm`
+    argument, which contains the first model and will be used for restoring
+    weights from the checkpoint.
+  """
+  if (bert_config is not None) == (encoder_config is not None):
+    raise ValueError("Exactly one of `bert_config` and `encoder_config` "
+                     "can be specified, but got %s and %s" %
+                     (bert_config, encoder_config))
+  if bert_config is not None:
+    encoder = get_bert_encoder(bert_config)
+  else:
+    encoder = encoders.build_encoder(encoder_config)
+  # Convert from list of named inputs to dict of inputs keyed by name.
+  # Only the latter accepts a dict of inputs after restoring from SavedModel.
+  if isinstance(encoder.inputs, list) or isinstance(encoder.inputs, tuple):
+    encoder_inputs_dict = {x.name: x for x in encoder.inputs}
+  else:
+    # encoder.inputs by default is dict for BertEncoderV2.
+    encoder_inputs_dict = encoder.inputs
+  encoder_output_dict = encoder(encoder_inputs_dict)
+  # For interchangeability with other text representations,
+  # add "default" as an alias for BERT's whole-input reptesentations.
+  encoder_output_dict["default"] = encoder_output_dict["pooled_output"]
+  core_model = tf_keras.Model(
+      inputs=encoder_inputs_dict, outputs=encoder_output_dict)
+  if with_mlm:
+    if bert_config is not None:
+      hidden_act = bert_config.hidden_act
+    else:
+      assert encoder_config is not None
+      hidden_act = encoder_config.get().hidden_activation
+    pretrainer = models.BertPretrainerV2(
+        encoder_network=encoder,
+        mlm_activation=tf_utils.get_activation(hidden_act))
+    if isinstance(pretrainer.inputs, dict):
+      pretrainer_inputs_dict = pretrainer.inputs
+    else:
+      pretrainer_inputs_dict = {x.name: x for x in pretrainer.inputs}
+    pretrainer_output_dict = pretrainer(pretrainer_inputs_dict)
+    mlm_model = tf_keras.Model(
+        inputs=pretrainer_inputs_dict, outputs=pretrainer_output_dict)
+    # Set `_auto_track_sub_layers` to False, so that the additional weights
+    # from `mlm` sub-object will not be included in the core model.
+    # TODO(b/169210253): Use a public API when available.
+    core_model._auto_track_sub_layers = False  # pylint: disable=protected-access
+    core_model.mlm = mlm_model
+    return core_model, pretrainer
+  else:
+    return core_model, encoder
+def export_model(export_path: Text,
+                 *,
+                 bert_config: Optional[configs.BertConfig] = None,
+                 encoder_config: Optional[encoders.EncoderConfig] = None,
+                 model_checkpoint_path: Text,
+                 with_mlm: bool,
+                 copy_pooler_dense_to_encoder: bool = False,
+                 vocab_file: Optional[Text] = None,
+                 sp_model_file: Optional[Text] = None,
+                 do_lower_case: Optional[bool] = None) -> None:
+  """Exports an Encoder as SavedModel after restoring pre-trained weights.
+  The exported SavedModel implements a superset of the Encoder API for
+  Text embeddings with Transformer Encoders described at
+  https://www.tensorflow.org/hub/common_saved_model_apis/text.
+  In particular, the exported SavedModel can be used in the following way:
+  ```
+  # Calls default interface (encoder only).
+  encoder = hub.load(...)
+  encoder_inputs = dict(
+      input_word_ids=...,  # Shape [batch, seq_length], dtype=int32
+      input_mask=...,      # Shape [batch, seq_length], dtype=int32
+      input_type_ids=...,  # Shape [batch, seq_length], dtype=int32
+  )
+  encoder_outputs = encoder(encoder_inputs)
+  assert encoder_outputs.keys() == {
+    "pooled_output",   # Shape [batch_size, width], dtype=float32
+    "default",         # Alias for "pooled_output" (aligns with other models).
+    "sequence_output"  # Shape [batch_size, seq_length, width], dtype=float32
+    "encoder_outputs", # List of Tensors with outputs of all transformer layers.
+  }
+  ```
+  If `with_mlm` is True, the exported SavedModel can also be called in the
+  following way:
+  ```
+  # Calls expanded interface that includes logits of the Masked Language Model.
+  mlm_inputs = dict(
+      input_word_ids=...,       # Shape [batch, seq_length], dtype=int32
+      input_mask=...,           # Shape [batch, seq_length], dtype=int32
+      input_type_ids=...,       # Shape [batch, seq_length], dtype=int32
+      masked_lm_positions=...,  # Shape [batch, num_predictions], dtype=int32
+  )
+  mlm_outputs = encoder.mlm(mlm_inputs)
+  assert mlm_outputs.keys() == {
+    "pooled_output",   # Shape [batch, width], dtype=float32
+    "sequence_output", # Shape [batch, seq_length, width], dtype=float32
+    "encoder_outputs", # List of Tensors with outputs of all transformer layers.
+    "mlm_logits"    # Shape [batch, num_predictions, vocab_size], dtype=float32
+  }
+  ```
+  Args:
+    export_path: The SavedModel output directory.
+    bert_config: An optional `configs.BertConfig` object. Note: exactly one of
+      `bert_config` and following `encoder_config` must be specified.
+    encoder_config: An optional `encoders.EncoderConfig` object.
+    model_checkpoint_path: The path to the checkpoint.
+    with_mlm: Whether to export the additional mlm sub-object.
+    copy_pooler_dense_to_encoder: Whether to copy the pooler's dense layer used
+      in the next sentence prediction task to the encoder.
+    vocab_file: The path to the wordpiece vocab file, or None.
+    sp_model_file: The path to the sentencepiece model file, or None. Exactly
+      one of vocab_file and sp_model_file must be set.
+    do_lower_case: Whether to lower-case text before tokenization.
+  """
+  if with_mlm:
+    core_model, pretrainer = _create_model(
+        bert_config=bert_config,
+        encoder_config=encoder_config,
+        with_mlm=with_mlm)
+    encoder = pretrainer.encoder_network
+    # It supports both the new pretrainer checkpoint produced by TF-NLP and
+    # the checkpoint converted from TF1 (original BERT, SmallBERTs).
+    checkpoint_items = pretrainer.checkpoint_items
+    checkpoint = tf.train.Checkpoint(**checkpoint_items)
+  else:
+    core_model, encoder = _create_model(
+        bert_config=bert_config,
+        encoder_config=encoder_config,
+        with_mlm=with_mlm)
+    checkpoint = tf.train.Checkpoint(
+        model=encoder,  # Legacy checkpoints.
+        encoder=encoder)
+  checkpoint.restore(model_checkpoint_path).assert_existing_objects_matched()
+  if copy_pooler_dense_to_encoder:
+    logging.info("Copy pooler's dense layer to the encoder.")
+    pooler_checkpoint = tf.train.Checkpoint(
+        **{"next_sentence.pooler_dense": encoder.pooler_layer})
+    pooler_checkpoint.restore(
+        model_checkpoint_path).assert_existing_objects_matched()
+  # Before SavedModels for preprocessing appeared in Oct 2020, the encoders
+  # provided this information to let users do preprocessing themselves.
+  # We keep doing that for now. It helps users to upgrade incrementally.
+  # Moreover, it offers an escape hatch for advanced users who want the
+  # full vocab, not the high-level operations from the preprocessing model.
+  if vocab_file:
+    core_model.vocab_file = tf.saved_model.Asset(vocab_file)
+    if do_lower_case is None:
+      raise ValueError("Must pass do_lower_case if passing vocab_file.")
+    core_model.do_lower_case = tf.Variable(do_lower_case, trainable=False)
+  elif sp_model_file:
+    # This was used by ALBERT, with implied values of do_lower_case=True
+    # and strip_diacritics=True.
+    core_model.sp_model_file = tf.saved_model.Asset(sp_model_file)
+  else:
+    raise ValueError("Must set vocab_file or sp_model_file")
+  core_model.save(export_path, include_optimizer=False, save_format="tf")
+class BertPackInputsSavedModelWrapper(tf.train.Checkpoint):
+  """Wraps a BertPackInputs layer for export to SavedModel.
+  The wrapper object is suitable for use with `tf.saved_model.save()` and
+  `.load()`. The wrapper object is callable with inputs and outputs like the
+  BertPackInputs layer, but differs from saving an unwrapped Keras object:
+    - The inputs can be a list of 1 or 2 RaggedTensors of dtype int32 and
+      ragged rank 1 or 2. (In Keras, saving to a tf.function in a SavedModel
+      would fix the number of RaggedTensors and their ragged rank.)
+    - The call accepts an optional keyword argument `seq_length=` to override
+      the layer's .seq_length hyperparameter. (In Keras, a hyperparameter
+      could not be changed after saving to a tf.function in a SavedModel.)
+  """
+  def __init__(self, bert_pack_inputs: layers.BertPackInputs):
+    super().__init__()
+    # Preserve the layer's configured seq_length as a default but make it
+    # overridable. Having this dynamically determined default argument
+    # requires self.__call__ to be defined in this indirect way.
+    default_seq_length = bert_pack_inputs.seq_length
+    @tf.function(autograph=False)
+    def call(inputs, seq_length=default_seq_length):
+      return layers.BertPackInputs.bert_pack_inputs(
+          inputs,
+          seq_length=seq_length,
+          start_of_sequence_id=bert_pack_inputs.start_of_sequence_id,
+          end_of_segment_id=bert_pack_inputs.end_of_segment_id,
+          padding_id=bert_pack_inputs.padding_id)
+    self.__call__ = call
+    for ragged_rank in range(1, 3):
+      for num_segments in range(1, 3):
+        _ = self.__call__.get_concrete_function([
+            tf.RaggedTensorSpec([None] * (ragged_rank + 1), dtype=tf.int32)
+            for _ in range(num_segments)
+        ],
+                                                seq_length=tf.TensorSpec(
+                                                    [], tf.int32))
+def create_preprocessing(*,
+                         vocab_file: Optional[str] = None,
+                         sp_model_file: Optional[str] = None,
+                         do_lower_case: bool,
+                         tokenize_with_offsets: bool,
+                         default_seq_length: int) -> tf_keras.Model:
+  """Returns a preprocessing Model for given tokenization parameters.
+  This function builds a Keras Model with attached subobjects suitable for
+  saving to a SavedModel. The resulting SavedModel implements the Preprocessor
+  API for Text embeddings with Transformer Encoders described at
+  https://www.tensorflow.org/hub/common_saved_model_apis/text.
+  Args:
+    vocab_file: The path to the wordpiece vocab file, or None.
+    sp_model_file: The path to the sentencepiece model file, or None. Exactly
+      one of vocab_file and sp_model_file must be set. This determines the type
+      of tokenzer that is used.
+    do_lower_case: Whether to do lower case.
+    tokenize_with_offsets: Whether to include the .tokenize_with_offsets
+      subobject.
+    default_seq_length: The sequence length of preprocessing results from root
+      callable. This is also the default sequence length for the
+      bert_pack_inputs subobject.
+  Returns:
+    A tf_keras.Model object with several attached subobjects, suitable for
+    saving as a preprocessing SavedModel.
+  """
+  # Select tokenizer.
+  if bool(vocab_file) == bool(sp_model_file):
+    raise ValueError("Must set exactly one of vocab_file, sp_model_file")
+  if vocab_file:
+    tokenize = layers.BertTokenizer(
+        vocab_file=vocab_file,
+        lower_case=do_lower_case,
+        tokenize_with_offsets=tokenize_with_offsets)
+  else:
+    tokenize = layers.SentencepieceTokenizer(
+        model_file_path=sp_model_file,
+        lower_case=do_lower_case,
+        strip_diacritics=True,  #  Strip diacritics to follow ALBERT model.
+        tokenize_with_offsets=tokenize_with_offsets)
+  # The root object of the preprocessing model can be called to do
+  # one-shot preprocessing for users with single-sentence inputs.
+  sentences = tf_keras.layers.Input(shape=(), dtype=tf.string, name="sentences")
+  if tokenize_with_offsets:
+    tokens, start_offsets, limit_offsets = tokenize(sentences)
+  else:
+    tokens = tokenize(sentences)
+  pack = layers.BertPackInputs(
+      seq_length=default_seq_length,
+      special_tokens_dict=tokenize.get_special_tokens_dict())
+  model_inputs = pack(tokens)
+  preprocessing = tf_keras.Model(sentences, model_inputs)
+  # Individual steps of preprocessing are made available as named subobjects
+  # to enable more general preprocessing. For saving, they need to be Models
+  # in their own right.
+  preprocessing.tokenize = tf_keras.Model(sentences, tokens)
+  # Provide an equivalent to tokenize.get_special_tokens_dict().
+  preprocessing.tokenize.get_special_tokens_dict = tf.train.Checkpoint()
+  preprocessing.tokenize.get_special_tokens_dict.__call__ = tf.function(
+      lambda: tokenize.get_special_tokens_dict(),  # pylint: disable=[unnecessary-lambda]
+      input_signature=[])
+  if tokenize_with_offsets:
+    preprocessing.tokenize_with_offsets = tf_keras.Model(
+        sentences, [tokens, start_offsets, limit_offsets])
+    preprocessing.tokenize_with_offsets.get_special_tokens_dict = (
+        preprocessing.tokenize.get_special_tokens_dict)
+  # Conceptually, this should be
+  # preprocessing.bert_pack_inputs = tf_keras.Model(tokens, model_inputs)
+  # but technicalities require us to use a wrapper (see comments there).
+  # In particular, seq_length can be overridden when calling this.
+  preprocessing.bert_pack_inputs = BertPackInputsSavedModelWrapper(pack)
+  return preprocessing
+def _move_to_tmpdir(file_path: Optional[Text], tmpdir: Text) -> Optional[Text]:
+  """Returns new path with same basename and hash of original path."""
+  if file_path is None:
+    return None
+  olddir, filename = os.path.split(file_path)
+  hasher = hashlib.sha1()
+  hasher.update(olddir.encode("utf-8"))
+  target_dir = os.path.join(tmpdir, hasher.hexdigest())
+  target_file = os.path.join(target_dir, filename)
+  tf.io.gfile.mkdir(target_dir)
+  tf.io.gfile.copy(file_path, target_file)
+  return target_file
+def export_preprocessing(export_path: Text,
+                         *,
+                         vocab_file: Optional[Text] = None,
+                         sp_model_file: Optional[Text] = None,
+                         do_lower_case: bool,
+                         tokenize_with_offsets: bool,
+                         default_seq_length: int,
+                         experimental_disable_assert: bool = False) -> None:
+  """Exports preprocessing to a SavedModel for TF Hub."""
+  with tempfile.TemporaryDirectory() as tmpdir:
+    # TODO(b/175369555): Remove experimental_disable_assert and its use.
+    with _maybe_disable_assert(experimental_disable_assert):
+      preprocessing = create_preprocessing(
+          vocab_file=_move_to_tmpdir(vocab_file, tmpdir),
+          sp_model_file=_move_to_tmpdir(sp_model_file, tmpdir),
+          do_lower_case=do_lower_case,
+          tokenize_with_offsets=tokenize_with_offsets,
+          default_seq_length=default_seq_length)
+      preprocessing.save(export_path, include_optimizer=False, save_format="tf")
+    if experimental_disable_assert:
+      _check_no_assert(export_path)
+  # It helps the unit test to prevent stray copies of the vocab file.
+  if tf.io.gfile.exists(tmpdir):
+    raise IOError("Failed to clean up TemporaryDirectory")
+# TODO(b/175369555): Remove all workarounds for this bug of TensorFlow 2.4
+# when this bug is no longer a concern for publishing new models.
+# TensorFlow 2.4 has a placement issue with Assert ops in tf.functions called
+# from Dataset.map() on a TPU worker. They end up on the TPU coordinator,
+# and invoking them from the TPU worker is either inefficient (when possible)
+# or impossible (notably when using "headless" TPU workers on Cloud that do not
+# have a channel to the coordinator). The bug has been fixed in time for TF 2.5.
+# To work around this, the following code avoids Assert ops in the exported
+# SavedModels. It monkey-patches calls to tf.Assert from inside TensorFlow and
+# replaces them by a no-op while building the exported model. This is fragile,
+# so _check_no_assert() validates the result. The resulting model should be fine
+# to read on future versions of TF, even if this workaround at export time
+# may break eventually. (Failing unit tests will tell.)
+def _dont_assert(condition, data, summarize=None, name="Assert"):
+  """The no-op version of tf.Assert installed by _maybe_disable_assert."""
+  del condition, data, summarize  # Unused.
+  if tf.executing_eagerly():
+    return
+  with tf.name_scope(name):
+    return tf.no_op(name="dont_assert")
+@contextlib.contextmanager
+def _maybe_disable_assert(disable_assert):
+  """Scoped monkey patch of control_flow_assert.Assert to a no-op."""
+  if not disable_assert:
+    yield
+    return
+  original_assert = control_flow_assert.Assert
+  control_flow_assert.Assert = _dont_assert
+  yield
+  control_flow_assert.Assert = original_assert
+def _check_no_assert(saved_model_path):
+  """Raises AssertionError if SavedModel contains Assert ops."""
+  saved_model_filename = os.path.join(saved_model_path, "saved_model.pb")
+  with tf.io.gfile.GFile(saved_model_filename, "rb") as f:
+    saved_model = saved_model_pb2.SavedModel.FromString(f.read())
+  assert_nodes = []
+  graph_def = saved_model.meta_graphs[0].graph_def
+  assert_nodes += [
+      "node '{}' in global graph".format(n.name)
+      for n in graph_def.node
+      if n.op == "Assert"
+  ]
+  for fdef in graph_def.library.function:
+    assert_nodes += [
+        "node '{}' in function '{}'".format(n.name, fdef.signature.name)
+        for n in fdef.node_def
+        if n.op == "Assert"
+    ]
+  if assert_nodes:
+    raise AssertionError(
+        "Internal tool error: "
+        "failed to suppress {} Assert ops in SavedModel:\n{}".format(
+            len(assert_nodes), "\n".join(assert_nodes[:10])))

export_tfhub_lib_test.py ADDED Viewed

	@@ -0,0 +1,1080 @@

+# Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests export_tfhub_lib."""
+import os
+import tempfile
+from absl.testing import parameterized
+import numpy as np
+import tensorflow as tf, tf_keras
+from tensorflow import estimator as tf_estimator
+import tensorflow_hub as hub
+import tensorflow_text as text
+from sentencepiece import SentencePieceTrainer
+from official.legacy.bert import configs
+from official.modeling import tf_utils
+from official.nlp.configs import encoders
+from official.nlp.modeling import layers
+from official.nlp.modeling import models
+from official.nlp.tools import export_tfhub_lib
+def _get_bert_config_or_encoder_config(use_bert_config,
+                                       hidden_size,
+                                       num_hidden_layers,
+                                       encoder_type="albert",
+                                       vocab_size=100):
+  """Generates config args for export_tfhub_lib._create_model().
+  Args:
+    use_bert_config: bool. If True, returns legacy BertConfig.
+    hidden_size: int.
+    num_hidden_layers: int.
+    encoder_type: str. Can be ['albert', 'bert', 'bert_v2']. If use_bert_config
+      == True, then model_type is not used.
+    vocab_size: int.
+  Returns:
+    bert_config, encoder_config. Only one is not None. If
+      `use_bert_config` == True, the first config is valid. Otherwise
+      `bert_config` == None.
+  """
+  if use_bert_config:
+    bert_config = configs.BertConfig(
+        vocab_size=vocab_size,
+        hidden_size=hidden_size,
+        intermediate_size=32,
+        max_position_embeddings=128,
+        num_attention_heads=2,
+        num_hidden_layers=num_hidden_layers)
+    encoder_config = None
+  else:
+    bert_config = None
+    if encoder_type == "albert":
+      encoder_config = encoders.EncoderConfig(
+          type="albert",
+          albert=encoders.AlbertEncoderConfig(
+              vocab_size=vocab_size,
+              embedding_width=16,
+              hidden_size=hidden_size,
+              intermediate_size=32,
+              max_position_embeddings=128,
+              num_attention_heads=2,
+              num_layers=num_hidden_layers,
+              dropout_rate=0.1))
+    else:
+      # encoder_type can be 'bert' or 'bert_v2'.
+      model_config = encoders.BertEncoderConfig(
+          vocab_size=vocab_size,
+          embedding_size=16,
+          hidden_size=hidden_size,
+          intermediate_size=32,
+          max_position_embeddings=128,
+          num_attention_heads=2,
+          num_layers=num_hidden_layers,
+          dropout_rate=0.1)
+      kwargs = {"type": encoder_type, encoder_type: model_config}
+      encoder_config = encoders.EncoderConfig(**kwargs)
+  return bert_config, encoder_config
+def _get_vocab_or_sp_model_dummy(temp_dir, use_sp_model):
+  """Returns tokenizer asset args for export_tfhub_lib.export_model()."""
+  dummy_file = os.path.join(temp_dir, "dummy_file.txt")
+  with tf.io.gfile.GFile(dummy_file, "w") as f:
+    f.write("dummy content")
+  if use_sp_model:
+    vocab_file, sp_model_file = None, dummy_file
+  else:
+    vocab_file, sp_model_file = dummy_file, None
+  return vocab_file, sp_model_file
+def _read_asset(asset: tf.saved_model.Asset):
+  return tf.io.gfile.GFile(asset.asset_path.numpy()).read()
+def _find_lambda_layers(layer):
+  """Returns list of all Lambda layers in a Keras model."""
+  if isinstance(layer, tf_keras.layers.Lambda):
+    return [layer]
+  elif hasattr(layer, "layers"):  # It's nested, like a Model.
+    result = []
+    for l in layer.layers:
+      result += _find_lambda_layers(l)
+    return result
+  else:
+    return []
+class ExportModelTest(tf.test.TestCase, parameterized.TestCase):
+  """Tests exporting a Transformer Encoder model as a SavedModel.
+  This covers export from an Encoder checkpoint to a SavedModel without
+  the .mlm subobject. This is no longer preferred, but still useful
+    for models like Electra that are trained without the MLM task.
+  The export code is generic. This test focuses on two main cases
+  (the most important ones in practice when this was written in 2020):
+    - BERT built from a legacy BertConfig, for use with BertTokenizer.
+    - ALBERT built from an EncoderConfig (as a representative of all other
+      choices beyond BERT, for use with SentencepieceTokenizer (the one
+      alternative to BertTokenizer).
+  """
+  @parameterized.named_parameters(
+      ("Bert_Legacy", True, None), ("Albert", False, "albert"),
+      ("BertEncoder", False, "bert"), ("BertEncoderV2", False, "bert_v2"))
+  def test_export_model(self, use_bert, encoder_type):
+    # Create the encoder and export it.
+    hidden_size = 16
+    num_hidden_layers = 1
+    bert_config, encoder_config = _get_bert_config_or_encoder_config(
+        use_bert,
+        hidden_size=hidden_size,
+        num_hidden_layers=num_hidden_layers,
+        encoder_type=encoder_type)
+    bert_model, encoder = export_tfhub_lib._create_model(
+        bert_config=bert_config, encoder_config=encoder_config, with_mlm=False)
+    self.assertEmpty(
+        _find_lambda_layers(bert_model),
+        "Lambda layers are non-portable since they serialize Python bytecode.")
+    model_checkpoint_dir = os.path.join(self.get_temp_dir(), "checkpoint")
+    checkpoint = tf.train.Checkpoint(encoder=encoder)
+    checkpoint.save(os.path.join(model_checkpoint_dir, "test"))
+    model_checkpoint_path = tf.train.latest_checkpoint(model_checkpoint_dir)
+    vocab_file, sp_model_file = _get_vocab_or_sp_model_dummy(
+        self.get_temp_dir(), use_sp_model=not use_bert)
+    export_path = os.path.join(self.get_temp_dir(), "hub")
+    export_tfhub_lib.export_model(
+        export_path=export_path,
+        bert_config=bert_config,
+        encoder_config=encoder_config,
+        model_checkpoint_path=model_checkpoint_path,
+        with_mlm=False,
+        vocab_file=vocab_file,
+        sp_model_file=sp_model_file,
+        do_lower_case=True)
+    # Restore the exported model.
+    hub_layer = hub.KerasLayer(export_path, trainable=True)
+    # Check legacy tokenization data.
+    if use_bert:
+      self.assertTrue(hub_layer.resolved_object.do_lower_case.numpy())
+      self.assertEqual("dummy content",
+                       _read_asset(hub_layer.resolved_object.vocab_file))
+      self.assertFalse(hasattr(hub_layer.resolved_object, "sp_model_file"))
+    else:
+      self.assertFalse(hasattr(hub_layer.resolved_object, "do_lower_case"))
+      self.assertFalse(hasattr(hub_layer.resolved_object, "vocab_file"))
+      self.assertEqual("dummy content",
+                       _read_asset(hub_layer.resolved_object.sp_model_file))
+    # Check restored weights.
+    self.assertEqual(
+        len(bert_model.trainable_weights), len(hub_layer.trainable_weights))
+    for source_weight, hub_weight in zip(bert_model.trainable_weights,
+                                         hub_layer.trainable_weights):
+      self.assertAllClose(source_weight.numpy(), hub_weight.numpy())
+    # Check computation.
+    seq_length = 10
+    dummy_ids = np.zeros((2, seq_length), dtype=np.int32)
+    input_dict = dict(
+        input_word_ids=dummy_ids,
+        input_mask=dummy_ids,
+        input_type_ids=dummy_ids)
+    hub_output = hub_layer(input_dict)
+    source_output = bert_model(input_dict)
+    encoder_output = encoder(input_dict)
+    self.assertEqual(hub_output["pooled_output"].shape, (2, hidden_size))
+    self.assertEqual(hub_output["sequence_output"].shape,
+                     (2, seq_length, hidden_size))
+    self.assertLen(hub_output["encoder_outputs"], num_hidden_layers)
+    for key in ("pooled_output", "sequence_output", "encoder_outputs"):
+      self.assertAllClose(source_output[key], hub_output[key])
+      self.assertAllClose(source_output[key], encoder_output[key])
+    # The "default" output of BERT as a text representation is pooled_output.
+    self.assertAllClose(hub_output["pooled_output"], hub_output["default"])
+    # Test that training=True makes a difference (activates dropout).
+    def _dropout_mean_stddev(training, num_runs=20):
+      input_ids = np.array([[14, 12, 42, 95, 99]], np.int32)
+      input_dict = dict(
+          input_word_ids=input_ids,
+          input_mask=np.ones_like(input_ids),
+          input_type_ids=np.zeros_like(input_ids))
+      outputs = np.concatenate([
+          hub_layer(input_dict, training=training)["pooled_output"]
+          for _ in range(num_runs)
+      ])
+      return np.mean(np.std(outputs, axis=0))
+    self.assertLess(_dropout_mean_stddev(training=False), 1e-6)
+    self.assertGreater(_dropout_mean_stddev(training=True), 1e-3)
+    # Test propagation of seq_length in shape inference.
+    input_word_ids = tf_keras.layers.Input(shape=(seq_length,), dtype=tf.int32)
+    input_mask = tf_keras.layers.Input(shape=(seq_length,), dtype=tf.int32)
+    input_type_ids = tf_keras.layers.Input(shape=(seq_length,), dtype=tf.int32)
+    input_dict = dict(
+        input_word_ids=input_word_ids,
+        input_mask=input_mask,
+        input_type_ids=input_type_ids)
+    output_dict = hub_layer(input_dict)
+    pooled_output = output_dict["pooled_output"]
+    sequence_output = output_dict["sequence_output"]
+    encoder_outputs = output_dict["encoder_outputs"]
+    self.assertEqual(pooled_output.shape.as_list(), [None, hidden_size])
+    self.assertEqual(sequence_output.shape.as_list(),
+                     [None, seq_length, hidden_size])
+    self.assertLen(encoder_outputs, num_hidden_layers)
+class ExportModelWithMLMTest(tf.test.TestCase, parameterized.TestCase):
+  """Tests exporting a Transformer Encoder model as a SavedModel.
+  This covers export from a Pretrainer checkpoint to a SavedModel including
+  the .mlm subobject, which is the preferred way since 2020.
+  The export code is generic. This test focuses on two main cases
+  (the most important ones in practice when this was written in 2020):
+    - BERT built from a legacy BertConfig, for use with BertTokenizer.
+    - ALBERT built from an EncoderConfig (as a representative of all other
+      choices beyond BERT, for use with SentencepieceTokenizer (the one
+      alternative to BertTokenizer).
+  """
+  def test_copy_pooler_dense_to_encoder(self):
+    encoder_config = encoders.EncoderConfig(
+        type="bert",
+        bert=encoders.BertEncoderConfig(
+            hidden_size=24, intermediate_size=48, num_layers=2))
+    cls_heads = [
+        layers.ClassificationHead(
+            inner_dim=24, num_classes=2, name="next_sentence")
+    ]
+    encoder = encoders.build_encoder(encoder_config)
+    pretrainer = models.BertPretrainerV2(
+        encoder_network=encoder,
+        classification_heads=cls_heads,
+        mlm_activation=tf_utils.get_activation(
+            encoder_config.get().hidden_activation))
+    # Makes sure the pretrainer variables are created.
+    _ = pretrainer(pretrainer.inputs)
+    checkpoint = tf.train.Checkpoint(**pretrainer.checkpoint_items)
+    model_checkpoint_dir = os.path.join(self.get_temp_dir(), "checkpoint")
+    checkpoint.save(os.path.join(model_checkpoint_dir, "test"))
+    vocab_file, sp_model_file = _get_vocab_or_sp_model_dummy(
+        self.get_temp_dir(), use_sp_model=True)
+    export_path = os.path.join(self.get_temp_dir(), "hub")
+    export_tfhub_lib.export_model(
+        export_path=export_path,
+        encoder_config=encoder_config,
+        model_checkpoint_path=tf.train.latest_checkpoint(model_checkpoint_dir),
+        with_mlm=True,
+        copy_pooler_dense_to_encoder=True,
+        vocab_file=vocab_file,
+        sp_model_file=sp_model_file,
+        do_lower_case=True)
+    # Restores a hub KerasLayer.
+    hub_layer = hub.KerasLayer(export_path, trainable=True)
+    dummy_ids = np.zeros((2, 10), dtype=np.int32)
+    input_dict = dict(
+        input_word_ids=dummy_ids,
+        input_mask=dummy_ids,
+        input_type_ids=dummy_ids)
+    hub_pooled_output = hub_layer(input_dict)["pooled_output"]
+    encoder_outputs = encoder(input_dict)
+    # Verify that hub_layer's pooled_output is the same as the output of next
+    # sentence prediction's dense layer.
+    pretrained_pooled_output = cls_heads[0].dense(
+        (encoder_outputs["sequence_output"][:, 0, :]))
+    self.assertAllClose(hub_pooled_output, pretrained_pooled_output)
+    # But the pooled_output between encoder and hub_layer are not the same.
+    encoder_pooled_output = encoder_outputs["pooled_output"]
+    self.assertNotAllClose(hub_pooled_output, encoder_pooled_output)
+  @parameterized.named_parameters(
+      ("Bert", True),
+      ("Albert", False),
+  )
+  def test_export_model_with_mlm(self, use_bert):
+    # Create the encoder and export it.
+    hidden_size = 16
+    num_hidden_layers = 2
+    bert_config, encoder_config = _get_bert_config_or_encoder_config(
+        use_bert, hidden_size, num_hidden_layers)
+    bert_model, pretrainer = export_tfhub_lib._create_model(
+        bert_config=bert_config, encoder_config=encoder_config, with_mlm=True)
+    self.assertEmpty(
+        _find_lambda_layers(bert_model),
+        "Lambda layers are non-portable since they serialize Python bytecode.")
+    bert_model_with_mlm = bert_model.mlm
+    model_checkpoint_dir = os.path.join(self.get_temp_dir(), "checkpoint")
+    checkpoint = tf.train.Checkpoint(**pretrainer.checkpoint_items)
+    checkpoint.save(os.path.join(model_checkpoint_dir, "test"))
+    model_checkpoint_path = tf.train.latest_checkpoint(model_checkpoint_dir)
+    vocab_file, sp_model_file = _get_vocab_or_sp_model_dummy(
+        self.get_temp_dir(), use_sp_model=not use_bert)
+    export_path = os.path.join(self.get_temp_dir(), "hub")
+    export_tfhub_lib.export_model(
+        export_path=export_path,
+        bert_config=bert_config,
+        encoder_config=encoder_config,
+        model_checkpoint_path=model_checkpoint_path,
+        with_mlm=True,
+        vocab_file=vocab_file,
+        sp_model_file=sp_model_file,
+        do_lower_case=True)
+    # Restore the exported model.
+    hub_layer = hub.KerasLayer(export_path, trainable=True)
+    # Check legacy tokenization data.
+    if use_bert:
+      self.assertTrue(hub_layer.resolved_object.do_lower_case.numpy())
+      self.assertEqual("dummy content",
+                       _read_asset(hub_layer.resolved_object.vocab_file))
+      self.assertFalse(hasattr(hub_layer.resolved_object, "sp_model_file"))
+    else:
+      self.assertFalse(hasattr(hub_layer.resolved_object, "do_lower_case"))
+      self.assertFalse(hasattr(hub_layer.resolved_object, "vocab_file"))
+      self.assertEqual("dummy content",
+                       _read_asset(hub_layer.resolved_object.sp_model_file))
+    # Check restored weights.
+    # Note that we set `_auto_track_sub_layers` to False when exporting the
+    # SavedModel, so hub_layer has the same number of weights as bert_model;
+    # otherwise, hub_layer will have extra weights from its `mlm` subobject.
+    self.assertEqual(
+        len(bert_model.trainable_weights), len(hub_layer.trainable_weights))
+    for source_weight, hub_weight in zip(bert_model.trainable_weights,
+                                         hub_layer.trainable_weights):
+      self.assertAllClose(source_weight, hub_weight)
+    # Check computation.
+    seq_length = 10
+    dummy_ids = np.zeros((2, seq_length), dtype=np.int32)
+    input_dict = dict(
+        input_word_ids=dummy_ids,
+        input_mask=dummy_ids,
+        input_type_ids=dummy_ids)
+    hub_outputs_dict = hub_layer(input_dict)
+    source_outputs_dict = bert_model(input_dict)
+    encoder_outputs_dict = pretrainer.encoder_network(
+        [dummy_ids, dummy_ids, dummy_ids])
+    self.assertEqual(hub_outputs_dict["pooled_output"].shape, (2, hidden_size))
+    self.assertEqual(hub_outputs_dict["sequence_output"].shape,
+                     (2, seq_length, hidden_size))
+    for output_key in ("pooled_output", "sequence_output", "encoder_outputs"):
+      self.assertAllClose(source_outputs_dict[output_key],
+                          hub_outputs_dict[output_key])
+      self.assertAllClose(source_outputs_dict[output_key],
+                          encoder_outputs_dict[output_key])
+    # The "default" output of BERT as a text representation is pooled_output.
+    self.assertAllClose(hub_outputs_dict["pooled_output"],
+                        hub_outputs_dict["default"])
+    # Test that training=True makes a difference (activates dropout).
+    def _dropout_mean_stddev(training, num_runs=20):
+      input_ids = np.array([[14, 12, 42, 95, 99]], np.int32)
+      input_dict = dict(
+          input_word_ids=input_ids,
+          input_mask=np.ones_like(input_ids),
+          input_type_ids=np.zeros_like(input_ids))
+      outputs = np.concatenate([
+          hub_layer(input_dict, training=training)["pooled_output"]
+          for _ in range(num_runs)
+      ])
+      return np.mean(np.std(outputs, axis=0))
+    self.assertLess(_dropout_mean_stddev(training=False), 1e-6)
+    self.assertGreater(_dropout_mean_stddev(training=True), 1e-3)
+    # Checks sub-object `mlm`.
+    self.assertTrue(hasattr(hub_layer.resolved_object, "mlm"))
+    self.assertLen(hub_layer.resolved_object.mlm.trainable_variables,
+                   len(bert_model_with_mlm.trainable_weights))
+    self.assertLen(hub_layer.resolved_object.mlm.trainable_variables,
+                   len(pretrainer.trainable_weights))
+    for source_weight, hub_weight, pretrainer_weight in zip(
+        bert_model_with_mlm.trainable_weights,
+        hub_layer.resolved_object.mlm.trainable_variables,
+        pretrainer.trainable_weights):
+      self.assertAllClose(source_weight, hub_weight)
+      self.assertAllClose(source_weight, pretrainer_weight)
+    max_predictions_per_seq = 4
+    mlm_positions = np.zeros((2, max_predictions_per_seq), dtype=np.int32)
+    input_dict = dict(
+        input_word_ids=dummy_ids,
+        input_mask=dummy_ids,
+        input_type_ids=dummy_ids,
+        masked_lm_positions=mlm_positions)
+    hub_mlm_outputs_dict = hub_layer.resolved_object.mlm(input_dict)
+    source_mlm_outputs_dict = bert_model_with_mlm(input_dict)
+    for output_key in ("pooled_output", "sequence_output", "mlm_logits",
+                       "encoder_outputs"):
+      self.assertAllClose(hub_mlm_outputs_dict[output_key],
+                          source_mlm_outputs_dict[output_key])
+    pretrainer_mlm_logits_output = pretrainer(input_dict)["mlm_logits"]
+    self.assertAllClose(hub_mlm_outputs_dict["mlm_logits"],
+                        pretrainer_mlm_logits_output)
+    # Test that training=True makes a difference (activates dropout).
+    def _dropout_mean_stddev_mlm(training, num_runs=20):
+      input_ids = np.array([[14, 12, 42, 95, 99]], np.int32)
+      mlm_position_ids = np.array([[1, 2, 3, 4]], np.int32)
+      input_dict = dict(
+          input_word_ids=input_ids,
+          input_mask=np.ones_like(input_ids),
+          input_type_ids=np.zeros_like(input_ids),
+          masked_lm_positions=mlm_position_ids)
+      outputs = np.concatenate([
+          hub_layer.resolved_object.mlm(input_dict,
+                                        training=training)["pooled_output"]
+          for _ in range(num_runs)
+      ])
+      return np.mean(np.std(outputs, axis=0))
+    self.assertLess(_dropout_mean_stddev_mlm(training=False), 1e-6)
+    self.assertGreater(_dropout_mean_stddev_mlm(training=True), 1e-3)
+    # Test propagation of seq_length in shape inference.
+    input_word_ids = tf_keras.layers.Input(shape=(seq_length,), dtype=tf.int32)
+    input_mask = tf_keras.layers.Input(shape=(seq_length,), dtype=tf.int32)
+    input_type_ids = tf_keras.layers.Input(shape=(seq_length,), dtype=tf.int32)
+    input_dict = dict(
+        input_word_ids=input_word_ids,
+        input_mask=input_mask,
+        input_type_ids=input_type_ids)
+    hub_outputs_dict = hub_layer(input_dict)
+    self.assertEqual(hub_outputs_dict["pooled_output"].shape.as_list(),
+                     [None, hidden_size])
+    self.assertEqual(hub_outputs_dict["sequence_output"].shape.as_list(),
+                     [None, seq_length, hidden_size])
+_STRING_NOT_TO_LEAK = "private_path_component_"
+class ExportPreprocessingTest(tf.test.TestCase, parameterized.TestCase):
+  def _make_vocab_file(self, vocab, filename="vocab.txt", add_mask_token=False):
+    """Creates wordpiece vocab file with given words plus special tokens.
+    The tokens of the resulting model are, in this order:
+        [PAD], [UNK], [CLS], [SEP], [MASK]*, ...vocab...
+    *=if requested by args.
+    This function also accepts wordpieces that start with the ## continuation
+    marker, but avoiding those makes this function interchangeable with
+    _make_sp_model_file(), up to the extra dimension returned by BertTokenizer.
+    Args:
+      vocab: a list of strings with the words or wordpieces to put into the
+        model's vocabulary. Do not include special tokens here.
+      filename: Optionally, a filename (relative to the temporary directory
+        created by this function).
+      add_mask_token: an optional bool, whether to include a [MASK] token.
+    Returns:
+      The absolute filename of the created vocab file.
+    """
+    full_vocab = ["[PAD]", "[UNK]", "[CLS]", "[SEP]"
+                 ] + ["[MASK]"] * add_mask_token + vocab
+    path = os.path.join(
+        tempfile.mkdtemp(
+            dir=self.get_temp_dir(),  # New subdir each time.
+            prefix=_STRING_NOT_TO_LEAK),
+        filename)
+    with tf.io.gfile.GFile(path, "w") as f:
+      f.write("\n".join(full_vocab + [""]))
+    return path
+  def _make_sp_model_file(self, vocab, prefix="spm", add_mask_token=False):
+    """Creates Sentencepiece word model with given words plus special tokens.
+    The tokens of the resulting model are, in this order:
+        <pad>, <unk>, [CLS], [SEP], [MASK]*, ...vocab..., <s>, </s>
+    *=if requested by args.
+    The words in the input vocab are plain text, without the whitespace marker.
+    That makes this function interchangeable with _make_vocab_file().
+    Args:
+      vocab: a list of strings with the words to put into the model's
+        vocabulary. Do not include special tokens here.
+      prefix: an optional string, to change the filename prefix for the model
+        (relative to the temporary directory created by this function).
+      add_mask_token: an optional bool, whether to include a [MASK] token.
+    Returns:
+      The absolute filename of the created Sentencepiece model file.
+    """
+    model_prefix = os.path.join(
+        tempfile.mkdtemp(dir=self.get_temp_dir()),  # New subdir each time.
+        prefix)
+    input_file = model_prefix + "_train_input.txt"
+    # Create input text for training the sp model from the tokens provided.
+    # Repeat tokens, the earlier the more, because they are sorted by frequency.
+    input_text = []
+    for i, token in enumerate(vocab):
+      input_text.append(" ".join([token] * (len(vocab) - i)))
+    with tf.io.gfile.GFile(input_file, "w") as f:
+      f.write("\n".join(input_text + [""]))
+    control_symbols = "[CLS],[SEP]"
+    full_vocab_size = len(vocab) + 6  # <pad>, <unk>, [CLS], [SEP], <s>, </s>.
+    if add_mask_token:
+      control_symbols += ",[MASK]"
+      full_vocab_size += 1
+    flags = dict(
+        model_prefix=model_prefix,
+        model_type="word",
+        input=input_file,
+        pad_id=0,
+        unk_id=1,
+        control_symbols=control_symbols,
+        vocab_size=full_vocab_size,
+        bos_id=full_vocab_size - 2,
+        eos_id=full_vocab_size - 1)
+    SentencePieceTrainer.Train(" ".join(
+        ["--{}={}".format(k, v) for k, v in flags.items()]))
+    return model_prefix + ".model"
+  def _do_export(self,
+                 vocab,
+                 do_lower_case,
+                 default_seq_length=128,
+                 tokenize_with_offsets=True,
+                 use_sp_model=False,
+                 experimental_disable_assert=False,
+                 add_mask_token=False):
+    """Runs SavedModel export and returns the export_path."""
+    export_path = tempfile.mkdtemp(dir=self.get_temp_dir())
+    vocab_file = sp_model_file = None
+    if use_sp_model:
+      sp_model_file = self._make_sp_model_file(
+          vocab, add_mask_token=add_mask_token)
+    else:
+      vocab_file = self._make_vocab_file(vocab, add_mask_token=add_mask_token)
+    export_tfhub_lib.export_preprocessing(
+        export_path,
+        vocab_file=vocab_file,
+        sp_model_file=sp_model_file,
+        do_lower_case=do_lower_case,
+        tokenize_with_offsets=tokenize_with_offsets,
+        default_seq_length=default_seq_length,
+        experimental_disable_assert=experimental_disable_assert)
+    # Invalidate the original filename to verify loading from the SavedModel.
+    tf.io.gfile.remove(sp_model_file or vocab_file)
+    return export_path
+  def test_no_leaks(self):
+    """Tests not leaking the path to the original vocab file."""
+    path = self._do_export(["d", "ef", "abc", "xy"],
+                           do_lower_case=True,
+                           use_sp_model=False)
+    with tf.io.gfile.GFile(os.path.join(path, "saved_model.pb"), "rb") as f:
+      self.assertFalse(  # pylint: disable=g-generic-assert
+          _STRING_NOT_TO_LEAK.encode("ascii") in f.read())
+  @parameterized.named_parameters(("Bert", False), ("Sentencepiece", True))
+  def test_exported_callables(self, use_sp_model):
+    preprocess = tf.saved_model.load(
+        self._do_export(
+            ["d", "ef", "abc", "xy"],
+            do_lower_case=True,
+            # TODO(b/181866850): drop this.
+            tokenize_with_offsets=not use_sp_model,
+            # TODO(b/175369555): drop this.
+            experimental_disable_assert=True,
+            use_sp_model=use_sp_model))
+    def fold_dim(rt):
+      """Removes the word/subword distinction of BertTokenizer."""
+      return rt if use_sp_model else rt.merge_dims(1, 2)
+    # .tokenize()
+    inputs = tf.constant(["abc d ef", "ABC D EF d"])
+    token_ids = preprocess.tokenize(inputs)
+    self.assertAllEqual(
+        fold_dim(token_ids), tf.ragged.constant([[6, 4, 5], [6, 4, 5, 4]]))
+    special_tokens_dict = {
+        k: v.numpy().item()  # Expecting eager Tensor, converting to Python.
+        for k, v in preprocess.tokenize.get_special_tokens_dict().items()
+    }
+    self.assertDictEqual(
+        special_tokens_dict,
+        dict(
+            padding_id=0,
+            start_of_sequence_id=2,
+            end_of_segment_id=3,
+            vocab_size=4 + 6 if use_sp_model else 4 + 4))
+    # .tokenize_with_offsets()
+    if use_sp_model:
+      # TODO(b/181866850): Enable tokenize_with_offsets when it works and test.
+      self.assertFalse(hasattr(preprocess, "tokenize_with_offsets"))
+    else:
+      token_ids, start_offsets, limit_offsets = (
+          preprocess.tokenize_with_offsets(inputs))
+      self.assertAllEqual(
+          fold_dim(token_ids), tf.ragged.constant([[6, 4, 5], [6, 4, 5, 4]]))
+      self.assertAllEqual(
+          fold_dim(start_offsets), tf.ragged.constant([[0, 4, 6], [0, 4, 6,
+                                                                   9]]))
+      self.assertAllEqual(
+          fold_dim(limit_offsets), tf.ragged.constant([[3, 5, 8], [3, 5, 8,
+                                                                   10]]))
+      self.assertIs(preprocess.tokenize.get_special_tokens_dict,
+                    preprocess.tokenize_with_offsets.get_special_tokens_dict)
+    # Root callable.
+    bert_inputs = preprocess(inputs)
+    self.assertAllEqual(bert_inputs["input_word_ids"].shape.as_list(), [2, 128])
+    self.assertAllEqual(
+        bert_inputs["input_word_ids"][:, :10],
+        tf.constant([[2, 6, 4, 5, 3, 0, 0, 0, 0, 0],
+                     [2, 6, 4, 5, 4, 3, 0, 0, 0, 0]]))
+    self.assertAllEqual(bert_inputs["input_mask"].shape.as_list(), [2, 128])
+    self.assertAllEqual(
+        bert_inputs["input_mask"][:, :10],
+        tf.constant([[1, 1, 1, 1, 1, 0, 0, 0, 0, 0],
+                     [1, 1, 1, 1, 1, 1, 0, 0, 0, 0]]))
+    self.assertAllEqual(bert_inputs["input_type_ids"].shape.as_list(), [2, 128])
+    self.assertAllEqual(
+        bert_inputs["input_type_ids"][:, :10],
+        tf.constant([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                     [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]))
+    # .bert_pack_inputs()
+    inputs_2 = tf.constant(["d xy", "xy abc"])
+    token_ids_2 = preprocess.tokenize(inputs_2)
+    bert_inputs = preprocess.bert_pack_inputs([token_ids, token_ids_2],
+                                              seq_length=256)
+    self.assertAllEqual(bert_inputs["input_word_ids"].shape.as_list(), [2, 256])
+    self.assertAllEqual(
+        bert_inputs["input_word_ids"][:, :10],
+        tf.constant([[2, 6, 4, 5, 3, 4, 7, 3, 0, 0],
+                     [2, 6, 4, 5, 4, 3, 7, 6, 3, 0]]))
+    self.assertAllEqual(bert_inputs["input_mask"].shape.as_list(), [2, 256])
+    self.assertAllEqual(
+        bert_inputs["input_mask"][:, :10],
+        tf.constant([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0],
+                     [1, 1, 1, 1, 1, 1, 1, 1, 1, 0]]))
+    self.assertAllEqual(bert_inputs["input_type_ids"].shape.as_list(), [2, 256])
+    self.assertAllEqual(
+        bert_inputs["input_type_ids"][:, :10],
+        tf.constant([[0, 0, 0, 0, 0, 1, 1, 1, 0, 0],
+                     [0, 0, 0, 0, 0, 0, 1, 1, 1, 0]]))
+  # For BertTokenizer only: repeat relevant parts for do_lower_case=False,
+  # default_seq_length=10, experimental_disable_assert=False,
+  # tokenize_with_offsets=False, and without folding the word/subword dimension.
+  def test_cased_length10(self):
+    preprocess = tf.saved_model.load(
+        self._do_export(["d", "##ef", "abc", "ABC"],
+                        do_lower_case=False,
+                        default_seq_length=10,
+                        tokenize_with_offsets=False,
+                        use_sp_model=False,
+                        experimental_disable_assert=False))
+    inputs = tf.constant(["abc def", "ABC DEF"])
+    token_ids = preprocess.tokenize(inputs)
+    self.assertAllEqual(token_ids,
+                        tf.ragged.constant([[[6], [4, 5]], [[7], [1]]]))
+    self.assertFalse(hasattr(preprocess, "tokenize_with_offsets"))
+    bert_inputs = preprocess(inputs)
+    self.assertAllEqual(
+        bert_inputs["input_word_ids"],
+        tf.constant([[2, 6, 4, 5, 3, 0, 0, 0, 0, 0],
+                     [2, 7, 1, 3, 0, 0, 0, 0, 0, 0]]))
+    self.assertAllEqual(
+        bert_inputs["input_mask"],
+        tf.constant([[1, 1, 1, 1, 1, 0, 0, 0, 0, 0],
+                     [1, 1, 1, 1, 0, 0, 0, 0, 0, 0]]))
+    self.assertAllEqual(
+        bert_inputs["input_type_ids"],
+        tf.constant([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                     [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]))
+    inputs_2 = tf.constant(["d ABC", "ABC abc"])
+    token_ids_2 = preprocess.tokenize(inputs_2)
+    bert_inputs = preprocess.bert_pack_inputs([token_ids, token_ids_2])
+    # Test default seq_length=10.
+    self.assertAllEqual(
+        bert_inputs["input_word_ids"],
+        tf.constant([[2, 6, 4, 5, 3, 4, 7, 3, 0, 0],
+                     [2, 7, 1, 3, 7, 6, 3, 0, 0, 0]]))
+    self.assertAllEqual(
+        bert_inputs["input_mask"],
+        tf.constant([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0],
+                     [1, 1, 1, 1, 1, 1, 1, 0, 0, 0]]))
+    self.assertAllEqual(
+        bert_inputs["input_type_ids"],
+        tf.constant([[0, 0, 0, 0, 0, 1, 1, 1, 0, 0],
+                     [0, 0, 0, 0, 1, 1, 1, 0, 0, 0]]))
+  # XLA requires fixed shapes for tensors found in graph mode.
+  # Statically known shapes in Python are a particularly firm way to
+  # guarantee that, and they are generally more convenient to work with.
+  # We test that the exported SavedModel plays well with TF's shape
+  # inference when applied to fully or partially known input shapes.
+  @parameterized.named_parameters(("Bert", False), ("Sentencepiece", True))
+  def test_shapes(self, use_sp_model):
+    preprocess = tf.saved_model.load(
+        self._do_export(
+            ["abc", "def"],
+            do_lower_case=True,
+            # TODO(b/181866850): drop this.
+            tokenize_with_offsets=not use_sp_model,
+            # TODO(b/175369555): drop this.
+            experimental_disable_assert=True,
+            use_sp_model=use_sp_model))
+    def expected_bert_input_shapes(batch_size, seq_length):
+      return dict(
+          input_word_ids=[batch_size, seq_length],
+          input_mask=[batch_size, seq_length],
+          input_type_ids=[batch_size, seq_length])
+    for batch_size in [7, None]:
+      if use_sp_model:
+        token_out_shape = [batch_size, None]  # No word/subword distinction.
+      else:
+        token_out_shape = [batch_size, None, None]
+      self.assertEqual(
+          _result_shapes_in_tf_function(preprocess.tokenize,
+                                        tf.TensorSpec([batch_size], tf.string)),
+          token_out_shape, "with batch_size=%s" % batch_size)
+      # TODO(b/181866850): Enable tokenize_with_offsets when it works and test.
+      if use_sp_model:
+        self.assertFalse(hasattr(preprocess, "tokenize_with_offsets"))
+      else:
+        self.assertEqual(
+            _result_shapes_in_tf_function(
+                preprocess.tokenize_with_offsets,
+                tf.TensorSpec([batch_size], tf.string)), [token_out_shape] * 3,
+            "with batch_size=%s" % batch_size)
+      self.assertEqual(
+          _result_shapes_in_tf_function(
+              preprocess.bert_pack_inputs,
+              [tf.RaggedTensorSpec([batch_size, None, None], tf.int32)] * 2,
+              seq_length=256), expected_bert_input_shapes(batch_size, 256),
+          "with batch_size=%s" % batch_size)
+      self.assertEqual(
+          _result_shapes_in_tf_function(preprocess,
+                                        tf.TensorSpec([batch_size], tf.string)),
+          expected_bert_input_shapes(batch_size, 128),
+          "with batch_size=%s" % batch_size)
+  @parameterized.named_parameters(("Bert", False), ("Sentencepiece", True))
+  def test_reexport(self, use_sp_model):
+    """Test that preprocess keeps working after another save/load cycle."""
+    path1 = self._do_export(
+        ["d", "ef", "abc", "xy"],
+        do_lower_case=True,
+        default_seq_length=10,
+        tokenize_with_offsets=False,
+        experimental_disable_assert=True,  # TODO(b/175369555): drop this.
+        use_sp_model=use_sp_model)
+    path2 = path1.rstrip("/") + ".2"
+    model1 = tf.saved_model.load(path1)
+    tf.saved_model.save(model1, path2)
+    # Delete the first SavedModel to test that the sceond one loads by itself.
+    # https://github.com/tensorflow/tensorflow/issues/46456 reports such a
+    # failure case for BertTokenizer.
+    tf.io.gfile.rmtree(path1)
+    model2 = tf.saved_model.load(path2)
+    inputs = tf.constant(["abc d ef", "ABC D EF d"])
+    bert_inputs = model2(inputs)
+    self.assertAllEqual(
+        bert_inputs["input_word_ids"],
+        tf.constant([[2, 6, 4, 5, 3, 0, 0, 0, 0, 0],
+                     [2, 6, 4, 5, 4, 3, 0, 0, 0, 0]]))
+    self.assertAllEqual(
+        bert_inputs["input_mask"],
+        tf.constant([[1, 1, 1, 1, 1, 0, 0, 0, 0, 0],
+                     [1, 1, 1, 1, 1, 1, 0, 0, 0, 0]]))
+    self.assertAllEqual(
+        bert_inputs["input_type_ids"],
+        tf.constant([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                     [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]))
+  @parameterized.named_parameters(("Bert", True), ("Albert", False))
+  def test_preprocessing_for_mlm(self, use_bert):
+    """Combines both SavedModel types and TF.text helpers for MLM."""
+    # Create the preprocessing SavedModel with a [MASK] token.
+    non_special_tokens = [
+        "hello", "world", "nice", "movie", "great", "actors", "quick", "fox",
+        "lazy", "dog"
+    ]
+    preprocess = tf.saved_model.load(
+        self._do_export(
+            non_special_tokens,
+            do_lower_case=True,
+            tokenize_with_offsets=use_bert,  # TODO(b/181866850): drop this.
+            experimental_disable_assert=True,  # TODO(b/175369555): drop this.
+            add_mask_token=True,
+            use_sp_model=not use_bert))
+    vocab_size = len(non_special_tokens) + (5 if use_bert else 7)
+    # Create the encoder SavedModel with an .mlm subobject.
+    hidden_size = 16
+    num_hidden_layers = 2
+    bert_config, encoder_config = _get_bert_config_or_encoder_config(
+        use_bert_config=use_bert,
+        hidden_size=hidden_size,
+        num_hidden_layers=num_hidden_layers,
+        vocab_size=vocab_size)
+    _, pretrainer = export_tfhub_lib._create_model(
+        bert_config=bert_config, encoder_config=encoder_config, with_mlm=True)
+    model_checkpoint_dir = os.path.join(self.get_temp_dir(), "checkpoint")
+    checkpoint = tf.train.Checkpoint(**pretrainer.checkpoint_items)
+    checkpoint.save(os.path.join(model_checkpoint_dir, "test"))
+    model_checkpoint_path = tf.train.latest_checkpoint(model_checkpoint_dir)
+    vocab_file, sp_model_file = _get_vocab_or_sp_model_dummy(  # Not used below.
+        self.get_temp_dir(), use_sp_model=not use_bert)
+    encoder_export_path = os.path.join(self.get_temp_dir(), "encoder_export")
+    export_tfhub_lib.export_model(
+        export_path=encoder_export_path,
+        bert_config=bert_config,
+        encoder_config=encoder_config,
+        model_checkpoint_path=model_checkpoint_path,
+        with_mlm=True,
+        vocab_file=vocab_file,
+        sp_model_file=sp_model_file,
+        do_lower_case=True)
+    encoder = tf.saved_model.load(encoder_export_path)
+    # Get special tokens from the vocab (and vocab size).
+    special_tokens_dict = preprocess.tokenize.get_special_tokens_dict()
+    self.assertEqual(int(special_tokens_dict["vocab_size"]), vocab_size)
+    padding_id = int(special_tokens_dict["padding_id"])
+    self.assertEqual(padding_id, 0)
+    start_of_sequence_id = int(special_tokens_dict["start_of_sequence_id"])
+    self.assertEqual(start_of_sequence_id, 2)
+    end_of_segment_id = int(special_tokens_dict["end_of_segment_id"])
+    self.assertEqual(end_of_segment_id, 3)
+    mask_id = int(special_tokens_dict["mask_id"])
+    self.assertEqual(mask_id, 4)
+    # A batch of 3 segment pairs.
+    raw_segments = [
+        tf.constant(["hello", "nice movie", "quick fox"]),
+        tf.constant(["world", "great actors", "lazy dog"])
+    ]
+    batch_size = 3
+    # Misc hyperparameters.
+    seq_length = 10
+    max_selections_per_seq = 2
+    # Tokenize inputs.
+    tokenized_segments = [preprocess.tokenize(s) for s in raw_segments]
+    # Trim inputs to eventually fit seq_lentgh.
+    num_special_tokens = len(raw_segments) + 1
+    trimmed_segments = text.WaterfallTrimmer(
+        seq_length - num_special_tokens).trim(tokenized_segments)
+    # Combine input segments into one input sequence.
+    input_ids, segment_ids = text.combine_segments(
+        trimmed_segments,
+        start_of_sequence_id=start_of_sequence_id,
+        end_of_segment_id=end_of_segment_id)
+    # Apply random masking controlled by policy objects.
+    (masked_input_ids, masked_lm_positions,
+     masked_ids) = text.mask_language_model(
+         input_ids=input_ids,
+         item_selector=text.RandomItemSelector(
+             max_selections_per_seq,
+             selection_rate=0.5,  # Adjusted for the short test examples.
+             unselectable_ids=[start_of_sequence_id, end_of_segment_id]),
+         mask_values_chooser=text.MaskValuesChooser(
+             vocab_size=vocab_size,
+             mask_token=mask_id,
+             # Always put [MASK] to have a predictable result.
+             mask_token_rate=1.0,
+             random_token_rate=0.0))
+    # Pad to fixed-length Transformer encoder inputs.
+    input_word_ids, _ = text.pad_model_inputs(
+        masked_input_ids, seq_length, pad_value=padding_id)
+    input_type_ids, input_mask = text.pad_model_inputs(
+        segment_ids, seq_length, pad_value=0)
+    masked_lm_positions, _ = text.pad_model_inputs(
+        masked_lm_positions, max_selections_per_seq, pad_value=0)
+    masked_lm_positions = tf.cast(masked_lm_positions, tf.int32)
+    num_predictions = int(tf.shape(masked_lm_positions)[1])
+    # Test transformer inputs.
+    self.assertEqual(num_predictions, max_selections_per_seq)
+    expected_word_ids = np.array([
+        # [CLS] hello [SEP] world [SEP]
+        [2, 5, 3, 6, 3, 0, 0, 0, 0, 0],
+        # [CLS] nice movie [SEP] great actors [SEP]
+        [2, 7, 8, 3, 9, 10, 3, 0, 0, 0],
+        # [CLS] brown fox [SEP] lazy dog [SEP]
+        [2, 11, 12, 3, 13, 14, 3, 0, 0, 0]
+    ])
+    for i in range(batch_size):
+      for j in range(num_predictions):
+        k = int(masked_lm_positions[i, j])
+        if k != 0:
+          expected_word_ids[i, k] = 4  # [MASK]
+    self.assertAllEqual(input_word_ids, expected_word_ids)
+    # Call the MLM head of the Transformer encoder.
+    mlm_inputs = dict(
+        input_word_ids=input_word_ids,
+        input_mask=input_mask,
+        input_type_ids=input_type_ids,
+        masked_lm_positions=masked_lm_positions,
+    )
+    mlm_outputs = encoder.mlm(mlm_inputs)
+    self.assertEqual(mlm_outputs["pooled_output"].shape,
+                     (batch_size, hidden_size))
+    self.assertEqual(mlm_outputs["sequence_output"].shape,
+                     (batch_size, seq_length, hidden_size))
+    self.assertEqual(mlm_outputs["mlm_logits"].shape,
+                     (batch_size, num_predictions, vocab_size))
+    self.assertLen(mlm_outputs["encoder_outputs"], num_hidden_layers)
+    # A real trainer would now compute the loss of mlm_logits
+    # trying to predict the masked_ids.
+    del masked_ids  # Unused.
+  @parameterized.named_parameters(("Bert", False), ("Sentencepiece", True))
+  def test_special_tokens_in_estimator(self, use_sp_model):
+    """Tests getting special tokens without an Eager init context."""
+    preprocess_export_path = self._do_export(["d", "ef", "abc", "xy"],
+                                             do_lower_case=True,
+                                             use_sp_model=use_sp_model,
+                                             tokenize_with_offsets=False)
+    def _get_special_tokens_dict(obj):
+      """Returns special tokens of restored tokenizer as Python values."""
+      if tf.executing_eagerly():
+        special_tokens_numpy = {
+            k: v.numpy() for k, v in obj.get_special_tokens_dict()
+        }
+      else:
+        with tf.Graph().as_default():
+          # This code expects `get_special_tokens_dict()` to be a tf.function
+          # with no dependencies (bound args) from the context it was loaded in,
+          # and boldly assumes that it can just be called in a dfferent context.
+          special_tokens_tensors = obj.get_special_tokens_dict()
+          with tf.compat.v1.Session() as sess:
+            special_tokens_numpy = sess.run(special_tokens_tensors)
+      return {
+          k: v.item()  # Numpy to Python.
+          for k, v in special_tokens_numpy.items()
+      }
+    def input_fn():
+      self.assertFalse(tf.executing_eagerly())
+      # Build a preprocessing Model.
+      sentences = tf_keras.layers.Input(shape=[], dtype=tf.string)
+      preprocess = tf.saved_model.load(preprocess_export_path)
+      tokenize = hub.KerasLayer(preprocess.tokenize)
+      special_tokens_dict = _get_special_tokens_dict(tokenize.resolved_object)
+      for k, v in special_tokens_dict.items():
+        self.assertIsInstance(v, int, "Unexpected type for {}".format(k))
+      tokens = tokenize(sentences)
+      packed_inputs = layers.BertPackInputs(
+          4, special_tokens_dict=special_tokens_dict)(
+              tokens)
+      preprocessing = tf_keras.Model(sentences, packed_inputs)
+      # Map the dataset.
+      ds = tf.data.Dataset.from_tensors(
+          (tf.constant(["abc", "D EF"]), tf.constant([0, 1])))
+      ds = ds.map(lambda features, labels: (preprocessing(features), labels))
+      return ds
+    def model_fn(features, labels, mode):
+      del labels  # Unused.
+      return tf_estimator.EstimatorSpec(
+          mode=mode, predictions=features["input_word_ids"])
+    estimator = tf_estimator.Estimator(model_fn=model_fn)
+    outputs = list(estimator.predict(input_fn))
+    self.assertAllEqual(outputs, np.array([[2, 6, 3, 0], [2, 4, 5, 3]]))
+  # TODO(b/175369555): Remove that code and its test.
+  @parameterized.named_parameters(("Bert", False), ("Sentencepiece", True))
+  def test_check_no_assert(self, use_sp_model):
+    """Tests the self-check during export without assertions."""
+    preprocess_export_path = self._do_export(["d", "ef", "abc", "xy"],
+                                             do_lower_case=True,
+                                             use_sp_model=use_sp_model,
+                                             tokenize_with_offsets=False,
+                                             experimental_disable_assert=False)
+    with self.assertRaisesRegex(AssertionError,
+                                r"failed to suppress \d+ Assert ops"):
+      export_tfhub_lib._check_no_assert(preprocess_export_path)
+def _result_shapes_in_tf_function(fn, *args, **kwargs):
+  """Returns shapes (as lists) observed on the result of `fn`.
+  Args:
+    fn: A callable.
+    *args: TensorSpecs for Tensor-valued arguments and actual values for
+      Python-valued arguments to fn.
+    **kwargs: Same for keyword arguments.
+  Returns:
+    The nest of partial tensor shapes (as lists) that is statically known inside
+    tf.function(fn)(*args, **kwargs) for the nest of its results.
+  """
+  # Use a captured mutable container for a side outout from the wrapper.
+  uninitialized = "uninitialized!"
+  result_shapes_container = [uninitialized]
+  assert result_shapes_container[0] is uninitialized
+  @tf.function
+  def shape_reporting_wrapper(*args, **kwargs):
+    result = fn(*args, **kwargs)
+    result_shapes_container[0] = tf.nest.map_structure(
+        lambda x: x.shape.as_list(), result)
+    return result
+  shape_reporting_wrapper.get_concrete_function(*args, **kwargs)
+  assert result_shapes_container[0] is not uninitialized
+  return result_shapes_container[0]
+if __name__ == "__main__":
+  tf.test.main()

squad_evaluate_v1_1.py ADDED Viewed

	@@ -0,0 +1,106 @@

+# Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Evaluation of SQuAD predictions (version 1.1).
+The functions are copied from
+https://worksheets.codalab.org/rest/bundles/0xbcd57bee090b421c982906709c8c27e1/contents/blob/.
+The SQuAD dataset is described in this paper:
+SQuAD: 100,000+ Questions for Machine Comprehension of Text
+Pranav Rajpurkar, Jian Zhang, Konstantin Lopyrev, Percy Liang
+https://nlp.stanford.edu/pubs/rajpurkar2016squad.pdf
+"""
+import collections
+import re
+import string
+# pylint: disable=g-bad-import-order
+from absl import logging
+# pylint: enable=g-bad-import-order
+def _normalize_answer(s):
+  """Lowers text and remove punctuation, articles and extra whitespace."""
+  def remove_articles(text):
+    return re.sub(r"\b(a|an|the)\b", " ", text)
+  def white_space_fix(text):
+    return " ".join(text.split())
+  def remove_punc(text):
+    exclude = set(string.punctuation)
+    return "".join(ch for ch in text if ch not in exclude)
+  def lower(text):
+    return text.lower()
+  return white_space_fix(remove_articles(remove_punc(lower(s))))
+def _f1_score(prediction, ground_truth):
+  """Computes F1 score by comparing prediction to ground truth."""
+  prediction_tokens = _normalize_answer(prediction).split()
+  ground_truth_tokens = _normalize_answer(ground_truth).split()
+  prediction_counter = collections.Counter(prediction_tokens)
+  ground_truth_counter = collections.Counter(ground_truth_tokens)
+  common = prediction_counter & ground_truth_counter
+  num_same = sum(common.values())
+  if num_same == 0:
+    return 0
+  precision = 1.0 * num_same / len(prediction_tokens)
+  recall = 1.0 * num_same / len(ground_truth_tokens)
+  f1 = (2 * precision * recall) / (precision + recall)
+  return f1
+def _exact_match_score(prediction, ground_truth):
+  """Checks if predicted answer exactly matches ground truth answer."""
+  return _normalize_answer(prediction) == _normalize_answer(ground_truth)
+def _metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
+  """Computes the max over all metric scores."""
+  scores_for_ground_truths = []
+  for ground_truth in ground_truths:
+    score = metric_fn(prediction, ground_truth)
+    scores_for_ground_truths.append(score)
+  return max(scores_for_ground_truths)
+def evaluate(dataset, predictions):
+  """Evaluates predictions for a dataset."""
+  f1 = exact_match = total = 0
+  for article in dataset:
+    for paragraph in article["paragraphs"]:
+      for qa in paragraph["qas"]:
+        total += 1
+        if qa["id"] not in predictions:
+          message = "Unanswered question " + qa["id"] + " will receive score 0."
+          logging.error(message)
+          continue
+        ground_truths = [entry["text"] for entry in qa["answers"]]
+        prediction = predictions[qa["id"]]
+        exact_match += _metric_max_over_ground_truths(_exact_match_score,
+                                                      prediction, ground_truths)
+        f1 += _metric_max_over_ground_truths(_f1_score, prediction,
+                                             ground_truths)
+  exact_match = exact_match / total
+  f1 = f1 / total
+  return {"exact_match": exact_match, "final_f1": f1}

squad_evaluate_v2_0.py ADDED Viewed

	@@ -0,0 +1,249 @@

+# Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Evaluation script for SQuAD version 2.0.
+The functions are copied and modified from
+https://raw.githubusercontent.com/white127/SQUAD-2.0-bidaf/master/evaluate-v2.0.py
+In addition to basic functionality, we also compute additional statistics and
+plot precision-recall curves if an additional na_prob.json file is provided.
+This file is expected to map question ID's to the model's predicted probability
+that a question is unanswerable.
+"""
+import collections
+import re
+import string
+from absl import logging
+def _make_qid_to_has_ans(dataset):
+  qid_to_has_ans = {}
+  for article in dataset:
+    for p in article['paragraphs']:
+      for qa in p['qas']:
+        qid_to_has_ans[qa['id']] = bool(qa['answers'])
+  return qid_to_has_ans
+def _normalize_answer(s):
+  """Lower text and remove punctuation, articles and extra whitespace."""
+  def remove_articles(text):
+    regex = re.compile(r'\b(a|an|the)\b', re.UNICODE)
+    return re.sub(regex, ' ', text)
+  def white_space_fix(text):
+    return ' '.join(text.split())
+  def remove_punc(text):
+    exclude = set(string.punctuation)
+    return ''.join(ch for ch in text if ch not in exclude)
+  def lower(text):
+    return text.lower()
+  return white_space_fix(remove_articles(remove_punc(lower(s))))
+def _get_tokens(s):
+  if not s: return []
+  return _normalize_answer(s).split()
+def _compute_exact(a_gold, a_pred):
+  return int(_normalize_answer(a_gold) == _normalize_answer(a_pred))
+def _compute_f1(a_gold, a_pred):
+  """Compute F1-score."""
+  gold_toks = _get_tokens(a_gold)
+  pred_toks = _get_tokens(a_pred)
+  common = collections.Counter(gold_toks) & collections.Counter(pred_toks)
+  num_same = sum(common.values())
+  if not gold_toks or not pred_toks:
+    # If either is no-answer, then F1 is 1 if they agree, 0 otherwise
+    return int(gold_toks == pred_toks)
+  if num_same == 0:
+    return 0
+  precision = 1.0 * num_same / len(pred_toks)
+  recall = 1.0 * num_same / len(gold_toks)
+  f1 = (2 * precision * recall) / (precision + recall)
+  return f1
+def _get_raw_scores(dataset, predictions):
+  """Compute raw scores."""
+  exact_scores = {}
+  f1_scores = {}
+  for article in dataset:
+    for p in article['paragraphs']:
+      for qa in p['qas']:
+        qid = qa['id']
+        gold_answers = [a['text'] for a in qa['answers']
+                        if _normalize_answer(a['text'])]
+        if not gold_answers:
+          # For unanswerable questions, only correct answer is empty string
+          gold_answers = ['']
+        if qid not in predictions:
+          logging.error('Missing prediction for %s', qid)
+          continue
+        a_pred = predictions[qid]
+        # Take max over all gold answers
+        exact_scores[qid] = max(_compute_exact(a, a_pred) for a in gold_answers)
+        f1_scores[qid] = max(_compute_f1(a, a_pred) for a in gold_answers)
+  return exact_scores, f1_scores
+def _apply_no_ans_threshold(
+    scores, na_probs, qid_to_has_ans, na_prob_thresh=1.0):
+  new_scores = {}
+  for qid, s in scores.items():
+    pred_na = na_probs[qid] > na_prob_thresh
+    if pred_na:
+      new_scores[qid] = float(not qid_to_has_ans[qid])
+    else:
+      new_scores[qid] = s
+  return new_scores
+def _make_eval_dict(exact_scores, f1_scores, qid_list=None):
+  """Make evaluation result dictionary."""
+  if not qid_list:
+    total = len(exact_scores)
+    return collections.OrderedDict([
+        ('exact', 100.0 * sum(exact_scores.values()) / total),
+        ('f1', 100.0 * sum(f1_scores.values()) / total),
+        ('total', total),
+    ])
+  else:
+    total = len(qid_list)
+    return collections.OrderedDict([
+        ('exact', 100.0 * sum(exact_scores[k] for k in qid_list) / total),
+        ('f1', 100.0 * sum(f1_scores[k] for k in qid_list) / total),
+        ('total', total),
+    ])
+def _merge_eval(main_eval, new_eval, prefix):
+  for k in new_eval:
+    main_eval['%s_%s' % (prefix, k)] = new_eval[k]
+def _make_precision_recall_eval(scores, na_probs, num_true_pos, qid_to_has_ans):
+  """Make evaluation dictionary containing average recision recall."""
+  qid_list = sorted(na_probs, key=lambda k: na_probs[k])
+  true_pos = 0.0
+  cur_p = 1.0
+  cur_r = 0.0
+  precisions = [1.0]
+  recalls = [0.0]
+  avg_prec = 0.0
+  for i, qid in enumerate(qid_list):
+    if qid_to_has_ans[qid]:
+      true_pos += scores[qid]
+    cur_p = true_pos / float(i+1)
+    cur_r = true_pos / float(num_true_pos)
+    if i == len(qid_list) - 1 or na_probs[qid] != na_probs[qid_list[i+1]]:
+      # i.e., if we can put a threshold after this point
+      avg_prec += cur_p * (cur_r - recalls[-1])
+      precisions.append(cur_p)
+      recalls.append(cur_r)
+  return {'ap': 100.0 * avg_prec}
+def _run_precision_recall_analysis(
+    main_eval, exact_raw, f1_raw, na_probs, qid_to_has_ans):
+  """Run precision recall analysis and return result dictionary."""
+  num_true_pos = sum(1 for v in qid_to_has_ans.values() if v)
+  if num_true_pos == 0:
+    return
+  pr_exact = _make_precision_recall_eval(
+      exact_raw, na_probs, num_true_pos, qid_to_has_ans)
+  pr_f1 = _make_precision_recall_eval(
+      f1_raw, na_probs, num_true_pos, qid_to_has_ans)
+  oracle_scores = {k: float(v) for k, v in qid_to_has_ans.items()}
+  pr_oracle = _make_precision_recall_eval(
+      oracle_scores, na_probs, num_true_pos, qid_to_has_ans)
+  _merge_eval(main_eval, pr_exact, 'pr_exact')
+  _merge_eval(main_eval, pr_f1, 'pr_f1')
+  _merge_eval(main_eval, pr_oracle, 'pr_oracle')
+def _find_best_thresh(predictions, scores, na_probs, qid_to_has_ans):
+  """Find the best threshold for no answer probability."""
+  num_no_ans = sum(1 for k in qid_to_has_ans if not qid_to_has_ans[k])
+  cur_score = num_no_ans
+  best_score = cur_score
+  best_thresh = 0.0
+  qid_list = sorted(na_probs, key=lambda k: na_probs[k])
+  for qid in qid_list:
+    if qid not in scores: continue
+    if qid_to_has_ans[qid]:
+      diff = scores[qid]
+    else:
+      if predictions[qid]:
+        diff = -1
+      else:
+        diff = 0
+    cur_score += diff
+    if cur_score > best_score:
+      best_score = cur_score
+      best_thresh = na_probs[qid]
+  return 100.0 * best_score / len(scores), best_thresh
+def _find_all_best_thresh(
+    main_eval, predictions, exact_raw, f1_raw, na_probs, qid_to_has_ans):
+  best_exact, exact_thresh = _find_best_thresh(
+      predictions, exact_raw, na_probs, qid_to_has_ans)
+  best_f1, f1_thresh = _find_best_thresh(
+      predictions, f1_raw, na_probs, qid_to_has_ans)
+  main_eval['final_exact'] = best_exact
+  main_eval['final_exact_thresh'] = exact_thresh
+  main_eval['final_f1'] = best_f1
+  main_eval['final_f1_thresh'] = f1_thresh
+def evaluate(dataset, predictions, na_probs=None):
+  """Evaluate prediction results."""
+  new_orig_data = []
+  for article in dataset:
+    for p in article['paragraphs']:
+      for qa in p['qas']:
+        if qa['id'] in predictions:
+          new_para = {'qas': [qa]}
+          new_article = {'paragraphs': [new_para]}
+          new_orig_data.append(new_article)
+  dataset = new_orig_data
+  if na_probs is None:
+    na_probs = {k: 0.0 for k in predictions}
+  qid_to_has_ans = _make_qid_to_has_ans(dataset)  # maps qid to True/False
+  has_ans_qids = [k for k, v in qid_to_has_ans.items() if v]
+  no_ans_qids = [k for k, v in qid_to_has_ans.items() if not v]
+  exact_raw, f1_raw = _get_raw_scores(dataset, predictions)
+  exact_thresh = _apply_no_ans_threshold(exact_raw, na_probs, qid_to_has_ans)
+  f1_thresh = _apply_no_ans_threshold(f1_raw, na_probs, qid_to_has_ans)
+  out_eval = _make_eval_dict(exact_thresh, f1_thresh)
+  if has_ans_qids:
+    has_ans_eval = _make_eval_dict(
+        exact_thresh, f1_thresh, qid_list=has_ans_qids)
+    _merge_eval(out_eval, has_ans_eval, 'HasAns')
+  if no_ans_qids:
+    no_ans_eval = _make_eval_dict(exact_thresh, f1_thresh, qid_list=no_ans_qids)
+    _merge_eval(out_eval, no_ans_eval, 'NoAns')
+  _find_all_best_thresh(
+      out_eval, predictions, exact_raw, f1_raw, na_probs, qid_to_has_ans)
+  _run_precision_recall_analysis(
+      out_eval, exact_raw, f1_raw, na_probs, qid_to_has_ans)
+  return out_eval

tf1_bert_checkpoint_converter_lib.py ADDED Viewed

	@@ -0,0 +1,201 @@

+# Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+r"""Convert checkpoints created by Estimator (tf1) to be Keras compatible."""
+import numpy as np
+import tensorflow.compat.v1 as tf  # TF 1.x
+# Mapping between old <=> new names. The source pattern in original variable
+# name will be replaced by destination pattern.
+BERT_NAME_REPLACEMENTS = (
+    ("bert", "bert_model"),
+    ("embeddings/word_embeddings", "word_embeddings/embeddings"),
+    ("embeddings/token_type_embeddings",
+     "embedding_postprocessor/type_embeddings"),
+    ("embeddings/position_embeddings",
+     "embedding_postprocessor/position_embeddings"),
+    ("embeddings/LayerNorm", "embedding_postprocessor/layer_norm"),
+    ("attention/self", "self_attention"),
+    ("attention/output/dense", "self_attention_output"),
+    ("attention/output/LayerNorm", "self_attention_layer_norm"),
+    ("intermediate/dense", "intermediate"),
+    ("output/dense", "output"),
+    ("output/LayerNorm", "output_layer_norm"),
+    ("pooler/dense", "pooler_transform"),
+)
+BERT_V2_NAME_REPLACEMENTS = (
+    ("bert/", ""),
+    ("encoder", "transformer"),
+    ("embeddings/word_embeddings", "word_embeddings/embeddings"),
+    ("embeddings/token_type_embeddings", "type_embeddings/embeddings"),
+    ("embeddings/position_embeddings", "position_embedding/embeddings"),
+    ("embeddings/LayerNorm", "embeddings/layer_norm"),
+    ("attention/self", "self_attention"),
+    ("attention/output/dense", "self_attention/attention_output"),
+    ("attention/output/LayerNorm", "self_attention_layer_norm"),
+    ("intermediate/dense", "intermediate"),
+    ("output/dense", "output"),
+    ("output/LayerNorm", "output_layer_norm"),
+    ("pooler/dense", "pooler_transform"),
+    ("cls/predictions", "bert/cls/predictions"),
+    ("cls/predictions/output_bias", "cls/predictions/output_bias/bias"),
+    ("cls/seq_relationship/output_bias", "predictions/transform/logits/bias"),
+    ("cls/seq_relationship/output_weights",
+     "predictions/transform/logits/kernel"),
+)
+BERT_PERMUTATIONS = ()
+BERT_V2_PERMUTATIONS = (("cls/seq_relationship/output_weights", (1, 0)),)
+def _bert_name_replacement(var_name, name_replacements):
+  """Gets the variable name replacement."""
+  for src_pattern, tgt_pattern in name_replacements:
+    if src_pattern in var_name:
+      old_var_name = var_name
+      var_name = var_name.replace(src_pattern, tgt_pattern)
+      tf.logging.info("Converted: %s --> %s", old_var_name, var_name)
+  return var_name
+def _has_exclude_patterns(name, exclude_patterns):
+  """Checks if a string contains substrings that match patterns to exclude."""
+  for p in exclude_patterns:
+    if p in name:
+      return True
+  return False
+def _get_permutation(name, permutations):
+  """Checks whether a variable requires transposition by pattern matching."""
+  for src_pattern, permutation in permutations:
+    if src_pattern in name:
+      tf.logging.info("Permuted: %s --> %s", name, permutation)
+      return permutation
+  return None
+def _get_new_shape(name, shape, num_heads):
+  """Checks whether a variable requires reshape by pattern matching."""
+  if "self_attention/attention_output/kernel" in name:
+    return tuple([num_heads, shape[0] // num_heads, shape[1]])
+  if "self_attention/attention_output/bias" in name:
+    return shape
+  patterns = [
+      "self_attention/query", "self_attention/value", "self_attention/key"
+  ]
+  for pattern in patterns:
+    if pattern in name:
+      if "kernel" in name:
+        return tuple([shape[0], num_heads, shape[1] // num_heads])
+      if "bias" in name:
+        return tuple([num_heads, shape[0] // num_heads])
+  return None
+def create_v2_checkpoint(model,
+                         src_checkpoint,
+                         output_path,
+                         checkpoint_model_name="model"):
+  """Converts a name-based matched TF V1 checkpoint to TF V2 checkpoint."""
+  # Uses streaming-restore in eager model to read V1 name-based checkpoints.
+  model.load_weights(src_checkpoint).assert_existing_objects_matched()
+  if hasattr(model, "checkpoint_items"):
+    checkpoint_items = model.checkpoint_items
+  else:
+    checkpoint_items = {}
+  checkpoint_items[checkpoint_model_name] = model
+  checkpoint = tf.train.Checkpoint(**checkpoint_items)
+  checkpoint.save(output_path)
+def convert(checkpoint_from_path,
+            checkpoint_to_path,
+            num_heads,
+            name_replacements,
+            permutations,
+            exclude_patterns=None):
+  """Migrates the names of variables within a checkpoint.
+  Args:
+    checkpoint_from_path: Path to source checkpoint to be read in.
+    checkpoint_to_path: Path to checkpoint to be written out.
+    num_heads: The number of heads of the model.
+    name_replacements: A list of tuples of the form (match_str, replace_str)
+      describing variable names to adjust.
+    permutations: A list of tuples of the form (match_str, permutation)
+      describing permutations to apply to given variables. Note that match_str
+      should match the original variable name, not the replaced one.
+    exclude_patterns: A list of string patterns to exclude variables from
+      checkpoint conversion.
+  Returns:
+    A dictionary that maps the new variable names to the Variable objects.
+    A dictionary that maps the old variable names to the new variable names.
+  """
+  with tf.Graph().as_default():
+    tf.logging.info("Reading checkpoint_from_path %s", checkpoint_from_path)
+    reader = tf.train.NewCheckpointReader(checkpoint_from_path)
+    name_shape_map = reader.get_variable_to_shape_map()
+    new_variable_map = {}
+    conversion_map = {}
+    for var_name in name_shape_map:
+      if exclude_patterns and _has_exclude_patterns(var_name, exclude_patterns):
+        continue
+      # Get the original tensor data.
+      tensor = reader.get_tensor(var_name)
+      # Look up the new variable name, if any.
+      new_var_name = _bert_name_replacement(var_name, name_replacements)
+      # See if we need to reshape the underlying tensor.
+      new_shape = None
+      if num_heads > 0:
+        new_shape = _get_new_shape(new_var_name, tensor.shape, num_heads)
+      if new_shape:
+        tf.logging.info("Veriable %s has a shape change from %s to %s",
+                        var_name, tensor.shape, new_shape)
+        tensor = np.reshape(tensor, new_shape)
+      # See if we need to permute the underlying tensor.
+      permutation = _get_permutation(var_name, permutations)
+      if permutation:
+        tensor = np.transpose(tensor, permutation)
+      # Create a new variable with the possibly-reshaped or transposed tensor.
+      var = tf.Variable(tensor, name=var_name)
+      # Save the variable into the new variable map.
+      new_variable_map[new_var_name] = var
+      # Keep a list of converter variables for sanity checking.
+      if new_var_name != var_name:
+        conversion_map[var_name] = new_var_name
+    saver = tf.train.Saver(new_variable_map)
+    with tf.Session() as sess:
+      sess.run(tf.global_variables_initializer())
+      tf.logging.info("Writing checkpoint_to_path %s", checkpoint_to_path)
+      saver.save(sess, checkpoint_to_path, write_meta_graph=False)
+  tf.logging.info("Summary:")
+  tf.logging.info("  Converted %d variable name(s).", len(new_variable_map))
+  tf.logging.info("  Converted: %s", str(conversion_map))

tf2_albert_encoder_checkpoint_converter.py ADDED Viewed

	@@ -0,0 +1,170 @@

+# Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""A converter from a tf1 ALBERT encoder checkpoint to a tf2 encoder checkpoint.
+The conversion will yield an object-oriented checkpoint that can be used
+to restore an AlbertEncoder object.
+"""
+import os
+from absl import app
+from absl import flags
+import tensorflow as tf, tf_keras
+from official.legacy.albert import configs
+from official.modeling import tf_utils
+from official.nlp.modeling import models
+from official.nlp.modeling import networks
+from official.nlp.tools import tf1_bert_checkpoint_converter_lib
+FLAGS = flags.FLAGS
+flags.DEFINE_string("albert_config_file", None,
+                    "Albert configuration file to define core bert layers.")
+flags.DEFINE_string(
+    "checkpoint_to_convert", None,
+    "Initial checkpoint from a pretrained BERT model core (that is, only the "
+    "BertModel, with no task heads.)")
+flags.DEFINE_string("converted_checkpoint_path", None,
+                    "Name for the created object-based V2 checkpoint.")
+flags.DEFINE_string("checkpoint_model_name", "encoder",
+                    "The name of the model when saving the checkpoint, i.e., "
+                    "the checkpoint will be saved using: "
+                    "tf.train.Checkpoint(FLAGS.checkpoint_model_name=model).")
+flags.DEFINE_enum(
+    "converted_model", "encoder", ["encoder", "pretrainer"],
+    "Whether to convert the checkpoint to a `AlbertEncoder` model or a "
+    "`BertPretrainerV2` model (with mlm but without classification heads).")
+ALBERT_NAME_REPLACEMENTS = (
+    ("bert/encoder/", ""),
+    ("bert/", ""),
+    ("embeddings/word_embeddings", "word_embeddings/embeddings"),
+    ("embeddings/position_embeddings", "position_embedding/embeddings"),
+    ("embeddings/token_type_embeddings", "type_embeddings/embeddings"),
+    ("embeddings/LayerNorm", "embeddings/layer_norm"),
+    ("embedding_hidden_mapping_in", "embedding_projection"),
+    ("group_0/inner_group_0/", ""),
+    ("attention_1/self", "self_attention"),
+    ("attention_1/output/dense", "self_attention/attention_output"),
+    ("transformer/LayerNorm/", "transformer/self_attention_layer_norm/"),
+    ("ffn_1/intermediate/dense", "intermediate"),
+    ("ffn_1/intermediate/output/dense", "output"),
+    ("transformer/LayerNorm_1/", "transformer/output_layer_norm/"),
+    ("pooler/dense", "pooler_transform"),
+    ("cls/predictions", "bert/cls/predictions"),
+    ("cls/predictions/output_bias", "cls/predictions/output_bias/bias"),
+    ("cls/seq_relationship/output_bias", "predictions/transform/logits/bias"),
+    ("cls/seq_relationship/output_weights",
+     "predictions/transform/logits/kernel"),
+)
+def _create_albert_model(cfg):
+  """Creates an ALBERT keras core model from BERT configuration.
+  Args:
+    cfg: A `AlbertConfig` to create the core model.
+  Returns:
+    A keras model.
+  """
+  albert_encoder = networks.AlbertEncoder(
+      vocab_size=cfg.vocab_size,
+      hidden_size=cfg.hidden_size,
+      embedding_width=cfg.embedding_size,
+      num_layers=cfg.num_hidden_layers,
+      num_attention_heads=cfg.num_attention_heads,
+      intermediate_size=cfg.intermediate_size,
+      activation=tf_utils.get_activation(cfg.hidden_act),
+      dropout_rate=cfg.hidden_dropout_prob,
+      attention_dropout_rate=cfg.attention_probs_dropout_prob,
+      max_sequence_length=cfg.max_position_embeddings,
+      type_vocab_size=cfg.type_vocab_size,
+      initializer=tf_keras.initializers.TruncatedNormal(
+          stddev=cfg.initializer_range))
+  return albert_encoder
+def _create_pretrainer_model(cfg):
+  """Creates a pretrainer with AlbertEncoder from ALBERT configuration.
+  Args:
+    cfg: A `BertConfig` to create the core model.
+  Returns:
+    A BertPretrainerV2 model.
+  """
+  albert_encoder = _create_albert_model(cfg)
+  pretrainer = models.BertPretrainerV2(
+      encoder_network=albert_encoder,
+      mlm_activation=tf_utils.get_activation(cfg.hidden_act),
+      mlm_initializer=tf_keras.initializers.TruncatedNormal(
+          stddev=cfg.initializer_range))
+  # Makes sure masked_lm layer's variables in pretrainer are created.
+  _ = pretrainer(pretrainer.inputs)
+  return pretrainer
+def convert_checkpoint(bert_config, output_path, v1_checkpoint,
+                       checkpoint_model_name,
+                       converted_model="encoder"):
+  """Converts a V1 checkpoint into an OO V2 checkpoint."""
+  output_dir, _ = os.path.split(output_path)
+  # Create a temporary V1 name-converted checkpoint in the output directory.
+  temporary_checkpoint_dir = os.path.join(output_dir, "temp_v1")
+  temporary_checkpoint = os.path.join(temporary_checkpoint_dir, "ckpt")
+  tf1_bert_checkpoint_converter_lib.convert(
+      checkpoint_from_path=v1_checkpoint,
+      checkpoint_to_path=temporary_checkpoint,
+      num_heads=bert_config.num_attention_heads,
+      name_replacements=ALBERT_NAME_REPLACEMENTS,
+      permutations=tf1_bert_checkpoint_converter_lib.BERT_V2_PERMUTATIONS,
+      exclude_patterns=["adam", "Adam"])
+  # Create a V2 checkpoint from the temporary checkpoint.
+  if converted_model == "encoder":
+    model = _create_albert_model(bert_config)
+  elif converted_model == "pretrainer":
+    model = _create_pretrainer_model(bert_config)
+  else:
+    raise ValueError("Unsupported converted_model: %s" % converted_model)
+  tf1_bert_checkpoint_converter_lib.create_v2_checkpoint(
+      model, temporary_checkpoint, output_path, checkpoint_model_name)
+  # Clean up the temporary checkpoint, if it exists.
+  try:
+    tf.io.gfile.rmtree(temporary_checkpoint_dir)
+  except tf.errors.OpError:
+    # If it doesn't exist, we don't need to clean it up; continue.
+    pass
+def main(_):
+  output_path = FLAGS.converted_checkpoint_path
+  v1_checkpoint = FLAGS.checkpoint_to_convert
+  checkpoint_model_name = FLAGS.checkpoint_model_name
+  converted_model = FLAGS.converted_model
+  albert_config = configs.AlbertConfig.from_json_file(FLAGS.albert_config_file)
+  convert_checkpoint(albert_config, output_path, v1_checkpoint,
+                     checkpoint_model_name,
+                     converted_model=converted_model)
+if __name__ == "__main__":
+  app.run(main)

tf2_bert_encoder_checkpoint_converter.py ADDED Viewed

	@@ -0,0 +1,160 @@

+# Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""A converter from a V1 BERT encoder checkpoint to a V2 encoder checkpoint.
+The conversion will yield an object-oriented checkpoint that can be used
+to restore a BertEncoder or BertPretrainerV2 object (see the `converted_model`
+FLAG below).
+"""
+import os
+from absl import app
+from absl import flags
+import tensorflow as tf, tf_keras
+from official.legacy.bert import configs
+from official.modeling import tf_utils
+from official.nlp.modeling import models
+from official.nlp.modeling import networks
+from official.nlp.tools import tf1_bert_checkpoint_converter_lib
+FLAGS = flags.FLAGS
+flags.DEFINE_string("bert_config_file", None,
+                    "Bert configuration file to define core bert layers.")
+flags.DEFINE_string(
+    "checkpoint_to_convert", None,
+    "Initial checkpoint from a pretrained BERT model core (that is, only the "
+    "BertModel, with no task heads.)")
+flags.DEFINE_string("converted_checkpoint_path", None,
+                    "Name for the created object-based V2 checkpoint.")
+flags.DEFINE_string("checkpoint_model_name", "encoder",
+                    "The name of the model when saving the checkpoint, i.e., "
+                    "the checkpoint will be saved using: "
+                    "tf.train.Checkpoint(FLAGS.checkpoint_model_name=model).")
+flags.DEFINE_enum(
+    "converted_model", "encoder", ["encoder", "pretrainer"],
+    "Whether to convert the checkpoint to a `BertEncoder` model or a "
+    "`BertPretrainerV2` model (with mlm but without classification heads).")
+def _create_bert_model(cfg):
+  """Creates a BERT keras core model from BERT configuration.
+  Args:
+    cfg: A `BertConfig` to create the core model.
+  Returns:
+    A BertEncoder network.
+  """
+  bert_encoder = networks.BertEncoder(
+      vocab_size=cfg.vocab_size,
+      hidden_size=cfg.hidden_size,
+      num_layers=cfg.num_hidden_layers,
+      num_attention_heads=cfg.num_attention_heads,
+      intermediate_size=cfg.intermediate_size,
+      activation=tf_utils.get_activation(cfg.hidden_act),
+      dropout_rate=cfg.hidden_dropout_prob,
+      attention_dropout_rate=cfg.attention_probs_dropout_prob,
+      max_sequence_length=cfg.max_position_embeddings,
+      type_vocab_size=cfg.type_vocab_size,
+      initializer=tf_keras.initializers.TruncatedNormal(
+          stddev=cfg.initializer_range),
+      embedding_width=cfg.embedding_size)
+  return bert_encoder
+def _create_bert_pretrainer_model(cfg):
+  """Creates a BERT keras core model from BERT configuration.
+  Args:
+    cfg: A `BertConfig` to create the core model.
+  Returns:
+    A BertPretrainerV2 model.
+  """
+  bert_encoder = _create_bert_model(cfg)
+  pretrainer = models.BertPretrainerV2(
+      encoder_network=bert_encoder,
+      mlm_activation=tf_utils.get_activation(cfg.hidden_act),
+      mlm_initializer=tf_keras.initializers.TruncatedNormal(
+          stddev=cfg.initializer_range))
+  # Makes sure the pretrainer variables are created.
+  _ = pretrainer(pretrainer.inputs)
+  return pretrainer
+def convert_checkpoint(bert_config,
+                       output_path,
+                       v1_checkpoint,
+                       checkpoint_model_name="model",
+                       converted_model="encoder"):
+  """Converts a V1 checkpoint into an OO V2 checkpoint."""
+  output_dir, _ = os.path.split(output_path)
+  tf.io.gfile.makedirs(output_dir)
+  # Create a temporary V1 name-converted checkpoint in the output directory.
+  temporary_checkpoint_dir = os.path.join(output_dir, "temp_v1")
+  temporary_checkpoint = os.path.join(temporary_checkpoint_dir, "ckpt")
+  tf1_bert_checkpoint_converter_lib.convert(
+      checkpoint_from_path=v1_checkpoint,
+      checkpoint_to_path=temporary_checkpoint,
+      num_heads=bert_config.num_attention_heads,
+      name_replacements=(
+          tf1_bert_checkpoint_converter_lib.BERT_V2_NAME_REPLACEMENTS),
+      permutations=tf1_bert_checkpoint_converter_lib.BERT_V2_PERMUTATIONS,
+      exclude_patterns=["adam", "Adam"])
+  if converted_model == "encoder":
+    model = _create_bert_model(bert_config)
+  elif converted_model == "pretrainer":
+    model = _create_bert_pretrainer_model(bert_config)
+  else:
+    raise ValueError("Unsupported converted_model: %s" % converted_model)
+  # Create a V2 checkpoint from the temporary checkpoint.
+  tf1_bert_checkpoint_converter_lib.create_v2_checkpoint(
+      model, temporary_checkpoint, output_path, checkpoint_model_name)
+  # Clean up the temporary checkpoint, if it exists.
+  try:
+    tf.io.gfile.rmtree(temporary_checkpoint_dir)
+  except tf.errors.OpError:
+    # If it doesn't exist, we don't need to clean it up; continue.
+    pass
+def main(argv):
+  if len(argv) > 1:
+    raise app.UsageError("Too many command-line arguments.")
+  output_path = FLAGS.converted_checkpoint_path
+  v1_checkpoint = FLAGS.checkpoint_to_convert
+  checkpoint_model_name = FLAGS.checkpoint_model_name
+  converted_model = FLAGS.converted_model
+  bert_config = configs.BertConfig.from_json_file(FLAGS.bert_config_file)
+  convert_checkpoint(
+      bert_config=bert_config,
+      output_path=output_path,
+      v1_checkpoint=v1_checkpoint,
+      checkpoint_model_name=checkpoint_model_name,
+      converted_model=converted_model)
+if __name__ == "__main__":
+  app.run(main)

tokenization_test.py ADDED Viewed

	@@ -0,0 +1,156 @@

+# Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import tempfile
+import six
+import tensorflow as tf, tf_keras
+from official.nlp.tools import tokenization
+class TokenizationTest(tf.test.TestCase):
+  """Tokenization test.
+    The implementation is forked from
+    https://github.com/google-research/bert/blob/master/tokenization_test.py."
+  """
+  def test_full_tokenizer(self):
+    vocab_tokens = [
+        "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn",
+        "##ing", ","
+    ]
+    with tempfile.NamedTemporaryFile(delete=False) as vocab_writer:
+      if six.PY2:
+        vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
+      else:
+        vocab_writer.write("".join([x + "\n" for x in vocab_tokens
+                                   ]).encode("utf-8"))
+      vocab_file = vocab_writer.name
+    tokenizer = tokenization.FullTokenizer(vocab_file)
+    os.unlink(vocab_file)
+    tokens = tokenizer.tokenize(u"UNwant\u00E9d,running")
+    self.assertAllEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"])
+    self.assertAllEqual(
+        tokenizer.convert_tokens_to_ids(tokens), [7, 4, 5, 10, 8, 9])
+  def test_chinese(self):
+    tokenizer = tokenization.BasicTokenizer()
+    self.assertAllEqual(
+        tokenizer.tokenize(u"ah\u535A\u63A8zz"),
+        [u"ah", u"\u535A", u"\u63A8", u"zz"])
+  def test_basic_tokenizer_lower(self):
+    tokenizer = tokenization.BasicTokenizer(do_lower_case=True)
+    self.assertAllEqual(
+        tokenizer.tokenize(u" \tHeLLo!how  \n Are yoU?  "),
+        ["hello", "!", "how", "are", "you", "?"])
+    self.assertAllEqual(tokenizer.tokenize(u"H\u00E9llo"), ["hello"])
+  def test_basic_tokenizer_no_lower(self):
+    tokenizer = tokenization.BasicTokenizer(do_lower_case=False)
+    self.assertAllEqual(
+        tokenizer.tokenize(u" \tHeLLo!how  \n Are yoU?  "),
+        ["HeLLo", "!", "how", "Are", "yoU", "?"])
+  def test_basic_tokenizer_no_split_on_punc(self):
+    tokenizer = tokenization.BasicTokenizer(
+        do_lower_case=True, split_on_punc=False)
+    self.assertAllEqual(
+        tokenizer.tokenize(u" \tHeLLo!how  \n Are yoU?  "),
+        ["hello!how", "are", "you?"])
+  def test_wordpiece_tokenizer(self):
+    vocab_tokens = [
+        "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn",
+        "##ing", "##!", "!"
+    ]
+    vocab = {}
+    for (i, token) in enumerate(vocab_tokens):
+      vocab[token] = i
+    tokenizer = tokenization.WordpieceTokenizer(vocab=vocab)
+    self.assertAllEqual(tokenizer.tokenize(""), [])
+    self.assertAllEqual(
+        tokenizer.tokenize("unwanted running"),
+        ["un", "##want", "##ed", "runn", "##ing"])
+    self.assertAllEqual(
+        tokenizer.tokenize("unwanted running !"),
+        ["un", "##want", "##ed", "runn", "##ing", "!"])
+    self.assertAllEqual(
+        tokenizer.tokenize("unwanted running!"),
+        ["un", "##want", "##ed", "runn", "##ing", "##!"])
+    self.assertAllEqual(
+        tokenizer.tokenize("unwantedX running"), ["[UNK]", "runn", "##ing"])
+  def test_convert_tokens_to_ids(self):
+    vocab_tokens = [
+        "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn",
+        "##ing"
+    ]
+    vocab = {}
+    for (i, token) in enumerate(vocab_tokens):
+      vocab[token] = i
+    self.assertAllEqual(
+        tokenization.convert_tokens_to_ids(
+            vocab, ["un", "##want", "##ed", "runn", "##ing"]), [7, 4, 5, 8, 9])
+  def test_is_whitespace(self):
+    self.assertTrue(tokenization._is_whitespace(u" "))
+    self.assertTrue(tokenization._is_whitespace(u"\t"))
+    self.assertTrue(tokenization._is_whitespace(u"\r"))
+    self.assertTrue(tokenization._is_whitespace(u"\n"))
+    self.assertTrue(tokenization._is_whitespace(u"\u00A0"))
+    self.assertFalse(tokenization._is_whitespace(u"A"))
+    self.assertFalse(tokenization._is_whitespace(u"-"))
+  def test_is_control(self):
+    self.assertTrue(tokenization._is_control(u"\u0005"))
+    self.assertFalse(tokenization._is_control(u"A"))
+    self.assertFalse(tokenization._is_control(u" "))
+    self.assertFalse(tokenization._is_control(u"\t"))
+    self.assertFalse(tokenization._is_control(u"\r"))
+    self.assertFalse(tokenization._is_control(u"\U0001F4A9"))
+  def test_is_punctuation(self):
+    self.assertTrue(tokenization._is_punctuation(u"-"))
+    self.assertTrue(tokenization._is_punctuation(u"$"))
+    self.assertTrue(tokenization._is_punctuation(u"`"))
+    self.assertTrue(tokenization._is_punctuation(u"."))
+    self.assertFalse(tokenization._is_punctuation(u"A"))
+    self.assertFalse(tokenization._is_punctuation(u" "))
+if __name__ == "__main__":
+  tf.test.main()