pere
/

long-t5x

Model card Files Files and versions Community

pere commited on Jan 18, 2023

Commit

0a8986a

•

1 Parent(s): cb76020

demo

Browse files

Files changed (6) hide show

__pycache__/tasks.cpython-38.pyc +0 -0
longt5/__pycache__/preprocessors.cpython-38.pyc +0 -0
longt5/preprocessors.py +202 -0
longt5_1_1_base.gin +61 -0
tasks.py +1 -12
train_long_base.sh +9 -0

__pycache__/tasks.cpython-38.pyc CHANGED Viewed

Binary files a/__pycache__/tasks.cpython-38.pyc and b/__pycache__/tasks.cpython-38.pyc differ

longt5/__pycache__/preprocessors.cpython-38.pyc ADDED Viewed

Binary file (4.8 kB). View file

longt5/preprocessors.py ADDED Viewed

	@@ -0,0 +1,202 @@

+# Copyright 2022 The LongT5 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Copyright 2022 Google LLC.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Preprocessors for long T5."""
+from pegasus.data import parsers
+import seqio
+import t5.data
+import tensorflow.compat.v2 as tf
+def _string_join(lst):
+  # Join on space, but collapse consecutive spaces.
+  out = tf.strings.join(lst, separator=' ')
+  return tf.strings.regex_replace(out, r'\s+', ' ')
+def _normalize_text(text):
+  """Lowercase and remove quotes from a TensorFlow string."""
+  text = tf.strings.lower(text)
+  text = tf.strings.regex_replace(text, "'(.*)'", r'\1')
+  return text
+@seqio.map_over_dataset
+def nq(x):
+  """Convert NQ TF examples to a text2text pair.
+  NQ produces examples with this form:
+    {'id_': <id>, 'title': <title>, context': <article>, 'question': <question>,
+     'answer': <answer> }
+  This function will return examples of the format:
+    {'inputs': 'question: <question> context: <article>',
+     'targets': '<answer>',
+     'id': <id>, 'question': <question>, 'context': <context>,
+     'answers': [<n answers>]},
+  Args:
+    x: an example to process.
+  Returns:
+    A preprocessed example with the format listed above.
+  """
+  inputs = _string_join(['question:', x['question'], 'context:', x['context']])
+  return {
+      'inputs': inputs,
+      'targets': x['answer'],
+      'id': x['id_'],
+      'context': x['context'],
+      'question': x['question'],
+      'answers': [x['answer']]
+  }
+@seqio.map_over_dataset
+def triviaqa(x, ignore_web=True, include_title=True):
+  """Convert TriviaQA TF examples to a text2text pair.
+  TriviaQA produces examples with this form:
+    {'entity_pages': {dict of wiki entities},
+     'search_results': <dict of web search results>,
+     'answer': {dict of all answers}, 'question': <question>,
+     'question_id': <question_id>, 'question_source': <question_source>}
+  This function will return examples of the format:
+    {'inputs': 'question: <question> context: <article>',
+     'targets': '<answer>',
+     'id': <id>, 'question': <question>, 'context': <context>,
+     'answers': [<n answers>]},
+  Args:
+    x: an example to process.
+    ignore_web: whether to ignore the web context
+    include_title: whether to include the title
+  Returns:
+    A preprocessed example with the format listed above.
+  """
+  question = _normalize_text(x['question'])
+  wiki_context = [_normalize_text(x['entity_pages']['wiki_context'])]
+  if include_title:
+    # Append the title before each context.
+    wiki_context = [_normalize_text(x['entity_pages']['title'])] + wiki_context
+    wiki_context = tf.transpose(tf.stack(wiki_context))
+  wiki_context = tf.strings.reduce_join(wiki_context, separator=' ')
+  context = wiki_context
+  if not ignore_web:
+    web_context = [_normalize_text(x['search_results']['search_context'])]
+    if include_title:
+      # Append the title before each context.
+      web_context = [_normalize_text(x['search_results']['title'])
+                    ] + web_context
+      web_context = tf.transpose(tf.stack(web_context))
+    web_context = tf.strings.reduce_join(web_context, separator=' ')
+    context = _string_join([wiki_context, web_context])
+  inputs = _string_join(['question:', question, 'context:', context])
+  targets = _normalize_text(x['answer']['value'])
+  return {
+      'inputs': inputs,
+      'targets': targets,
+      'id': x['question_id'],
+      'context': context,
+      'question': question,
+      'answers': x['answer']['aliases']
+  }
+# Preprocessor for PEGASUS type pretraining.
+# Sentences/words are masked/replaced with different strategies. Details at
+# https://arxiv.org/abs/1912.08777
+pegasus_parser, _ = parsers.string_features_for_pretraining_parser(
+    vocab_filename='gs://t5-data/vocabs/cc_all.32000.100extra/sentencepiece.model',
+    encoder_type='sentencepiece_noshift',  # Matches tokenizer used by T5.
+    max_input_len=4096,
+    max_target_len=910,
+    max_total_words=0,
+    parser_strategy='dynamic_rouge',
+    parser_masked_sentence_ratio=0.2,
+    parser_masked_words_ratio=0,
+    parser_mask_word_option_prob=[0.8, 0.1, 0.1],
+    parser_mask_sentence_option_prob=[.9, 0, .1, 0],
+    parser_rouge_ngrams_size=1,
+    parser_rouge_metric_type='F',
+    parser_rouge_compute_option='standard',
+    # The stopwords file used is here: https://gist.github.com/sebleier/554280
+    parser_rouge_stopwords_filename='',
+    shift_special_token_id=t5.data.DEFAULT_EXTRA_IDS - 2,  # 2's for eos and pad
+    mode='',
+    parser_rouge_noise_ratio=.2,
+    parser_dynamic_mask_min_ratio=.33,
+    input_feature='inputs',
+    pretrain_target_filter_min=0)
+@seqio.map_over_dataset
+def pegasus_parse(x):
+  """Parses an example with the Pegasus parser.
+  As input, method receives:
+    {
+      'inputs': '<sent1> <sent2> .... <sentn>'
+      'targets': None
+    }
+  This function will return examples of the format:
+    {
+      'inputs': '<sent1> <mask> .... <sentn>'
+      'targets': '<sent2>'
+    }
+  though the returned example will have been tokenized with SPM and will
+  contain EOS id at the end of both inputs and targets (as is also done in T5).
+  Args:
+    x: an example to process.
+  Returns:
+    A preprocessed example, where some of the input is masked and copied to the
+    target. These values will have been tokenized with SPM.
+  """
+  # Add key 'supervised' as required by Pegasus parser.
+  x['supervised'] = tf.constant(False, dtype=tf.bool)
+  # Parse the input. Pegasus parser will return with some of the input masked
+  # and copied to target (all having been tokenized).
+  parsed = pegasus_parser(x)
+  # Adjust outputs from Pegasus parser to work with T5. This involves taking
+  # the elements at index 0 (to get the right shape needed) and casting from
+  # int64 to int32.
+  return {
+      'inputs': tf.cast(parsed['inputs'][0], tf.int32),
+      'targets': tf.cast(parsed['targets'][0], tf.int32)
+  }

longt5_1_1_base.gin ADDED Viewed

	@@ -0,0 +1,61 @@

+# LongT5 Base model. Config based on T5.1.1 Base model.
+# Provides MODEL
+from __gin__ import dynamic_registration
+import seqio
+from t5x import adafactor
+from t5x import models
+import tasks
+ARCHITECTURE = %gin.REQUIRED
+include 'flaxformer/t5x/configs/longt5/architectures/longt5_1_1_flaxformer.gin'
+include 't5x/configs/runs/pretrain.gin'
+#include 'pretrain_cont.gin'
+MIXTURE_OR_TASK_NAME = "ncc_scandinavian_span_corruption_stream"
+TASK_FEATURE_LENGTHS = {"inputs": 4048, "targets": 910}
+# CORRECT IS 128!!
+BATCH_SIZE=32
+TRAIN_STEPS = 1_000_000
+DROPOUT_RATE = 0.0 # Changed from the default since T5-1.1 recomments this.
+#INITIAL_CHECKPOINT_PATH = "gs://nb-t5x-us-central2/norwegian_NCC_plus_English_t5x_base/checkpoint_1500000"
+#PjitPartitioner.num_partitions = 1
+# Architecture overrides
+NUM_HEADS = 12
+NUM_ENCODER_LAYERS = 12
+NUM_DECODER_LAYERS = 12
+HEAD_DIM = 64
+EMBED_DIM = 768
+MLP_DIM = 2048
+# Loss HParam defaults
+Z_LOSS = 0.0001
+LABEL_SMOOTHING = 0.0
+LOSS_NORMALIZING_FACTOR = None
+# Vocabulary (shared by encoder and decoder)
+VOCABULARY = @seqio.SentencePieceVocabulary()
+seqio.SentencePieceVocabulary.sentencepiece_model_file = "gs://t5-data/vocabs/cc_all.32000.100extra/sentencepiece.model"
+NUM_EMBEDDINGS = 32128  # vocab size rounded to a multiple of 128 for TPU efficiency
+# Optimizer
+# `learning_rate` is set by `Trainer.learning_rate_fn`.
+OPTIMIZER = @adafactor.Adafactor()
+adafactor.Adafactor:
+  decay_rate = 0.8
+  step_offset = 0
+# Model
+MODEL = @models.EncoderDecoderModel()
+models.EncoderDecoderModel:
+  module = %ARCHITECTURE  # provided by longt5_flaxformer
+  input_vocabulary = %VOCABULARY
+  output_vocabulary = %VOCABULARY
+  optimizer_def = %OPTIMIZER
+  z_loss = %Z_LOSS
+  label_smoothing = %LABEL_SMOOTHING
+  loss_normalizing_factor = %LOSS_NORMALIZING_FACTOR

tasks.py CHANGED Viewed

@@ -1,5 +1,4 @@
 import functools
 import seqio
 import tensorflow as tf
 import t5.data
@@ -10,9 +9,7 @@ from t5.evaluation import metrics
 from seqio import FunctionDataSource, utils
 TaskRegistry = seqio.TaskRegistry
-vocabulary = seqio.SentencePieceVocabulary('gs://t5-data/vocabs/mc4.250000.100extra/sentencepiece.model', extra_ids=0)
-byt5_vocabulary = t5.data.ByteVocabulary()
 DEFAULT_OUTPUT_FEATURES = {
     "inputs": seqio.Feature(
@@ -22,14 +19,6 @@ DEFAULT_OUTPUT_FEATURES = {
         vocabulary=vocabulary, add_eos=True)
 }
-BYT5_DEFAULT_OUTPUT_FEATURES = {
-    "inputs": seqio.Feature(
-        vocabulary=byt5_vocabulary, add_eos=True,
-        required=False),
-    "targets": seqio.Feature(
-        vocabulary=byt5_vocabulary, add_eos=True)
-}
 def gen_dataset(split, shuffle=False, seed=None, column="text", dataset_params=None):
     dataset = load_dataset(**dataset_params)

 import functools
 import seqio
 import tensorflow as tf
 import t5.data
 from seqio import FunctionDataSource, utils
 TaskRegistry = seqio.TaskRegistry
+vocabulary=seqio.SentencePieceVocabulary('gs://t5-data/vocabs/cc_all.32000.100extra/sentencepiece.model', extra_ids=0)
 DEFAULT_OUTPUT_FEATURES = {
     "inputs": seqio.Feature(
         vocabulary=vocabulary, add_eos=True)
 }
 def gen_dataset(split, shuffle=False, seed=None, column="text", dataset_params=None):
     dataset = load_dataset(**dataset_params)

train_long_base.sh ADDED Viewed

	@@ -0,0 +1,9 @@

+PROJECT_DIR=${HOME}"/models/long-t5x"
+T5X_DIR="../../t5x"  # directory where the t5x is cloned.
+MODEL_DIR="gs://nb-t5x-us-central2/long_test_t5x_base"
+export PYTHONPATH=${PROJECT_DIR}
+python3 ${T5X_DIR}/t5x/train.py \
+  --gin_search_paths=${PROJECT_DIR} \
+  --gin_file="longt5_1_1_base.gin" \
+  --gin.MODEL_DIR="'${MODEL_DIR}'" \