pere
/

multi-sentencefix-mt5

Model card Files Files and versions Community

pere commited on Feb 15, 2022

Commit

d3986eb

1 Parent(s): b032b6f

First attempt

Browse files

Files changed (6) hide show

README.md +16 -0
base_wmt_infer.gin +23 -0
finetune_mt5_sentencefix.gin +41 -0
interference.sh +16 -0
tasks.py +65 -0
train.sh +12 -0

README.md ADDED Viewed

	@@ -0,0 +1,16 @@

+---
+license: cc
+---
+# Multi-Lingual DeUnCaser - Base mT5 Version
+The output from Automated Speak Recognition software is usually uncased and without any punctation. This does not make a very readable text.
+The DeUnCaser is a sequence-to-sequence model that is reversing this process. It adds punctation, and capitalises the correct words. In some languages this means adding capital letters at start of sentences and on all proper nouns, in other languages, like German, it means capitalising the first letter of all nouns. It will also make attempts at adding hyphens and parentheses if this is making the meaning clearer.
+It is using based on the multi-lingual T5 model. It is finetuned for 100,000 steps. The finetuning scripts is based on 100,000 training examples from each of the 44 languages with Latin alphabet that is both part of OSCAR and the mT5 training set: Afrikaans, Albanian, Basque, Catalan, Cebuano, Czech, Danish, Dutch, English, Esperanto, Estonian, Finnish, French, Galician, German, Haitian Creole, Hungarian, Icelandic, Indonesian, Irish, Italian, Kurdish, Latin, Latvian, Lithuanian, Luxembourgish, Malagasy, Malay, Maltese, Norwegian Bokmål, Norwegian Nynorsk, Polish, Portuguese, Romanian, Slovak, Spanish, Sundanese, Swahili, Swedish, Turkish, Uzbek, Vietnamese, Welsh, West Frisian.
+A Notebook for creating the training corpus is available [here](https://colab.research.google.com/drive/1bkH94z-0wIQP8Pz0qXFndhoQsokU-78x?usp=sharing).

base_wmt_infer.gin ADDED Viewed

	@@ -0,0 +1,23 @@

+from __gin__ import dynamic_registration
+import tasks
+import __main__ as infer_script
+from t5.data import mixtures
+from t5x import partitioning
+from t5x import utils
+include "t5x/examples/t5/mt5/base.gin"
+include "t5x/configs/runs/infer.gin"
+DROPOUT_RATE = 0.0  # unused but needs to be specified
+MIXTURE_OR_TASK_NAME = "sentencefix"
+TASK_FEATURE_LENGTHS = {"inputs": 256, "targets": 256}
+infer_script.infer:
+  partitioner = @partitioning.ModelBasedPjitPartitioner()
+partitioning.ModelBasedPjitPartitioner.num_partitions = 1
+utils.DatasetConfig:
+  split = "test"
+  batch_size = 32

finetune_mt5_sentencefix.gin ADDED Viewed

	@@ -0,0 +1,41 @@

+from __gin__ import dynamic_registration
+import tasks
+import __main__ as train_script
+from t5.data import mixtures
+from t5x import models
+from t5x import partitioning
+from t5x import utils
+include "t5x/examples/t5/mt5/base.gin"
+include "t5x/configs/runs/finetune.gin"
+MIXTURE_OR_TASK_NAME = "sentencefix"
+TASK_FEATURE_LENGTHS = {"inputs": 256, "targets": 256}
+TRAIN_STEPS = 1_050_000  # 1000000 pre-trained steps + 20000 fine-tuning steps.
+USE_CACHED_TASKS = False
+DROPOUT_RATE = 0.0
+RANDOM_SEED = 0
+# `LOSS_NORMALIZING_FACTOR`: When fine-tuning a model that was pre-trained
+# using Mesh Tensorflow (e.g. the public T5 / mT5 / ByT5 models), this should be
+# set to `pretraining batch_size` * `target_token_length`. For T5 and T5.1.1:
+# `2048 * 114`. For mT5: `1024 * 229`. For ByT5: `1024 * 189`.
+LOSS_NORMALIZING_FACTOR = 234496
+INITIAL_CHECKPOINT_PATH = "gs://t5-data/pretrained_models/t5x/mt5_base/checkpoint_1000000"
+train_script.train:
+  eval_period = 500
+  partitioner = @partitioning.ModelBasedPjitPartitioner()
+# `num_decodes` is equivalent to a beam size in a beam search decoding.
+models.EncoderDecoderModel.predict_batch_with_aux.num_decodes = 4
+partitioning.ModelBasedPjitPartitioner.num_partitions = 2
+#from t5.models import mesh_transformer
+#import t5.models
+#mesh_transformer.learning_rate_schedules.constant_learning_rate.learning_rate = 0.0005
+#run.learning_rate_schedule = @learning_rate_schedules.constant_learning_rate

interference.sh ADDED Viewed

	@@ -0,0 +1,16 @@

+INFER_OUTPUT_DIR="output"  # directory to write infer output
+T5X_DIR="../../t5x"  # directory where the t5x is cloned, e.g., ${HOME}"/t5x".
+TFDS_DATA_DIR="gs://nb-t5x/corpus_multi_sentencefix_mt5/"
+CHECKPOINT_PATH="gs://nb-t5x/corpus_multi_sentencefix_mt5/checkpoint_1100000"
+PROJECT_DIR=${HOME}"/mymodel/multi_sentencefix_mt5"
+export PYTHONPATH=${PROJECT_DIR}
+python3 ${T5X_DIR}/t5x/infer.py \
+  --gin_search_paths=${PROJECT_DIR} \
+  --gin_file="base_wmt_infer.gin" \
+  --gin.CHECKPOINT_PATH=\"${CHECKPOINT_PATH}\" \
+  --gin.INFER_OUTPUT_DIR=\"${INFER_OUTPUT_DIR}\" \
+  --tfds_data_dir=${TFDS_DATA_DIR}

tasks.py ADDED Viewed

	@@ -0,0 +1,65 @@

+# /home/perk/mymodel/sentencefix/tasks.py
+import functools
+import seqio
+import tensorflow_datasets as tfds
+from t5.evaluation import metrics
+from t5.data import preprocessors
+import t5
+import tensorflow.compat.v1 as tf
+tsv_path = {
+        "train": "gs://nb-t5x/corpus/train/train.tsv",
+        "validation": "gs://nb-t5x/corpus/eval/eval.tsv",
+        "test": "gs://nb-t5x/corpus/test/test.tsv"
+}
+vocabulary = t5.data.ByteVocabulary()
+DEFAULT_OUTPUT_FEATURES = {
+    "inputs":
+        seqio.Feature(
+            vocabulary=vocabulary, add_eos=True),
+    "targets":
+        seqio.Feature(
+            vocabulary=vocabulary, add_eos=True)
+}
+def sentencefix_preprocessor(ds):
+  def normalize_text(text):
+    """Lowercase and remove quotes from a TensorFlow string."""
+    text = tf.strings.regex_replace(text,"'(.*)'", r"\1")
+    return text
+  def to_inputs_and_targets(ex):
+    """Map {"source": ..., "source": ...}->{"target": ..., "target": ...}."""
+    return {
+        "inputs":
+             tf.strings.join(
+                 [normalize_text(ex["source"])]),
+        "targets":
+	    tf.strings.join(
+                 [normalize_text(ex["target"])]),
+    }
+  return ds.map(to_inputs_and_targets,
+                num_parallel_calls=tf.data.experimental.AUTOTUNE)
+seqio.TaskRegistry.add(
+    "sentencefix",
+    source=seqio.TextLineDataSource(
+        split_to_filepattern=tsv_path,
+        #num_input_examples=num_nq_examples
+        ),
+    preprocessors=[
+      functools.partial(
+          t5.data.preprocessors.parse_tsv,
+          field_names=["source", "target"]),
+      sentencefix_preprocessor,
+      seqio.preprocessors.tokenize_and_append_eos,
+    ],
+    #metric_fns=[t5.evaluation.metrics.bleu],
+    output_features=DEFAULT_OUTPUT_FEATURES,
+)

train.sh ADDED Viewed

	@@ -0,0 +1,12 @@

+PROJECT_DIR=${HOME}"/mymodel/multi-sentencefix-mt5"
+T5X_DIR="../../t5x"  # directory where the t5x is cloned.
+TFDS_DATA_DIR="gs://nb-t5x/corpus_multi_sentencefix_mt5"
+MODEL_DIR="gs://nb-t5x/model_multi_sentencefix_mt5"
+export PYTHONPATH=${PROJECT_DIR}
+python3 ${T5X_DIR}/t5x/train.py \
+  --gin_search_paths=${PROJECT_DIR} \
+  --gin_file="finetune_mt5_sentencefix.gin" \
+  --gin.MODEL_DIR="'${MODEL_DIR}'" \
+  --tfds_data_dir=${TFDS_DATA_DIR}