pere
/

eu-jav-categorisation

Model card Files Files and versions Community

pere commited on May 4, 2022

Commit

61f309d

•

1 Parent(s): c7fbd30

first test

Browse files

Files changed (8) hide show

batch_finetune_eu_jav.sh +11 -0
finetuning_categorisation.gin → finetune_classification_base.gin +16 -16
finetuning_categorisation_large.gin → finetune_classification_large.gin +18 -18
finetune_classification_small.gin +39 -0
finetune_classification_xl.gin +40 -0
tasks.py +5 -5
train_base.sh +0 -11
train_large.sh +0 -11

batch_finetune_eu_jav.sh ADDED Viewed

	@@ -0,0 +1,11 @@

+PROJECT_DIR=${HOME}"/models/eu-jav-categorisation"
+export PYTHONPATH=${PROJECT_DIR}
+INITIAL_CHECKPOINT_PATH=\"gs://t5-data/pretrained_models/t5x/mt5_base/checkpoint_1000000\"
+TRAIN_STEPS=1005000
+python3 ../../t5x/t5x/train.py --gin_search_paths="./" --gin.TRAIN_STEPS=${TRAIN_STEPS} --gin_file="finetune_classification_base.gin" --gin.INITIAL_CHECKPOINT_PATH=${INITIAL_CHECKPOINT_PATH}  --gin.MIXTURE_OR_TASK_NAME=\"classify_tweets\" --gin.MODEL_DIR=\"gs://eu-jav-t5x/finetuned/italian_tweets/classify_tweets_base_v1\" &&
+python3 ../../t5x/t5x/train.py --gin_search_paths="./" --gin.TRAIN_STEPS=${TRAIN_STEPS} --gin_file="finetune_classification_base.gin" --gin.INITIAL_CHECKPOINT_PATH=${INITIAL_CHECKPOINT_PATH}  --gin.MIXTURE_OR_TASK_NAME=\"classify_tweets\" --gin.MODEL_DIR=\"gs://eu-jav-t5x/finetuned/italian_tweets/classify_tweets_base_v2\" &&
+python3 ../../t5x/t5x/train.py --gin_search_paths="./" --gin.TRAIN_STEPS=${TRAIN_STEPS} --gin_file="finetune_classification_base.gin" --gin.INITIAL_CHECKPOINT_PATH=${INITIAL_CHECKPOINT_PATH}  --gin.MIXTURE_OR_TASK_NAME=\"classify_tweets\" --gin.MODEL_DIR=\"gs://eu-jav-t5x/finetuned/italian_tweets/classify_tweets_base_v3\" &&
+python3 ../../t5x/t5x/train.py --gin_search_paths="./" --gin.TRAIN_STEPS=${TRAIN_STEPS} --gin_file="finetune_classification_base.gin" --gin.INITIAL_CHECKPOINT_PATH=${INITIAL_CHECKPOINT_PATH}  --gin.MIXTURE_OR_TASK_NAME=\"classify_tweets\" --gin.MODEL_DIR=\"gs://eu-jav-t5x/finetuned/italian_tweets/classify_tweets_base_v4\" &&
+python3 ../../t5x/t5x/train.py --gin_search_paths="./" --gin.TRAIN_STEPS=${TRAIN_STEPS} --gin_file="finetune_classification_base.gin" --gin.INITIAL_CHECKPOINT_PATH=${INITIAL_CHECKPOINT_PATH}  --gin.MIXTURE_OR_TASK_NAME=\"classify_tweets\" --gin.MODEL_DIR=\"gs://eu-jav-t5x/finetuned/italian_tweets/classify_tweets_base_v5\"

finetuning_categorisation.gin → finetune_classification_base.gin RENAMED Viewed

@@ -10,30 +10,30 @@ from t5x import utils
 include "t5x/examples/t5/mt5/base.gin"
 include "t5x/configs/runs/finetune.gin"
-MIXTURE_OR_TASK_NAME = "categorise"
-TASK_FEATURE_LENGTHS = {"inputs": 256, "targets": 256}
-TRAIN_STEPS = 1_010_000  # 1000000 pre-trained steps + 10000 fine-tuning steps.
 USE_CACHED_TASKS = False
-DROPOUT_RATE = 0.0
 RANDOM_SEED = 0
 # Pere: Only necessary if we load a t5 model. We can start with an t5x model here
 # `LOSS_NORMALIZING_FACTOR`: When fine-tuning a model that was pre-trained
 # using Mesh Tensorflow (e.g. the public T5 / mT5 / ByT5 models), this should be
 # set to `pretraining batch_size` * `target_token_length`. For T5 and T5.1.1:
 # `2048 * 114`. For mT5: `1024 * 229`. For ByT5: `1024 * 189`.
-#LOSS_NORMALIZING_FACTOR = 234496
-INITIAL_CHECKPOINT_PATH = "gs://t5-data/pretrained_models/t5x/mt5_base/checkpoint_1000000"
-#train_script.train:
-#  eval_period = 500
-#  partitioner = @partitioning.ModelBasedPjitPartitioner()
 # partitioning.PjitPartitioner.num_partitions = 1
-# `num_decodes` is equivalent to a beam size in a beam search decoding.
-models.EncoderDecoderModel.predict_batch_with_aux.num_decodes = 4
-#mesh_transformer.learning_rate_schedules.constant_learning_rate.learning_rate = 0.0005
-#run.learning_rate_schedule = @learning_rate_schedules.constant_learning_rate

 include "t5x/examples/t5/mt5/base.gin"
 include "t5x/configs/runs/finetune.gin"
+MIXTURE_OR_TASK_NAME = %gin.REQUIRED
+TASK_FEATURE_LENGTHS = {"inputs": 512, "targets": 512}
+INITIAL_CHECKPOINT_PATH = %gin.REQUIRED
+TRAIN_STEPS = %gin.REQUIRED  # 1000000 pre-trained steps + 10000 fine-tuning steps.
 USE_CACHED_TASKS = False
+DROPOUT_RATE = 0.1
 RANDOM_SEED = 0
+#Fixing a small error
+infer_eval/utils.DatasetConfig:
+  task_feature_lengths = %TASK_FEATURE_LENGTHS
+#Saving every 1000 steps
+utils.SaveCheckpointConfig:
+  period = 1000
 # Pere: Only necessary if we load a t5 model. We can start with an t5x model here
 # `LOSS_NORMALIZING_FACTOR`: When fine-tuning a model that was pre-trained
 # using Mesh Tensorflow (e.g. the public T5 / mT5 / ByT5 models), this should be
 # set to `pretraining batch_size` * `target_token_length`. For T5 and T5.1.1:
 # `2048 * 114`. For mT5: `1024 * 229`. For ByT5: `1024 * 189`.
+# LOSS_NORMALIZING_FACTOR = 234496
+# Might have to ba changed based on architecture
 # partitioning.PjitPartitioner.num_partitions = 1

finetuning_categorisation_large.gin → finetune_classification_large.gin RENAMED Viewed

@@ -10,31 +10,31 @@ from t5x import utils
 include "t5x/examples/t5/mt5/large.gin"
 include "t5x/configs/runs/finetune.gin"
-MIXTURE_OR_TASK_NAME = "categorise"
-TASK_FEATURE_LENGTHS = {"inputs": 256, "targets": 256}
-TRAIN_STEPS = 1_010_000  # 1000000 pre-trained steps + 10000 fine-tuning steps.
 USE_CACHED_TASKS = False
-DROPOUT_RATE = 0.0
 RANDOM_SEED = 0
 # Pere: Only necessary if we load a t5 model. We can start with an t5x model here
 # `LOSS_NORMALIZING_FACTOR`: When fine-tuning a model that was pre-trained
 # using Mesh Tensorflow (e.g. the public T5 / mT5 / ByT5 models), this should be
 # set to `pretraining batch_size` * `target_token_length`. For T5 and T5.1.1:
 # `2048 * 114`. For mT5: `1024 * 229`. For ByT5: `1024 * 189`.
-#LOSS_NORMALIZING_FACTOR = 234496
-INITIAL_CHECKPOINT_PATH = "gs://t5-data/pretrained_models/t5x/mt5_large/checkpoint_1000000"
-# BATCH_SIZE = 64
-#train_script.train:
-#  eval_period = 500
-#  partitioner = @partitioning.ModelBasedPjitPartitioner()
-# partitioning.ModelBasedPjitPartitioner.num_partitions = 2
-# `num_decodes` is equivalent to a beam size in a beam search decoding.
-models.EncoderDecoderModel.predict_batch_with_aux.num_decodes = 4
-#mesh_transformer.learning_rate_schedules.constant_learning_rate.learning_rate = 0.0005
-#run.learning_rate_schedule = @learning_rate_schedules.constant_learning_rate

 include "t5x/examples/t5/mt5/large.gin"
 include "t5x/configs/runs/finetune.gin"
+MIXTURE_OR_TASK_NAME = %gin.REQUIRED
+TASK_FEATURE_LENGTHS = {"inputs": 512, "targets": 512}
+INITIAL_CHECKPOINT_PATH = %gin.REQUIRED
+TRAIN_STEPS = %gin.REQUIRED  # 1000000 pre-trained steps + 10000 fine-tuning steps.
 USE_CACHED_TASKS = False
+DROPOUT_RATE = 0.1
 RANDOM_SEED = 0
+BATCH_SIZE = 32
+#Fixing a small error
+infer_eval/utils.DatasetConfig:
+  task_feature_lengths = %TASK_FEATURE_LENGTHS
+#Saving every 1000 steps
+utils.SaveCheckpointConfig:
+  period = 1000
 # Pere: Only necessary if we load a t5 model. We can start with an t5x model here
 # `LOSS_NORMALIZING_FACTOR`: When fine-tuning a model that was pre-trained
 # using Mesh Tensorflow (e.g. the public T5 / mT5 / ByT5 models), this should be
 # set to `pretraining batch_size` * `target_token_length`. For T5 and T5.1.1:
 # `2048 * 114`. For mT5: `1024 * 229`. For ByT5: `1024 * 189`.
+# LOSS_NORMALIZING_FACTOR = 234496
+# Might have to ba changed based on architecture
+# partitioning.PjitPartitioner.num_partitions = 1

finetune_classification_small.gin ADDED Viewed

	@@ -0,0 +1,39 @@

+from __gin__ import dynamic_registration
+import tasks
+import __main__ as train_script
+from t5.data import mixtures
+from t5x import models
+from t5x import partitioning
+from t5x import utils
+include "t5x/examples/t5/mt5/small.gin"
+include "t5x/configs/runs/finetune.gin"
+MIXTURE_OR_TASK_NAME = %gin.REQUIRED
+TASK_FEATURE_LENGTHS = {"inputs": 512, "targets": 512}
+INITIAL_CHECKPOINT_PATH = %gin.REQUIRED
+TRAIN_STEPS = %gin.REQUIRED  # 1000000 pre-trained steps + 10000 fine-tuning steps.
+USE_CACHED_TASKS = False
+DROPOUT_RATE = 0.1
+RANDOM_SEED = 0
+#Fixing a small error
+infer_eval/utils.DatasetConfig:
+  task_feature_lengths = %TASK_FEATURE_LENGTHS
+#Saving every 1000 steps
+utils.SaveCheckpointConfig:
+  period = 1000
+# Pere: Only necessary if we load a t5 model. We can start with an t5x model here
+# `LOSS_NORMALIZING_FACTOR`: When fine-tuning a model that was pre-trained
+# using Mesh Tensorflow (e.g. the public T5 / mT5 / ByT5 models), this should be
+# set to `pretraining batch_size` * `target_token_length`. For T5 and T5.1.1:
+# `2048 * 114`. For mT5: `1024 * 229`. For ByT5: `1024 * 189`.
+# LOSS_NORMALIZING_FACTOR = 234496
+# Might have to ba changed based on architecture
+# partitioning.PjitPartitioner.num_partitions = 4

finetune_classification_xl.gin ADDED Viewed

	@@ -0,0 +1,40 @@

+from __gin__ import dynamic_registration
+import tasks
+import __main__ as train_script
+from t5.data import mixtures
+from t5x import models
+from t5x import partitioning
+from t5x import utils
+include "t5x/examples/t5/mt5/xl.gin"
+include "t5x/configs/runs/finetune.gin"
+MIXTURE_OR_TASK_NAME = %gin.REQUIRED
+TASK_FEATURE_LENGTHS = {"inputs": 512, "targets": 512}
+INITIAL_CHECKPOINT_PATH = %gin.REQUIRED
+TRAIN_STEPS = %gin.REQUIRED  # 1000000 pre-trained steps + 10000 fine-tuning steps.
+USE_CACHED_TASKS = False
+DROPOUT_RATE = 0.1
+RANDOM_SEED = 0
+BATCH_SIZE = 32
+#Fixing a small error
+infer_eval/utils.DatasetConfig:
+  task_feature_lengths = %TASK_FEATURE_LENGTHS
+#Saving every 1000 steps
+utils.SaveCheckpointConfig:
+  period = 1000
+# Pere: Only necessary if we load a t5 model. We can start with an t5x model here
+# `LOSS_NORMALIZING_FACTOR`: When fine-tuning a model that was pre-trained
+# using Mesh Tensorflow (e.g. the public T5 / mT5 / ByT5 models), this should be
+# set to `pretraining batch_size` * `target_token_length`. For T5 and T5.1.1:
+# `2048 * 114`. For mT5: `1024 * 229`. For ByT5: `1024 * 189`.
+# LOSS_NORMALIZING_FACTOR = 234496
+# Might have to ba changed based on architecture
+# partitioning.PjitPartitioner.num_partitions = 1

tasks.py CHANGED Viewed

@@ -9,9 +9,9 @@ import t5
 import tensorflow.compat.v1 as tf
 tsv_path = {
-        "train": "gs://peregilcloud/italian_tweets/train3.tsv",
-        "validation": "gs://peregilcloud/italian_tweets/dev.tsv",
-        "test": "gs://peregilcloud/italian_tweets/test.tsv"
 }
 vocabulary = seqio.SentencePieceVocabulary(
@@ -47,7 +47,7 @@ def categorise_preprocessor(ds):
 seqio.TaskRegistry.add(
-    "categorise",
     source=seqio.TextLineDataSource(
         split_to_filepattern=tsv_path,
         #num_input_examples=num_nq_examples
@@ -55,7 +55,7 @@ seqio.TaskRegistry.add(
     preprocessors=[
       functools.partial(
           t5.data.preprocessors.parse_tsv,
-          field_names=["target","source"]),
       categorise_preprocessor,
       seqio.preprocessors.tokenize_and_append_eos,
     ],

 import tensorflow.compat.v1 as tf
 tsv_path = {
+        "train": "gs://eu-jav-t5x/corpus/labeled/datasetA_train_3categories.tsv",
+        "validation": "gs://eu-jav-t5x/corpus/labeled/datasetA_dev_3categories.tsv",
+        "test": "gs://eu-jav-t5x/corpus/labeled/ datasetA_test_3categories.tsv"
 }
 vocabulary = seqio.SentencePieceVocabulary(
 seqio.TaskRegistry.add(
+    "categorise_tweets",
     source=seqio.TextLineDataSource(
         split_to_filepattern=tsv_path,
         #num_input_examples=num_nq_examples
     preprocessors=[
       functools.partial(
           t5.data.preprocessors.parse_tsv,
+          field_names=[i"annotator1","annotator2","annotator3","target","source"."id"]),
       categorise_preprocessor,
       seqio.preprocessors.tokenize_and_append_eos,
     ],

train_base.sh DELETED Viewed

@@ -1,11 +0,0 @@
-PROJECT_DIR=${HOME}"/models/eu-jav-categorisation"
-T5X_DIR="../../t5x"  # directory where the t5x is cloned.
-#Needs to be updated when moving to tpu-v4  it should then be in another zone
-MODEL_DIR="gs://nb-t5x/eujav_base"
-export PYTHONPATH=${PROJECT_DIR}
-python3 ${T5X_DIR}/t5x/train.py \
-  --gin_search_paths=${PROJECT_DIR} \
-  --gin_file="finetuning_categorisation.gin" \
-  --gin.MODEL_DIR="'${MODEL_DIR}'"

train_large.sh DELETED Viewed

@@ -1,11 +0,0 @@
-PROJECT_DIR=${HOME}"/models/eu-jav-categorisation"
-T5X_DIR="../../t5x"  # directory where the t5x is cloned.
-#Needs to be updated when moving to tpu-v4  it should then be in another zone
-MODEL_DIR="gs://nb-t5x-us-central2/eujav_large3"
-export PYTHONPATH=${PROJECT_DIR}
-python3 ${T5X_DIR}/t5x/train.py \
-  --gin_search_paths=${PROJECT_DIR} \
-  --gin_file="finetuning_categorisation_large.gin" \
-  --gin.MODEL_DIR="'${MODEL_DIR}'"