pere commited on
Commit
61f309d
1 Parent(s): c7fbd30

first test

Browse files
batch_finetune_eu_jav.sh ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ PROJECT_DIR=${HOME}"/models/eu-jav-categorisation"
2
+ export PYTHONPATH=${PROJECT_DIR}
3
+ INITIAL_CHECKPOINT_PATH=\"gs://t5-data/pretrained_models/t5x/mt5_base/checkpoint_1000000\"
4
+ TRAIN_STEPS=1005000
5
+
6
+ python3 ../../t5x/t5x/train.py --gin_search_paths="./" --gin.TRAIN_STEPS=${TRAIN_STEPS} --gin_file="finetune_classification_base.gin" --gin.INITIAL_CHECKPOINT_PATH=${INITIAL_CHECKPOINT_PATH} --gin.MIXTURE_OR_TASK_NAME=\"classify_tweets\" --gin.MODEL_DIR=\"gs://eu-jav-t5x/finetuned/italian_tweets/classify_tweets_base_v1\" &&
7
+ python3 ../../t5x/t5x/train.py --gin_search_paths="./" --gin.TRAIN_STEPS=${TRAIN_STEPS} --gin_file="finetune_classification_base.gin" --gin.INITIAL_CHECKPOINT_PATH=${INITIAL_CHECKPOINT_PATH} --gin.MIXTURE_OR_TASK_NAME=\"classify_tweets\" --gin.MODEL_DIR=\"gs://eu-jav-t5x/finetuned/italian_tweets/classify_tweets_base_v2\" &&
8
+ python3 ../../t5x/t5x/train.py --gin_search_paths="./" --gin.TRAIN_STEPS=${TRAIN_STEPS} --gin_file="finetune_classification_base.gin" --gin.INITIAL_CHECKPOINT_PATH=${INITIAL_CHECKPOINT_PATH} --gin.MIXTURE_OR_TASK_NAME=\"classify_tweets\" --gin.MODEL_DIR=\"gs://eu-jav-t5x/finetuned/italian_tweets/classify_tweets_base_v3\" &&
9
+ python3 ../../t5x/t5x/train.py --gin_search_paths="./" --gin.TRAIN_STEPS=${TRAIN_STEPS} --gin_file="finetune_classification_base.gin" --gin.INITIAL_CHECKPOINT_PATH=${INITIAL_CHECKPOINT_PATH} --gin.MIXTURE_OR_TASK_NAME=\"classify_tweets\" --gin.MODEL_DIR=\"gs://eu-jav-t5x/finetuned/italian_tweets/classify_tweets_base_v4\" &&
10
+ python3 ../../t5x/t5x/train.py --gin_search_paths="./" --gin.TRAIN_STEPS=${TRAIN_STEPS} --gin_file="finetune_classification_base.gin" --gin.INITIAL_CHECKPOINT_PATH=${INITIAL_CHECKPOINT_PATH} --gin.MIXTURE_OR_TASK_NAME=\"classify_tweets\" --gin.MODEL_DIR=\"gs://eu-jav-t5x/finetuned/italian_tweets/classify_tweets_base_v5\"
11
+
finetuning_categorisation.gin → finetune_classification_base.gin RENAMED
@@ -10,30 +10,30 @@ from t5x import utils
10
  include "t5x/examples/t5/mt5/base.gin"
11
  include "t5x/configs/runs/finetune.gin"
12
 
13
- MIXTURE_OR_TASK_NAME = "categorise"
14
- TASK_FEATURE_LENGTHS = {"inputs": 256, "targets": 256}
15
- TRAIN_STEPS = 1_010_000 # 1000000 pre-trained steps + 10000 fine-tuning steps.
 
16
  USE_CACHED_TASKS = False
17
- DROPOUT_RATE = 0.0
18
  RANDOM_SEED = 0
19
 
 
 
 
 
 
 
 
 
 
20
  # Pere: Only necessary if we load a t5 model. We can start with an t5x model here
21
  # `LOSS_NORMALIZING_FACTOR`: When fine-tuning a model that was pre-trained
22
  # using Mesh Tensorflow (e.g. the public T5 / mT5 / ByT5 models), this should be
23
  # set to `pretraining batch_size` * `target_token_length`. For T5 and T5.1.1:
24
  # `2048 * 114`. For mT5: `1024 * 229`. For ByT5: `1024 * 189`.
25
- #LOSS_NORMALIZING_FACTOR = 234496
26
 
27
- INITIAL_CHECKPOINT_PATH = "gs://t5-data/pretrained_models/t5x/mt5_base/checkpoint_1000000"
28
-
29
- #train_script.train:
30
- # eval_period = 500
31
- # partitioner = @partitioning.ModelBasedPjitPartitioner()
32
  # partitioning.PjitPartitioner.num_partitions = 1
33
 
34
- # `num_decodes` is equivalent to a beam size in a beam search decoding.
35
- models.EncoderDecoderModel.predict_batch_with_aux.num_decodes = 4
36
-
37
- #mesh_transformer.learning_rate_schedules.constant_learning_rate.learning_rate = 0.0005
38
- #run.learning_rate_schedule = @learning_rate_schedules.constant_learning_rate
39
-
 
10
  include "t5x/examples/t5/mt5/base.gin"
11
  include "t5x/configs/runs/finetune.gin"
12
 
13
+ MIXTURE_OR_TASK_NAME = %gin.REQUIRED
14
+ TASK_FEATURE_LENGTHS = {"inputs": 512, "targets": 512}
15
+ INITIAL_CHECKPOINT_PATH = %gin.REQUIRED
16
+ TRAIN_STEPS = %gin.REQUIRED # 1000000 pre-trained steps + 10000 fine-tuning steps.
17
  USE_CACHED_TASKS = False
18
+ DROPOUT_RATE = 0.1
19
  RANDOM_SEED = 0
20
 
21
+ #Fixing a small error
22
+ infer_eval/utils.DatasetConfig:
23
+ task_feature_lengths = %TASK_FEATURE_LENGTHS
24
+
25
+ #Saving every 1000 steps
26
+ utils.SaveCheckpointConfig:
27
+ period = 1000
28
+
29
+
30
  # Pere: Only necessary if we load a t5 model. We can start with an t5x model here
31
  # `LOSS_NORMALIZING_FACTOR`: When fine-tuning a model that was pre-trained
32
  # using Mesh Tensorflow (e.g. the public T5 / mT5 / ByT5 models), this should be
33
  # set to `pretraining batch_size` * `target_token_length`. For T5 and T5.1.1:
34
  # `2048 * 114`. For mT5: `1024 * 229`. For ByT5: `1024 * 189`.
35
+ # LOSS_NORMALIZING_FACTOR = 234496
36
 
37
+ # Might have to ba changed based on architecture
 
 
 
 
38
  # partitioning.PjitPartitioner.num_partitions = 1
39
 
 
 
 
 
 
 
finetuning_categorisation_large.gin → finetune_classification_large.gin RENAMED
@@ -10,31 +10,31 @@ from t5x import utils
10
  include "t5x/examples/t5/mt5/large.gin"
11
  include "t5x/configs/runs/finetune.gin"
12
 
13
- MIXTURE_OR_TASK_NAME = "categorise"
14
- TASK_FEATURE_LENGTHS = {"inputs": 256, "targets": 256}
15
- TRAIN_STEPS = 1_010_000 # 1000000 pre-trained steps + 10000 fine-tuning steps.
 
16
  USE_CACHED_TASKS = False
17
- DROPOUT_RATE = 0.0
18
  RANDOM_SEED = 0
 
 
 
 
 
 
 
 
 
 
19
 
20
  # Pere: Only necessary if we load a t5 model. We can start with an t5x model here
21
  # `LOSS_NORMALIZING_FACTOR`: When fine-tuning a model that was pre-trained
22
  # using Mesh Tensorflow (e.g. the public T5 / mT5 / ByT5 models), this should be
23
  # set to `pretraining batch_size` * `target_token_length`. For T5 and T5.1.1:
24
  # `2048 * 114`. For mT5: `1024 * 229`. For ByT5: `1024 * 189`.
25
- #LOSS_NORMALIZING_FACTOR = 234496
26
-
27
- INITIAL_CHECKPOINT_PATH = "gs://t5-data/pretrained_models/t5x/mt5_large/checkpoint_1000000"
28
- # BATCH_SIZE = 64
29
-
30
- #train_script.train:
31
- # eval_period = 500
32
- # partitioner = @partitioning.ModelBasedPjitPartitioner()
33
- # partitioning.ModelBasedPjitPartitioner.num_partitions = 2
34
-
35
- # `num_decodes` is equivalent to a beam size in a beam search decoding.
36
- models.EncoderDecoderModel.predict_batch_with_aux.num_decodes = 4
37
 
38
- #mesh_transformer.learning_rate_schedules.constant_learning_rate.learning_rate = 0.0005
39
- #run.learning_rate_schedule = @learning_rate_schedules.constant_learning_rate
40
 
 
10
  include "t5x/examples/t5/mt5/large.gin"
11
  include "t5x/configs/runs/finetune.gin"
12
 
13
+ MIXTURE_OR_TASK_NAME = %gin.REQUIRED
14
+ TASK_FEATURE_LENGTHS = {"inputs": 512, "targets": 512}
15
+ INITIAL_CHECKPOINT_PATH = %gin.REQUIRED
16
+ TRAIN_STEPS = %gin.REQUIRED # 1000000 pre-trained steps + 10000 fine-tuning steps.
17
  USE_CACHED_TASKS = False
18
+ DROPOUT_RATE = 0.1
19
  RANDOM_SEED = 0
20
+ BATCH_SIZE = 32
21
+
22
+ #Fixing a small error
23
+ infer_eval/utils.DatasetConfig:
24
+ task_feature_lengths = %TASK_FEATURE_LENGTHS
25
+
26
+ #Saving every 1000 steps
27
+ utils.SaveCheckpointConfig:
28
+ period = 1000
29
+
30
 
31
  # Pere: Only necessary if we load a t5 model. We can start with an t5x model here
32
  # `LOSS_NORMALIZING_FACTOR`: When fine-tuning a model that was pre-trained
33
  # using Mesh Tensorflow (e.g. the public T5 / mT5 / ByT5 models), this should be
34
  # set to `pretraining batch_size` * `target_token_length`. For T5 and T5.1.1:
35
  # `2048 * 114`. For mT5: `1024 * 229`. For ByT5: `1024 * 189`.
36
+ # LOSS_NORMALIZING_FACTOR = 234496
 
 
 
 
 
 
 
 
 
 
 
37
 
38
+ # Might have to ba changed based on architecture
39
+ # partitioning.PjitPartitioner.num_partitions = 1
40
 
finetune_classification_small.gin ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __gin__ import dynamic_registration
2
+ import tasks
3
+
4
+ import __main__ as train_script
5
+ from t5.data import mixtures
6
+ from t5x import models
7
+ from t5x import partitioning
8
+ from t5x import utils
9
+
10
+ include "t5x/examples/t5/mt5/small.gin"
11
+ include "t5x/configs/runs/finetune.gin"
12
+
13
+ MIXTURE_OR_TASK_NAME = %gin.REQUIRED
14
+ TASK_FEATURE_LENGTHS = {"inputs": 512, "targets": 512}
15
+ INITIAL_CHECKPOINT_PATH = %gin.REQUIRED
16
+ TRAIN_STEPS = %gin.REQUIRED # 1000000 pre-trained steps + 10000 fine-tuning steps.
17
+ USE_CACHED_TASKS = False
18
+ DROPOUT_RATE = 0.1
19
+ RANDOM_SEED = 0
20
+
21
+ #Fixing a small error
22
+ infer_eval/utils.DatasetConfig:
23
+ task_feature_lengths = %TASK_FEATURE_LENGTHS
24
+
25
+ #Saving every 1000 steps
26
+ utils.SaveCheckpointConfig:
27
+ period = 1000
28
+
29
+
30
+ # Pere: Only necessary if we load a t5 model. We can start with an t5x model here
31
+ # `LOSS_NORMALIZING_FACTOR`: When fine-tuning a model that was pre-trained
32
+ # using Mesh Tensorflow (e.g. the public T5 / mT5 / ByT5 models), this should be
33
+ # set to `pretraining batch_size` * `target_token_length`. For T5 and T5.1.1:
34
+ # `2048 * 114`. For mT5: `1024 * 229`. For ByT5: `1024 * 189`.
35
+ # LOSS_NORMALIZING_FACTOR = 234496
36
+
37
+ # Might have to ba changed based on architecture
38
+ # partitioning.PjitPartitioner.num_partitions = 4
39
+
finetune_classification_xl.gin ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __gin__ import dynamic_registration
2
+ import tasks
3
+
4
+ import __main__ as train_script
5
+ from t5.data import mixtures
6
+ from t5x import models
7
+ from t5x import partitioning
8
+ from t5x import utils
9
+
10
+ include "t5x/examples/t5/mt5/xl.gin"
11
+ include "t5x/configs/runs/finetune.gin"
12
+
13
+ MIXTURE_OR_TASK_NAME = %gin.REQUIRED
14
+ TASK_FEATURE_LENGTHS = {"inputs": 512, "targets": 512}
15
+ INITIAL_CHECKPOINT_PATH = %gin.REQUIRED
16
+ TRAIN_STEPS = %gin.REQUIRED # 1000000 pre-trained steps + 10000 fine-tuning steps.
17
+ USE_CACHED_TASKS = False
18
+ DROPOUT_RATE = 0.1
19
+ RANDOM_SEED = 0
20
+ BATCH_SIZE = 32
21
+
22
+ #Fixing a small error
23
+ infer_eval/utils.DatasetConfig:
24
+ task_feature_lengths = %TASK_FEATURE_LENGTHS
25
+
26
+ #Saving every 1000 steps
27
+ utils.SaveCheckpointConfig:
28
+ period = 1000
29
+
30
+
31
+ # Pere: Only necessary if we load a t5 model. We can start with an t5x model here
32
+ # `LOSS_NORMALIZING_FACTOR`: When fine-tuning a model that was pre-trained
33
+ # using Mesh Tensorflow (e.g. the public T5 / mT5 / ByT5 models), this should be
34
+ # set to `pretraining batch_size` * `target_token_length`. For T5 and T5.1.1:
35
+ # `2048 * 114`. For mT5: `1024 * 229`. For ByT5: `1024 * 189`.
36
+ # LOSS_NORMALIZING_FACTOR = 234496
37
+
38
+ # Might have to ba changed based on architecture
39
+ # partitioning.PjitPartitioner.num_partitions = 1
40
+
tasks.py CHANGED
@@ -9,9 +9,9 @@ import t5
9
  import tensorflow.compat.v1 as tf
10
 
11
  tsv_path = {
12
- "train": "gs://peregilcloud/italian_tweets/train3.tsv",
13
- "validation": "gs://peregilcloud/italian_tweets/dev.tsv",
14
- "test": "gs://peregilcloud/italian_tweets/test.tsv"
15
  }
16
 
17
  vocabulary = seqio.SentencePieceVocabulary(
@@ -47,7 +47,7 @@ def categorise_preprocessor(ds):
47
 
48
 
49
  seqio.TaskRegistry.add(
50
- "categorise",
51
  source=seqio.TextLineDataSource(
52
  split_to_filepattern=tsv_path,
53
  #num_input_examples=num_nq_examples
@@ -55,7 +55,7 @@ seqio.TaskRegistry.add(
55
  preprocessors=[
56
  functools.partial(
57
  t5.data.preprocessors.parse_tsv,
58
- field_names=["target","source"]),
59
  categorise_preprocessor,
60
  seqio.preprocessors.tokenize_and_append_eos,
61
  ],
 
9
  import tensorflow.compat.v1 as tf
10
 
11
  tsv_path = {
12
+ "train": "gs://eu-jav-t5x/corpus/labeled/datasetA_train_3categories.tsv",
13
+ "validation": "gs://eu-jav-t5x/corpus/labeled/datasetA_dev_3categories.tsv",
14
+ "test": "gs://eu-jav-t5x/corpus/labeled/ datasetA_test_3categories.tsv"
15
  }
16
 
17
  vocabulary = seqio.SentencePieceVocabulary(
 
47
 
48
 
49
  seqio.TaskRegistry.add(
50
+ "categorise_tweets",
51
  source=seqio.TextLineDataSource(
52
  split_to_filepattern=tsv_path,
53
  #num_input_examples=num_nq_examples
 
55
  preprocessors=[
56
  functools.partial(
57
  t5.data.preprocessors.parse_tsv,
58
+ field_names=[i"annotator1","annotator2","annotator3","target","source"."id"]),
59
  categorise_preprocessor,
60
  seqio.preprocessors.tokenize_and_append_eos,
61
  ],
train_base.sh DELETED
@@ -1,11 +0,0 @@
1
- PROJECT_DIR=${HOME}"/models/eu-jav-categorisation"
2
- T5X_DIR="../../t5x" # directory where the t5x is cloned.
3
- #Needs to be updated when moving to tpu-v4 it should then be in another zone
4
- MODEL_DIR="gs://nb-t5x/eujav_base"
5
- export PYTHONPATH=${PROJECT_DIR}
6
-
7
- python3 ${T5X_DIR}/t5x/train.py \
8
- --gin_search_paths=${PROJECT_DIR} \
9
- --gin_file="finetuning_categorisation.gin" \
10
- --gin.MODEL_DIR="'${MODEL_DIR}'"
11
-
 
 
 
 
 
 
 
 
 
 
 
 
train_large.sh DELETED
@@ -1,11 +0,0 @@
1
- PROJECT_DIR=${HOME}"/models/eu-jav-categorisation"
2
- T5X_DIR="../../t5x" # directory where the t5x is cloned.
3
- #Needs to be updated when moving to tpu-v4 it should then be in another zone
4
- MODEL_DIR="gs://nb-t5x-us-central2/eujav_large3"
5
- export PYTHONPATH=${PROJECT_DIR}
6
-
7
- python3 ${T5X_DIR}/t5x/train.py \
8
- --gin_search_paths=${PROJECT_DIR} \
9
- --gin_file="finetuning_categorisation_large.gin" \
10
- --gin.MODEL_DIR="'${MODEL_DIR}'"
11
-