pere commited on
Commit
97d299e
1 Parent(s): ac636e2
README.md ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ ---
finetune_summary_base.gin ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __gin__ import dynamic_registration
2
+ import tasks
3
+
4
+ import __main__ as train_script
5
+ from t5.data import mixtures
6
+ from t5x import models
7
+ from t5x import partitioning
8
+ from t5x import utils
9
+
10
+ include "t5x/examples/t5/mt5/base.gin"
11
+ include "t5x/configs/runs/finetune.gin"
12
+
13
+ MIXTURE_OR_TASK_NAME = %gin.REQUIRED
14
+ TASK_FEATURE_LENGTHS = {"inputs": 512, "targets": 128}
15
+ INITIAL_CHECKPOINT_PATH = %gin.REQUIRED
16
+ TRAIN_STEPS = %gin.REQUIRED # 1000000 pre-trained steps + 10000 fine-tuning steps.
17
+ USE_CACHED_TASKS = False
18
+ DROPOUT_RATE = 0.1
19
+ RANDOM_SEED = 0
20
+
21
+ #Fixing a small error
22
+ infer_eval/utils.DatasetConfig:
23
+ task_feature_lengths = %TASK_FEATURE_LENGTHS
24
+
25
+ #Saving every 1000 steps
26
+ utils.SaveCheckpointConfig:
27
+ period = 1000
28
+
29
+
30
+ # Pere: Only necessary if we load a t5 model. We can start with an t5x model here
31
+ # `LOSS_NORMALIZING_FACTOR`: When fine-tuning a model that was pre-trained
32
+ # using Mesh Tensorflow (e.g. the public T5 / mT5 / ByT5 models), this should be
33
+ # set to `pretraining batch_size` * `target_token_length`. For T5 and T5.1.1:
34
+ # `2048 * 114`. For mT5: `1024 * 229`. For ByT5: `1024 * 189`.
35
+ # LOSS_NORMALIZING_FACTOR = 234496
36
+
37
+ # Might have to ba changed based on architecture
38
+ # partitioning.PjitPartitioner.num_partitions = 1
39
+
finetune_summary_large.gin ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __gin__ import dynamic_registration
2
+ import tasks
3
+
4
+ import __main__ as train_script
5
+ from t5.data import mixtures
6
+ from t5x import models
7
+ from t5x import partitioning
8
+ from t5x import utils
9
+
10
+ include "t5x/examples/t5/mt5/large.gin"
11
+ include "t5x/configs/runs/finetune.gin"
12
+
13
+ MIXTURE_OR_TASK_NAME = %gin.REQUIRED
14
+ TASK_FEATURE_LENGTHS = {"inputs": 512, "targets": 128}
15
+ INITIAL_CHECKPOINT_PATH = %gin.REQUIRED
16
+ TRAIN_STEPS = %gin.REQUIRED # 1000000 pre-trained steps + 10000 fine-tuning steps.
17
+ USE_CACHED_TASKS = False
18
+ DROPOUT_RATE = 0.1
19
+ RANDOM_SEED = 0
20
+ #BATCH_SIZE = 32
21
+
22
+ #Fixing a small error
23
+ infer_eval/utils.DatasetConfig:
24
+ task_feature_lengths = %TASK_FEATURE_LENGTHS
25
+
26
+ #Saving every 1000 steps
27
+ utils.SaveCheckpointConfig:
28
+ period = 1000
29
+
30
+
31
+ # Pere: Only necessary if we load a t5 model. We can start with an t5x model here
32
+ # `LOSS_NORMALIZING_FACTOR`: When fine-tuning a model that was pre-trained
33
+ # using Mesh Tensorflow (e.g. the public T5 / mT5 / ByT5 models), this should be
34
+ # set to `pretraining batch_size` * `target_token_length`. For T5 and T5.1.1:
35
+ # `2048 * 114`. For mT5: `1024 * 229`. For ByT5: `1024 * 189`.
36
+ # LOSS_NORMALIZING_FACTOR = 234496
37
+
38
+ # Might have to ba changed based on architecture
39
+ # partitioning.PjitPartitioner.num_partitions = 1
40
+
finetune_summary_xl.gin ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __gin__ import dynamic_registration
2
+ import tasks
3
+
4
+ import __main__ as train_script
5
+ from t5.data import mixtures
6
+ from t5x import models
7
+ from t5x import partitioning
8
+ from t5x import utils
9
+
10
+ include "t5x/examples/t5/mt5/xl.gin"
11
+ include "t5x/configs/runs/finetune.gin"
12
+
13
+ MIXTURE_OR_TASK_NAME = %gin.REQUIRED
14
+ TASK_FEATURE_LENGTHS = {"inputs": 512, "targets": 128}
15
+ INITIAL_CHECKPOINT_PATH = %gin.REQUIRED
16
+ TRAIN_STEPS = %gin.REQUIRED # 1000000 pre-trained steps + 10000 fine-tuning steps.
17
+ USE_CACHED_TASKS = False
18
+ DROPOUT_RATE = 0.1
19
+ RANDOM_SEED = 0
20
+ #BATCH_SIZE = 32
21
+
22
+ #Fixing a small error
23
+ infer_eval/utils.DatasetConfig:
24
+ task_feature_lengths = %TASK_FEATURE_LENGTHS
25
+
26
+ #Saving every 1000 steps
27
+ utils.SaveCheckpointConfig:
28
+ period = 5000
29
+
30
+
31
+ # Pere: Only necessary if we load a t5 model. We can start with an t5x model here
32
+ # `LOSS_NORMALIZING_FACTOR`: When fine-tuning a model that was pre-trained
33
+ # using Mesh Tensorflow (e.g. the public T5 / mT5 / ByT5 models), this should be
34
+ # set to `pretraining batch_size` * `target_token_length`. For T5 and T5.1.1:
35
+ # `2048 * 114`. For mT5: `1024 * 229`. For ByT5: `1024 * 189`.
36
+ # LOSS_NORMALIZING_FACTOR = 234496
37
+
38
+ # Might have to ba changed based on architecture
39
+ # partitioning.PjitPartitioner.num_partitions = 2
40
+
my_metrics.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ import sklearn.metrics
2
+ import numpy as np
3
+
4
+ def f1_macro(targets, predictions):
5
+ targets, predictions = np.asarray(targets).astype(str), np.asarray(predictions).astype(str)
6
+ return {"f1_macro": 100*sklearn.metrics.f1_score(targets, predictions, average='macro')}
7
+
summary_base.sh ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ PROJECT_DIR=${HOME}"/models/t5-engelsk-oversetter"
2
+ export PYTHONPATH=${PROJECT_DIR}
3
+ INITIAL_CHECKPOINT_PATH=\"gs://nb-t5x-us-central2/scandinavian3k_solo_t5x_base/checkpoint_3000000\"
4
+ TRAIN_STEPS=3030000
5
+
6
+
7
+
8
+ python3 ../../t5x/t5x/train.py \
9
+ --gin_search_paths="./" \
10
+ --gin.TRAIN_STEPS=${TRAIN_STEPS} \
11
+ --gin_file="finetune_summary_base.gin" \
12
+ --gin.MIXTURE_OR_TASK_NAME=\"summary\" \
13
+ --gin.MODEL_DIR=\"gs://nb-t5x-us-central2/finetuned/summary_scandinavian3k_solo_base\" \
14
+ --gin.INITIAL_CHECKPOINT_PATH=${INITIAL_CHECKPOINT_PATH} \
summary_large.sh ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ PROJECT_DIR=${HOME}"/models/t5-engelsk-oversetter"
2
+ export PYTHONPATH=${PROJECT_DIR}
3
+ INITIAL_CHECKPOINT_PATH=\"gs://nb-t5x-us-central2/scandinavian3k_solo_t5x_large/checkpoint_3000000\"
4
+ TRAIN_STEPS=3030000
5
+
6
+
7
+
8
+ python3 ../../t5x/t5x/train.py \
9
+ --gin_search_paths="./" \
10
+ --gin.TRAIN_STEPS=${TRAIN_STEPS} \
11
+ --gin_file="finetune_summary_large.gin" \
12
+ --gin.MIXTURE_OR_TASK_NAME=\"summary\" \
13
+ --gin.MODEL_DIR=\"gs://nb-t5x-us-central2/finetuned/summary_scandinavian3k_solo_large\" \
14
+ --gin.INITIAL_CHECKPOINT_PATH=${INITIAL_CHECKPOINT_PATH} \
summary_xl.sh ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ PROJECT_DIR=${HOME}"/models/t5-engelsk-oversetter"
2
+ export PYTHONPATH=${PROJECT_DIR}
3
+ INITIAL_CHECKPOINT_PATH=\"gs://nb-t5x-us-central2/scandinavian3k_solo_t5x_xl/checkpoint_3000000\"
4
+ TRAIN_STEPS=3030000
5
+
6
+ python3 ../../t5x/t5x/train.py \
7
+ --gin_search_paths="./" \
8
+ --gin.TRAIN_STEPS=${TRAIN_STEPS} \
9
+ --gin_file="finetune_summary_xl.gin" \
10
+ --gin.MIXTURE_OR_TASK_NAME=\"summary\" \
11
+ --gin.MODEL_DIR=\"gs://nb-t5x-us-central2/finetuned/summary_scandinavian3k_solo_xl\" \
12
+ --gin.INITIAL_CHECKPOINT_PATH=${INITIAL_CHECKPOINT_PATH} \
tasks.py ADDED
@@ -0,0 +1,191 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # /home/perk/mymodel/categorisation-mt5x/tasks.py
2
+
3
+
4
+ import functools
5
+ import seqio
6
+ import my_metrics
7
+ import tensorflow_datasets as tfds
8
+ from t5.evaluation import metrics
9
+ from t5.data import preprocessors
10
+ #import my_preprocessors
11
+ import t5
12
+ import tensorflow.compat.v1 as tf
13
+
14
+
15
+
16
+ tsv_parliament_path = {
17
+ "train": "gs://notram-public/finetune_datasets/parliament_speeches_1998_2016_frp_or_sv/train.tsv",
18
+ "validation": "gs://notram-public/finetune_datasets/parliament_speeches_1998_2016_frp_or_sv/dev.tsv",
19
+ "test": "gs://notram-public/finetune_datasets/parliament_speeches_1998_2016_frp_or_sv/test.tsv"
20
+ }
21
+
22
+ summary_path = {
23
+ "train": "gs://north-t5x/corpus/summary_test/all_train.tsv",
24
+ "validation": "gs://north-t5x/corpus/summary_test/nrk_test.tsv",
25
+ "test": "gs://north-t5x/corpus/summary_test/nrk_test.tsv"
26
+ }
27
+
28
+ tsv_translate_path = {
29
+ "train": "gs://nb-t5x-us-central2/corpus_en_no/train.tsv",
30
+ "validation": "gs://nb-t5x-us-central2/corpus_en_no/dev.tsv",
31
+ "test": "gs://nb-t5x-us-central2/corpus_en_no/test.tsv"
32
+ }
33
+
34
+
35
+ tsv_sentiment_path = {
36
+ "train": "gs://notram-public/finetune_datasets/norec_sentiment/train.tsv",
37
+ "validation": "gs://notram-public/finetune_datasets/norec_sentiment/dev.tsv",
38
+ "test": "gs://notram-public/finetune_datasets/norec_sentiment/test.tsv"
39
+ }
40
+
41
+ json_angry_tweets_path = {
42
+ "train": "gs://notram-public/finetune_datasets/angry_tweets/train.jsonl",
43
+ "validation": "gs://notram-public/finetune_datasets/angry_tweets/test.jsonl",
44
+ "test": "gs://notram-public/finetune_datasets/angry_tweets/test.jsonl"
45
+ }
46
+
47
+ tsv_angry_tweets_path = {
48
+ "train": "gs://notram-public/finetune_datasets/angry_tweets/train.tsv",
49
+ "validation": "gs://notram-public/finetune_datasets/angry_tweets/test.tsv",
50
+ "test": "gs://notram-public/finetune_datasets/angry_tweets/test.tsv"
51
+ }
52
+
53
+
54
+ tsv_dane_path = {
55
+ "train": "gs://notram-public/finetune_datasets/dane/train.tsv",
56
+ "validation": "gs://notram-public/finetune_datasets/dane/test.tsv",
57
+ "test": "gs://notram-public/finetune_datasets/dane/test.tsv"
58
+ }
59
+
60
+ tsv_dane_tokens_path = {
61
+ "train": "gs://notram-public/finetune_datasets/dane/train_tokens.tsv",
62
+ "validation": "gs://notram-public/finetune_datasets/dane/test_tokens.tsv",
63
+ "test": "gs://notram-public/finetune_datasets/dane/test_tokens.tsv"
64
+ }
65
+
66
+
67
+ tsv_dane_long_tokens_path = {
68
+ "train": "gs://notram-public/finetune_datasets/dane/train_long_tokens.tsv",
69
+ "validation": "gs://notram-public/finetune_datasets/dane/test_long_tokens.tsv",
70
+ "test": "gs://notram-public/finetune_datasets/dane/test_long_tokens.tsv"
71
+ }
72
+
73
+
74
+ vocabulary = seqio.SentencePieceVocabulary(
75
+ 'gs://t5-data/vocabs/mc4.250000.100extra/sentencepiece.model', extra_ids=0)
76
+
77
+ DEFAULT_OUTPUT_FEATURES = {
78
+ "inputs":
79
+ seqio.Feature(
80
+ vocabulary=vocabulary, add_eos=True),
81
+ "targets":
82
+ seqio.Feature(
83
+ vocabulary=vocabulary, add_eos=True)
84
+ }
85
+
86
+ def categorise_preprocessor(ds):
87
+ def normalize_text(text):
88
+ """Lowercase and remove quotes from a TensorFlow string."""
89
+ #text = tf.strings.regex_replace(text,"'(.*)'", r"\1")
90
+ ...
91
+ return text
92
+
93
+ def to_inputs_and_targets(ex):
94
+ """Map {"source": ..., "source": ...}->{"target": ..., "target": ...}."""
95
+ return {
96
+ "inputs":
97
+ tf.strings.join(
98
+ [normalize_text(ex["source"])]),
99
+ "targets":
100
+ tf.strings.join(
101
+ [normalize_text(ex["target"])]),
102
+ }
103
+ return ds.map(to_inputs_and_targets,
104
+ num_parallel_calls=tf.data.experimental.AUTOTUNE)
105
+
106
+
107
+ seqio.TaskRegistry.add(
108
+ "parliament",
109
+ source=seqio.TextLineDataSource(
110
+ split_to_filepattern=tsv_parliament_path,
111
+ #num_input_examples=num_nq_examples
112
+ ),
113
+ preprocessors=[
114
+ functools.partial(
115
+ t5.data.preprocessors.parse_tsv,
116
+ field_names=["target","source"]),
117
+ categorise_preprocessor,
118
+ seqio.preprocessors.tokenize_and_append_eos,
119
+ ],
120
+ metric_fns=[metrics.accuracy,my_metrics.f1_macro],
121
+ output_features=DEFAULT_OUTPUT_FEATURES,
122
+ )
123
+
124
+ seqio.TaskRegistry.add(
125
+ "sentiment",
126
+ source=seqio.TextLineDataSource(
127
+ split_to_filepattern=tsv_sentiment_path,
128
+ #num_input_examples=num_nq_examples
129
+ ),
130
+ preprocessors=[
131
+ functools.partial(
132
+ t5.data.preprocessors.parse_tsv,
133
+ field_names=["target","source"]),
134
+ categorise_preprocessor,
135
+ seqio.preprocessors.tokenize_and_append_eos,
136
+ ],
137
+ metric_fns=[metrics.accuracy,my_metrics.f1_macro],
138
+ output_features=DEFAULT_OUTPUT_FEATURES,
139
+ )
140
+
141
+ seqio.TaskRegistry.add(
142
+ "angry_tweets",
143
+ source=seqio.TextLineDataSource(
144
+ split_to_filepattern=tsv_angry_tweets_path,
145
+ #num_input_examples=num_nq_examples
146
+ ),
147
+ preprocessors=[
148
+ functools.partial(
149
+ t5.data.preprocessors.parse_tsv,
150
+ field_names=["target","source"]),
151
+ categorise_preprocessor,
152
+ seqio.preprocessors.tokenize_and_append_eos,
153
+ ],
154
+ metric_fns=[metrics.accuracy,my_metrics.f1_macro],
155
+ output_features=DEFAULT_OUTPUT_FEATURES,
156
+ )
157
+
158
+ seqio.TaskRegistry.add(
159
+ "dane",
160
+ source=seqio.TextLineDataSource(
161
+ split_to_filepattern=tsv_dane_long_tokens_path,
162
+ #num_input_examples=num_nq_examples
163
+ ),
164
+ preprocessors=[
165
+ functools.partial(
166
+ t5.data.preprocessors.parse_tsv,
167
+ field_names=["placeholder1","placeholder2","placeholder3","target","source"]),
168
+ categorise_preprocessor,
169
+ seqio.preprocessors.tokenize_and_append_eos,
170
+ ],
171
+ metric_fns=[metrics.accuracy,my_metrics.f1_macro],
172
+ output_features=DEFAULT_OUTPUT_FEATURES,
173
+ )
174
+
175
+ seqio.TaskRegistry.add(
176
+ "summary",
177
+ source=seqio.TextLineDataSource(
178
+ split_to_filepattern=tsv_translate_path,
179
+ #num_input_examples=num_nq_examples
180
+ ),
181
+ preprocessors=[
182
+ functools.partial(
183
+ t5.data.preprocessors.parse_tsv,
184
+ field_names=["source","target"]),
185
+ categorise_preprocessor,
186
+ seqio.preprocessors.tokenize_and_append_eos,
187
+ ],
188
+ metric_fns=[metrics.accuracy,my_metrics.f1_macro,metrics.bleu,metrics.rouge],
189
+ output_features=DEFAULT_OUTPUT_FEATURES,
190
+ )
191
+