pere
/

north-t5-base-deuncaser

Model card Files Files and versions Community

pere commited on May 12, 2022

Commit

e957cc1

•

1 Parent(s): ad029a3

updated dataset

Browse files

Files changed (5) hide show

__pycache__/my_metrics.cpython-38.pyc +0 -0
__pycache__/tasks.cpython-38.pyc +0 -0
finetune_base.sh +11 -0
finetune_deuncaser_base.gin +2 -2
tasks.py +6 -6

__pycache__/my_metrics.cpython-38.pyc ADDED Viewed

Binary file (459 Bytes). View file

__pycache__/tasks.cpython-38.pyc ADDED Viewed

Binary file (4.79 kB). View file

finetune_base.sh ADDED Viewed

	@@ -0,0 +1,11 @@

+PROJECT_DIR=${HOME}"/models/north-t5-base-deuncaser"
+export PYTHONPATH=${PROJECT_DIR}
+INITIAL_CHECKPOINT_PATH=\"gs://north-t5x/pretrained_models/base/norwegian_NCC_plus_English_pluss200k_balanced_bokmaal_nynorsk_t5x_base/checkpoint_1700000\"
+TRAIN_STEPS=1800000
+python3 ../../t5x/t5x/train.py --gin_search_paths="./" --gin.TRAIN_STEPS=${TRAIN_STEPS} --gin_file="finetune_deuncaser_base.gin" --gin.INITIAL_CHECKPOINT_PATH=${INITIAL_CHECKPOINT_PATH}  --gin.MIXTURE_OR_TASK_NAME=\"deuncaser\" --gin.MODEL_DIR=\"gs://eu-jav-t5x/finetuned/deuncaser/deuncaser_base_v1\"
+#python3 ../../t5x/t5x/train.py --gin_search_paths="./" --gin.TRAIN_STEPS=${TRAIN_STEPS} --gin_file="finetune_classification_base.gin" --gin.INITIAL_CHECKPOINT_PATH=${INITIAL_CHECKPOINT_PATH}  --gin.MIXTURE_OR_TASK_NAME=\"classify_tweets\" --gin.MODEL_DIR=\"gs://eu-jav-t5x/finetuned/italian_tweets/ts20_classify_tweets_base_v2\" &&
+#python3 ../../t5x/t5x/train.py --gin_search_paths="./" --gin.TRAIN_STEPS=${TRAIN_STEPS} --gin_file="finetune_classification_base.gin" --gin.INITIAL_CHECKPOINT_PATH=${INITIAL_CHECKPOINT_PATH}  --gin.MIXTURE_OR_TASK_NAME=\"classify_tweets\" --gin.MODEL_DIR=\"gs://eu-jav-t5x/finetuned/italian_tweets/ts20_classify_tweets_base_v3\" &&
+#python3 ../../t5x/t5x/train.py --gin_search_paths="./" --gin.TRAIN_STEPS=${TRAIN_STEPS} --gin_file="finetune_classification_base.gin" --gin.INITIAL_CHECKPOINT_PATH=${INITIAL_CHECKPOINT_PATH}  --gin.MIXTURE_OR_TASK_NAME=\"classify_tweets\" --gin.MODEL_DIR=\"gs://eu-jav-t5x/finetuned/italian_tweets/ts20_classify_tweets_base_v4\" &&
+#python3 ../../t5x/t5x/train.py --gin_search_paths="./" --gin.TRAIN_STEPS=${TRAIN_STEPS} --gin_file="finetune_classification_base.gin" --gin.INITIAL_CHECKPOINT_PATH=${INITIAL_CHECKPOINT_PATH}  --gin.MIXTURE_OR_TASK_NAME=\"classify_tweets\" --gin.MODEL_DIR=\"gs://eu-jav-t5x/finetuned/italian_tweets/ts_20classify_tweets_base_v5\"

finetune_deuncaser_base.gin CHANGED Viewed

@@ -22,9 +22,9 @@ RANDOM_SEED = 0
 infer_eval/utils.DatasetConfig:
   task_feature_lengths = %TASK_FEATURE_LENGTHS
-#Saving every 1000 steps
 utils.SaveCheckpointConfig:
-  period = 1000
 # Pere: Only necessary if we load a t5 model. We can start with an t5x model here

 infer_eval/utils.DatasetConfig:
   task_feature_lengths = %TASK_FEATURE_LENGTHS
+#Saving every 10000 steps
 utils.SaveCheckpointConfig:
+  period = 10000
 # Pere: Only necessary if we load a t5 model. We can start with an t5x model here

tasks.py CHANGED Viewed

@@ -10,9 +10,9 @@ import t5
 import tensorflow.compat.v1 as tf
 tsv_path = {
-        "train": "gs://eu-jav-t5x/corpus/labeled/datasetA_train_3categories.tsv",
-        "validation": "gs://eu-jav-t5x/corpus/labeled/datasetA_dev_3categories.tsv",
-        "test": "gs://eu-jav-t5x/corpus/labeled/ datasetA_test_3categories.tsv"
 }
 vocabulary = seqio.SentencePieceVocabulary(
@@ -138,7 +138,7 @@ def categorise_binary_preprocessor(ds):
 seqio.TaskRegistry.add(
-    "classify_tweets",
     source=seqio.TextLineDataSource(
         split_to_filepattern=tsv_path,
         #num_input_examples=num_nq_examples
@@ -146,11 +146,11 @@ seqio.TaskRegistry.add(
     preprocessors=[
       functools.partial(
           t5.data.preprocessors.parse_tsv,
-          field_names=["annotator1","annotator2","annotator3","target","source","id"]),
       categorise_preprocessor,
       seqio.preprocessors.tokenize_and_append_eos,
     ],
-    metric_fns=[metrics.accuracy,my_metrics.f1_macro],
     output_features=DEFAULT_OUTPUT_FEATURES,
 )

 import tensorflow.compat.v1 as tf
 tsv_path = {
+        "train": "gs://north-t5x/corpus/deuncaser/norwegian/train.tsv",
+        "validation": "gs://north-t5x/corpus/deuncaser/norwegian/validation.tsv",
+        "test": "gs://north-t5x/corpus/deuncaser/norwegian/validation.tsv"
 }
 vocabulary = seqio.SentencePieceVocabulary(
 seqio.TaskRegistry.add(
+    "deuncaser",
     source=seqio.TextLineDataSource(
         split_to_filepattern=tsv_path,
         #num_input_examples=num_nq_examples
     preprocessors=[
       functools.partial(
           t5.data.preprocessors.parse_tsv,
+          field_names=["id","methods","source","target"]),
       categorise_preprocessor,
       seqio.preprocessors.tokenize_and_append_eos,
     ],
+    metric_fns=[metrics.accuracy,metrics.bleu],
     output_features=DEFAULT_OUTPUT_FEATURES,
 )