pere
/

t5-parliament-categorisation

Model card Files Files and versions Community

pere commited on Apr 20, 2022

Commit

0173bf7

•

1 Parent(s): 9258b0c

more small changes

Browse files

Files changed (20) hide show

corpus/angry_tweets/jsonlines2tsv.py +66 -0
corpus/angry_tweets/test.csv +0 -0
corpus/angry_tweets/test.tsv +0 -0
corpus/angry_tweets/test2.tsv +0 -0
corpus/angry_tweets/train.csv +0 -0
corpus/angry_tweets/train.tsv +0 -0
corpus/dane/jsonlines2tsv.py +69 -0
corpus/dane/test.jsonl +0 -0
corpus/dane/test.tsv +0 -0
corpus/dane/test_tokens.tsv +0 -0
corpus/dane/train.jsonl +0 -0
corpus/dane/train.tsv +0 -0
corpus/dane/train_tokens.tsv +0 -0
finetune_categorisation_large.gin +14 -7
finetune_large.sh +1 -1
log/angry_tweets-1705000.jsonl +0 -0
log/angry_tweets-metrics.jsonl +3 -0
my_metrics.py +1 -1
my_preprocessors.py +1 -0
tasks.py +44 -25

corpus/angry_tweets/jsonlines2tsv.py ADDED Viewed

	@@ -0,0 +1,66 @@

+import csv
+import ujson as json
+import gzip
+import sys
+from tqdm import tqdm
+def validate_to_set(x):
+    if x is None:
+        return set()
+    elif isinstance(x, (tuple, list)):
+        return set(x)
+    elif isinstance(x, str):
+        return set([x])
+    return -1
+def main(in_path, out_path, delim='\t', keep_fields=None, skip_fields=None):
+    """
+    :param str in_path:
+    :param str out_path:
+    :param str delim:
+    :param list|str keep_fields:
+    :param list|str skip_fields:
+    """
+    keep_fields = validate_to_set(keep_fields)
+    if keep_fields == -1:
+        return
+    skip_fields = validate_to_set(skip_fields)
+    if skip_fields == -1:
+        return
+    fmt = in_path.split('.')[-1]
+    if fmt == 'gz':
+        open_to_use = gzip.open
+    else:
+        open_to_use = open
+    # Read the file once to get a list of all keep fields
+    # skip if a set list of keep fields is defined
+    line_count = None
+    if len(keep_fields) == 0:
+        line_count = 0
+        for line in tqdm(open_to_use(in_path)):
+            keep_fields.update(list(json.loads(line).keys()))
+            line_count += 1
+        keep_fields.difference_update(skip_fields)
+    # force alphabetization
+    keep_list = sorted(keep_fields)
+    with open(out_path, 'w') as outfile:
+        writer = csv.writer(outfile, delimiter=delim)
+        #writer.writerow(keep_list)
+        for line in tqdm(open_to_use(in_path), total=line_count):
+            jsn = json.loads(line)
+            writer.writerow([jsn[x].replace("\n"," ").replace("\t"," ") if x in jsn else '' for x in keep_list])
+if __name__ == "__main__":
+    if len(sys.argv) < 2:
+        print('Usage: python jsonlines2ctsv.py <in_file> <out_file>]')
+        sys.exit(0)
+    main(sys.argv[1], sys.argv[2], skip_fields=['content'])

corpus/angry_tweets/test.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

corpus/angry_tweets/test.tsv ADDED Viewed

The diff for this file is too large to render. See raw diff

corpus/angry_tweets/test2.tsv ADDED Viewed

The diff for this file is too large to render. See raw diff

corpus/angry_tweets/train.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

corpus/angry_tweets/train.tsv ADDED Viewed

The diff for this file is too large to render. See raw diff

corpus/dane/jsonlines2tsv.py ADDED Viewed

	@@ -0,0 +1,69 @@

+import csv
+import ujson as json
+import gzip
+import sys
+from tqdm import tqdm
+def validate_to_set(x):
+    if x is None:
+        return set()
+    elif isinstance(x, (tuple, list)):
+        return set(x)
+    elif isinstance(x, str):
+        return set([x])
+    return -1
+def main(in_path, out_path, delim='\t', keep_fields=None, skip_fields=None):
+    """
+    :param str in_path:
+    :param str out_path:
+    :param str delim:
+    :param list|str keep_fields:
+    :param list|str skip_fields:
+    """
+    keep_fields = validate_to_set(keep_fields)
+    if keep_fields == -1:
+        return
+    skip_fields = validate_to_set(skip_fields)
+    if skip_fields == -1:
+        return
+    fmt = in_path.split('.')[-1]
+    if fmt == 'gz':
+        open_to_use = gzip.open
+    else:
+        open_to_use = open
+    # Read the file once to get a list of all keep fields
+    # skip if a set list of keep fields is defined
+    line_count = None
+    if len(keep_fields) == 0:
+        line_count = 0
+        for line in tqdm(open_to_use(in_path)):
+            keep_fields.update(list(json.loads(line).keys()))
+            line_count += 1
+        keep_fields.difference_update(skip_fields)
+    # force alphabetization
+    keep_list = sorted(keep_fields)
+    keep_list.append("combined")
+    with open(out_path, 'w') as outfile:
+        writer = csv.writer(outfile, delimiter=delim)
+        #writer.writerow(keep_list)
+        for line in tqdm(open_to_use(in_path), total=line_count):
+            jsn = json.loads(line)
+            jsn['combined'] = dict(zip(jsn['tokens'], jsn['ner_tags']))
+            #writer.writerow([jsn[x].replace("\n"," ").replace("\t"," ") if x in jsn else '' for x in keep_list])
+            writer.writerow([jsn[x] if x in jsn else '' for x in keep_list])
+if __name__ == "__main__":
+    if len(sys.argv) < 2:
+        print('Usage: python jsonlines2ctsv.py <in_file> <out_file>]')
+        sys.exit(0)
+    main(sys.argv[1], sys.argv[2], keep_fields=['doc','tokens','ner_tags'])

corpus/dane/test.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

corpus/dane/test.tsv ADDED Viewed

The diff for this file is too large to render. See raw diff

corpus/dane/test_tokens.tsv ADDED Viewed

The diff for this file is too large to render. See raw diff

corpus/dane/train.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

corpus/dane/train.tsv ADDED Viewed

The diff for this file is too large to render. See raw diff

corpus/dane/train_tokens.tsv ADDED Viewed

The diff for this file is too large to render. See raw diff

finetune_categorisation_large.gin CHANGED Viewed

@@ -10,9 +10,9 @@ from t5x import utils
 include "t5x/examples/t5/mt5/large.gin"
 include "t5x/configs/runs/finetune.gin"
-MIXTURE_OR_TASK_NAME = "angry_tweets"
-TASK_FEATURE_LENGTHS = {"inputs": 512, "targets": 8}
-TRAIN_STEPS = 1_005_000  # 1000000 pre-trained steps + 10000 fine-tuning steps.
 USE_CACHED_TASKS = False
 DROPOUT_RATE = 0.1
 RANDOM_SEED = 0
@@ -24,17 +24,24 @@ RANDOM_SEED = 0
 # `2048 * 114`. For mT5: `1024 * 229`. For ByT5: `1024 * 189`.
 #LOSS_NORMALIZING_FACTOR = 234496
-#INITIAL_CHECKPOINT_PATH = "gs://t5-data/pretrained_models/t5x/mt5_base/checkpoint_1000000"
-#INITIAL_CHECKPOINT_PATH = "gs://t5-data/pretrained_models/t5x/mt5_large/checkpoint_1000000"
-#INITIAL_CHECKPOINT_PATH = "gs://nb-t5x-us-central2/pk_nb_t5x_base_run1/checkpoint_1100000"
 INITIAL_CHECKPOINT_PATH = "gs://nb-t5x-us-central2/norwegian_NCC_plus_English_pluss200k_scandinavian_t5x_large/checkpoint_1700000"
 #train_script.train:
 #  eval_period = 500
 #  partitioner = @partitioning.ModelBasedPjitPartitioner()
-# partitioning.PjitPartitioner.num_partitions = 1
 # `num_decodes` is equivalent to a beam size in a beam search decoding.
 # models.EncoderDecoderModel.predict_batch_with_aux.num_decodes = 1

 include "t5x/examples/t5/mt5/large.gin"
 include "t5x/configs/runs/finetune.gin"
+MIXTURE_OR_TASK_NAME = "dane"
+TASK_FEATURE_LENGTHS = {"inputs": 256, "targets": 512}
+TRAIN_STEPS = 1_720_000  # 1700000 pre-trained steps + 20000 fine-tuning steps.
 USE_CACHED_TASKS = False
 DROPOUT_RATE = 0.1
 RANDOM_SEED = 0
 # `2048 * 114`. For mT5: `1024 * 229`. For ByT5: `1024 * 189`.
 #LOSS_NORMALIZING_FACTOR = 234496
 INITIAL_CHECKPOINT_PATH = "gs://nb-t5x-us-central2/norwegian_NCC_plus_English_pluss200k_scandinavian_t5x_large/checkpoint_1700000"
+#Fixing a small error
+infer_eval/utils.DatasetConfig:
+  task_feature_lengths = %TASK_FEATURE_LENGTHS
+#Saving every 2000 steps
+utils.SaveCheckpointConfig:
+  period = 2000
 #train_script.train:
 #  eval_period = 500
 #  partitioner = @partitioning.ModelBasedPjitPartitioner()
+partitioning.PjitPartitioner.num_partitions = 1
 # `num_decodes` is equivalent to a beam size in a beam search decoding.
 # models.EncoderDecoderModel.predict_batch_with_aux.num_decodes = 1

finetune_large.sh CHANGED Viewed

@@ -1,7 +1,7 @@
 PROJECT_DIR=${HOME}"/models/t5-parliament-categorisation"
 T5X_DIR="../../t5x"  # directory where the t5x is cloned.
 #Needs to be updated when moving to tpu-v4  it should then be in another zone
-MODEL_DIR="gs://nb-t5x-us-central2/finetuned/v1_eval_angry_tweets_scandinavian_large"
 export PYTHONPATH=${PROJECT_DIR}
 python3 ${T5X_DIR}/t5x/train.py \

 PROJECT_DIR=${HOME}"/models/t5-parliament-categorisation"
 T5X_DIR="../../t5x"  # directory where the t5x is cloned.
 #Needs to be updated when moving to tpu-v4  it should then be in another zone
+MODEL_DIR="gs://nb-t5x-us-central2/finetuned/v3_eval_dane_scandinavian_large"
 export PYTHONPATH=${PROJECT_DIR}
 python3 ${T5X_DIR}/t5x/train.py \

log/angry_tweets-1705000.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

log/angry_tweets-metrics.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+{"step": 1703000, "accuracy": 70.96466093600765, "f1_macro": 70.6282173005562}
+{"step": 1704000, "accuracy": 71.82425978987584, "f1_macro": 71.53409993558694}
+{"step": 1705000, "accuracy": 71.25119388729703, "f1_macro": 71.00190678021416}

my_metrics.py CHANGED Viewed

@@ -2,6 +2,6 @@ import sklearn.metrics
 import numpy as np
 def f1_macro(targets, predictions):
-      targets, predictions = np.asarray(targets).astype(int), np.asarray(predictions).astype(int)
       return {"f1_macro": 100*sklearn.metrics.f1_score(targets, predictions, average='macro')}

 import numpy as np
 def f1_macro(targets, predictions):
+      targets, predictions = np.asarray(targets).astype(str), np.asarray(predictions).astype(str)
       return {"f1_macro": 100*sklearn.metrics.f1_score(targets, predictions, average='macro')}

my_preprocessors.py CHANGED Viewed

@@ -36,6 +36,7 @@ def parse_tsv(line, field_names=None, field_delim='\t'):
   Returns:
     A feature dict mapping field name to string value.
   """
   field_names = field_names or ['inputs', 'targets']
   return dict(
       zip(field_names,

   Returns:
     A feature dict mapping field name to string value.
   """
+  breakpoint()
   field_names = field_names or ['inputs', 'targets']
   return dict(
       zip(field_names,

tasks.py CHANGED Viewed

@@ -5,8 +5,8 @@ import seqio
 import my_metrics
 import tensorflow_datasets as tfds
 from t5.evaluation import metrics
-#from t5.data import preprocessors
-import my_preprocessors
 import t5
 import tensorflow.compat.v1 as tf
@@ -28,6 +28,25 @@ json_angry_tweets_path = {
         "test": "gs://notram-public/finetune_datasets/angry_tweets/test.jsonl"
 }
 vocabulary = seqio.SentencePieceVocabulary(
                 'gs://t5-data/vocabs/mc4.250000.100extra/sentencepiece.model', extra_ids=0)
@@ -44,7 +63,8 @@ DEFAULT_OUTPUT_FEATURES = {
 def categorise_preprocessor(ds):
   def normalize_text(text):
     """Lowercase and remove quotes from a TensorFlow string."""
-    text = tf.strings.regex_replace(text,"'(.*)'", r"\1")
     return text
   def to_inputs_and_targets(ex):
@@ -60,25 +80,6 @@ def categorise_preprocessor(ds):
   return ds.map(to_inputs_and_targets,
                 num_parallel_calls=tf.data.experimental.AUTOTUNE)
-def scandeval_preprocessor(ds):
-  def normalize_text(text):
-    """Lowercase and remove quotes from a TensorFlow string."""
-    text = tf.strings.regex_replace(text,"'(.*)'", r"\1")
-    return text
-  def to_inputs_and_targets(ex):
-    """Map {"source": ..., "source": ...}->{"target": ..., "target": ...}."""
-    return {
-        "inputs":
-             tf.strings.join(
-                 [normalize_text(ex["text"])]),
-        "targets":
-	    tf.strings.join(
-                 [normalize_text(ex["label"])]),
-    }
-  return ds.map(to_inputs_and_targets,
-                num_parallel_calls=tf.data.experimental.AUTOTUNE)
 seqio.TaskRegistry.add(
     "parliament",
@@ -117,13 +118,31 @@ seqio.TaskRegistry.add(
 seqio.TaskRegistry.add(
     "angry_tweets",
     source=seqio.TextLineDataSource(
-        split_to_filepattern=json_angry_tweets_path,
         #num_input_examples=num_nq_examples
         ),
     preprocessors=[
       functools.partial(
-          my_preprocessors.parse_tsv),
-      scandeval_preprocessor,
       seqio.preprocessors.tokenize_and_append_eos,
     ],
     metric_fns=[metrics.accuracy,my_metrics.f1_macro],

 import my_metrics
 import tensorflow_datasets as tfds
 from t5.evaluation import metrics
+from t5.data import preprocessors
+#import my_preprocessors
 import t5
 import tensorflow.compat.v1 as tf
         "test": "gs://notram-public/finetune_datasets/angry_tweets/test.jsonl"
 }
+tsv_angry_tweets_path = {
+        "train": "gs://notram-public/finetune_datasets/angry_tweets/train.tsv",
+        "validation": "gs://notram-public/finetune_datasets/angry_tweets/test.tsv",
+        "test": "gs://notram-public/finetune_datasets/angry_tweets/test.tsv"
+}
+tsv_dane_path = {
+        "train": "gs://notram-public/finetune_datasets/dane/train.tsv",
+        "validation": "gs://notram-public/finetune_datasets/dane/test.tsv",
+        "test": "gs://notram-public/finetune_datasets/dane/test.tsv"
+}
+tsv_dane_tokens_path = {
+        "train": "gs://notram-public/finetune_datasets/dane/train_tokens.tsv",
+        "validation": "gs://notram-public/finetune_datasets/dane/test_tokens.tsv",
+        "test": "gs://notram-public/finetune_datasets/dane/test_tokens.tsv"
+}
 vocabulary = seqio.SentencePieceVocabulary(
                 'gs://t5-data/vocabs/mc4.250000.100extra/sentencepiece.model', extra_ids=0)
 def categorise_preprocessor(ds):
   def normalize_text(text):
     """Lowercase and remove quotes from a TensorFlow string."""
+    #text = tf.strings.regex_replace(text,"'(.*)'", r"\1")
+    ...
     return text
   def to_inputs_and_targets(ex):
   return ds.map(to_inputs_and_targets,
                 num_parallel_calls=tf.data.experimental.AUTOTUNE)
 seqio.TaskRegistry.add(
     "parliament",
 seqio.TaskRegistry.add(
     "angry_tweets",
     source=seqio.TextLineDataSource(
+        split_to_filepattern=tsv_angry_tweets_path,
         #num_input_examples=num_nq_examples
         ),
     preprocessors=[
       functools.partial(
+          t5.data.preprocessors.parse_tsv,
+          field_names=["target","source"]),
+      categorise_preprocessor,
+      seqio.preprocessors.tokenize_and_append_eos,
+    ],
+    metric_fns=[metrics.accuracy,my_metrics.f1_macro],
+    output_features=DEFAULT_OUTPUT_FEATURES,
+)
+seqio.TaskRegistry.add(
+    "dane",
+    source=seqio.TextLineDataSource(
+        split_to_filepattern=tsv_dane_tokens_path,
+        #num_input_examples=num_nq_examples
+        ),
+    preprocessors=[
+      functools.partial(
+          t5.data.preprocessors.parse_tsv,
+          field_names=["source","placeholder1","placeholder2","target"]),
+      categorise_preprocessor,
       seqio.preprocessors.tokenize_and_append_eos,
     ],
     metric_fns=[metrics.accuracy,my_metrics.f1_macro],