aapot commited on
Commit
42db976
1 Parent(s): edc0c11

Add 1M train step model

Browse files
.gitattributes CHANGED
@@ -25,3 +25,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
25
  *.zip filter=lfs diff=lfs merge=lfs -text
26
  *.zstandard filter=lfs diff=lfs merge=lfs -text
27
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
25
  *.zip filter=lfs diff=lfs merge=lfs -text
26
  *.zstandard filter=lfs diff=lfs merge=lfs -text
27
  *tfevents* filter=lfs diff=lfs merge=lfs -text
28
+ *.ckpt* filter=lfs diff=lfs merge=lfs -text
29
+ *.pbtxt* filter=lfs diff=lfs merge=lfs -text
build_data.sh ADDED
@@ -0,0 +1 @@
 
1
+ python3 build_pretraining_dataset.py --corpus-dir /researchdisk/training_dataset_sentences/train_splitted/ --vocab-file /researchdisk/convbert-base-finnish/vocab.txt --output-dir /researchdisk/training_dataset_sentences/train_tokenized_512 --max-seq-length 512 --num-processes 64 --no-lower-case --no-strip-accents
config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "/researchdisk/convbert-base-finnish",
3
+ "architectures": [
4
+ "ConvBertModel"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "bos_token_id": 0,
8
+ "classifier_dropout": null,
9
+ "conv_kernel_size": 9,
10
+ "embedding_size": 768,
11
+ "eos_token_id": 2,
12
+ "head_ratio": 2,
13
+ "hidden_act": "gelu",
14
+ "hidden_dropout_prob": 0.1,
15
+ "hidden_size": 768,
16
+ "initializer_range": 0.02,
17
+ "intermediate_size": 3072,
18
+ "layer_norm_eps": 1e-12,
19
+ "max_position_embeddings": 512,
20
+ "model_type": "convbert",
21
+ "num_attention_heads": 12,
22
+ "num_groups": 1,
23
+ "num_hidden_layers": 12,
24
+ "pad_token_id": 0,
25
+ "torch_dtype": "float32",
26
+ "transformers_version": "4.13.0.dev0",
27
+ "type_vocab_size": 2,
28
+ "vocab_size": 50265
29
+ }
dataset_to_sentences.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import csv
2
+ import datasets
3
+ import nltk
4
+ nltk.download('punkt')
5
+ from nltk.tokenize import sent_tokenize
6
+ import numpy as np
7
+
8
+
9
+ dataset = datasets.load_from_disk("/researchdisk/training_dataset_full_deduplicated")
10
+
11
+ def tokenize_sentences(example):
12
+ sentences = sent_tokenize(example["text"], "finnish")
13
+ sentences = [sentence for sentence in sentences if len(sentence.split()) >= 5]
14
+ sentences.append("")
15
+ example["text"] = "\n".join(sentences)
16
+ return example
17
+
18
+
19
+ dataset["train"] = dataset["train"].map(tokenize_sentences, num_proc=64, batched=False, writer_batch_size=100000)
20
+ dataset["validation"] = dataset["validation"].map(tokenize_sentences, num_proc=64, batched=False, writer_batch_size=100000)
21
+
22
+ np.savetxt('/researchdisk/training_dataset_sentences/train.txt', dataset["train"].to_pandas().values, fmt = "%s")
23
+ np.savetxt('/researchdisk/training_dataset_sentences/validation.txt', dataset["validation"].to_pandas().values, fmt = "%s")
events.out.tfevents.1641404286.t1v-n-8eba1090-w-0 ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:56f18026a3d535676fb6d84a6dfbdfa5a7ae3556f144627e2fd5d1daf2b0b580
3
+ size 48032940
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4dec5473cc9fed3c62d06e25247ccffdeac12c8903f9f63d22f718f0b2977964
3
+ size 483483789
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
1
+ {"unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]"}
tf_model.h5 ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bd2dab1e8334107ef6e3a0a35a2283d38cf26e93a8ed7ed4cbea52849bddb2e8
3
+ size 483731808
tf_rename_checkpoint_variables.py ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Adapted from https://gist.github.com/qqaatw/82b47c2b3da602fa1df604167bfcb9b0
2
+
3
+ import getopt
4
+ import sys
5
+ import re
6
+
7
+ import tensorflow.compat.v1 as tf
8
+
9
+
10
+ usage_str = ('python tensorflow_rename_variables.py '
11
+ '--checkpoint_dir=path/to/dir/ --replace_from=substr '
12
+ '--replace_to=substr --add_prefix=abc --dry_run')
13
+ find_usage_str = ('python tensorflow_rename_variables.py '
14
+ '--checkpoint_dir=path/to/dir/ --find_str=[\'!\']substr')
15
+ comp_usage_str = ('python tensorflow_rename_variables.py '
16
+ '--checkpoint_dir=path/to/dir/ '
17
+ '--checkpoint_dir2=path/to/dir/')
18
+
19
+
20
+ def print_usage_str():
21
+ print('Please specify a checkpoint_dir. Usage:')
22
+ print('%s\nor\n%s\nor\n%s' % (usage_str, find_usage_str, comp_usage_str))
23
+ print('Note: checkpoint_dir should be a *DIR*, not a file')
24
+
25
+
26
+ def compare(checkpoint_dir, checkpoint_dir2):
27
+ import difflib
28
+ with tf.Session():
29
+ list1 = [el1 for (el1, el2) in
30
+ tf.train.list_variables(checkpoint_dir)]
31
+ list2 = [el1 for (el1, el2) in
32
+ tf.train.list_variables(checkpoint_dir2)]
33
+ for k1 in list1:
34
+ if k1 in list2:
35
+ continue
36
+ else:
37
+ print('{} close matches: {}'.format(
38
+ k1, difflib.get_close_matches(k1, list2)))
39
+
40
+
41
+ def find(checkpoint_dir, find_str):
42
+ with tf.Session():
43
+ negate = find_str.startswith('!')
44
+ if negate:
45
+ find_str = find_str[1:]
46
+ for var_name, _ in tf.train.list_variables(checkpoint_dir):
47
+ if negate and find_str not in var_name:
48
+ print('%s missing from %s.' % (find_str, var_name))
49
+ if not negate and find_str in var_name:
50
+ print('Found %s in %s.' % (find_str, var_name))
51
+
52
+
53
+ def rename(checkpoint_dir, replace_from, replace_to, add_prefix, dry_run):
54
+ checkpoint = tf.train.get_checkpoint_state(checkpoint_dir)
55
+ print('print: ', checkpoint)
56
+ with tf.Session() as sess:
57
+ for var_name, _ in tf.train.list_variables(checkpoint_dir):
58
+ # Load the variable
59
+
60
+ var= tf.train.load_variable(checkpoint_dir, var_name)
61
+
62
+ # Set the new name
63
+ if None not in [replace_from, replace_to]:
64
+ new_name = re.sub(replace_from, replace_to, var_name)
65
+ if add_prefix:
66
+ new_name = add_prefix + new_name
67
+ if dry_run:
68
+ print('%s would be renamed to %s.' % (var_name,
69
+ new_name))
70
+ else:
71
+ if var_name != new_name:
72
+ print('Renaming %s to %s.' % (var_name, new_name))
73
+ # Create the variable, potentially renaming it
74
+ var = tf.Variable(var, name=new_name)
75
+
76
+ if not dry_run:
77
+ # Save the variables
78
+ saver = tf.train.Saver()
79
+ sess.run(tf.global_variables_initializer())
80
+ #saver.save(sess, checkpoint.model_checkpoint_path)
81
+ saver.save(sess, "renamed-model.ckpt")
82
+
83
+
84
+ def main(argv):
85
+ checkpoint_dir = None
86
+ checkpoint_dir2 = None
87
+ replace_from = None
88
+ replace_to = None
89
+ add_prefix = None
90
+ dry_run = False
91
+ find_str = None
92
+
93
+ try:
94
+ opts, args = getopt.getopt(argv, 'h', ['help=', 'checkpoint_dir=',
95
+ 'replace_from=', 'replace_to=',
96
+ 'add_prefix=', 'dry_run',
97
+ 'find_str=',
98
+ 'checkpoint_dir2='])
99
+ except getopt.GetoptError as e:
100
+ print(e)
101
+ print_usage_str()
102
+ sys.exit(2)
103
+ for opt, arg in opts:
104
+ if opt in ('-h', '--help'):
105
+ print(usage_str)
106
+ sys.exit()
107
+ elif opt == '--checkpoint_dir':
108
+ checkpoint_dir = arg
109
+ elif opt == '--checkpoint_dir2':
110
+ checkpoint_dir2 = arg
111
+ elif opt == '--replace_from':
112
+ replace_from = arg
113
+ elif opt == '--replace_to':
114
+ replace_to = arg
115
+ elif opt == '--add_prefix':
116
+ add_prefix = arg
117
+ elif opt == '--dry_run':
118
+ dry_run = True
119
+ elif opt == '--find_str':
120
+ find_str = arg
121
+
122
+ if not checkpoint_dir:
123
+ print_usage_str()
124
+ sys.exit(2)
125
+
126
+ if checkpoint_dir2:
127
+ compare(checkpoint_dir, checkpoint_dir2)
128
+ elif find_str:
129
+ find(checkpoint_dir, find_str)
130
+ else:
131
+ rename(checkpoint_dir, replace_from, replace_to, add_prefix, dry_run)
132
+
133
+
134
+ if __name__ == '__main__':
135
+ main(sys.argv[1:])
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
1
+ {"do_lower_case": false, "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]", "tokenize_chinese_chars": true, "strip_accents": null, "model_max_length": 512, "special_tokens_map_file": null, "name_or_path": "bert-base-cased", "tokenizer_class": "BertTokenizer"}
train_tokenizer.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import load_from_disk
2
+ from transformers import AutoTokenizer
3
+
4
+ dataset = load_from_disk("/researchdisk/training_dataset_full_deduplicated")
5
+ dataset = dataset["train"]
6
+
7
+ # We train on batch of texts, 1000 at a time here.
8
+ batch_size = 1000
9
+ corpus = (dataset[i : i + batch_size]["text"] for i in range(0, len(dataset), batch_size))
10
+
11
+ # ConvBERT uses Bert tokenizer
12
+ tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
13
+ #let's use same vocab size as in Finnish-NLP/roberta-large-finnish-v2 which is also very close to TurkuNLP/bert-base-finnish-cased-v1
14
+ new_tokenizer = tokenizer.train_new_from_iterator(corpus, vocab_size=50265)
15
+ new_tokenizer.save_pretrained("./")
vocab.txt ADDED
The diff for this file is too large to render. See raw diff