aapot
commited on
Commit
•
42db976
1
Parent(s):
edc0c11
Add 1M train step model
Browse files- .gitattributes +2 -0
- build_data.sh +1 -0
- config.json +29 -0
- dataset_to_sentences.py +23 -0
- events.out.tfevents.1641404286.t1v-n-8eba1090-w-0 +3 -0
- pytorch_model.bin +3 -0
- special_tokens_map.json +1 -0
- tf_model.h5 +3 -0
- tf_rename_checkpoint_variables.py +135 -0
- tokenizer.json +0 -0
- tokenizer_config.json +1 -0
- train_tokenizer.py +15 -0
- vocab.txt +0 -0
.gitattributes
CHANGED
@@ -25,3 +25,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
25 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
26 |
*.zstandard filter=lfs diff=lfs merge=lfs -text
|
27 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
25 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
26 |
*.zstandard filter=lfs diff=lfs merge=lfs -text
|
27 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
28 |
+
*.ckpt* filter=lfs diff=lfs merge=lfs -text
|
29 |
+
*.pbtxt* filter=lfs diff=lfs merge=lfs -text
|
build_data.sh
ADDED
@@ -0,0 +1 @@
|
|
|
1 |
+
python3 build_pretraining_dataset.py --corpus-dir /researchdisk/training_dataset_sentences/train_splitted/ --vocab-file /researchdisk/convbert-base-finnish/vocab.txt --output-dir /researchdisk/training_dataset_sentences/train_tokenized_512 --max-seq-length 512 --num-processes 64 --no-lower-case --no-strip-accents
|
config.json
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "/researchdisk/convbert-base-finnish",
|
3 |
+
"architectures": [
|
4 |
+
"ConvBertModel"
|
5 |
+
],
|
6 |
+
"attention_probs_dropout_prob": 0.1,
|
7 |
+
"bos_token_id": 0,
|
8 |
+
"classifier_dropout": null,
|
9 |
+
"conv_kernel_size": 9,
|
10 |
+
"embedding_size": 768,
|
11 |
+
"eos_token_id": 2,
|
12 |
+
"head_ratio": 2,
|
13 |
+
"hidden_act": "gelu",
|
14 |
+
"hidden_dropout_prob": 0.1,
|
15 |
+
"hidden_size": 768,
|
16 |
+
"initializer_range": 0.02,
|
17 |
+
"intermediate_size": 3072,
|
18 |
+
"layer_norm_eps": 1e-12,
|
19 |
+
"max_position_embeddings": 512,
|
20 |
+
"model_type": "convbert",
|
21 |
+
"num_attention_heads": 12,
|
22 |
+
"num_groups": 1,
|
23 |
+
"num_hidden_layers": 12,
|
24 |
+
"pad_token_id": 0,
|
25 |
+
"torch_dtype": "float32",
|
26 |
+
"transformers_version": "4.13.0.dev0",
|
27 |
+
"type_vocab_size": 2,
|
28 |
+
"vocab_size": 50265
|
29 |
+
}
|
dataset_to_sentences.py
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import csv
|
2 |
+
import datasets
|
3 |
+
import nltk
|
4 |
+
nltk.download('punkt')
|
5 |
+
from nltk.tokenize import sent_tokenize
|
6 |
+
import numpy as np
|
7 |
+
|
8 |
+
|
9 |
+
dataset = datasets.load_from_disk("/researchdisk/training_dataset_full_deduplicated")
|
10 |
+
|
11 |
+
def tokenize_sentences(example):
|
12 |
+
sentences = sent_tokenize(example["text"], "finnish")
|
13 |
+
sentences = [sentence for sentence in sentences if len(sentence.split()) >= 5]
|
14 |
+
sentences.append("")
|
15 |
+
example["text"] = "\n".join(sentences)
|
16 |
+
return example
|
17 |
+
|
18 |
+
|
19 |
+
dataset["train"] = dataset["train"].map(tokenize_sentences, num_proc=64, batched=False, writer_batch_size=100000)
|
20 |
+
dataset["validation"] = dataset["validation"].map(tokenize_sentences, num_proc=64, batched=False, writer_batch_size=100000)
|
21 |
+
|
22 |
+
np.savetxt('/researchdisk/training_dataset_sentences/train.txt', dataset["train"].to_pandas().values, fmt = "%s")
|
23 |
+
np.savetxt('/researchdisk/training_dataset_sentences/validation.txt', dataset["validation"].to_pandas().values, fmt = "%s")
|
events.out.tfevents.1641404286.t1v-n-8eba1090-w-0
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:56f18026a3d535676fb6d84a6dfbdfa5a7ae3556f144627e2fd5d1daf2b0b580
|
3 |
+
size 48032940
|
pytorch_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4dec5473cc9fed3c62d06e25247ccffdeac12c8903f9f63d22f718f0b2977964
|
3 |
+
size 483483789
|
special_tokens_map.json
ADDED
@@ -0,0 +1 @@
|
|
|
1 |
+
{"unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]"}
|
tf_model.h5
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:bd2dab1e8334107ef6e3a0a35a2283d38cf26e93a8ed7ed4cbea52849bddb2e8
|
3 |
+
size 483731808
|
tf_rename_checkpoint_variables.py
ADDED
@@ -0,0 +1,135 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Adapted from https://gist.github.com/qqaatw/82b47c2b3da602fa1df604167bfcb9b0
|
2 |
+
|
3 |
+
import getopt
|
4 |
+
import sys
|
5 |
+
import re
|
6 |
+
|
7 |
+
import tensorflow.compat.v1 as tf
|
8 |
+
|
9 |
+
|
10 |
+
usage_str = ('python tensorflow_rename_variables.py '
|
11 |
+
'--checkpoint_dir=path/to/dir/ --replace_from=substr '
|
12 |
+
'--replace_to=substr --add_prefix=abc --dry_run')
|
13 |
+
find_usage_str = ('python tensorflow_rename_variables.py '
|
14 |
+
'--checkpoint_dir=path/to/dir/ --find_str=[\'!\']substr')
|
15 |
+
comp_usage_str = ('python tensorflow_rename_variables.py '
|
16 |
+
'--checkpoint_dir=path/to/dir/ '
|
17 |
+
'--checkpoint_dir2=path/to/dir/')
|
18 |
+
|
19 |
+
|
20 |
+
def print_usage_str():
|
21 |
+
print('Please specify a checkpoint_dir. Usage:')
|
22 |
+
print('%s\nor\n%s\nor\n%s' % (usage_str, find_usage_str, comp_usage_str))
|
23 |
+
print('Note: checkpoint_dir should be a *DIR*, not a file')
|
24 |
+
|
25 |
+
|
26 |
+
def compare(checkpoint_dir, checkpoint_dir2):
|
27 |
+
import difflib
|
28 |
+
with tf.Session():
|
29 |
+
list1 = [el1 for (el1, el2) in
|
30 |
+
tf.train.list_variables(checkpoint_dir)]
|
31 |
+
list2 = [el1 for (el1, el2) in
|
32 |
+
tf.train.list_variables(checkpoint_dir2)]
|
33 |
+
for k1 in list1:
|
34 |
+
if k1 in list2:
|
35 |
+
continue
|
36 |
+
else:
|
37 |
+
print('{} close matches: {}'.format(
|
38 |
+
k1, difflib.get_close_matches(k1, list2)))
|
39 |
+
|
40 |
+
|
41 |
+
def find(checkpoint_dir, find_str):
|
42 |
+
with tf.Session():
|
43 |
+
negate = find_str.startswith('!')
|
44 |
+
if negate:
|
45 |
+
find_str = find_str[1:]
|
46 |
+
for var_name, _ in tf.train.list_variables(checkpoint_dir):
|
47 |
+
if negate and find_str not in var_name:
|
48 |
+
print('%s missing from %s.' % (find_str, var_name))
|
49 |
+
if not negate and find_str in var_name:
|
50 |
+
print('Found %s in %s.' % (find_str, var_name))
|
51 |
+
|
52 |
+
|
53 |
+
def rename(checkpoint_dir, replace_from, replace_to, add_prefix, dry_run):
|
54 |
+
checkpoint = tf.train.get_checkpoint_state(checkpoint_dir)
|
55 |
+
print('print: ', checkpoint)
|
56 |
+
with tf.Session() as sess:
|
57 |
+
for var_name, _ in tf.train.list_variables(checkpoint_dir):
|
58 |
+
# Load the variable
|
59 |
+
|
60 |
+
var= tf.train.load_variable(checkpoint_dir, var_name)
|
61 |
+
|
62 |
+
# Set the new name
|
63 |
+
if None not in [replace_from, replace_to]:
|
64 |
+
new_name = re.sub(replace_from, replace_to, var_name)
|
65 |
+
if add_prefix:
|
66 |
+
new_name = add_prefix + new_name
|
67 |
+
if dry_run:
|
68 |
+
print('%s would be renamed to %s.' % (var_name,
|
69 |
+
new_name))
|
70 |
+
else:
|
71 |
+
if var_name != new_name:
|
72 |
+
print('Renaming %s to %s.' % (var_name, new_name))
|
73 |
+
# Create the variable, potentially renaming it
|
74 |
+
var = tf.Variable(var, name=new_name)
|
75 |
+
|
76 |
+
if not dry_run:
|
77 |
+
# Save the variables
|
78 |
+
saver = tf.train.Saver()
|
79 |
+
sess.run(tf.global_variables_initializer())
|
80 |
+
#saver.save(sess, checkpoint.model_checkpoint_path)
|
81 |
+
saver.save(sess, "renamed-model.ckpt")
|
82 |
+
|
83 |
+
|
84 |
+
def main(argv):
|
85 |
+
checkpoint_dir = None
|
86 |
+
checkpoint_dir2 = None
|
87 |
+
replace_from = None
|
88 |
+
replace_to = None
|
89 |
+
add_prefix = None
|
90 |
+
dry_run = False
|
91 |
+
find_str = None
|
92 |
+
|
93 |
+
try:
|
94 |
+
opts, args = getopt.getopt(argv, 'h', ['help=', 'checkpoint_dir=',
|
95 |
+
'replace_from=', 'replace_to=',
|
96 |
+
'add_prefix=', 'dry_run',
|
97 |
+
'find_str=',
|
98 |
+
'checkpoint_dir2='])
|
99 |
+
except getopt.GetoptError as e:
|
100 |
+
print(e)
|
101 |
+
print_usage_str()
|
102 |
+
sys.exit(2)
|
103 |
+
for opt, arg in opts:
|
104 |
+
if opt in ('-h', '--help'):
|
105 |
+
print(usage_str)
|
106 |
+
sys.exit()
|
107 |
+
elif opt == '--checkpoint_dir':
|
108 |
+
checkpoint_dir = arg
|
109 |
+
elif opt == '--checkpoint_dir2':
|
110 |
+
checkpoint_dir2 = arg
|
111 |
+
elif opt == '--replace_from':
|
112 |
+
replace_from = arg
|
113 |
+
elif opt == '--replace_to':
|
114 |
+
replace_to = arg
|
115 |
+
elif opt == '--add_prefix':
|
116 |
+
add_prefix = arg
|
117 |
+
elif opt == '--dry_run':
|
118 |
+
dry_run = True
|
119 |
+
elif opt == '--find_str':
|
120 |
+
find_str = arg
|
121 |
+
|
122 |
+
if not checkpoint_dir:
|
123 |
+
print_usage_str()
|
124 |
+
sys.exit(2)
|
125 |
+
|
126 |
+
if checkpoint_dir2:
|
127 |
+
compare(checkpoint_dir, checkpoint_dir2)
|
128 |
+
elif find_str:
|
129 |
+
find(checkpoint_dir, find_str)
|
130 |
+
else:
|
131 |
+
rename(checkpoint_dir, replace_from, replace_to, add_prefix, dry_run)
|
132 |
+
|
133 |
+
|
134 |
+
if __name__ == '__main__':
|
135 |
+
main(sys.argv[1:])
|
tokenizer.json
ADDED
The diff for this file is too large to render.
See raw diff
|
tokenizer_config.json
ADDED
@@ -0,0 +1 @@
|
|
|
1 |
+
{"do_lower_case": false, "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]", "tokenize_chinese_chars": true, "strip_accents": null, "model_max_length": 512, "special_tokens_map_file": null, "name_or_path": "bert-base-cased", "tokenizer_class": "BertTokenizer"}
|
train_tokenizer.py
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from datasets import load_from_disk
|
2 |
+
from transformers import AutoTokenizer
|
3 |
+
|
4 |
+
dataset = load_from_disk("/researchdisk/training_dataset_full_deduplicated")
|
5 |
+
dataset = dataset["train"]
|
6 |
+
|
7 |
+
# We train on batch of texts, 1000 at a time here.
|
8 |
+
batch_size = 1000
|
9 |
+
corpus = (dataset[i : i + batch_size]["text"] for i in range(0, len(dataset), batch_size))
|
10 |
+
|
11 |
+
# ConvBERT uses Bert tokenizer
|
12 |
+
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
|
13 |
+
#let's use same vocab size as in Finnish-NLP/roberta-large-finnish-v2 which is also very close to TurkuNLP/bert-base-finnish-cased-v1
|
14 |
+
new_tokenizer = tokenizer.train_new_from_iterator(corpus, vocab_size=50265)
|
15 |
+
new_tokenizer.save_pretrained("./")
|
vocab.txt
ADDED
The diff for this file is too large to render.
See raw diff
|