bert / tokenizer.py
pt-sk's picture
Upload 7 files
28dc58b verified
raw
history blame contribute delete
No virus
1.24 kB
import os
from pathlib import Path
from tokenizers import BertWordPieceTokenizer
from transformers import BertTokenizer
import tqdm
from data import get_data
import re
import transformers, datasets
import numpy as np
from torch.optim import Adam
import math
pairs = get_data('datasets/movie_conversations.txt', "datasets/movie_lines.txt")
# WordPiece tokenizer
### save data as txt file
os.mkdir('data')
text_data = []
file_count = 0
for sample in tqdm.tqdm([x[0] for x in pairs]):
text_data.append(sample)
# once we hit the 10K mark, save to file
if len(text_data) == 10000:
with open(f'data/text_{file_count}.txt', 'w', encoding='utf-8') as fp:
fp.write('\n'.join(text_data))
text_data = []
file_count += 1
paths = [str(x) for x in Path('data').glob('**/*.txt')]
### Training own tokenizer
tokenizer = BertWordPieceTokenizer(
clean_text=True,
handle_chinese_chars=False,
strip_accents=False,
lowercase=True
)
tokenizer.train(
files=paths,
min_frequency=5,
limit_alphabet=1000,
wordpieces_prefix="##",
special_tokens=["[PAD]", "[CLS]", "[SEP]", "[MASK]", "[UNK]"]
)
os.mkdir("bert-it-1")
tokenizer.save_model("bert-it-1", "bert-it")