|
import os |
|
from pathlib import Path |
|
from tokenizers import BertWordPieceTokenizer |
|
from transformers import BertTokenizer |
|
import tqdm |
|
|
|
from data import get_data |
|
|
|
|
|
import re |
|
import transformers, datasets |
|
import numpy as np |
|
from torch.optim import Adam |
|
import math |
|
|
|
|
|
pairs = get_data('datasets/movie_conversations.txt', "datasets/movie_lines.txt") |
|
|
|
|
|
|
|
|
|
os.mkdir('data') |
|
text_data = [] |
|
file_count = 0 |
|
|
|
|
|
for sample in tqdm.tqdm([x[0] for x in pairs]): |
|
text_data.append(sample) |
|
|
|
|
|
if len(text_data) == 10000: |
|
with open(f'data/text_{file_count}.txt', 'w', encoding='utf-8') as fp: |
|
fp.write('\n'.join(text_data)) |
|
text_data = [] |
|
file_count += 1 |
|
|
|
paths = [str(x) for x in Path('data').glob('**/*.txt')] |
|
|
|
|
|
|
|
tokenizer = BertWordPieceTokenizer( |
|
clean_text=True, |
|
handle_chinese_chars=False, |
|
strip_accents=False, |
|
lowercase=True |
|
) |
|
|
|
|
|
tokenizer.train( |
|
files=paths, |
|
min_frequency=5, |
|
limit_alphabet=1000, |
|
wordpieces_prefix="##", |
|
special_tokens=["[PAD]", "[CLS]", "[SEP]", "[MASK]", "[UNK]"] |
|
) |
|
|
|
|
|
os.mkdir("bert-it-1") |
|
tokenizer.save_model("bert-it-1", "bert-it") |
|
|