|
from datasets import load_dataset |
|
from models.bpe_trainer import BpeTrainer |
|
from tqdm import tqdm |
|
|
|
raw_ds = load_dataset("parquet", data_files ={'train':'data/culturaX_bnhi_500Kx2.parquet'}) |
|
raw_ds = raw_ds['train'] |
|
|
|
phn_ds = load_dataset("parquet", data_files ={'train':'data/culturaX_bnhi_500Kx2_phonemized.parquet'}) |
|
phn_ds = phn_ds['train'] |
|
|
|
|
|
vocab_sizes = [16000] |
|
|
|
for vocab_size in tqdm(vocab_sizes): |
|
BpeTrainer(dataset=raw_ds, vocab_size=vocab_size, batch_size=50000, |
|
output_dir=f"trained_tokenizers/multi/multi_raw_bnhi_bpetokenizer_{vocab_size//1000}K") |
|
BpeTrainer(dataset=phn_ds, vocab_size=vocab_size, batch_size=50000, |
|
output_dir=f"trained_tokenizers/multi/multi_phn_bnhi_bpetokenizer_{vocab_size//1000}K") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
''' |
|
Benchmarking for how much time for phonemization: |
|
NUM_SAMPLES = 50,000 |
|
Convert to text |
|
Phonemization script |
|
|
|
time command_for_script |
|
|
|
time/500000 |
|
|
|
------------------------------------------------ |
|
|
|
Prep data: |
|
Native script = directly from Sangraha |
|
Phonemization |
|
|
|
HF dataset --> Convert to text files and store in a dir --> Phonemization script --> |
|
phonemized text files --> convert back to HF dataset (parquet format) |
|
|
|
------------------------------------------------ |
|
|
|
|
|
1st exp: |
|
|
|
Hi, Phn_Hi --> Plot FS from vocab size 4K to 16K. Train 12 tokenizers. |
|
Ur, Phn_Ur --> Plot FS from vocab size 4K to 16K. |
|
|
|
2nd exp: |
|
|
|
HiUr, Phn HiUr --> Plot FS from vs 8K to 16K. 8 in total. |
|
|
|
''' |
|
|
|
|