Spaces:

hoang1007
/

spelling-correction

Runtime error

App Files Files Community

hoang1007 commited on Oct 22, 2023

Commit

44db343

•

1 Parent(s): 5ada341

Upload 69 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitignore +3 -0
__init__.py +0 -0
__pycache__/params.cpython-310.pyc +0 -0
app.py +62 -0
correct.py +69 -0
data/binhvq/binhvq.vocab.pkl +3 -0
data/binhvq/sentences.txt +0 -0
data/checkpoints/tfmwtr/binhvq.weights.pth +3 -0
dataset/__init__.py +0 -0
dataset/__pycache__/__init__.cpython-310.pyc +0 -0
dataset/__pycache__/autocorrect_dataset.cpython-310.pyc +0 -0
dataset/__pycache__/noise.cpython-310.pyc +0 -0
dataset/__pycache__/vocab.cpython-310.pyc +0 -0
dataset/autocorrect_dataset.py +16 -0
dataset/cleandata.sh +28 -0
dataset/data_generation/all-vietnamese-syllables.txt +0 -0
dataset/data_generation/all_nguyen_am_ba.py +44 -0
dataset/data_generation/all_nguyen_am_don.py +26 -0
dataset/data_generation/all_nguyen_am_hai.py +47 -0
dataset/data_generation/all_phu_am_daucuoi.py +78 -0
dataset/data_generation/common-vietnamese-syllables.txt +7184 -0
dataset/data_generation/confusion_set.py +262 -0
dataset/data_generation/keyboard_neighbor.py +79 -0
dataset/data_generation/normalize.py +183 -0
dataset/data_generation/typing_error_gen.py +13 -0
dataset/log/prepare_data.log +0 -0
dataset/noise.py +655 -0
dataset/noising_resources/accents.json +498 -0
dataset/noising_resources/confusion_set.json +0 -0
dataset/noising_resources/homo_leter.json +27 -0
dataset/noising_resources/kieu_go_dau_cu_moi.txt +78 -0
dataset/noising_resources/typo.json +650 -0
dataset/prepare_dataset.py +310 -0
dataset/prepare_vsec.py +46 -0
dataset/util.py +128 -0
dataset/vocab.py +188 -0
models/__init__.py +0 -0
models/__pycache__/__init__.cpython-310.pyc +0 -0
models/__pycache__/collator.cpython-310.pyc +0 -0
models/__pycache__/corrector.cpython-310.pyc +0 -0
models/__pycache__/model.cpython-310.pyc +0 -0
models/__pycache__/sampler.cpython-310.pyc +0 -0
models/__pycache__/tokenizer.cpython-310.pyc +0 -0
models/__pycache__/transformer.cpython-310.pyc +0 -0
models/__pycache__/util.cpython-310.pyc +0 -0
models/collator.py +78 -0
models/corrector.py +170 -0
models/model.py +22 -0
models/sampler.py +99 -0
models/tokenizer.py +55 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@

+data
+__pycache__
+log

__init__.py ADDED Viewed

File without changes

__pycache__/params.cpython-310.pyc ADDED Viewed

Binary file (539 Bytes). View file

app.py ADDED Viewed

	@@ -0,0 +1,62 @@

+from typing import Union
+import sys
+sys.path.append("..")
+from params import *
+from dataset.vocab import Vocab
+from models.corrector import Corrector
+from models.model import ModelWrapper
+from models.util import load_weights
+from dataset.noise import SynthesizeData
+from utils.api_utils import correctFunction, postprocessing_result
+model_name = "tfmwtr"
+dataset = "binhvq"
+vocab_path = f'data/{dataset}/{dataset}.vocab.pkl'
+weight_path = f'data/checkpoints/tfmwtr/{dataset}.weights.pth'
+vocab = Vocab("vi")
+vocab.load_vocab_dict(vocab_path)
+noiser = SynthesizeData(vocab)
+model_wrapper = ModelWrapper(f"{model_name}", vocab)
+corrector = Corrector(model_wrapper)
+load_weights(corrector.model, weight_path)
+def correct(string: str):
+    out = correctFunction(string, corrector)
+    result = postprocessing_result(out)
+    ret = []
+    for r in result:
+        r = [s.strip() for s in r if isinstance(s, str)]
+        if len(r) == 2:
+            ret.append((r[0], r[1]))
+        else:
+            ret.append((r[0], None))
+        ret.append((" ", None))
+    ret.pop()
+    print(ret, "RET")
+    return ret
+import gradio as gr
+if __name__ == "__main__":
+    css = """
+    #output {
+        .label {
+            background-color: green !important;
+        }
+    }
+    """
+    gr.Interface(
+        correct,
+        inputs=gr.Textbox(label="Input", placeholder="Enter text to be corrected here..."),
+        outputs=gr.HighlightedText(
+            label="Output",
+            combine_adjacent=True,
+            show_label=True,
+            elem_id="output"
+        ),
+        theme=gr.themes.Default(),
+        css=css
+    ).launch()

correct.py ADDED Viewed

	@@ -0,0 +1,69 @@

+import os
+from params import *
+from dataset.vocab import Vocab
+from dataset.util import load_dataset, load_vsec_dataset
+if __name__ == "__main__":
+    import argparse
+    description = '''
+        Corrector:
+        Usage: python corrector.py --model tfmwtr --data_path ./data --dataset binhvq
+        Params:
+            --model
+                    tfmwtr - Transformer with Tokenization Repair
+            --data_path:    default to ./data
+            --dataset:      default to 'binhvq'
+    '''
+    parser = argparse.ArgumentParser(description=description)
+    parser.add_argument('--model', type=str, default='tfmwtr')
+    parser.add_argument('--data_path', type=str, default='./data')
+    parser.add_argument('--dataset', type=str, default='binhvq')
+    parser.add_argument('--test_dataset', type=str, default='binhvq')
+    parser.add_argument("--beams", type=int, default=2)
+    parser.add_argument("--fraction", type=float, default= 1.0)
+    parser.add_argument('--text', type=str, default='Bình mnh ơi day ch ưa, café xáng vớitôi dược không?')
+    args = parser.parse_args()
+    dataset_path = os.path.join(args.data_path, f'{args.test_dataset}')
+    weight_ext = 'pth'
+    checkpoint_dir = os.path.join(args.data_path, f'checkpoints/{args.model}')
+    weight_path = os.path.join(checkpoint_dir, f'{args.dataset}.weights.{weight_ext}')
+    vocab_path = os.path.join(args.data_path, f'binhvq/binhvq.vocab.pkl')
+    correct_file = f'{args.test_dataset}.test'
+    incorrect_file = f'{args.test_dataset}.test.noise'
+    length_file = f'{args.dataset}.length.test'
+    if args.test_dataset != "vsec":
+        test_data = load_dataset(base_path=dataset_path, corr_file=correct_file, incorr_file=incorrect_file,
+                              length_file=length_file)
+    else:
+        test_data = load_vsec_dataset(base_path=dataset_path, corr_file=correct_file, incorr_file=incorrect_file)
+    length_of_data = len(test_data)
+    test_data = test_data[0 : int(args.fraction * length_of_data) ]
+    vocab = Vocab()
+    vocab.load_vocab_dict(vocab_path)
+    from dataset.autocorrect_dataset import SpellCorrectDataset
+    from models.corrector import Corrector
+    from models.model import ModelWrapper
+    from models.util import load_weights
+    test_dataset = SpellCorrectDataset(dataset=test_data)
+    model_wrapper = ModelWrapper(args.model, vocab)
+    corrector = Corrector(model_wrapper)
+    load_weights(corrector.model, weight_path)
+    corrector.evaluate(test_dataset, beams = args.beams)

data/binhvq/binhvq.vocab.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3f2d12d2fe63b67c7f8138c5a1e2a19c90c06bc8f0df4da127a3d1b1a0a5bece
+size 2155566

data/binhvq/sentences.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

data/checkpoints/tfmwtr/binhvq.weights.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a64a21d18d2faf349e84cea71ea045d9698a6520bf5fdee6aefb559848693401
+size 600250423

dataset/__init__.py ADDED Viewed

File without changes

dataset/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (163 Bytes). View file

dataset/__pycache__/autocorrect_dataset.cpython-310.pyc ADDED Viewed

Binary file (1.2 kB). View file

dataset/__pycache__/noise.cpython-310.pyc ADDED Viewed

Binary file (18.1 kB). View file

dataset/__pycache__/vocab.cpython-310.pyc ADDED Viewed

Binary file (6.53 kB). View file

dataset/autocorrect_dataset.py ADDED Viewed

	@@ -0,0 +1,16 @@

+import torch
+import numpy as np
+class SpellCorrectDataset(torch.utils.data.Dataset):
+    def __init__(self, dataset):
+        self.dataset = dataset
+    def __getitem__(self, idx):
+        return self.dataset[idx]
+    def __len__(self):
+        return len(self.dataset)
+    def take(self, n = 1):
+        indies = np.random.choice(len(self.dataset), n)
+        return [self.dataset[idx] for idx in indies]

dataset/cleandata.sh ADDED Viewed

	@@ -0,0 +1,28 @@

+#!/bin/bash
+corpus=$1
+root=../data/$1/
+echo "Clean corpus $1"
+cat $root$corpus.train[0-9]* > $root$corpus.train
+rm -r $root$corpus.train[0-9]*
+cat $root$corpus.test[0-9]* > $root$corpus.test
+rm -r $root$corpus.test[0-9]*
+cat $root$corpus.train.noise[0-9]* > $root$corpus.train.noise
+rm -r $root$corpus.train.noise[0-9]*
+cat $root$corpus.test.noise[0-9]* > $root$corpus.test.noise
+rm -r $root$corpus.test.noise[0-9]*
+cat $root$corpus.length.train[0-9]* > $root$corpus.length.train
+rm -r $root$corpus.length.train[0-9]*
+cat $root$corpus.length.test[0-9]* > $root$corpus.length.test
+rm -r $root$corpus.length.test[0-9]*
+cat $root$corpus.valid.noise[0-9]* > $root$corpus.valid.noise
+rm -r $root$corpus.valid.noise[0-9]*
+cat $root$corpus.length.valid[0-9]* > $root$corpus.length.valid
+rm -r $root$corpus.length.valid[0-9]*
+cat $root$corpus.valid[0-9]* > $root$corpus.valid
+rm -r $root$corpus.valid[0-9]*
+cat $root$corpus.onehot.test[0-9]* > $root$corpus.onehot.test
+rm -r $root$corpus.onehot.test[0-9]*
+cat $root$corpus.onehot.train[0-9]* > $root$corpus.onehot.train
+rm -r $root$corpus.onehot.train[0-9]*
+cat $root$corpus.onehot.valid[0-9]* > $root$corpus.onehot.valid
+rm -r $root$corpus.onehot.valid[0-9]*

dataset/data_generation/all-vietnamese-syllables.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

dataset/data_generation/all_nguyen_am_ba.py ADDED Viewed

	@@ -0,0 +1,44 @@

+import re
+from normalize import chuan_hoa_dau_tu_tieng_viet
+import numpy as np
+from keyboard_neighbor import getKeyboardNeighbors
+with open("common-vietnamese-syllables.txt", "r") as file:
+    vi_syllables = [line.strip("\n") for line in file.readlines()]
+vi_syllables_new = []
+for syllable in vi_syllables:
+    normalized = chuan_hoa_dau_tu_tieng_viet(syllable)
+    vi_syllables_new.append(normalized)
+nguyen_am_ba = 'oai|oao|uao|oeo|iêu|yêu|uôi|ươu|uyu|uyê|ươi|oay|uây|ươi|uya'
+keyboardNeighbors = getKeyboardNeighbors()
+for key in keyboardNeighbors.keys():
+    keyboardNeighbors[key] = keyboardNeighbors[key][0][np.argmax(keyboardNeighbors[key][1])]
+result = set()
+for am_ba in nguyen_am_ba.split("|"):
+    result.add(am_ba)
+    if am_ba == "uyê":
+        for candidate in keyboardNeighbors[am_ba[2]]:
+            result.add(am_ba[0] + am_ba[1] + candidate)
+    else:
+        for candidate in keyboardNeighbors[am_ba[1]]:
+            result.add(am_ba[0] + candidate + am_ba[2])
+remove_list = set()
+for syllable in result:
+    for idx in range(len(vi_syllables_new)):
+        if syllable in vi_syllables_new[idx]:
+            break
+    if idx == len(vi_syllables_new) - 1:
+        remove_list.add(syllable)
+for el in remove_list:
+    result.discard(el)
+print("|".join(result))

dataset/data_generation/all_nguyen_am_don.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import re
+from normalize import chuan_hoa_dau_tu_tieng_viet
+from keyboard_neighbor import getKeyboardNeighbors
+import numpy as np
+with open("common-vietnamese-syllables.txt", "r") as file:
+    vi_syllables = [line.strip("\n") for line in file.readlines()]
+vi_syllables_new = []
+for syllable in vi_syllables:
+    normalized = chuan_hoa_dau_tu_tieng_viet(syllable)
+    vi_syllables_new.append(normalized)
+nguyen_am_don = 'a|ă|â|e|ê|i|y|o|ô|ơ|u|ư'
+keyboardNeighbors = getKeyboardNeighbors()
+for key in keyboardNeighbors.keys():
+    keyboardNeighbors[key] = keyboardNeighbors[key][0][np.argmax(keyboardNeighbors[key][1])]
+result = set()
+for am_don in nguyen_am_don.split("|"):
+    result.add(am_don)
+    for candidate in keyboardNeighbors[am_don]:
+        result.add(candidate)
+print("|".join(result))

dataset/data_generation/all_nguyen_am_hai.py ADDED Viewed

	@@ -0,0 +1,47 @@

+import re
+from normalize import chuan_hoa_dau_tu_tieng_viet
+from keyboard_neighbor import getKeyboardNeighbors
+import numpy as np
+with open("common-vietnamese-syllables.txt", "r") as file:
+    vi_syllables = [line.strip("\n") for line in file.readlines()]
+vi_syllables_new = []
+for syllable in vi_syllables:
+    normalized = chuan_hoa_dau_tu_tieng_viet(syllable)
+    vi_syllables_new.append(normalized)
+nguyen_am_doi = 'ai|ao|au|ay|âu|ây|êu|eo|ia|iê|yê|iu|oă|oa|oi|oe|oo|ôô|ơi|uă|uâ|ue|ua|ui|ưi|uo|ươ|ưu|uơ|uy|ưa|ôi|uô|uê'
+no_end_phu_am = 'ưu|ưi|ui|ưa|ơi|ôi|oi|iu|ia|êu|eo|ây|ay|âu|au|ao|ai'
+must_end_phu_am = "yê|ươ|uô|uâ|iê|â"
+keyboardNeighbors = getKeyboardNeighbors()
+for key in keyboardNeighbors.keys():
+    keyboardNeighbors[key] = keyboardNeighbors[key][0][np.argmax(keyboardNeighbors[key][1])]
+result = set()
+for am_doi in nguyen_am_doi.split("|"):
+    result.add(am_doi)
+    if am_doi not in must_end_phu_am:
+        for candidate in keyboardNeighbors[am_doi[0]]:
+            result.add(candidate + am_doi[1])
+    if am_doi not in no_end_phu_am:
+        for candidate in keyboardNeighbors[am_doi[1]]:
+                result.add(am_doi[0] + candidate)
+remove_list = set()
+for syllable in result:
+    for idx in range(len(vi_syllables_new)):
+        if syllable in vi_syllables_new[idx]:
+            break
+    if idx == len(vi_syllables_new) - 1:
+        remove_list.add(syllable)
+for el in remove_list:
+    result.discard(el)
+print("|".join(result))

dataset/data_generation/all_phu_am_daucuoi.py ADDED Viewed

	@@ -0,0 +1,78 @@

+import re
+from normalize import chuan_hoa_dau_tu_tieng_viet
+import numpy as np
+with open("common-vietnamese-syllables.txt", "r") as file:
+    vi_syllables = [line.strip("\n") for line in file.readlines()]
+vi_syllables_new = []
+for syllable in vi_syllables:
+    normalized = chuan_hoa_dau_tu_tieng_viet(syllable)
+    vi_syllables_new.append(normalized)
+regex_am_ba = "ười|oeo|uyễ|uồi|ươi|uyê|uôi|ướu|oải|uyệ|oẹo|ưới|iễu|uối|yếu|oại|ưỡi|iêu|ưởi|oèo|uya|oáy|uổi|uỷu|uyế|uyể|ượu|uội|uao|uầy|uào|uẫy|ươu|yểu|oai|uyề|oài|uậy|iều|uỵu|iếu|oay|yều|uấy|oái|iểu|uẩy|yêu|uỗi|iệu|uây|ượi"
+regex_am_hai = "áo|ay|ùy|ại|ậu|ỡi|èo|ọi|ào|ao|uấ|ãy|uề|uy|ảu|oạ|iê|ái|ảy|ội|ựa|ẻo|ời|ôi|iệ|oỏ|ủi|ía|oẻ|uệ|ọe|ẫy|ơi|ồi|uẹ|ũy|ấy|ủa|ùa|ỗi|ượ|uý|eo|ấu|ễu|iề|ướ|ưu|ụi|ụy|iễ|uỗ|âu|uồ|ửi|uã|ạo|ây|ia|ìa|àu|ểu|uả|oả|oo|ếu|ĩa|ué|ẽo|oà|uộ|ue|oẹ|uâ|ịu|uố|íu|yể|òe|uằ|uẳ|ùi|au|uo|iu|ựu|iể|uẽ|uở|õi|éo|ão|ới|uậ|uỹ|ìu|yệ|oặ|ui|ầy|yế|áu|óa|yê|ợi|oe|oè|ẫu|uơ|oó|uá|ửu|úa|uầ|ưở|ỏe|ĩu|oé|uể|ậy|úi|ỏi|uà|ủy|oằ|ữa|oã|ửa|uớ|oă|ổi|oò|uă|uắ|uờ|ườ|úy|ữu|ối|uó|oi|ừu|oá|ởi|ừa|ũi|ải|yề|ỉa|uặ|ưa|òa|òi|ệu|ạy|uổ|ịa|uê|ạu|ụa|ãi|oọ|ài|oẳ|uỷ|ưỡ|ẩy|uỳ|iế|ọa|uế|ua|ũa|óe|uẩ|oắ|ẩu|uẻ|ai|ỉu|ói|ầu|ươ|uè|ều|ảo|yễ|êu|uẫ|oa|ứu|ày|uỵ|oẵ|áy|ứa|ỏa|uô|õa|uạ|ẹo"
+regex_am_don = "ề|e|a|ầ|è|ơ|ồ|ú|ỵ|ả|ắ|ỷ|ố|ẩ|ặ|ừ|ữ|ủ|ụ|é|ợ|ằ|á|ỉ|ỗ|ê|ờ|ạ|õ|o|y|ì|ỳ|ự|ấ|ế|ý|ẽ|ó|u|ể|ễ|i|â|ẻ|ẹ|ỏ|ớ|ẳ|ẵ|ỹ|à|ẫ|ị|ù|ư|ứ|ở|ộ|ỡ|ũ|ô|í|ổ|ệ|ò|ĩ|ọ|ã|ậ|ử|ă"
+all_phu_am_dau = set()
+all_phu_am_cuoi = set()
+special_list = set()
+for syllable in vi_syllables_new:
+    if syllable[0:2] in ["qu", "gi"]:
+        special_list.add(syllable)
+        continue
+    if len(result:=re.findall(regex_am_ba, syllable)) != 0:
+        nguyen_am = result[0]
+    elif len(result:=re.findall(regex_am_hai, syllable)) != 0:
+        nguyen_am = result[0]
+    elif len(result:=re.findall(regex_am_don, syllable)) != 0:
+        nguyen_am = result[0]
+    else:
+        raise Exception("Khong co nguyen am")
+    phu_am_dau, phu_am_cuoi = "", ""
+    if len(result:=re.findall(f"(.+){nguyen_am}", syllable)) !=0 :
+        phu_am_dau = result[0]
+    if len(result:=re.findall(f"{nguyen_am}(.+)", syllable)) !=0 :
+        phu_am_cuoi = result[0]
+    all_phu_am_dau.add(phu_am_dau)
+    all_phu_am_cuoi.add(phu_am_cuoi)
+    assert "".join([phu_am_dau, nguyen_am, phu_am_cuoi]) == syllable
+for syllable in special_list:
+    if len(result:=re.findall(regex_am_don, syllable)) > 1:
+        phu_am_dau = syllable[0:2]
+        remained = syllable[2:]
+    else:
+        phu_am_dau = syllable[0]
+        remained = syllable[1:]
+    if len(result:=re.findall(regex_am_ba, remained)) != 0:
+        nguyen_am = result[0]
+    elif len(result:=re.findall(regex_am_hai, remained)) != 0:
+        nguyen_am = result[0]
+    elif len(result:=re.findall(regex_am_don, remained)) != 0:
+        nguyen_am = result[0]
+    else:
+        nguyen_am, phu_am_cuoi = "", ""
+    phu_am_cuoi = ""
+    if nguyen_am != "" and len(result:=re.findall(f"{nguyen_am}(.+)", remained)) !=0 :
+        phu_am_cuoi = result[0]
+    all_phu_am_dau.add(phu_am_dau)
+    all_phu_am_cuoi.add(phu_am_cuoi)
+    assert "".join([phu_am_dau, nguyen_am, phu_am_cuoi]) == syllable
+print("Tất cả phụ âm đầu: ")
+print(all_phu_am_dau)
+print("Tất cả phụ âm cuối: ")
+print(all_phu_am_cuoi)

dataset/data_generation/common-vietnamese-syllables.txt ADDED Viewed

	@@ -0,0 +1,7184 @@

+và
+của
+có
+các
+là
+được
+trong
+cho
+không
+người
+với
+một
+đã
+công
+để
+những
+khi
+đến
+về
+này
+tại
+ở
+cũng
+tôi
+ra
+năm
+nhiều
+từ
+việc
+đồng
+nhà
+làm
+đó
+hiện
+ông
+vào
+học
+bị
+trên
+thể
+theo
+trường
+như
+ngày
+anh
+đầu
+nước
+phải
+thành
+định
+bộ
+nhân
+sẽ
+gia
+quan
+sự
+nam
+lại
+chỉ
+số
+hàng
+con
+sinh
+động
+sau
+điều
+chính
+dân
+cơ
+nhưng
+việt
+đi
+quốc
+thì
+còn
+biết
+hội
+hơn
+thời
+thông
+an
+trung
+vụ
+giá
+viên
+thực
+lý
+phát
+nên
+nhận
+hành
+nhất
+chủ
+hợp
+rất
+mình
+đang
+qua
+xe
+văn
+trước
+do
+cao
+mới
+trình
+cùng
+mà
+đại
+vì
+bạn
+thế
+thị
+sản
+em
+đây
+tế
+đường
+cả
+đối
+bệnh
+hai
+án
+nói
+thi
+tiếp
+chức
+tư
+hình
+nghiệp
+nội
+tình
+hà
+nguyễn
+tiền
+dự
+lượng
+lên
+tin
+điểm
+bình
+cấp
+báo
+kinh
+đề
+tác
+dụng
+bảo
+xã
+tâm
+xuất
+tỉnh
+cô
+nay
+thanh
+bà
+tài
+kết
+tuổi
+cách
+vẫn
+thu
+khác
+đình
+cầu
+tăng
+toàn
+năng
+phương
+phòng
+chúng
+thấy
+tra
+tháng
+doanh
+giải
+cần
+khách
+thương
+tự
+bản
+thường
+chị
+chưa
+ảnh
+ngoài
+tới
+sở
+quy
+giao
+lớn
+diễn
+tổ
+ý
+yêu
+liên
+lực
+pháp
+ăn
+gian
+tập
+khu
+ban
+cuộc
+sống
+quyết
+phạm
+sĩ
+hoá
+mặt
+triển
+triệu
+nào
+phần
+trẻ
+hay
+lần
+bằng
+chất
+minh
+độ
+nếu
+trưởng
+rằng
+giới
+tạo
+quả
+nhiên
+trọng
+vị
+quá
+mẹ
+bán
+thủ
+trị
+địa
+đưa
+khoảng
+họ
+đạo
+tục
+tổng
+tiêu
+ty
+thức
+viện
+tham
+điện
+tính
+sử
+mua
+gần
+cảm
+huyện
+hiệu
+phẩm
+cảnh
+hệ
+bên
+luật
+máy
+sáng
+kỳ
+nguyên
+cứu
+vực
+giáo
+giờ
+mỹ
+hoạt
+bắt
+vậy
+kiến
+kiểm
+đổi
+xây
+đất
+vừa
+sát
+khó
+nghệ
+tỷ
+trở
+gây
+hoàn
+vấn
+tuy
+đơn
+khai
+tốt
+mạnh
+giảm
+biệt
+nhiệm
+dựng
+thống
+lúc
+bất
+trang
+vi
+thứ
+rồi
+phố
+nghị
+tiếng
+đều
+đặc
+chồng
+cáo
+hồ
+mức
+chí
+chế
+xử
+tượng
+mỗi
+nhau
+ta
+gì
+giúp
+nữ
+chuyển
+thêm
+đánh
+loại
+trí
+tiến
+khiến
+chi
+tìm
+muốn
+phụ
+cá
+thân
+chuyện
+đoàn
+quyền
+vợ
+quản
+đông
+bố
+chia
+hoặc
+sách
+tích
+phim
+mang
+sức
+hoa
+lời
+dùng
+ngân
+chương
+giám
+nhập
+ngành
+từng
+nạn
+hết
+diện
+chuyên
+tay
+tịch
+ngay
+nơi
+khoa
+dịch
+lập
+giữ
+lợi
+chứng
+hải
+hộ
+thiết
+hướng
+phó
+tiên
+phục
+mọi
+bao
+xét
+dẫn
+truyền
+biểu
+phí
+ca
+biển
+thư
+bé
+bỏ
+lịch
+trần
+chung
+xác
+vật
+rõ
+giữa
+giả
+bài
+sao
+cái
+y
+du
+ứng
+tử
+đẹp
+xem
+hoàng
+hoà
+dù
+trả
+sẻ
+chiếc
+đủ
+dài
+kiện
+cổ
+vàng
+thay
+đạt
+thuộc
+kế
+gái
+trợ
+lê
+ba
+nhỏ
+ký
+chọn
+chiến
+câu
+thuật
+sơn
+mất
+hỏi
+gặp
+thái
+chiều
+biến
+lấy
+vệ
+bàn
+luôn
+tên
+phủ
+xảy
+danh
+quận
+đức
+đúng
+thích
+dục
+đảm
+bởi
+ấy
+tiết
+bác
+hạn
+hậu
+đời
+quân
+ai
+hưởng
+cây
+quảng
+tranh
+so
+sang
+đội
+tố
+cửa
+vùng
+kể
+nguồn
+trạng
+vốn
+nhóm
+căn
+phân
+xuống
+cuối
+tất
+cứ
+bay
+nghiệm
+thí
+chuẩn
+cố
+tàu
+lao
+mở
+liệu
+nữa
+tướng
+tối
+uỷ
+lưu
+đăng
+ít
+nhạc
+mắt
+lãnh
+ngọc
+đoạn
+xin
+đốc
+trực
+đặt
+trách
+bắc
+tuyển
+vận
+dương
+riêng
+ngoại
+luận
+mạng
+cụ
+sơ
+đa
+phía
+đóng
+tương
+nông
+yếu
+ninh
+tội
+khăn
+xuân
+mục
+nghĩa
+cạnh
+cháu
+lớp
+đào
+hoạch
+khẩu
+long
+phong
+vũ
+phú
+nghĩ
+môi
+thiếu
+thật
+hiểu
+nổi
+chạy
+trái
+cán
+thuốc
+kỹ
+hữu
+cục
+áp
+cộng
+thuận
+tinh
+nhật
+chân
+lòng
+càng
+dưới
+nó
+nhìn
+đêm
+quang
+nhanh
+nghiên
+gửi
+hỗ
+châu
+khá
+phúc
+phép
+trai
+tải
+đảng
+chơi
+chết
+gọi
+đàn
+môn
+tuần
+lễ
+dung
+góp
+huy
+tân
+khả
+hồi
+hôm
+biên
+lan
+độc
+linh
+cung
+toán
+giấy
+cường
+đáng
+tai
+tuyến
+nằm
+gồm
+xúc
+duy
+sắc
+giang
+trao
+lo
+bí
+buổi
+thần
+sân
+đồ
+thuỷ
+cử
+tây
+khoẻ
+nghe
+rộng
+vui
+toà
+hồng
+dành
+phối
+kim
+khoản
+tấn
+vai
+chống
+lệ
+khí
+vô
+lương
+dễ
+đầy
+sư
+nặng
+mai
+trò
+hương
+nguy
+hại
+thảo
+đảo
+thiện
+nuôi
+ghi
+quán
+chịu
+tưởng
+phường
+niên
+tết
+mong
+lộ
+cực
+vài
+bước
+đô
+nghiêm
+đà
+chấp
+thuế
+thoại
+đứng
+đôi
+di
+uống
+phản
+nghề
+vọng
+thấp
+âm
+tiện
+lĩnh
+thẩm
+dầu
+khám
+màu
+hát
+đau
+xa
+hiểm
+lâu
+mô
+kiếm
+mặc
+nghỉ
+viết
+phù
+nhằm
+xuyên
+kỷ
+ngờ
+yên
+mưa
+hạ
+nghi
+kéo
+khởi
+ương
+nâng
+khẳng
+cư
+mẫu
+trương
+quý
+mại
+phá
+chú
+sai
+ô
+chăm
+hùng
+suất
+sâu
+tuấn
+bức
+đọc
+lâm
+nợ
+khỏi
+phạt
+tín
+vượt
+đấu
+thiên
+thưởng
+sông
+sớm
+dạy
+cha
+bổ
+cải
+tuyên
+chỗ
+áo
+hạnh
+tầng
+da
+tạm
+đẩy
+dưỡng
+chữa
+vòng
+phiên
+đá
+mùa
+vay
+hôn
+đâu
+chứ
+xanh
+quê
+hút
+máu
+hiệp
+nhờ
+thăm
+trú
+thuê
+tránh
+khán
+chẳng
+tiểu
+ánh
+quay
+soát
+nhiệt
+ngôi
+thắng
+dũng
+món
+thừa
+ung
+cháy
+khắc
+ngồi
+phê
+họp
+trì
+tô
+quen
+vinh
+miền
+chợ
+nhu
+chắc
+nền
+giống
+nga
+nóng
+thôn
+phút
+giai
+ổn
+bầu
+ưu
+thầy
+nhớ
+mắc
+dấu
+sóc
+quanh
+trời
+mối
+cà
+tỉ
+thậm
+chờ
+chỉnh
+khánh
+ma
+thúc
+lai
+ngủ
+siêu
+thịt
+chiếm
+tức
+đứa
+may
+nẵng
+thơ
+bày
+song
+nhiễm
+hãy
+đón
+ấn
+chuyến
+khảo
+phóng
+can
+bánh
+suy
+cân
+đáp
+vẻ
+khoá
+lựa
+hưng
+trải
+truy
+lái
+sợ
+trồng
+thú
+kịp
+dòng
+ích
+lạc
+tỏ
+hàn
+khối
+tặng
+suốt
+khuyến
+đổ
+á
+đỏ
+phan
+rút
+tái
+mấy
+cũ
+rau
+nghèo
+mời
+gắn
+thụ
+cầm
+tờ
+tốc
+buộc
+cam
+ước
+dư
+biện
+quần
+tim
+phi
+thất
+thai
+vong
+bạc
+lãi
+tắc
+the
+khúc
+sài
+giác
+ngữ
+cắt
+hài
+bè
+phiếu
+niệm
+sửa
+dừng
+la
+thiệt
+xếp
+tận
+tĩnh
+sung
+thử
+đài
+thẳng
+to
+dịp
+thôi
+trăm
+chở
+vĩnh
+nhẹ
+sữa
+nguyện
+nêu
+trắng
+sạch
+tộc
+cận
+bóng
+chục
+ly
+dần
+phổ
+vé
+thác
+sỹ
+xong
+lỗi
+chóng
+niềm
+tri
+gốc
+đỡ
+nhi
+thoát
+chặt
+tù
+a
+sẵn
+hãng
+làng
+loạt
+gió
+cười
+vân
+dạng
+gòn
+cưới
+vững
+lá
+dây
+hấp
+nhắc
+nộp
+coi
+liệt
+nối
+đỗ
+kém
+võ
+luyện
+lẽ
+sắp
+bến
+rừng
+nhấn
+lạnh
+hầu
+tôn
+huynh
+cậu
+cấu
+chứa
+cập
+đám
+xấu
+kiên
+gà
+thiệu
+lắng
+lạ
+lắm
+sa
+lửa
+dao
+sắt
+sóng
+tạp
+đột
+mê
+chụp
+kê
+giảng
+gắng
+hoạ
+cánh
+ngăn
+giàu
+trụ
+huyết
+quỹ
+đương
+cuốn
+tầm
+ngại
+tá
+giản
+đòi
+xăng
+cướp
+lộc
+thượng
+kích
+mật
+kịch
+ủng
+rơi
+tuý
+cặp
+núi
+dinh
+xung
+liền
+tồn
+trấn
+bào
+đích
+tuyệt
+buồn
+bồi
+hạng
+giết
+ngắn
+đợt
+vườn
+chắn
+thải
+cấm
+miễn
+phước
+quà
+trận
+nắng
+mái
+mạch
+bạch
+bãi
+trà
+mơ
+nắm
+chữ
+phẫu
+rượu
+buôn
+phận
+hơi
+hy
+hào
+đem
+nửa
+trưng
+mừng
+khổ
+tươi
+trinh
+khấu
+ngã
+băng
+bây
+đen
+nổ
+huế
+xế
+trúng
+hoang
+kêu
+bữa
+tùng
+vết
+dõi
+bờ
+chiếu
+lượt
+lào
+thăng
+chút
+cơm
+kháng
+tấm
+chậm
+thoả
+trộm
+sàng
+lũ
+ơn
+ngụ
+nghìn
+sạn
+hằng
+nỗi
+ngô
+âu
+gương
+chàng
+trúc
+vương
+ngược
+cơn
+kẻ
+bật
+đinh
+ngập
+khóc
+nỗ
+hạt
+đoạt
+ngàn
+thao
+kính
+ràng
+hè
+chào
+bò
+nhuận
+huỳnh
+mã
+cảng
+hoài
+bùi
+thọ
+trạm
+đừng
+đâm
+cát
+hiếu
+lừa
+lược
+mộ
+kiểu
+tệ
+khắp
+giam
+kia
+đàm
+trùng
+bậc
+túi
+mãi
+màn
+đẳng
+xóm
+giật
+đuổi
+khủng
+kênh
+tóc
+đậu
+đợi
+lẫn
+ngũ
+thách
+kiều
+giỏi
+dị
+ví
+dựa
+đoán
+kiệm
+viêm
+thuyết
+bão
+trốn
+tang
+gấp
+binh
+ngang
+rối
+não
+khẩn
+nhiêu
+ngư
+ấm
+vỡ
+lành
+thận
+tường
+gói
+xu
+khiển
+ngọt
+ho
+dàng
+gạo
+giọng
+hẹn
+sốt
+già
+ngon
+mùi
+hiến
+chảy
+thẻ
+hiền
+hề
+lệnh
+duyên
+trại
+rác
+trưa
+thuyền
+chánh
+bụng
+mổ
+giành
+ngôn
+ha
+lối
+chấn
+gỗ
+xương
+sứ
+quên
+mắn
+nấu
+loạn
+ga
+chấm
+nha
+trừ
+ruột
+trật
+dáng
+mẽ
+súng
+mũi
+uy
+bảng
+kèm
+đặng
+dám
+dược
+lẻ
+miệng
+rời
+hứng
+lục
+in
+chu
+nhũng
+đập
+ép
+chặn
+hung
+thở
+đèn
+béo
+túc
+cương
+hàm
+huyền
+huỷ
+khích
+bền
+khô
+dụ
+duyệt
+phức
+sàn
+sổ
+khen
+chúc
+thầu
+non
+thuý
+dâu
+bá
+tuỳ
+nai
+khuôn
+le
+bia
+dữ
+hiếm
+đãi
+ôm
+tật
+làn
+tiêm
+trông
+bốn
+điển
+lỗ
+gỡ
+kín
+trứng
+phán
+mềm
+tú
+dâm
+dày
+thập
+chăn
+bối
+ngừng
+rẻ
+thiểu
+nhắn
+nhảy
+dứt
+tổn
+phấn
+rào
+quỳnh
+góc
+táo
+ống
+mệt
+mạc
+dậy
+nhánh
+chạm
+mát
+hư
+nét
+gan
+căng
+lô
+kho
+hẳn
+thùng
+toả
+tiềm
+bột
+thang
+đồn
+cụm
+hỏng
+đam
+hảo
+khuyên
+rửa
+khoán
+tràn
+xinh
+tung
+van
+hô
+thù
+ẩn
+ghế
+thổ
+rà
+bó
+tông
+chẽ
+bếp
+đáo
+đấy
+trịnh
+doạ
+tụ
+tháo
+trữ
+lãng
+tắm
+cãi
+xưa
+lúa
+mưu
+tôm
+ngoái
+canh
+hứa
+ái
+lắp
+thạch
+sắm
+xứ
+trầm
+mỏi
+loan
+bổng
+cựu
+đỉnh
+bắn
+cai
+tán
+đua
+thịnh
+xâm
+thảm
+tốn
+mỡ
+thự
+hộp
+tran
+ẩm
+thép
+tiếc
+tam
+yến
+trào
+bù
+ngưỡng
+tan
+răng
+nàng
+nhãn
+khiếu
+hoảng
+mực
+giấu
+chối
+loài
+tàng
+cống
+chùa
+đền
+cổng
+chốt
+hoan
+lọc
+hoả
+giường
+hầm
+my
+cột
+bốc
+chín
+điệu
+đắt
+than
+thuẫn
+ro
+đe
+thạnh
+gợi
+rủi
+thơm
+niêm
+va
+mâu
+vũng
+củ
+dạ
+khống
+hồn
+thói
+sôi
+ven
+lưới
+trích
+tháp
+cản
+lang
+miếng
+cất
+lặng
+đạp
+điệp
+đốt
+pha
+bạo
+vỏ
+mầm
+khung
+cắp
+thờ
+treo
+huống
+chừng
+oan
+vả
+trọ
+lãm
+thoải
+lệch
+ấp
+lưng
+co
+bơi
+u
+ân
+mảnh
+ghép
+dàn
+lợn
+chai
+giấc
+cờ
+cẩm
+thua
+giáp
+xả
+đắk
+muối
+khuẩn
+váy
+lít
+huấn
+chủng
+trôi
+say
+heo
+ngực
+hối
+mau
+vẽ
+hâm
+ngừa
+khoe
+vất
+bỏng
+bớt
+i
+tuyết
+bận
+sót
+bát
+phỏng
+bông
+bang
+tuân
+lò
+che
+khát
+chém
+đậm
+bại
+tiệc
+nhịp
+mệnh
+đống
+mặn
+săn
+bích
+đo
+mây
+móc
+xứng
+hong
+viễn
+chua
+thả
+lam
+đẻ
+cứng
+sáu
+mì
+xoá
+vắng
+mãn
+triều
+cốt
+su
+giây
+nhung
+chó
+tiệm
+tròn
+khâu
+lậu
+chuỗi
+kiệt
+bám
+lạm
+bom
+bê
+trân
+hưu
+vải
+trăng
+đựng
+dọc
+gánh
+cỏ
+thuỳ
+nghiện
+vở
+đắc
+ngộ
+tre
+nở
+ngãi
+cẩn
+vĩ
+lầm
+tả
+trục
+dọn
+vội
+thiệp
+đai
+triệt
+sốc
+liêm
+mượn
+tắt
+tu
+bách
+truyện
+trống
+bọn
+cú
+phượng
+dưa
+ngõ
+tủ
+dập
+muộn
+vời
+chẩn
+buýt
+huệ
+vu
+ùn
+tụng
+mét
+vang
+tiễn
+yết
+gọn
+sập
+nại
+chim
+hôi
+phổi
+thuỵ
+xô
+nhàng
+vẹn
+vua
+giận
+chìm
+nhựa
+dường
+rạng
+ngọn
+chinh
+lở
+nề
+ngầm
+trộn
+pháo
+lân
+nhan
+tha
+nhậu
+nồng
+thợ
+pham
+trọn
+lăng
+thưa
+khuyết
+sánh
+mắm
+nhầm
+đạn
+chênh
+bảy
+lọt
+khôi
+xoay
+nụ
+hang
+toạ
+tách
+vướng
+múa
+bi
+e
+lão
+kì
+gò
+hi
+giày
+hẹp
+kẹt
+ốc
+ảo
+đắp
+dính
+nén
+ác
+hiển
+bề
+lồng
+mốc
+móng
+mạn
+ức
+suối
+uý
+lí
+cưỡng
+nướng
+bụi
+khói
+đói
+dĩ
+kiêm
+mến
+nhơn
+tuệ
+mù
+đeo
+khớp
+gãy
+vạn
+thước
+lấn
+trắc
+rèn
+xách
+điền
+mạo
+bơm
+sen
+phu
+khoáng
+gìn
+tàn
+nhạy
+nhượng
+nhẫn
+chán
+đớn
+lạng
+leo
+xá
+tráng
+ném
+thoái
+soạn
+nảy
+khê
+lạt
+lây
+thuần
+hò
+phật
+bàng
+com
+dỡ
+diệt
+ruộng
+diệu
+rạp
+phùng
+đĩa
+thô
+thắc
+tám
+tràng
+hổ
+quyên
+rét
+đành
+tảng
+thánh
+thuy
+nhé
+lấp
+vỉa
+mồ
+xạ
+lứa
+vươn
+mày
+xi
+sợi
+nhở
+bùng
+liều
+nguyệt
+tứ
+thạc
+san
+bế
+chiêu
+ám
+giãn
+dời
+mỏ
+đan
+dã
+trâu
+thoáng
+oanh
+son
+thâm
+ngắm
+vịnh
+út
+liêu
+dắt
+rộn
+úc
+xưởng
+khổng
+khanh
+bẩn
+dông
+nát
+ngạc
+bể
+khang
+kem
+bút
+phái
+ôn
+bón
+rãi
+ngoan
+luồng
+lông
+tạ
+rẫy
+luân
+vấp
+sạt
+hắn
+rủ
+chửi
+mộc
+im
+đằng
+sâm
+thoảng
+nút
+gũi
+nấm
+ngâm
+lồ
+bội
+dừa
+chúa
+cua
+măng
+xơ
+quế
+củng
+o
+quyến
+mũ
+mảng
+chả
+ngào
+tím
+gom
+ốm
+đầm
+khơi
+băn
+men
+vây
+địch
+khoăn
+sưu
+hụt
+lắk
+bồ
+dở
+bì
+trượt
+đùa
+ngạch
+dốc
+nhã
+gạch
+man
+mông
+bái
+bỗng
+soi
+mười
+khứ
+sương
+rũ
+khéo
+dồn
+lộn
+diễm
+lỏng
+khoảnh
+ổ
+cỡ
+gối
+manh
+ngơi
+cước
+bún
+hán
+hòn
+côn
+màng
+dâng
+ơi
+hận
+mẻ
+hở
+tựu
+dạo
+ngột
+hiếp
+hỗn
+lính
+thắn
+thắt
+trùm
+cay
+luỹ
+chanh
+đạm
+đồi
+ranh
+xao
+tạng
+chuột
+bọc
+ưa
+hố
+thầm
+voi
+cốc
+thấu
+mọc
+rể
+nhát
+ngón
+nhì
+sụt
+dịu
+dang
+phiền
+rạch
+thằng
+rắn
+lôi
+rỡ
+gắt
+xông
+ghen
+tê
+hoại
+đứt
+mò
+bắp
+nhí
+mờ
+chuyền
+dệt
+hãi
+má
+cắm
+nồi
+dối
+rải
+ớt
+ham
+chè
+cúng
+phụng
+nôn
+xót
+ngợi
+khoai
+chuối
+vành
+hoành
+nhặt
+nhĩ
+dò
+đán
+cám
+trạch
+na
+lùi
+chê
+chôn
+lề
+cạn
+nếp
+đới
+đuôi
+dội
+vy
+đáy
+hân
+bưu
+rồng
+nho
+ồn
+no
+kíp
+mộng
+khe
+nhiếp
+mỏng
+kĩ
+nã
+ong
+lật
+lường
+giáng
+uyên
+ạ
+phun
+vắc
+xích
+vịt
+thỉnh
+khải
+cự
+dán
+quãng
+đúc
+hước
+phế
+lát
+hái
+lưỡng
+lăn
+cởi
+vú
+tẩy
+ao
+bưởi
+be
+vóc
+quét
+vã
+buồng
+hoãn
+châm
+chặng
+cửu
+xát
+tần
+gã
+rịa
+cồn
+mi
+cậy
+cúc
+bồn
+lắc
+khoan
+thấm
+ghé
+buông
+nhạt
+đế
+dan
+vớt
+xà
+phở
+chuộng
+dẹp
+vặt
+chiên
+chậu
+gục
+gay
+điên
+chật
+lùng
+rực
+xôi
+khiêm
+lặn
+ngất
+giọt
+chép
+giặt
+vượng
+chăng
+đôn
+kẹo
+khép
+láng
+kí
+đọng
+dại
+đạc
+quái
+lụt
+đắn
+trễ
+cong
+kề
+xôn
+muỗi
+bộc
+dải
+ngưng
+thổi
+lỡ
+tỏi
+cáp
+phơi
+họng
+ào
+xỉ
+vứt
+bấy
+rắc
+cọc
+tui
+huân
+gác
+cóc
+hẻm
+ngựa
+giếng
+trăn
+nứt
+tưới
+cháo
+nốt
+tống
+dỗ
+khuất
+lữ
+mùng
+cành
+đuối
+qui
+sếp
+lận
+cẩu
+khuê
+văng
+khôn
+lau
+lốc
+khương
+mụn
+sướng
+quầy
+nhục
+quát
+diệp
+chui
+bấm
+giàn
+chen
+hóc
+cắn
+rẽ
+ê
+lơ
+trội
+thắm
+sáp
+khái
+dãy
+lội
+nhàn
+dặn
+đùi
+đếm
+me
+khuya
+nhị
+thối
+táng
+chiết
+lặp
+đắng
+kon
+bùn
+mòn
+nương
+cò
+pa
+mèo
+lưỡi
+ập
+khoái
+súc
+ngỡ
+sút
+bôi
+phì
+trâm
+đê
+xí
+thiêng
+hét
+lạp
+tóm
+xoài
+rong
+nghiêng
+túng
+mường
+tao
+gián
+đè
+khoác
+tum
+loa
+thắp
+dai
+đoan
+sọ
+nghịch
+léo
+xào
+lách
+dồi
+xen
+đặn
+mắng
+khao
+mác
+tò
+đính
+chay
+bỉ
+xì
+luộc
+bóc
+nhức
+nhắm
+rao
+dạn
+chèo
+nhường
+vo
+mía
+cuồng
+bẫy
+chau
+tấu
+sụp
+nạp
+chén
+bóp
+ướt
+đèo
+thám
+thiêu
+cấy
+hốt
+bú
+viếng
+gấu
+tước
+tẩu
+thèm
+ghét
+dong
+bơ
+li
+eo
+xài
+báu
+xổ
+giông
+điêu
+lột
+kỉ
+hao
+thoa
+tựa
+nhọn
+tấp
+ben
+ráo
+lót
+đệ
+mẫn
+rùa
+bẩm
+khoang
+sưng
+bính
+kè
+ngần
+khan
+hạch
+lầu
+xưng
+bạt
+quyển
+mâm
+chìa
+chuồng
+chiêm
+xuôi
+ti
+choáng
+tát
+bướu
+nghẹn
+chì
+đụng
+tơ
+bui
+tia
+quỵ
+rộ
+né
+dê
+thửa
+bực
+gầy
+gai
+đục
+đệm
+rung
+thục
+ủ
+gậy
+giòn
+xăm
+chốn
+mồi
+đun
+ngặt
+sườn
+liễu
+dép
+quất
+de
+sành
+neo
+ủi
+ráp
+nghiệt
+nành
+ứ
+xay
+rụng
+nàn
+ngà
+gầm
+thới
+chiểu
+à
+rước
+rớt
+hoạn
+còi
+kiềm
+vạch
+mịn
+thìa
+vác
+lõi
+nản
+nhộn
+hoán
+nâu
+quỳ
+hường
+tảo
+nắp
+rách
+tom
+nang
+ướp
+phanh
+thản
+sầm
+nới
+bới
+khốc
+tụi
+vuông
+chưng
+bạ
+nan
+lọ
+nhịn
+đòn
+siết
+dẻo
+nghênh
+thòi
+mập
+khiếp
+ngài
+khỉ
+triết
+toa
+tuyền
+tịnh
+trơn
+sỏi
+ngo
+nô
+ồ
+quạt
+nhăn
+tho
+run
+ngậm
+han
+xẻ
+vòi
+thuyên
+pin
+hấu
+cù
+nọ
+gạt
+lún
+toan
+rốt
+hoi
+nấy
+kiêng
+ngạt
+dưng
+rệt
+rỉ
+tà
+tụt
+khiêu
+chuông
+tí
+nhược
+lui
+quách
+bong
+bìa
+mí
+thính
+râu
+thiều
+mài
+xắn
+nức
+xướng
+đắm
+thon
+háo
+êm
+bã
+thuột
+khốn
+vỗ
+sẹo
+quynh
+náo
+nuốt
+răn
+sun
+ròng
+côi
+lung
+rang
+nuối
+vọt
+ve
+óc
+bịt
+xoáy
+cúm
+nhủ
+cưng
+tồi
+thốn
+khoả
+cài
+doãn
+giò
+chích
+quật
+rễ
+tem
+lánh
+hanh
+biếu
+khía
+lẩn
+ạt
+ngây
+liêng
+lụa
+xoa
+nhuộm
+doan
+rảnh
+ki
+hít
+gừng
+ngẫu
+trêu
+ngỏ
+miếu
+ngứa
+lũng
+chọc
+nhồi
+mươi
+diễu
+rụi
+sét
+mít
+tòng
+sầu
+hông
+keo
+lộng
+nêm
+quấy
+hức
+khiếm
+dì
+chọi
+chải
+vách
+thủng
+đàng
+sim
+đũa
+vắt
+cớ
+cày
+thốt
+phẳng
+dào
+trừng
+nhiễu
+oai
+cào
+tặc
+mạ
+úng
+mậu
+cuộn
+miện
+bén
+vờ
+ất
+kha
+kẽ
+sam
+xê
+nể
+té
+bung
+trãi
+hé
+xấp
+đò
+mương
+đu
+mìn
+dạt
+muôn
+đốn
+chảo
+võng
+miên
+lén
+ưng
+vỹ
+gặt
+dầm
+cọ
+xé
+phô
+rã
+nám
+trèo
+tẩm
+lười
+lanh
+phàn
+sừng
+ngàng
+chéo
+nhọc
+nghẽn
+lóc
+nhái
+ván
+cỗ
+xịt
+loét
+luỵ
+bồng
+nóc
+lẫy
+cưa
+nần
+ghe
+phác
+cúi
+phôi
+lì
+gốm
+ẩu
+thạo
+đả
+ni
+gang
+len
+bênh
+ầm
+séc
+thà
+mỳ
+tủi
+nguội
+ngoạn
+rạn
+phiêu
+khiết
+miêu
+ngửa
+ách
+gieo
+gõ
+uốn
+nạ
+vét
+dẫu
+kìm
+két
+hủ
+tràm
+tý
+kệ
+phao
+nạo
+xỉu
+tăm
+quấn
+búa
+sấu
+sào
+mu
+hen
+sờ
+loãng
+chèn
+day
+nón
+rập
+đấm
+am
+cụt
+xốp
+ư
+lướt
+miệt
+cừ
+chạp
+mận
+gam
+đái
+vôi
+tạt
+chùm
+lờ
+tốp
+tuỵ
+giềng
+ruồi
+giỏ
+bọt
+ghê
+ỏi
+giã
+khắt
+hiên
+giằng
+gân
+giỗ
+áng
+rán
+hả
+vun
+nhai
+xáo
+thiêm
+rưỡi
+rầm
+trù
+chợt
+cu
+hăng
+muỗng
+xảo
+dè
+đăk
+xám
+ngả
+rỗi
+rơm
+tuỷ
+trói
+nhốt
+thuở
+cau
+vùi
+chan
+khoanh
+mốt
+roi
+bẻ
+lúng
+niệu
+ngán
+lẩu
+vung
+ả
+diều
+ế
+nạc
+phả
+uất
+nhang
+sô
+go
+phiến
+váng
+nhuyễn
+lựu
+quýt
+thâu
+gật
+lốp
+dằn
+trĩ
+on
+trán
+khử
+phà
+rót
+méo
+ngấm
+nếm
+ngóng
+xíu
+nem
+mứt
+sấy
+khoát
+mỉ
+rổ
+ấu
+nhào
+ngự
+thề
+uông
+vụn
+ten
+đao
+bọ
+nhổ
+gắm
+gươm
+ngùi
+chà
+trau
+kẽm
+lay
+hinh
+suýt
+mải
+trút
+mỉm
+năn
+súp
+xiếc
+thềm
+ray
+chót
+bầm
+nhét
+bừa
+gót
+cộ
+quỷ
+lu
+mãnh
+cối
+mủ
+trót
+bứt
+kẹp
+níu
+que
+tiệp
+sắn
+sà
+sanh
+lích
+khâm
+cội
+sòng
+giơ
+sảnh
+trặc
+ngơ
+chập
+sum
+lán
+cúp
+mép
+ngắt
+chững
+ghềnh
+ươm
+toản
+đố
+giục
+nộ
+cuống
+quây
+nguỵ
+lon
+chốc
+si
+lượn
+thiền
+lừng
+hới
+bừng
+thung
+hãm
+giun
+muống
+tị
+rò
+nập
+phồng
+trứ
+rưng
+kiếp
+tạc
+xước
+búp
+quẩn
+luyến
+vẫy
+ngừ
+chư
+trổ
+điểu
+lửng
+vá
+kiêu
+củi
+sạp
+ngẫm
+chuộc
+gội
+cấn
+tuýp
+bo
+nặc
+kép
+tuông
+nhàm
+noi
+bèo
+nến
+ôi
+vàn
+pu
+se
+loay
+phượt
+hoay
+điếu
+giêng
+phẫn
+lều
+xua
+quăng
+nà
+úp
+chòi
+toát
+băm
+sê
+hất
+đãng
+ói
+duẩn
+đẫm
+teo
+lùm
+cáu
+gu
+chắt
+bửu
+hoằng
+giàng
+min
+nhạ
+xuồng
+tiếu
+mướn
+pho
+duệ
+giặc
+lươn
+cõi
+then
+ngó
+tỉa
+sưa
+ngoặt
+tòi
+gạc
+gào
+phách
+nghẹt
+oà
+kỵ
+chớp
+thêu
+rát
+chát
+trơ
+nòng
+gọt
+biếng
+phai
+hệt
+hun
+bịch
+húc
+cồng
+vạ
+điềm
+ia
+trệ
+mẩn
+rô
+vãn
+dí
+ơ
+ngao
+trảng
+ếch
+nạt
+gàng
+tro
+nhích
+ru
+phe
+múc
+sưởi
+hãn
+rỗng
+đậy
+tản
+thy
+dặm
+xuyến
+xoang
+cạo
+nhóc
+nỉ
+cược
+ngổn
+vòm
+sảng
+huyên
+chằng
+quyện
+rãnh
+bin
+diên
+cang
+rốn
+gáy
+trọt
+khinh
+vơ
+lư
+kiểng
+suôn
+rượt
+tỵ
+bon
+xiết
+phình
+mịt
+chước
+ngọ
+sỉ
+rằm
+còng
+tẻ
+chướng
+xùm
+diêm
+xộn
+ngách
+nậm
+cằm
+tớ
+xỉn
+ron
+nhậm
+ngời
+găng
+đực
+quậy
+mấu
+tít
+yếm
+lợp
+ngoãn
+sỡ
+thỏ
+nhúng
+nghiền
+ngan
+don
+chài
+tưng
+bục
+dìu
+nhôm
+rồ
+hắt
+nhỉ
+nghé
+hờ
+hễ
+dứa
+tơi
+trỗi
+mồng
+khay
+dụm
+kèn
+bỉnh
+ri
+vựa
+tuột
+sụn
+xệ
+sặc
+mẹo
+thệ
+sò
+trũng
+khuấy
+sáo
+quẹt
+hổng
+nung
+ngói
+chắp
+hãnh
+sùng
+mách
+riết
+triền
+ngủi
+liu
+xít
+mượt
+dột
+quí
+ngáo
+cẳng
+ngùng
+hắc
+mớ
+khăng
+lã
+sởi
+lim
+khóm
+đong
+chảnh
+bệ
+sửu
+xù
+khét
+hằn
+rai
+chôm
+nấc
+mè
+vựng
+vãi
+dậu
+gồng
+dăm
+dãi
+ngút
+ngu
+gượng
+bậy
+vuốt
+ổi
+hạm
+ngục
+trệt
+oán
+cặn
+ren
+dấn
+vò
+nhâm
+dấy
+gắp
+vặn
+rêu
+túm
+gởi
+reo
+bai
+thê
+rơ
+chàm
+sục
+đản
+cõng
+vữa
+nách
+nhằn
+bời
+rành
+hốc
+lút
+vầy
+cốp
+thẳm
+rùng
+chao
+thét
+phen
+kiệu
+dơi
+cuốc
+lầy
+nham
+vợt
+sán
+múi
+kén
+bưng
+thừng
+hoai
+vơi
+trầy
+vảy
+ngạn
+sá
+lem
+mênh
+tuồng
+giăng
+gớm
+giấm
+ráng
+dỏm
+đùng
+thán
+cừu
+tắn
+nhựt
+mẩu
+mồm
+hũ
+sùi
+phông
+hờn
+nhuần
+rình
+tã
+ốt
+đoòng
+quai
+sảo
+von
+tanh
+lìa
+khuynh
+vụng
+thoai
+bộn
+rê
+bỉm
+đẽ
+thìn
+mon
+nhẹn
+nhện
+nắn
+lượm
+lẻo
+vệt
+bàu
+nguyền
+ngòi
+dẫm
+mướt
+sạm
+độn
+giẽ
+rầu
+ruốc
+tróc
+hẫng
+nhưỡng
+nguôi
+tẩn
+chang
+đốm
+gập
+chỉn
+xịn
+xoắn
+nom
+sạc
+tày
+nhờn
+khoắn
+bịa
+vâng
+ngót
+đút
+miến
+tép
+kềnh
+oi
+ngửi
+át
+by
+giở
+chội
+nôi
+chần
+trá
+điếc
+quặng
+dẻ
+chơn
+giầy
+buốt
+hụi
+đỗi
+nhoáng
+lùn
+lẫm
+vạt
+sảy
+hạc
+cộm
+rợn
+lênh
+găm
+phớt
+thun
+thỏi
+gio
+buồm
+sả
+cóp
+ngợp
+cạp
+ỷ
+nhớt
+hòm
+hăm
+lõm
+bỡ
+hổi
+rậm
+láo
+vỏn
+hàu
+thóc
+bẹp
+thớt
+hợi
+trớ
+râm
+xiêm
+cằn
+gỏi
+hùm
+chói
+xói
+sói
+rôn
+nheo
+nghia
+bấp
+khế
+siu
+nặn
+lức
+đóm
+căm
+uyển
+chông
+câm
+lăk
+vừng
+quàng
+núp
+đoá
+héo
+ắp
+lốt
+vớ
+nhút
+nhối
+trẻo
+thuấn
+thong
+ĩ
+hòng
+vén
+nong
+tột
+trọc
+nhác
+bợ
+bần
+gi
+trợn
+hon
+hởi
+bùa
+bẩy
+vần
+gộp
+hời
+dỗi
+nhuệ
+ngượng
+vụt
+bít
+nhỡ
+nháo
+sộ
+mích
+den
+lịm
+lăm
+rúng
+nghè
+ì
+nín
+phung
+tào
+luống
+sững
+tễ
+chổi
+đom
+ná
+dẳng
+bô
+ngốc
+bầy
+thỏm
+lạch
+hèn
+gỉ
+nhấp
+lậy
+vênh
+khoét
+giùm
+mào
+cút
+moi
+tuồn
+tợn
+ngẩn
+mị
+dũ
+mão
+he
+thinh
+trầu
+nhõm
+trồ
+giồng
+sủa
+phất
+chầu
+rườm
+cứa
+niu
+chừ
+ngưu
+seo
+nhũ
+nỡ
+gióng
+mó
+vin
+phồn
+suyễn
+miu
+ảm
+cưỡi
+tè
+choàng
+bổn
+đạ
+huyệt
+dùi
+tráo
+vãng
+vía
+ối
+nhấc
+tằm
+lép
+bim
+chạnh
+ốp
+nẻo
+bặm
+bướm
+nơ
+ngớt
+xẩm
+trĩu
+sệt
+hỷ
+gao
+nhầy
+tua
+gạ
+xui
+xén
+nôm
+mễ
+chiêng
+nậu
+vít
+cạy
+bành
+trác
+dượng
+ngợm
+ngậy
+xoè
+sần
+thiếp
+phím
+nhẹt
+hớn
+nao
+đéc
+nhọ
+sửng
+ghẹ
+lóng
+đấng
+nệm
+giời
+trướng
+vẹo
+tèo
+xở
+nghén
+dặt
+mĩ
+nghĩnh
+éo
+bói
+rông
+hoóc
+chít
+hỉnh
+hớt
+thẹo
+nhô
+ải
+sải
+phộng
+khít
+pia
+lẳng
+lả
+giãi
+bủa
+chiềng
+hóm
+chùi
+đùm
+thoi
+hến
+mửa
+nghêu
+choi
+bươn
+om
+mút
+đổng
+đờn
+lỏi
+hột
+nháy
+lèo
+khơ
+ken
+trụi
+ùa
+lồi
+tuất
+vồng
+hoè
+vút
+bả
+thoạt
+lẻn
+lác
+én
+óng
+vế
+sẫm
+thía
+tấc
+mọng
+tram
+bảnh
+tề
+đay
+đát
+ngáy
+sú
+mao
+thúng
+chồn
+viền
+vẹt
+lõng
+xới
+gấm
+chớ
+tong
+dát
+tì
+luồn
+kèo
+xoan
+mướp
+mo
+cốm
+tỳ
+khuân
+loang
+xiêu
+xắt
+khẽ
+phèn
+ngoai
+thiu
+trắm
+xiên
+vót
+rè
+khỉnh
+chây
+rụt
+vàm
+phết
+nhảm
+giẫm
+phiện
+xoăn
+nấp
+cọng
+diết
+quẫn
+khiêng
+càn
+hót
+ỉ
+nực
+tươm
+gán
+cựa
+xèo
+nhếch
+nhả
+phào
+khờ
+rít
+ngác
+nài
+chuốt
+thụt
+cuông
+rôm
+chày
+sàm
+nhão
+toái
+um
+pô
+ny
+mỗ
+đờm
+giũ
+bở
+lấm
+chun
+lùa
+tậu
+kháu
+mầu
+đần
+khùng
+cơi
+hươu
+sọc
+ẵm
+ruổi
+huê
+thò
+loà
+suông
+pan
+nhoà
+đáu
+bèn
+nhói
+răm
+oải
+ngẩm
+móp
+đẩu
+tròng
+chệch
+chạc
+sến
+quáng
+nải
+rèm
+cật
+ừ
+phăng
+giỡn
+chùng
+mả
+lố
+bêu
+vỉ
+lạy
+khoé
+nhừ
+cữu
+húng
+chuồn
+lặt
+nghiến
+truỵ
+lin
+tiệt
+tét
+quới
+giếm
+dơ
+dĩnh
+phìn
+nhợt
+luông
+un
+nghê
+nhún
+bốt
+mãng
+ngông
+lọi
+duỗi
+ngưởng
+kham
+hoe
+dĩa
+soái
+goá
+bệt
+thảnh
+ổng
+sứa
+ton
+khò
+hẻo
+đằm
+thót
+mú
+rẩy
+víu
+xối
+toé
+rứa
+ngải
+dìm
+riêu
+thơi
+ngớ
+ky
+vạy
+há
+tía
+phui
+lài
+moóc
+thau
+nết
+đờ
+lâng
+xoong
+tuôn
+dấm
+him
+sủ
+ran
+nớt
+tóp
+gành
+bướng
+lặc
+cui
+bềnh
+miết
+lẽo
+náu
+gù
+cáng
+banh
+rả
+quệt
+ngạo
+chằm
+quệ
+hú
+chực
+hiu
+re
+khiên
+nọc
+bôn
+bết
+rú
+bét
+mẩy
+loát
+chĩa
+nhá
+liếng
+sấp
+rinh
+hoắc
+ắt
+vẩy
+khất
+nấng
+nái
+nhẩm
+lảng
+sin
+cữ
+rục
+rìa
+nhăng
+liếc
+dền
+núm
+kẻo
+xúi
+mạt
+cưu
+khuỷu
+búi
+hích
+bu
+chíp
+nháp
+kỉnh
+đểu
+chong
+chịt
+vằn
+sấm
+nhổn
+nhen
+lừ
+nhỉnh
+chiu
+xóc
+rủa
+nộm
+phốt
+ngát
+héc
+đanh
+mia
+vồ
+dốt
+đềm
+sẩy
+lụng
+nẻ
+ganh
+chầm
+nơm
+mọt
+ụ
+riềng
+nịnh
+nhẽo
+dôi
+chừa
+tùm
+ngóc
+mún
+xỏ
+vát
+vẩn
+thặng
+hám
+gài
+loi
+chưởng
+nhại
+muội
+rạ
+hớ
+xẻo
+nòi
+luẩn
+líu
+xy
+máng
+tấy
+nhảnh
+gổ
+cụi
+ắng
+trê
+rớm
+hóng
+ú
+trỏ
+nép
+lụi
+cùi
+rứt
+pôn
+đỉa
+ố
+đận
+choạng
+ngốn
+đoái
+diệm
+mền
+hợt
+siêng
+tánh
+nhấm
+đọ
+rựa
+ghẹo
+đênh
+sứt
+nít
+gẫy
+cỗi
+bịu
+loanh
+mánh
+lỉnh
+khư
+rết
+dế
+xâu
+nứa
+nhẽ
+diệc
+đác
+cợt
+nhím
+toại
+bặt
+cỏi
+xúm
+sính
+mói
+đùn
+tru
+đày
+xuýt
+thóp
+veo
+dúm
+cặm
+lỳ
+giẻ
+thỉ
+rịch
+phin
+mếu
+lự
+thố
+săm
+khảm
+pơ
+ghẻ
+hỉ
+chợp
+thảy
+quặc
+núc
+vạc
+thui
+thoắt
+ngỗng
+nớp
+mằn
+thuyển
+rọi
+trìu
+thẫn
+gặng
+khăm
+gianh
+guồng
+biếm
+nhúc
+khấn
+hấn
+thẹn
+quầng
+lù
+bập
+ắc
+loạng
+thào
+ngò
+tiều
+phom
+mỵ
+chum
+gút
+xéo
+ứa
+tuộc
+trạc
+phùn
+phàng
+rúp
+ngoi
+chẻ
+nhọt
+inh
+hí
+giễu
+rạc
+mịch
+lết
+cồ
+náy
+au
+đượm
+rầy
+phàm
+hửng
+ngầu
+nẹp
+phũ
+hù
+quặn
+nán
+rộp
+ợ
+hững
+tành
+mân
+xập
+ù
+trồi
+trịa
+ngùn
+kìa
+xiển
+hên
+sậm
+mụ
+gặm
+dả
+ngụt
+mui
+bống
+dầy
+chườm
+sới
+ngẩng
+áy
+đĩnh
+lủi
+en
+giuộc
+rợ
+lẹ
+hẹ
+gáp
+phiệt
+ngước
+chòm
+pù
+kình
+khàn
+nục
+mố
+trừu
+mỉa
+gả
+cọp
+ngấn
+nãy
+háng
+tót
+cộc
+chẵn
+beng
+vịn
+phuy
+cuồn
+táp
+nhứt
+miễu
+lắt
+khem
+cạm
+hùn
+lằn
+chùn
+bẹn
+xụp
+mề
+nua
+hục
+đẽo
+thổn
+hực
+nuông
+nu
+sình
+nhuế
+xém
+cheo
+viêng
+diêu
+hoen
+bẹ
+ngấy
+mé
+xộc
+rượi
+hừng
+xuể
+thình
+gồ
+xẻng
+cún
+trám
+thướt
+oái
+vánh
+típ
+sy
+rươi
+nướu
+mơn
+móm
+chãi
+tộ
+thớ
+nhạn
+ngai
+chấu
+hì
+rợp
+nhót
+ke
+chồi
+voọc
+uổng
+phiếm
+phèo
+đoài
+gãi
+uế
+khấm
+hộc
+xo
+mảy
+gấc
+ỳ
+lòi
+xoà
+tắp
+vông
+trôm
+nin
+hê
+bòn
+nhẵn
+chửa
+đước
+khui
+chới
+vèo
+thỉu
+te
+sên
+nhòm
+đìa
+dam
+phích
+đơ
+biền
+bẽ
+ó
+hạo
+chớm
+vón
+rói
+đuốc
+hau
+dó
+thủi
+nhằng
+ạch
+bụt
+tửu
+dà
+uẩn
+toác
+quỵt
+phờ
+bọng
+uể
+ráy
+đúp
+dúi
+díu
+vù
+khè
+chóc
+beo
+tuốt
+hùa
+ngụm
+thày
+lụp
+chình
+soài
+rên
+bịp
+boong
+rìu
+nhản
+chóp
+cầy
+nhạo
+khoi
+giũa
+tĩu
+nhàu
+nghiễm
+nếnh
+nùng
+mót
+chột
+vắn
+mỏm
+bua
+ém
+rưới
+báng
+xốt
+rởm
+nườm
+niềng
+nghịt
+rảo
+lỵ
+bìu
+vừ
+truân
+quẩy
+nượp
+dẹt
+sắng
+rỗ
+rần
+vầng
+lể
+gờ
+đuông
+dấp
+tuýt
+them
+thẫm
+quẹo
+thít
+lổ
+yểu
+ló
+lãn
+khoải
+loáng
+kịt
+bờm
+xẩy
+xẹp
+khập
+quằn
+mần
+loá
+nhoi
+diếp
+tăn
+kiển
+rặng
+quyệt
+sám
+sạ
+giặm
+bẵng
+sủi
+mống
+chử
+tầu
+tằn
+quánh
+nhộng
+nê
+đoản
+ăng
+tếu
+muồi
+vái
+trát
+bấu
+lơi
+lẵng
+hách
+nhơ
+liếm
+toanh
+nhốn
+ngoa
+ròi
+biếc
+úa
+tưởi
+xó
+ửng
+lợm
+lềnh
+tể
+ngươi
+yểm
+rì
+nầm
+gẫm
+cảo
+sọt
+nhám
+nằng
+gáo
+chường
+ớn
+nhãng
+lủng
+túp
+nuột
+xốc
+thênh
+khiễng
+điếm
+đẵng
+chuân
+bun
+hui
+hói
+chộp
+chơ
+po
+loè
+hua
+gàu
+nhuốm
+nhoài
+khuếch
+quắt
+phảng
+luy
+húp
+cuội
+bẫm
+dật
+nhặn
+pao
+xáng
+khước
+đoạ
+bằm
+thủa
+quại
+lình
+huề
+voan
+nghinh
+vó
+pay
+nhoè
+dằng
+bĩnh
+trấu
+nựng
+dau
+nè
+vại
+ớ
+niêu
+lợ
+lêu
+rim
+ram
+oằn
+loăng
+dãn
+tởm
+nanh
+đơm
+búng
+hạp
+ghém
+ngáp
+giãy
+láy
+nồm
+lạn
+nhem
+nhài
+chồm
+bụ
+xép
+oăm
+gờm
+trề
+máo
+chờn
+trằn
+lũi
+liềm
+lải
+khạc
+gộc
+chọt
+ngọng
+khua
+xom
+ria
+ngòm
+ngoảnh
+ngoặc
+nẹt
+sìn
+chuốc
+toang
+sua
+nghía
+lẩm
+trịch
+quắp
+dượt
+tau
+ghề
+sủng
+dua
+tạnh
+sực
+bự
+tíu
+còm
+cạch
+mun
+tủa
+lứt
+kiềng
+gùi
+sề
+phập
+mùn
+loã
+guốc
+phúng
+hủi
+hỡi
+chác
+pi
+pà
+mõm
+nhè
+mũm
+mĩm
+lẹt
+bõm
+tuế
+ké
+gien
+quạnh
+hác
+thiếc
+riu
+nưa
+thảng
+noãn
+bâng
+soán
+nịch
+nhó
+thăn
+nũng
+giụa
+moong
+khêu
+kếch
+dòi
+nhũn
+vẻn
+nhữ
+cuỗm
+chửng
+ngăm
+boa
+ươn
+mớm
+quơ
+giương
+xổm
+lìm
+lèn
+điếng
+phớ
+gẫu
+phỉ
+ngổ
+loé
+diếm
+dậm
+đăm
+chin
+nhúm
+kị
+guy
+dửng
+trượng
+thẩn
+nhuỵ
+giập
+cói
+bỉu
+xơi
+thoăn
+phẩy
+nhiếc
+duật
+ríu
+hênh
+truồng
+nhum
+nhom
+nia
+gợn
+soa
+giòi
+tửng
+khuây
+kiếng
+dụi
+nệ
+liệm
+rọ
+nhú
+hiềm
+ghim
+thiển
+đồm
+khom
+dùm
+vối
+lè
+khứa
+èo
+doi
+đoa
+mạp
+địu
+cở
+vạm
+thó
+tời
+miều
+lọng
+khèn
+rua
+quạ
+thược
+rắm
+ngụp
+trườn
+thím
+cộp
+thồ
+gằn
+bấn
+xị
+rớ
+huổi
+cùm
+phủi
+ngoằn
+khà
+hủa
+sổng
+peng
+nghèn
+xuề
+nghét
+lới
+khố
+xệch
+rữa
+nghếch
+rúm
+ngoạ
+quềnh
+mẫm
+khong
+ăm
+phò
+mấp
+kiết
+xình
+lé
+gàn
+đảnh
+ậm
+trối
+nhăm
+nạng
+dòm
+chề
+xỉa
+rắp
+viển
+uột
+mợ
+bủn
+thằn
+phăn
+oánh
+lững
+lém
+gôn
+ang
+nạm
+kiền
+bân
+rĩ
+ngoe
+lổng
+hơ
+vượn
+thoang
+bõ
+thòng
+som
+giậm
+réo
+nịu
+muốt
+lia
+ruồng
+hĩnh
+xum
+tráp
+toàng
+mành
+mòi
+lúm
+gí
+xuyết
+phới
+chúi
+tàm
+sũng
+phên
+nhử
+rỏi
+hu
+trảm
+hập
+nhe
+meo
+gọng
+nhễ
+truông
+seng
+rủng
+vạng
+sền
+phầm
+xổi
+cũi
+trọi
+phia
+loẹt
+truất
+suý
+mởn
+giốc
+bứng
+vuột
+phay
+ờ
+khứu
+kệch
+xoã
+rỉnh
+nhép
+khản
+dổi
+chạng
+bẵm
+xồng
+phây
+mom
+lẽn
+bẽn
+vanh
+pen
+nhông
+nhẹm
+leng
+kền
+dề
+vè
+lẩy
+rộc
+nỏ
+lởm
+tụm
+nõn
+ngố
+vẻo
+rích
+nhẻm
+ghiền
+đở
+cóng
+rặn
+rám
+oang
+nhành
+chứt
+xú
+thốc
+thãi
+rường
+luộm
+ghì
+phẩn
+pá
+nhèm
+thiểm
+liễn
+choán
+bớp
+tớp
+sớ
+mủi
+đọt
+cườm
+chỏng
+toi
+tịt
+guộc
+đìu
+đên
+đốp
+khuy
+chiền
+xúng
+vê
+vẳng
+tréo
+keng
+hão
+cheng
+í
+dom
+thuộm
+sỏ
+sậy
+lú
+hớp
+sém
+sác
+nhạnh
+ngộp
+mọ
+gun
+xoàn
+vằng
+toạc
+rí
+rằn
+ngái
+xuý
+xính
+tuốc
+thây
+qúa
+đẫy
+trớn
+hòi
+sờn
+tuềnh
+gột
+lét
+rin
+mọn
+tâng
+mẹt
+húi
+buột
+tray
+quẳng
+ngâu
+duân
+thom
+ngỗ
+nghệch
+xôm
+phang
+ọp
+đon
+toáng
+thoan
+sái
+nhởn
+lúi
+rọc
+gạn
+ẹp
+dững
+trụng
+thều
+rạo
+nịt
+ngoèo
+mèn
+dực
+dõng
+làu
+kin
+duyện
+đung
+ngoạc
+tút
+trờ
+sì
+cót
+vợi
+nhao
+nhặng
+chởm
+bươu
+xang
+vược
+vam
+ngấp
+xuê
+nhố
+bìm
+véo
+vảng
+thàng
+sồi
+queo
+khuâng
+ươi
+ngụa
+ngoáy
+ỉm
+hậm
+gằm
+túa
+sẩm
+rọt
+khuông
+huyến
+xắc
+soạng
+piêu
+đật
+đẫn
+tủm
+rủn
+choẹ
+xiêng
+viềng
+tằng
+pó
+xoe
+ngắc
+lởi
+hom
+ã
+sồng
+rụm
+páo
+nhảu
+khựng
+kháp
+hoác
+thọc
+rắt
+giắt
+đỏng
+tỉm
+lói
+khau
+trùn
+trạo
+soóc
+ới
+kẻng
+sượt
+qủa
+hỏn
+chẹt
+bện
+téc
+sõng
+rén
+nhùng
+gầu
+dặc
+vức
+phơ
+nhõng
+kềm
+rền
+nguệch
+khưu
+khấp
+khảnh
+hựu
+tược
+sia
+đin
+sượng
+kháo
+xẹt
+trẩy
+rón
+quẫy
+tênh
+iu
+rịn
+phạc
+pác
+đuống
+quết
+phấp
+pê
+lớ
+hâu
+bòng
+thin
+sui
+quắm
+phiu
+tiêng
+róc
+rận
+muông
+muồng
+gui
+gong
+đúm
+bẳn
+xốn
+loe
+bẽo
+vấy
+truật
+sõi
+nẩy
+xút
+toe
+tẹo
+sật
+rây
+nui
+lấu
+dạc
+sếu
+phắt
+lèng
+khuỵu
+cắc
+ruộm
+kheo
+hóp
+gièm
+bốp
+bĩ
+xín
+rưởi
+phạ
+nhổm
+lống
+gú
+gá
+vưởng
+rúc
+quặt
+đủi
+chấy
+chạn
+són
+giát
+dềnh
+trét
+roe
+panh
+lến
+lằng
+dộng
+đít
+sển
+lọn
+khuyển
+đủng
+cợn
+choai
+bươm
+nhớp
+gỏng
+giựt
+càu
+tuẫn
+tẹt
+ré
+nhíp
+chụm
+vờn
+sụa
+moan
+choé
+chỏ
+cham
+chá
+nhùn
+lất
+dèm
+vích
+thút
+rum
+nhời
+khoắng
+heng
+xeo
+xè
+rá
+phử
+phiêng
+ních
+nghịu
+lum
+lữa
+gụ
+chểnh
+chái
+búc
+rệu
+rái
+quèn
+ngấu
+gon
+đừ
+đĩ
+chộn
+tồng
+núng
+nót
+lua
+khạo
+giót
+đòng
+đẹt
+trĩnh
+phễu
+nầy
+khểnh
+giê
+cụng
+xầm
+lơn
+thuồng
+thoá
+síp
+phau
+pắc
+nhẩn
+nghỉm
+hẩm
+bợm
+vồn
+sín
+ry
+phơn
+pe
+ngạnh
+nạnh
+lườm
+lảm
+xuyền
+trui
+quýnh
+bấc
+nâm
+chuôi
+bụp
+rờn
+què
+nhắt
+mèm
+láu
+choang
+phổng
+nhụt
+nện
+lẹm
+khênh
+trẽn
+sướt
+lảo
+hoanh
+bịn
+bậu
+vùn
+trư
+quít
+ngoắt
+ỉn
+chành
+thếp
+nhuốc
+ngoằng
+ngheo
+cáy
+buy
+lờn
+toét
+nhoẻn
+kít
+huyễn
+hem
+gía
+cùa
+ướm
+quao
+nhim
+gô
+đưng
+bĩu
+sẩn
+hin
+sây
+lỏm
+giầu
+xuyệt
+vãnh
+nhón
+lườn
+khú
+ưỡn
+ruỗng
+pủa
+chếch
+thuân
+nọng
+nèo
+hiều
+dúa
+xìu
+rướn
+quàn
+rùm
+ngồn
+lìn
+khoáy
+chúm
+bơn
+khốm
+cỏn
+choảng
+chạch
+váp
+sãi
+háu
+chuý
+chím
+yêm
+xán
+thũng
+tễnh
+khính
+gở
+đễ
+bía
+sụ
+quị
+nhít
+mán
+huôi
+huơ
+hắng
+ghìm
+dôm
+chon
+bẹt
+xạc
+ủa
+truyển
+sấn
+pò
+nhiêm
+mồn
+lưa
+líp
+gày
+đớp
+đì
+xịch
+tuận
+phụt
+oẹ
+oách
+nặm
+m��
+mội
+hển
+hằm
+duẫn
+dũa
+dô
+diếc
+dến
+cọt
+bổi
+vể
+tó
+ình
+dim
+mủn
+khướt
+khoeo
+khệ
+chễ
+cạ
+sốp
+rốc
+nhẹp
+gin
+chòng
+chỏm
+xường
+vập
+sệ
+ruy
+rức
+quẻ
+nghen
+khon
+huýt
+hổn
+xức
+quĩ
+miêng
+khẩm
+eng
+chã
+bệch
+xúp
+roa
+rẻo
+ngoải
+mấn
+chẩm
+thoong
+nhíu
+muôi
+mõ
+lịnh
+ghè
+vao
+ui
+nẫu
+mưng
+khuỵ
+thừ
+thia
+ngáng
+lom
+dáo
+dánh
+phừng
+nguậy
+hý
+huyn
+dể
+chổm
+thõng
+lũa
+khoen
+khều
+giạ
+dẹo
+sởn
+rem
+huồi
+gông
+cũn
+vởn
+trợt
+thết
+ne
+nau
+nấn
+hẵng
+gừa
+giăm
+chếnh
+rương
+nhon
+lếch
+ghẽ
+đễnh
+bớ
+bạng
+thiềng
+tệp
+sộp
+sẻn
+phướng
+phôn
+păng
+luyên
+lịa
+cỡm
+bâu
+tạch
+sít
+lởn
+giảo
+dơn
+ché
+xố
+xắp
+tiển
+thếch
+rờ
+pang
+nhương
+nguýt
+diềm
+súa
+ngợ
+ngằn
+mén
+lử
+loong
+khẩy
+chèm
+xởi
+liếp
+dởm
+bum
+xạo
+vốc
+tron
+triện
+rề
+lảnh
+khiu
+hống
+try
+liến
+ị
+hum
+xiềng
+rười
+lúp
+hõm
+dạm
+tiệu
+thững
+tém
+nốc
+khảng
+cùn
+phẩu
+giáy
+đụn
+đét
+chò
+xược
+vẩu
+rù
+nhày
+mòng
+đoảng
+dia
+cụp
+chổng
+biều
+xồm
+vẹm
+tứa
+thùa
+nĩa
+gâm
+é
+vói
+soản
+rệp
+phệ
+ngộn
+xoàng
+tiễu
+rặt
+ngoắc
+dy
+chạ
+sã
+oa
+nhắng
+ngoạm
+mủng
+mím
+mệ
+hụ
+dịnh
+càm
+bịnh
+xếch
+trố
+tiếm
+thịch
+quở
+pả
+mớn
+lồm
+chĩnh
+chẽm
+tun
+thểu
+pồ
+khén
+híp
+đúa
+dác
+chẹn
+luốc
+khù
+cồm
+chổ
+chét
+chêm
+chễm
+chệ
+toẹt
+nhéo
+lổn
+hia
+đển
+phỉnh
+phắc
+ngồng
+lốm
+xèng
+xấc
+sồ
+niết
+nhiểu
+nạy
+liệp
+huẩn
+hoáy
+xửa
+xẻn
+tuyn
+mém
+láp
+din
+bứa
+ục
+trươi
+tở
+suồng
+nọt
+khàng
+hoản
+dụa
+vư
+nhụa
+nhoẹt
+mứa
+giong
+dái
+chù
+chõng
+suê
+sũ
+nhiện
+loằng
+è
+coen
+bùm
+vục
+thuôn
+thụng
+liệng
+hếch
+gường
+dỏ
+chũ
+bỡn
+bím
+tưa
+trành
+thoàn
+sù
+rày
+nguẩy
+choạc
+boe
+triễn
+thượt
+rùi
+nhẩy
+giội
+duối
+dóc
+yều
+xái
+tòm
+táu
+hoạnh
+hiêu
+deng
+triêm
+thúi
+pí
+gừ
+chía
+xửng
+váo
+trạnh
+tọt
+thòm
+rướm
+ót
+nhèo
+néo
+hoắt
+ẻo
+choe
+biễn
+vặc
+truyên
+tồ
+rom
+riếu
+quơn
+phạn
+nghẽo
+khọt
+hoắm
+gum
+đười
+đót
+diến
+boi
+báy
+xẩu
+vầu
+truồi
+tậm
+nhín
+lị
+khạp
+đỏi
+dợ
+đia
+chiển
+xuẩn
+úm
+ời
+nhuân
+nhiệp
+meng
+khắn
+ẹo
+deo
+dằm
+xổng
+xoạc
+xiu
+rống
+ró
+mẩm
+lỏn
+lếp
+kím
+gôm
+dỏng
+vỳ
+thụp
+sỗ
+nộn
+loàn
+hặc
+diu
+bải
+ại
+xoảng
+vống
+toong
+sòm
+sậu
+rui
+ràm
+pheng
+noé
+nhin
+lếu
+khoàng
+ỉa
+hịch
+bường
+thiềm
+táy
+sịt
+pờ
+nhách
+nẫng
+đoàng
+chỉa
+trập
+sểnh
+rếu
+pì
+phét
+méc
+hiềng
+cun
+chiện
+ùm
+thạp
+quắc
+pênh
+nhong
+nhiễn
+nghễ
+xía
+ùng
+tọc
+sèo
+ngươn
+kía
+hợm
+hoải
+xênh
+triêu
+rộm
+nghệp
+nẫm
+loảng
+dem
+cửi
+bụa
+ực
+thuyến
+thum
+sừ
+pẩu
+đíp
+biêu
+ảng
+vỵ
+vố
+véc
+só
+rạm
+pai
+nớ
+nắc
+loai
+hoẹ
+đọi
+cỡn
+chồ
+ụp
+quạch
+phọt
+oặt
+luyn
+huý
+đù
+dồ
+chặp
+ầu
+thướng
+pán
+giắc
+coong
+sớt
+pông
+nim
+lôm
+hoảnh
+xùa
+xoành
+xoạch
+xộ
+xan
+tuynh
+tuôi
+ròn
+py
+pú
+phướn
+phịu
+pheo
+noong
+nhiệu
+ngoã
+ỉu
+huỵch
+huých
+hó
+doe
+choè
+bựa
+xạm
+vểnh
+tỏng
+thòn
+sòn
+quờ
+khuổi
+hùi
+diểm
+chạo
+chằn
+cảu
+via
+vện
+triên
+trẹo
+trảo
+tâu
+ruệ
+nhột
+gau
+đợ
+dìn
+chặc
+xồ
+thuẩn
+sại
+phéng
+ộp
+nhỏm
+nguyến
+lìu
+bíp
+thé
+sạo
+quạng
+púng
+poe
+pam
+lạo
+kiếu
+huênh
+choẹt
+chõ
+chẩy
+bép
+bau
+xàm
+trum
+thớm
+múp
+lỷ
+khoèo
+gơ
+dợt
+coóng
+vệnh
+thưng
+thèn
+thát
+rơn
+nọi
+luynh
+khum
+hoạc
+đẻn
+bam
+xự
+voóc
+tuối
+tực
+thườn
+thoắng
+rịt
+phịch
+phăm
+oe
+oẳn
+nhoạng
+ngòn
+liềng
+gioi
+giâm
+gảy
+dúng
+dồng
+chượp
+bôm
+xềnh
+truyến
+trôn
+trấp
+teng
+quăn
+pín
+nhủi
+nhẫy
+ngủn
+nả
+mung
+khì
+hây
+goi
+giôn
+chẩu
+cẫng
+xam
+thếnh
+tềnh
+sẵng
+noe
+hiêng
+giấp
+tren
+tiu
+rế
+nhiền
+duỵ
+dớn
+cúa
+cồi
+chẫm
+ành
+yêng
+xề
+uyn
+tủn
+trẹm
+tổi
+thọt
+rấm
+qu
+pui
+nhinh
+mừ
+lựng
+khun
+ghiếc
+ên
+đũi
+cọn
+chuỳ
+chô
+chiếng
+xoẹt
+tiểng
+rảng
+quện
+ồm
+nhia
+nghiêu
+mơi
+mầy
+kếp
+cum
+vưu
+voa
+tòn
+soan
+rều
+pồn
+khé
+huỳ
+giậu
+ễnh
+đạng
+cờn
+bẫng
+xên
+ừng
+tễu
+sún
+siên
+siêm
+sẳn
+quặp
+pua
+õng
+măm
+liện
+khào
+đổm
+đàu
+bằn
+xiều
+vẳn
+triêng
+thoã
+thiến
+thè
+ỏn
+ỡm
+ọc
+lun
+khịt
+khắng
+giu
+gải
+doàn
+điễn
+dịa
+chẹo
+bửng
+bẳng
+vôn
+uân
+tré
+thíp
+soạt
+rổn
+rỏ
+riệu
+phum
+oát
+nuy
+nì
+nhồng
+ngưởi
+ngầy
+khin
+gườm
+dình
+dìa
+đểm
+choong
+chản
+biêng
+béc
+ỹ
+vim
+ường
+thuyện
+thẻn
+sột
+sếnh
+rỉa
+phóc
+ồng
+ỏ
+nhều
+nhẳng
+nháng
+giau
+dóng
+đén
+bộng
+xuông
+xun
+tuồi
+tõm
+tịu
+thuổng
+rụp
+ràn
+pố
+phảm
+niễn
+ngườm
+huội
+ghèn
+doa
+dờ
+chọ
+bặc
+ua
+quảy
+pong
+phính
+ngóp
+náng
+moọc
+mếch
+hoằn
+gịn
+dăng
+côm
+chẻo
+bíu
+ây
+thạt
+sùm
+sồn
+sặt
+ỏm
+niền
+nác
+mản
+lụ
+lộp
+khuỷ
+hoong
+hổm
+hảng
+ét
+éc
+đũng
+đuề
+đú
+đôm
+cháng
+bộp
+quắn
+phám
+nơn
+nhiếu
+nhẫm
+mý
+lọp
+lòm
+loãn
+lèm
+kiễng
+khúm
+ịch
+hử
+hiệt
+gioăng
+dẫy
+dáy
+đân
+chận
+xừ
+tuếch
+trỉa
+thàn
+sựt
+rúi
+pốt
+phoi
+phềnh
+noa
+lẹo
+léng
+hướm
+dừ
+củm
+têm
+sơm
+sạnh
+săng
+pom
+ón
+oắc
+nhiển
+khọm
+kẽo
+hiệm
+đui
+dớp
+choọng
+bắng
+xìn
+vọp
+vình
+viểm
+rôi
+rạnh
+qúi
+pon
+khoằm
+hảnh
+gụi
+đỏm
+dẽ
+chụt
+chỏi
+chẹp
+bỗ
+biễu
+uỳnh
+tũn
+trự
+trệu
+tìn
+têu
+tẽn
+tạn
+sộc
+sẹc
+pía
+nhiểm
+ngứ
+mum
+măn
+lôn
+lòn
+hự
+gúp
+gòi
+dòn
+đị
+chựng
+chầy
+biêm
+xoi
+thồng
+rẹt
+quì
+ơm
+nủ
+khoắt
+kẹ
+đụp
+dử
+điến
+bỏi
+béng
+vọn
+thởi
+thẳn
+tếch
+sụng
+sằn
+rài
+páp
+nình
+nhoét
+nhoáy
+nhẩu
+nghin
+nghiễn
+nãi
+muỗm
+khộp
+khấy
+khằm
+đưới
+cạc
+bảm
+xớt
+vường
+trược
+trảy
+trẫm
+tiễng
+sỉa
+rũa
+roong
+riệp
+phài
+pách
+nhò
+nhịt
+lớt
+lớm
+lổm
+liếu
+khịa
+giút
+ẹ
+dun
+đõ
+cón
+chẽn
+chảng
+cặt
+biềm
+xeng
+trính
+tộp
+sằng
+oản
+nhội
+nhãi
+ngộc
+lỹ
+kều
+hỵ
+điêm
+dếnh
+dận
+chiểm
+bủng
+ần
+ròm
+poọng
+phốc
+ợt
+nuốc
+nởi
+ngam
+mạy
+luổng
+lóp
+liễng
+kiệp
+goa
+gâu
+duôi
+chượt
+chẳn
+ẳng
+xếnh
+vơn
+vổ
+tọng
+thài
+quẽ
+pim
+phon
+phiểu
+phến
+phéc
+pết
+pạc
+ọ
+nự
+nị
+nhoay
+nhải
+mự
+mên
+mế
+loóng
+khìn
+khìa
+khẹt
+khạng
+hoát
+hiết
+gới
+đượng
+độp
+dộ
+đeng
+dất
+đạch
+cớm
+choa
+bương
+bìn
+úi
+tếnh
+tén
+suyển
+soọng
+phưởng
+phuông
+phưng
+nú
+nhây
+nẩu
+mằng
+khành
+gớp
+dên
+dẩu
+củn
+cõn
+chuệch
+choá
+bợt
+bẹo
+bẩu
+bạnh
+tủng
+trằm
+thềnh
+thẩu
+thạ
+rạt
+qun
+pỉnh
+phỏm
+phèng
+pèng
+nhậy
+ngúm
+lủ
+liểng
+khiệm
+khễnh
+hầy
+gọ
+dỵ
+đư
+dợn
+cõ
+chũm
+chệnh
+cấc
+xư
+vửa
+vồm
+vên
+ự
+trơi
+tởn
+thẩy
+sổi
+quớ
+péo
+pềnh
+pạ
+oong
+nhuy
+nhúa
+nhự
+ngớn
+ngởi
+nằn
+moa
+mìu
+mín
+lủn
+loàng
+liễm
+khướu
+khừa
+khị
+hừ
+hón
+giúi
+duyền
+duông
+dứ
+choắt
+chậy
+xướt
+trúm
+tría
+trỉ
+sơi
+sỉn
+sem
+pun
+pốc
+phốp
+nen
+khiểu
+khệnh
+húa
+hể
+hế
+gười
+gẩm
+dủ
+đớt
+chũng
+chón
+chéng
+bưởng
+biểm
+xai
+thấn
+tảy
+quài
+pứa
+pư
+pịa
+phời
+pém
+nưng
+nhướn
+nhê
+ngoạt
+nghỉu
+nậy
+mựng
+mịnh
+lụn
+lỉu
+kheng
+hèm
+hẩy
+giẫy
+cổn
+chìu
+buối
+bếnh
+báp
+vắp
+ún
+từa
+tóng
+rân
+phạch
+nhịu
+ngúng
+ngảo
+mem
+mảo
+luồi
+lũm
+lợt
+khồng
+khẹc
+kên
+goong
+giô
+gấy
+dốp
+đóp
+coe
+bót
+bển
+ản
+xia
+veng
+vảo
+ướng
+ử
+tròm
+troi
+trển
+trèm
+tràu
+tiểm
+thá
+sịa
+sênh
+rửng
+roam
+qùa
+phực
+phè
+ỏng
+nõ
+nhưn
+nhợ
+ngẩy
+mừu
+mụi
+miểng
+lừu
+khươi
+khừng
+khoòng
+khim
+khày
+khẳm
+hượu
+hởn
+hoét
+giay
+gẩy
+dút
+dướng
+đứ
+xùi
+xẹo
+xân
+truỷ
+triến
+tiềng
+tểnh
+rụa
+quèo
+phinh
+phấm
+oét
+nười
+nống
+niêng
+ní
+nhừng
+nhay
+ngoác
+nèn
+mính
+loỏng
+khưn
+ía
+hỳ
+hềnh
+hệch
+gim
+gié
+diểu
+điếp
+đệp
+chốp
+chốm
+chom
+xim
+xiểng
+ượng
+trỗ
+tèm
+tắng
+rốp
+rổi
+quạu
+pớn
+phộc
+nường
+nhúp
+nhuật
+nguốn
+ngưới
+nguầy
+nân
+mốn
+mởi
+lốn
+lọm
+khiện
+khặng
+họt
+gìa
+dỹ
+dum
+dín
+dắc
+chược
+chời
+chiệc
+chềnh
+bụm
+biệc
+xoọng
+xon
+ườn
+ủn
+tuyễn
+trụa
+trây
+toá
+thuông
+thên
+sềnh
+sất
+rịp
+phuộc
+phoóc
+pè
+pài
+óp
+nủa
+nhôn
+nhiềm
+nghéo
+luý
+lụm
+khuống
+hỗng
+gữa
+goòng
+gám
+êu
+dức
+diển
+ầng
+xêm
+vự
+ược
+trũi
+thoán
+thín
+thìm
+tèn
+súi
+soọc
+rẳng
+quặm
+phậm
+ò
+nùa
+nư
+nốp
+níp
+niệp
+nhùi
+nhóp
+nhợi
+nhìu
+ngốt
+ngồ
+nảo
+miềng
+mâng
+lẳn
+lậm
+khường
+huần
+hía
+háp
+gậm
+ềnh
+duyển
+dớt
+doang
+doai
+điệm
+đêu
+chuổng
+chẳm
+bươi
+ay
+xỉnh
+xiếu
+vèn
+trốc
+thoản
+soong
+sím
+pút
+púa
+phâng
+pếu
+pét
+pau
+pấc
+nun
+nóp
+nính
+nhụi
+nhốm
+ngượi
+nghẹ
+ngau
+ngãng
+ngàm
+nật
+mim
+lưn
+hị
+giạt
+ghéc
+đún
+dú
+dôn
+dọi
+dít
+diếu
+coa
+chưn
+chịa
+chão
+bư
+bợn
+xọp
+xện
+vùa
+trủng
+troóc
+trỏng
+tợi
+tỉu
+tều
+tếp
+tẹc
+tãng
+sỳ
+sịch
+sè
+rột
+roát
+rầng
+quóc
+quịt
+quéo
+piêng
+phôm
+phành
+nột
+nhứng
+nhồm
+nhoàm
+nhền
+ngong
+míp
+lệt
+lế
+khấc
+hũng
+háy
+hản
+gỉa
+dười
+dọ
+dích
+dãnh
+dắm
+cũa
+cọi
+chiếp
+chem
+buỗi
+xờm
+tướt
+trun
+trộc
+thuỳnh
+thựu
+thửng
+thôm
+síu
+sảm
+rứ
+piềng
+phâu
+nhù
+nhệch
+nguyển
+ngữa
+ngổm
+nghép
+nganh
+mủm
+luyệt
+lẻm
+lày
+khẻm
+hưởn
+hín
+gờn
+giộp
+gát
+dợm
+đém
+dáu
+cứt
+choóng
+chôi
+chắm
+chám
+chậc
+buôi
+bèm
+ậc
+xủn
+xiệc
+xằng
+xăn
+vởi
+vọ
+vảnh
+ười
+trụt
+trâng
+tói
+thọn
+thờm
+séng
+são
+ruộc
+roẹt
+riệt
+quày
+pớ
+pít
+phún
+phư
+phóp
+phoong
+phoanh
+phếch
+pản
+pằn
+oạp
+nững
+noan
+nhiu
+nhìa
+nguây
+ngỗn
+nghệm
+nghế
+ngật
+nềnh
+mợi
+mõi
+mẹc
+luệ
+lỉ
+khỏ
+khảy
+kẹm
+hươn
+huối
+huận
+hừa
+hữ
+hính
+hện
+ghếch
+ệch
+đưỡng
+đườn
+doái
+dẩn
+cời
+chuôn
+chươi
+chớn
+câng
+cặng
+bứu
+bưa
+xững
+xáp
+vủ
+voòng
+vâm
+trới
+trến
+tọ
+thẽ
+tặn
+sừn
+sôn
+riều
+pung
+phảo
+pát
+oem
+num
+noọc
+nhoong
+ngừm
+nghẻ
+luốt
+lẻng
+khoài
+khếnh
+khề
+khầu
+hẹo
+hẩu
+ẻm
+đươi
+dún
+dỡn
+doãng
+điêng
+đếu
+dão
+dách
+cuộng
+chừn
+choay
+cẳm
+buội
+biếp
+bẻo
+ạnh
+yểng
+xụi
+xục
+xờ
+xĩnh
+xéc
+xẵng
+vều
+uýnh
+uỵch
+uôn
+tuyện
+truột
+trức
+trớt
+trổng
+trịt
+tria
+tọp
+thủm
+thim
+théo
+tháy
+sý
+sáy
+rún
+roóng
+róm
+rến
+rấy
+quyn
+poọc
+phỗng
+phồ
+phặng
+pé
+pầng
+păn
+nuýp
+nò
+nhõn
+nhoạm
+ngoẹo
+nghín
+nghì
+mứ
+mọp
+mềnh
+mam
+lưởng
+luấn
+lỡm
+lóm
+lản
+khôm
+khật
+khằng
+hụp
+hủm
+hén
+hẵn
+gủi
+gách
+ẻn
+đụt
+dủng
+dừm
+dũi
+duấn
+doong
+đễn
+dèn
+dảm
+cươn
+cuổi
+coỏng
+chũi
+chuếnh
+chều
+xỏn
+vọc
+vạnh
+trẹt
+thẻo
+tẽ
+sổm
+sâng
+roòng
+roàng
+quớt
+quộc
+quào
+pưn
+pùa
+phưỡng
+peo
+pành
+pàng
+pàn
+núa
+ngum
+nghển
+ngạ
+muých
+miểu
+lộm
+khòm
+khặn
+hủng
+hoẵng
+hiêm
+hép
+ghị
+duyết
+dữu
+duế
+duần
+đự
+dộp
+đợn
+doạnh
+đèm
+đẩm
+đách
+cới
+choã
+chiễn
+chẹm
+bỉa
+xoàm
+xớ
+xáy
+xản
+xạch
+ưởng
+tụa
+truyệt
+trốt
+trép
+trể
+trằng
+trài
+tìa
+thứu
+thùm
+thóng
+thão
+sộng
+soàn
+rỹ
+rươm
+riệc
+reng
+rêm
+rấn
+quấc
+quác
+poan
+piu
+phùa
+phiềng
+păm
+ờn
+nuôn
+nừng
+nùi
+noọng
+nỏng
+nờ
+niện
+nhụng
+nhóng
+nhể
+nghể
+ngàu
+mụt
+mươn
+muổi
+lỗng
+khức
+khục
+khụ
+khoách
+khảu
+hún
+hứ
+hạy
+hày
+hăn
+guyên
+gôi
+giớ
+giào
+giằm
+gấn
+đướng
+dót
+dộc
+dớ
+diểng
+diềng
+diền
+dều
+dảo
+dạnh
+cươi
+coóc
+chủa
+chíu
+chếu
+chể
+chặm
+cạng
+buốc
+boăn
+âng
+àm
+xượt
+xuốt
+xườn
+xựng
+xuấn
+xợt
+xơm
+xốm
+xòm
+xoèn
+xoanh
+xẹc
+xậy
+xảnh
+uộng
+uôm
+ừa
+tưu
+tứng
+truốt
+trũ
+trợi
+tró
+triểu
+treng
+trẩu
+toòng
+toạn
+toài
+thươn
+thưn
+thơn
+thỏn
+thoạn
+thịu
+thén
+thém
+thặt
+tạu
+tấng
+sụm
+sội
+sịp
+sị
+sí
+sết
+sẻng
+sện
+rỳ
+rược
+rự
+rỡn
+roãn
+rép
+rẹc
+rật
+quố
+quo
+quin
+pọng
+piên
+phươn
+phọ
+phiếp
+phéo
+phâm
+pây
+õn
+ộ
+nhộp
+nhoóng
+nhoằng
+nhịa
+nhạp
+ngựu
+nguồi
+ngủng
+nênh
+nàu
+nẳng
+nẫn
+mớp
+mẳn
+mậm
+luýnh
+lươm
+loắt
+lền
+kíu
+khụm
+khủa
+khừ
+khủ
+khoong
+khỏng
+khộng
+khỏn
+khoàn
+khẳn
+khắm
+khài
+kẹn
+kến
+kếm
+ín
+huýnh
+hưỡng
+huổng
+huếch
+hưa
+họi
+hợ
+hèo
+hánh
+gung
+gữ
+giửa
+gioan
+giỡ
+giạm
+ẹt
+ết
+ển
+ẹc
+dước
+dứng
+dui
+đựa
+dờn
+đởm
+đóc
+đoanh
+đoà
+đớ
+diêng
+đía
+đết
+đặp
+dảnh
+cừa
+coan
+chuôm
+chuổi
+chuẫn
+chơm
+chếm
+bửa
+àng
+àn

dataset/data_generation/confusion_set.py ADDED Viewed

	@@ -0,0 +1,262 @@

+# coding: utf8
+import re
+from normalize import chuan_hoa_dau_tu_tieng_viet
+import numpy as np
+from tqdm import tqdm
+import textdistance
+import json
+from copy import copy
+with open("common-vietnamese-syllables.txt", "r", encoding="utf-8") as file:
+    vi_syllables = [line.strip("\n") for line in file.readlines()]
+vi_syllables_new = []
+for syllable in vi_syllables:
+    normalized = chuan_hoa_dau_tu_tieng_viet(syllable)
+    vi_syllables_new.append(normalized)
+regex_nguyen_am_don = "ộ|ặ|ằ|ụ|ầ|a|ũ|á|ể|ỡ|ủ|y|ở|ế|ẵ|ệ|é|ẹ|â|ề|ê|ọ|ờ|ẳ|ợ|ỷ|ữ|ị|e|u|ò|ẫ|i|ỉ|ẩ|ẽ|õ|ỹ|ô|ỵ|ồ|ú|í|ó|ỗ|ã|ẻ|ù|ă|ơ|ứ|ậ|ử|ừ|à|ĩ|ả|ố|ớ|ự|ắ|o|ý|ỳ|ư|ấ|ễ|ạ|ỏ|ổ|è|ì"
+regex_nguyen_am_doi = "uằ|iê|ấu|ượ|ùy|ạy|uỹ|ươ|ỗi|yệ|ụy|ẫy|oà|ái|ói|uồ|uỷ|oỏ|ệu|ue|oi|ậu|oè|uã|ãi|òi|ơi|ựa|ụi|iể|oá|ìa|ĩu|uẹ|ìu|ầu|ỏe|ối|uẳ|ịa|òe|ai|ọe|yể|ày|ỉu|uỵ|uể|óe|ỉa|ũa|ườ|uè|êu|ẹo|uá|ỏi|uấ|ưỡ|ội|au|iề|ửu|ọi|ảu|uẽ|ầy|ẻo|ao|yế|uẻ|uơ|ưở|iế|uở|ịu|ủa|ẫu|uặ|oằ|oò|ạu|uỳ|ạo|oọ|ưa|oẹ|ui|uậ|ủi|áo|óa|ẩu|ảy|oẵ|áu|ựu|uô|ửa|ễu|uâ|oạ|uổ|uê|ùi|ếu|ời|iu|uo|oé|yễ|oẳ|uớ|ay|iễ|ủy|ướ|oó|eo|ũi|oả|ua|ỏa|ấy|uố|èo|oo|úy|ẩy|ồi|yề|ẽo|uẫ|ứu|ãy|ổi|ía|ảo|ué|uờ|ùa|ia|ều|oa|iệ|àu|õa|oắ|uắ|uả|ứa|ởi|ụa|ũy|òa|íu|éo|oã|uă|uộ|ữu|úa|ải|ỡi|ừu|ểu|oe|õi|ọa|ừa|uệ|uý|uó|ào|uà|ây|oă|uạ|ữa|oặ|uy|ợi|uẩ|uỗ|ão|uế|ưu|ửi|ại|âu|ới|uầ|ĩa|úi|oẻ|ôi|ài|uề|yê|ậy|áy"
+regex_nguyen_am_ba = "uỷu|uây|ươu|iệu|yếu|yểu|uyế|uyệ|uyề|ưỡi|uôi|ượi|uổi|oay|uào|iễu|oeo|oèo|uỗi|oai|uấy|oái|uỵu|uyể|uồi|oáy|yều|oẹo|uẫy|ưởi|iểu|uầy|iêu|uối|uyễ|ưới|iều|oài|uao|ươi|yêu|ười|uya|oải|ướu|uội|oại|iếu|ượu|uẩy|uyê|uậy"
+all_phu_am_dau = {'', 'gh', 'q', 'kh', 'p', 'm', 'qu', 'n', 'b', 'g', 't', 'ch', 'th', 'k', 'đ', 'r', 'ph', 'ngh', 'gi', 'tr', 's', 'l', 'h', 'nh', 'c', 'ng', 'd', 'v', 'x'}
+all_phu_am_cuoi = {'', 'ng', 'nh', 't', 'ch', 'c', 'p', 'm', 'k', 'n'}
+all_nguyen_am_don = "ộ|ặ|ằ|ụ|ầ|a|ũ|á|ể|ỡ|ủ|y|ở|ế|ẵ|ệ|é|ẹ|â|ề|ê|ọ|ờ|ẳ|ợ|ỷ|ữ|ị|e|u|ò|ẫ|i|ỉ|ẩ|ẽ|õ|ỹ|ô|ỵ|ồ|ú|í|ó|ỗ|ã|ẻ|ù|ă|ơ|ứ|ậ|ử|ừ|à|ĩ|ả|ố|ớ|ự|ắ|o|ý|ỳ|ư|ấ|ễ|ạ|ỏ|ổ|è|ì".split("|")
+all_nguyen_am_doi = "uằ|iê|ấu|ượ|ùy|ạy|uỹ|ươ|ỗi|yệ|ụy|ẫy|oà|ái|ói|uồ|uỷ|oỏ|ệu|ue|oi|ậu|oè|uã|ãi|òi|ơi|ựa|ụi|iể|oá|ìa|ĩu|uẹ|ìu|ầu|ỏe|ối|uẳ|ịa|òe|ai|ọe|yể|ày|ỉu|uỵ|uể|óe|ỉa|ũa|ườ|uè|êu|ẹo|uá|ỏi|uấ|ưỡ|ội|au|iề|ửu|ọi|ảu|uẽ|ầy|ẻo|ao|yế|uẻ|uơ|ưở|iế|uở|ịu|ủa|ẫu|uặ|oằ|oò|ạu|uỳ|ạo|oọ|ưa|oẹ|ui|uậ|ủi|áo|óa|ẩu|ảy|oẵ|áu|ựu|uô|ửa|ễu|uâ|oạ|uổ|uê|ùi|ếu|ời|iu|uo|oé|yễ|oẳ|uớ|ay|iễ|ủy|ướ|oó|eo|ũi|oả|ua|ỏa|ấy|uố|èo|oo|úy|ẩy|ồi|yề|ẽo|uẫ|ứu|ãy|ổi|ía|ảo|ué|uờ|ùa|ia|ều|oa|iệ|àu|õa|oắ|uắ|uả|ứa|ởi|ụa|ũy|òa|íu|éo|oã|uă|uộ|ữu|úa|ải|ỡi|ừu|ểu|oe|õi|ọa|ừa|uệ|uý|uó|ào|uà|ây|oă|uạ|ữa|oặ|uy|ợi|uẩ|uỗ|ão|uế|ưu|ửi|ại|âu|ới|uầ|ĩa|úi|oẻ|ôi|ài|uề|yê|ậy|áy".split("|")
+all_nguyen_am_ba = "uỷu|uây|ươu|iệu|yếu|yểu|uyế|uyệ|uyề|ưỡi|uôi|ượi|uổi|oay|uào|iễu|oeo|oèo|uỗi|oai|uấy|oái|uỵu|uyể|uồi|oáy|yều|oẹo|uẫy|ưởi|iểu|uầy|iêu|uối|uyễ|ưới|iều|oài|uao|ươi|yêu|ười|uya|oải|ướu|uội|oại|iếu|ượu|uẩy|uyê|uậy".split("|")
+confusion_set = dict()
+special_list = set()
+for syllable in tqdm(vi_syllables_new):
+    # print(syllable)
+    if syllable[0:2] in ["qu", "gi"]:
+        special_list.add(syllable)
+        # print(f"Ignore {syllable}")
+        continue
+    confusion_set[syllable] = dict()
+    syllable_candidates = confusion_set[syllable]
+    syllable_candidates['phu_am_dau'] = set()
+    syllable_candidates['nguyen_am'] = set()
+    syllable_candidates['phu_am_cuoi'] = set()
+    if len(re.findall(regex_nguyen_am_ba, syllable)) != 0:
+        result = re.findall(regex_nguyen_am_ba, syllable)
+        nguyen_am = result[0]
+    elif len(re.findall(regex_nguyen_am_doi, syllable)) != 0:
+        result = re.findall(regex_nguyen_am_doi, syllable)
+        nguyen_am = result[0]
+    elif len(re.findall(regex_nguyen_am_don, syllable)) != 0:
+        result = re.findall(regex_nguyen_am_don, syllable)
+        nguyen_am = result[0]
+    else:
+        raise Exception("Khong co nguyen am")
+    phu_am_dau, phu_am_cuoi = "", ""
+    if len(re.findall(f"(.+){nguyen_am}", syllable)) !=0 :
+        result = re.findall(f"(.+){nguyen_am}", syllable)
+        phu_am_dau = result[0]
+    if len(re.findall(f"{nguyen_am}(.+)", syllable)) !=0 :
+        result = re.findall(f"{nguyen_am}(.+)", syllable)
+        phu_am_cuoi = result[0]
+    ### Error thay đổi phụ âm đầu
+    for candidate in all_phu_am_dau:
+        if "".join([candidate, nguyen_am, phu_am_cuoi]) in vi_syllables_new:
+            syllable_candidates['phu_am_dau'].add("".join([candidate, nguyen_am, phu_am_cuoi]))
+    ### Error thay đổi nguyên âm
+    all_nguyen_am = all_nguyen_am_don + all_nguyen_am_doi + all_nguyen_am_ba
+    for candidate in all_nguyen_am:
+        if "".join([phu_am_dau, candidate, phu_am_cuoi]) in vi_syllables_new:
+            syllable_candidates['nguyen_am'].add("".join([phu_am_dau, candidate, phu_am_cuoi]))
+    ### Error thay đổi phụ âm cuối
+    for candidate in all_phu_am_cuoi:
+        if "".join([phu_am_dau, nguyen_am, candidate]) in vi_syllables_new:
+            syllable_candidates['phu_am_cuoi'].add("".join([phu_am_dau, nguyen_am, candidate]))
+for syllable in tqdm(special_list):
+    if len(re.findall(regex_nguyen_am_don, syllable)) > 1:
+        phu_am_dau = syllable[0:2]
+        remained = syllable[2:]
+    else:
+        phu_am_dau = syllable[0]
+        remained = syllable[1:]
+    confusion_set[syllable] = dict()
+    syllable_candidates = confusion_set[syllable]
+    syllable_candidates['phu_am_dau'] = set()
+    syllable_candidates['nguyen_am'] = set()
+    syllable_candidates['phu_am_cuoi'] = set()
+    if len(re.findall(regex_nguyen_am_ba, remained)) != 0:
+        result = re.findall(regex_nguyen_am_ba, remained)
+        nguyen_am = result[0]
+    elif len(re.findall(regex_nguyen_am_doi, remained)) != 0:
+        result = re.findall(regex_nguyen_am_doi, remained)
+        nguyen_am = result[0]
+    elif len(re.findall(regex_nguyen_am_don, remained)) != 0:
+        result = re.findall(regex_nguyen_am_don, remained)
+        nguyen_am = result[0]
+    else:
+        nguyen_am, phu_am_cuoi = "", ""
+    phu_am_cuoi = ""
+    if nguyen_am != "" and len(re.findall(f"{nguyen_am}(.+)", remained)) !=0 :
+        result = re.findall(f"{nguyen_am}(.+)", remained)
+        phu_am_cuoi = result[0]
+    ### Error thay đổi phụ âm đầu
+    for candidate in all_phu_am_dau:
+        if "".join([candidate, nguyen_am, phu_am_cuoi]) in vi_syllables_new:
+            syllable_candidates['phu_am_dau'].add("".join([candidate, nguyen_am, phu_am_cuoi]))
+    ### Error thay đổi nguyên âm
+    all_nguyen_am = all_nguyen_am_don + all_nguyen_am_doi + all_nguyen_am_ba
+    for candidate in all_nguyen_am:
+        if "".join([phu_am_dau, candidate, phu_am_cuoi]) in vi_syllables_new:
+            syllable_candidates['nguyen_am'].add("".join([phu_am_dau, candidate, phu_am_cuoi]))
+    ### Error thay đổi phụ âm cuối
+    for candidate in all_phu_am_cuoi:
+        if "".join([phu_am_dau, nguyen_am, candidate]) in vi_syllables_new:
+            syllable_candidates['phu_am_cuoi'].add("".join([phu_am_dau, nguyen_am, candidate]))
+for key in tqdm(confusion_set.keys()):
+    for key_2_level in confusion_set[key].keys():
+        try:
+            confusion_set[key][key_2_level].remove(key)
+        except:
+            pass
+for key in tqdm(confusion_set.keys()):
+    for key_2_level in confusion_set[key].keys():
+        candidates_to_remove = []
+        for candidate in confusion_set[key][key_2_level]:
+            similarity = textdistance.damerau_levenshtein.normalized_similarity(key, candidate)
+            if similarity < 0.5:
+                candidates_to_remove.append(candidate)
+        for candidate in candidates_to_remove:
+            confusion_set[key][key_2_level].remove(candidate)
+keyboard_neighbor = {'a': 'áàảãạ',
+ 'ă': 'ắằẳẵặ',
+ 'â': 'ấầẩẫậ',
+ 'á': 'aàảãạ',
+ 'à': 'aáảãạ',
+ 'ả': 'aáàãạ',
+ 'ã': 'aáàảạ',
+ 'ạ': 'aáàảã',
+ 'ắ': 'ăằẳẵặ',
+ 'ằ': 'ăắẳẵặ',
+ 'ẳ': 'ăắằẵặ',
+ 'ặ': 'ăắằẳẵ',
+ 'ẵ': 'ăắằẳặ',
+ 'ấ': 'âầẩẫậ',
+ 'ầ': 'âấẩẫậ',
+ 'ẩ': 'âấầẫậ',
+ 'ẫ': 'âấầẩậ',
+ 'ậ': 'âấầẩẫ',
+ 'e': 'èéẻẽẹ',
+ 'é': 'eèẻẽẹ',
+ 'è': 'eéẻẽẹ',
+ 'ẻ': 'eéèẽẹ',
+ 'ẽ': 'eéèẻẹ',
+ 'ẹ': 'eéèẻẽ',
+ 'ê': 'ếềểễệ',
+ 'ế': 'êềểễệ',
+ 'ề': 'êếểễệ',
+ 'ể': 'êếềễệ',
+ 'ễ': 'êếềểệ',
+ 'ệ': 'êếềểễ',
+ 'i': 'íìỉĩị',
+ 'í': 'iìỉĩị',
+ 'ì': 'iíỉĩị',
+ 'ỉ': 'iíìĩị',
+ 'ĩ': 'iíìỉị',
+ 'ị': 'iíìỉĩ',
+ 'o': 'òóỏọõ',
+ 'ó': 'oòỏọõ',
+ 'ò': 'oóỏọõ',
+ 'ỏ': 'oóòọõ',
+ 'õ': 'oóòỏọ',
+ 'ọ': 'oóòỏõ',
+ 'ô': 'ốồổỗộ',
+ 'ố': 'ôồổỗộ',
+ 'ồ': 'ôốổỗộ',
+ 'ổ': 'ôốồỗộ',
+ 'ộ': 'ôốồổỗ',
+ 'ỗ': 'ôốồổộ',
+ 'ơ': 'ớờởợỡ',
+ 'ớ': 'ơờởợỡ',
+ 'ờ': 'ơớởợỡ',
+ 'ở': 'ơớờợỡ',
+ 'ợ': 'ơớờởỡ',
+ 'ỡ': 'ơớờởợ',
+ 'u': 'úùủũụ',
+ 'ú': 'uùủũụ',
+ 'ù': 'uúủũụ',
+ 'ủ': 'uúùũụ',
+ 'ũ': 'uúùủụ',
+ 'ụ': 'uúùủũ',
+ 'ư': 'ứừữửự',
+ 'ứ': 'ưừữửự',
+ 'ừ': 'ưứữửự',
+ 'ử': 'ưứừữự',
+ 'ữ': 'ưứừửự',
+ 'ự': 'ưứừữử',
+ 'y': 'ýỳỷỵỹ',
+ 'ý': 'yỳỷỵỹ',
+ 'ỳ': 'yýỷỵỹ',
+ 'ỷ': 'yýỳỵỹ',
+ 'ỵ': 'yýỳỷỹ',
+ 'ỹ': 'yýỳỷỵ'}
+pattern = "(" + "|".join(keyboard_neighbor.keys()) + "){1}"
+def make_accent_change_candidates(text):
+    result = re.findall(pattern, text)
+    candidates =  []
+    for candidate in result:
+        [candidates.append(text.replace(candidate, x)) for x in keyboard_neighbor[candidate]]
+    return set(candidates)
+typo = json.load(open("../noising_resources/typo.json", "r", encoding="utf-8"))
+typo_pattern = "(" + "|".join(typo.keys()) + "){1}"
+accent_pattern = "(s|f|r|x|j|1|2|3|4|5){1}"
+def convert_to_non_telex(text):
+    word = copy(text)
+    candidates = re.findall(typo_pattern, text)
+    for candidate in candidates:
+        replaced = typo[candidate][0]
+            # Move accent to the end of text
+        if len(re.findall(accent_pattern, replaced)) != 0:
+            word = re.sub(candidate, replaced[0:-1], word)
+            word += replaced[-1]
+        else:
+            word = re.sub(candidate, replaced, word)
+    return word
+def keep_1_distance_candidates(text, nguyen_am_errors : set):
+    nguyen_am_errors = list(nguyen_am_errors)
+    text = convert_to_non_telex(text)
+    distances = [textdistance.damerau_levenshtein(text, convert_to_non_telex(error)) for error in nguyen_am_errors]
+    indies_to_keep = np.where(np.array(distances) <= 1)[0]
+    return set([nguyen_am_errors[i] for i in indies_to_keep])
+for key in tqdm(confusion_set.keys()):
+    candidates = make_accent_change_candidates(key)
+    one_distance_candidates = keep_1_distance_candidates(key, confusion_set[key]['nguyen_am'])
+    candidates = candidates.union(one_distance_candidates)
+    high_probs_list = candidates.intersection(confusion_set[key]['nguyen_am'])
+    lower_probs_list = confusion_set[key]['nguyen_am'].difference(high_probs_list)
+    confusion_set[key]['nguyen_am'] = [high_probs_list, lower_probs_list]
+for key in tqdm(confusion_set.keys()):
+    confusion_set[key]['nguyen_am'] = [list(confusion_set[key]['nguyen_am'][0]), list(confusion_set[key]['nguyen_am'][1])]
+    confusion_set[key]['phu_am_dau'] = list(confusion_set[key]['phu_am_dau'])
+    confusion_set[key]['phu_am_cuoi'] = list(confusion_set[key]['phu_am_cuoi'])
+with open("../noising_resources/confusion_set.json", "w+", encoding="utf-8") as outfile:
+    print(confusion_set, file = outfile)

dataset/data_generation/keyboard_neighbor.py ADDED Viewed

	@@ -0,0 +1,79 @@

+def getKeyboardNeighbors():
+            keyboardNeighbors = {}
+            keyboardNeighbors['a'] = ["ắằẳẵặă|âấầẩẫậ|áàảãạ", [0.15, 0.15, 0.7]]
+            keyboardNeighbors['ă'] = ["ắằẳẵặ|âấầẩẫậ|aáàảãạ", [0.7, 0.15, 0.15]]
+            keyboardNeighbors['â'] = ["ấầẩẫậ|aáàảãạ|ăắằẳẵặ", [0.7, 0.15, 0.15]]
+            keyboardNeighbors['á'] = ["aàảãạ|ăắằẳẵặ|âấầẩẫậ", [0.7, 0.15, 0.15]]
+            keyboardNeighbors['à'] = ["aáảãạ|ăắằẳẵặ|âấầẩẫậ", [0.7, 0.15, 0.15]]
+            keyboardNeighbors['ả'] = ["aáàãạ|ăắằẳẵặ|âấầẩẫậ", [0.7, 0.15, 0.15]]
+            keyboardNeighbors['ã'] = ["aáàảạ|ăắằẳẵặ|âấầẩẫậ", [0.7, 0.15, 0.15]]
+            keyboardNeighbors['ạ'] = ["aáàảã|ăắằẳẵặ|âấầẩẫậ", [0.7, 0.15, 0.15]]
+            keyboardNeighbors['ắ'] = ["ăằẳẵặ|aáàảãạ|âấầẩẫậ", [0.7, 0.15, 0.15]]
+            keyboardNeighbors['ằ'] = ["ăắẳẵặ|aáàảãạ|âấầẩẫậ", [0.7, 0.15, 0.15]]
+            keyboardNeighbors['ẳ'] = ["ăắằẵặ|aáàảãạ|âấầẩẫậ", [0.7, 0.15, 0.15]]
+            keyboardNeighbors['ặ'] = ["ăắằẳẵ|aáàảãạ|âấầẩẫậ", [0.7, 0.15, 0.15]]
+            keyboardNeighbors['ẵ'] = ["ăắằẳặ|aáàảãạ|âấầẩẫậ", [0.7, 0.15, 0.15]]
+            keyboardNeighbors['ấ'] = ["âầẩẫậ|aáàảãạ|ăắằẳẵặ", [0.7, 0.15, 0.15]]
+            keyboardNeighbors['ầ'] = ["âấẩẫậ|aáàảãạ|ăắằẳẵặ", [0.7, 0.15, 0.15]]
+            keyboardNeighbors['ẩ'] = ["âấầẫậ|aáàảãạ|ăắằẳẵặ", [0.7, 0.15, 0.15]]
+            keyboardNeighbors['ẫ'] = ["âấầẩậ|aáàảãạ|ăắằẳẵặ", [0.7, 0.15, 0.15]]
+            keyboardNeighbors['ậ'] = ["âấầẩẫ|aáàảãạ|ăắằẳẵặ", [0.7, 0.15, 0.15]]
+            keyboardNeighbors['e'] = ["èéẻẽẹ|êếềểễệ", [0.7, 0.3]]
+            keyboardNeighbors['é'] = ["eèẻẽẹ|êếềểễệ", [0.7, 0.3]]
+            keyboardNeighbors['è'] = ["eéẻẽẹ|êếềểễệ", [0.7, 0.3]]
+            keyboardNeighbors['ẻ'] = ["eéèẽẹ|êếềểễệ", [0.7, 0.3]]
+            keyboardNeighbors['ẽ'] = ["eéèẻẹ|êếềểễệ", [0.7, 0.3]]
+            keyboardNeighbors['ẹ'] = ["eéèẻẽ|êếềểễệ", [0.7, 0.3]]
+            keyboardNeighbors['ê'] = ["eéèẻẽẹ|ếềểễệ", [0.3, 0.7]]
+            keyboardNeighbors['ế'] = ["eéèẻẽẹ|êềểễệ", [0.3, 0.7]]
+            keyboardNeighbors['ề'] = ["eéèẻẽẹ|êếểễệ", [0.3, 0.7]]
+            keyboardNeighbors['ể'] = ["eéèẻẽẹ|êếềễệ", [0.3, 0.7]]
+            keyboardNeighbors['ễ'] = ["eéèẻẽẹ|êếềểệ", [0.3, 0.7]]
+            keyboardNeighbors['ệ'] = ["eéèẻẽẹ|êếềểễ", [0.3, 0.7]]
+            keyboardNeighbors['i'] = ["íìỉĩị|ýỳỷỹỵy", [0.7, 0.3]]
+            keyboardNeighbors['í'] = ["iìỉĩị|ýỳỷỹỵy", [0.7, 0.3]]
+            keyboardNeighbors['ì'] = ["iíỉĩị|ýỳỷỹỵy", [0.7, 0.3]]
+            keyboardNeighbors['ỉ'] = ["iíìĩị|ýỳỷỹỵy", [0.7, 0.3]]
+            keyboardNeighbors['ĩ'] = ["iíìỉị|ýỳỷỹỵy", [0.7, 0.3]]
+            keyboardNeighbors['ị'] = ["iíìỉĩ|ýỳỷỹỵy", [0.7, 0.3]]
+            keyboardNeighbors['o'] = ["òóỏọõ|ôốồổỗộ|ơớờởợỡ", [0.7, 0.15, 0.15]]
+            keyboardNeighbors['ó'] = ["oòỏọõ|ôốồổỗộ|ơớờởợỡ", [0.7, 0.15, 0.15]]
+            keyboardNeighbors['ò'] = ["oóỏọõ|ôốồổỗộ|ơớờởợỡ", [0.7, 0.15, 0.15]]
+            keyboardNeighbors['ỏ'] = ["oóòọõ|ôốồổỗộ|ơớờởợỡ", [0.7, 0.15, 0.15]]
+            keyboardNeighbors['õ'] = ["oóòỏọ|ôốồổỗộ|ơớờởợỡ", [0.7, 0.15, 0.15]]
+            keyboardNeighbors['ọ'] = ["oóòỏõ|ôốồổỗộ|ơớờởợỡ", [0.7, 0.15, 0.15]]
+            keyboardNeighbors['ô'] = ["oóòỏọõ|ốồổỗộ|ơớờởợỡ", [0.15, 0.7, 0.15]]
+            keyboardNeighbors['ố'] = ["oóòỏọõ|ôồổỗộ|ơớờởợỡ", [0.15, 0.7, 0.15]]
+            keyboardNeighbors['ồ'] = ["oóòỏọõ|ôốổỗộ|ơớờởợỡ", [0.15, 0.7, 0.15]]
+            keyboardNeighbors['ổ'] = ["oóòỏọõ|ôốồỗộ|ơớờởợỡ", [0.15, 0.7, 0.15]]
+            keyboardNeighbors['ộ'] = ["oóòỏọõ|ôốồổỗ|ơớờởợỡ", [0.15, 0.7, 0.15]]
+            keyboardNeighbors['ỗ'] = ["oóòỏọõ|ôốồổộ|ơớờởợỡ", [0.15, 0.7, 0.15]]
+            keyboardNeighbors['ơ'] = ["oóòỏọõ|ôốồổỗộ|ớờởợỡ", [0.15, 0.15, 0.7]]
+            keyboardNeighbors['ớ'] = ["oóòỏọõ|ôốồổỗộ|ơờởợỡ", [0.15, 0.15, 0.7]]
+            keyboardNeighbors['ờ'] = ["oóòỏọõ|ôốồổỗộ|ơớởợỡ", [0.15, 0.15, 0.7]]
+            keyboardNeighbors['ở'] = ["oóòỏọõ|ôốồổỗộ|ơớờợỡ", [0.15, 0.15, 0.7]]
+            keyboardNeighbors['ợ'] = ["oóòỏọõ|ôốồổỗộ|ơớờởỡ", [0.15, 0.15, 0.7]]
+            keyboardNeighbors['ỡ'] = ["oóòỏọõ|ôốồổỗộ|ơớờởợ", [0.15, 0.15, 0.7]]
+            keyboardNeighbors['u'] = ["úùủũụ|ưứừữửự", [0.7, 0.3]]
+            keyboardNeighbors['ú'] = ["uùủũụ|ưứừữửự", [0.7, 0.3]]
+            keyboardNeighbors['ù'] = ["uúủũụ|ưứừữửự", [0.7, 0.3]]
+            keyboardNeighbors['ủ'] = ["uúùũụ|ưứừữửự", [0.7, 0.3]]
+            keyboardNeighbors['ũ'] = ["uúùủụ|ưứừữửự", [0.7, 0.3]]
+            keyboardNeighbors['ụ'] = ["uúùủũ|ưứừữửự", [0.7, 0.3]]
+            keyboardNeighbors['ư'] = ["uúùủũụ|ứừữửự", [0.3, 0.7]]
+            keyboardNeighbors['ứ'] = ["uúùủũụ|ưừữửự", [0.3, 0.7]]
+            keyboardNeighbors['ừ'] = ["uúùủũụ|ưứữửự", [0.3, 0.7]]
+            keyboardNeighbors['ử'] = ["uúùủũụ|ưứừữự", [0.3, 0.7]]
+            keyboardNeighbors['ữ'] = ["uúùủũụ|ưứừửự", [0.3, 0.7]]
+            keyboardNeighbors['ự'] = ["uúùủũụ|ưứừữử", [0.3, 0.7]]
+            keyboardNeighbors['y'] = ["ýỳỷỵỹ|iíìỉĩị", [0.7, 0.3]]
+            keyboardNeighbors['ý'] = ["yỳỷỵỹ|iíìỉĩị", [0.7, 0.3]]
+            keyboardNeighbors['ỳ'] = ["yýỷỵỹ|iíìỉĩị", [0.7, 0.3]]
+            keyboardNeighbors['ỷ'] = ["yýỳỵỹ|iíìỉĩị", [0.7, 0.3]]
+            keyboardNeighbors['ỵ'] = ["yýỳỷỹ|iíìỉĩị", [0.7, 0.3]]
+            keyboardNeighbors['ỹ'] = ["yýỳỷỵ|iíìỉĩị", [0.7, 0.3]]
+            for key in keyboardNeighbors.keys():
+                keyboardNeighbors[key] = [keyboardNeighbors[key][0].split("|"), keyboardNeighbors[key][1]]
+            return keyboardNeighbors

dataset/data_generation/normalize.py ADDED Viewed

	@@ -0,0 +1,183 @@

+"""
+  Copyright @ nguyenvanhieu.vn
+  Thằng code python này không giữ được lower/upper case
+  Sẽ update khi rảnh
+ """
+import re
+uniChars = "àáảãạâầấẩẫậăằắẳẵặèéẻẽẹêềếểễệđìíỉĩịòóỏõọôồốổỗộơờớởỡợùúủũụưừứửữựỳýỷỹỵÀÁẢÃẠÂẦẤẨẪẬĂẰẮẲẴẶÈÉẺẼẸÊỀẾỂỄỆĐÌÍỈĨỊÒÓỎÕỌÔỒỐỔỖỘƠỜỚỞỠỢÙÚỦŨỤƯỪỨỬỮỰỲÝỶỸỴÂĂĐÔƠƯ"
+unsignChars = "aaaaaaaaaaaaaaaaaeeeeeeeeeeediiiiiooooooooooooooooouuuuuuuuuuuyyyyyAAAAAAAAAAAAAAAAAEEEEEEEEEEEDIIIOOOOOOOOOOOOOOOOOOOUUUUUUUUUUUYYYYYAADOOU"
+def loaddicchar():
+    dic = {}
+    char1252 = 'à|á|ả|ã|ạ|ầ|ấ|ẩ|ẫ|ậ|ằ|ắ|ẳ|ẵ|ặ|è|é|ẻ|ẽ|ẹ|ề|ế|ể|ễ|ệ|ì|í|ỉ|ĩ|ị|ò|ó|ỏ|õ|ọ|ồ|ố|ổ|ỗ|ộ|ờ|ớ|ở|ỡ|ợ|ù|ú|ủ|ũ|ụ|ừ|ứ|ử|ữ|ự|ỳ|ý|ỷ|ỹ|ỵ|À|Á|Ả|Ã|Ạ|Ầ|Ấ|Ẩ|Ẫ|Ậ|Ằ|Ắ|Ẳ|Ẵ|Ặ|È|É|Ẻ|Ẽ|Ẹ|Ề|Ế|Ể|Ễ|Ệ|Ì|Í|Ỉ|Ĩ|Ị|Ò|Ó|Ỏ|Õ|Ọ|Ồ|Ố|Ổ|Ỗ|Ộ|Ờ|Ớ|Ở|Ỡ|Ợ|Ù|Ú|Ủ|Ũ|Ụ|Ừ|Ứ|Ử|Ữ|Ự|Ỳ|Ý|Ỷ|Ỹ|Ỵ'.split(
+        '|')
+    charutf8 = "à|á|ả|ã|ạ|ầ|ấ|ẩ|ẫ|ậ|ằ|ắ|ẳ|ẵ|ặ|è|é|ẻ|ẽ|ẹ|ề|ế|ể|ễ|ệ|ì|í|ỉ|ĩ|ị|ò|ó|ỏ|õ|ọ|ồ|ố|ổ|ỗ|ộ|ờ|ớ|ở|ỡ|ợ|ù|ú|ủ|ũ|ụ|ừ|ứ|ử|ữ|ự|ỳ|ý|ỷ|ỹ|ỵ|À|Á|Ả|Ã|Ạ|Ầ|Ấ|Ẩ|Ẫ|Ậ|Ằ|Ắ|Ẳ|Ẵ|Ặ|È|É|Ẻ|Ẽ|Ẹ|Ề|Ế|Ể|Ễ|Ệ|Ì|Í|Ỉ|Ĩ|Ị|Ò|Ó|Ỏ|Õ|Ọ|Ồ|Ố|Ổ|Ỗ|Ộ|Ờ|Ớ|Ở|Ỡ|Ợ|Ù|Ú|Ủ|Ũ|Ụ|Ừ|Ứ|Ử|Ữ|Ự|Ỳ|Ý|Ỷ|Ỹ|Ỵ".split(
+        '|')
+    for i in range(len(char1252)):
+        dic[char1252[i]] = charutf8[i]
+    return dic
+dicchar = loaddicchar()
+def convertwindown1525toutf8(txt):
+    return re.sub(
+        r'à|á|ả|ã|ạ|ầ|ấ|ẩ|ẫ|ậ|ằ|ắ|ẳ|ẵ|ặ|è|é|ẻ|ẽ|ẹ|ề|ế|ể|ễ|ệ|ì|í|ỉ|ĩ|ị|ò|ó|ỏ|õ|ọ|ồ|ố|ổ|ỗ|ộ|ờ|ớ|ở|ỡ|ợ|ù|ú|ủ|ũ|ụ|ừ|ứ|ử|ữ|ự|ỳ|ý|ỷ|ỹ|ỵ|À|Á|Ả|Ã|Ạ|Ầ|Ấ|Ẩ|Ẫ|Ậ|Ằ|Ắ|Ẳ|Ẵ|Ặ|È|É|Ẻ|Ẽ|Ẹ|Ề|Ế|Ể|Ễ|Ệ|Ì|Í|Ỉ|Ĩ|Ị|Ò|Ó|Ỏ|Õ|Ọ|Ồ|Ố|Ổ|Ỗ|Ộ|Ờ|Ớ|Ở|Ỡ|Ợ|Ù|Ú|Ủ|Ũ|Ụ|Ừ|Ứ|Ử|Ữ|Ự|Ỳ|Ý|Ỷ|Ỹ|Ỵ',
+        lambda x: dicchar[x.group()], txt)
+"""
+    Start section: Chuyển câu văn về kiểu gõ telex khi không bật Unikey
+    Ví dụ: thủy = thuyr, tượng = tuwowngj
+"""
+bang_nguyen_am = [['a', 'à', 'á', 'ả', 'ã', 'ạ', 'a'],
+                  ['ă', 'ằ', 'ắ', 'ẳ', 'ẵ', 'ặ', 'aw'],
+                  ['â', 'ầ', 'ấ', 'ẩ', 'ẫ', 'ậ', 'aa'],
+                  ['e', 'è', 'é', 'ẻ', 'ẽ', 'ẹ', 'e'],
+                  ['ê', 'ề', 'ế', 'ể', 'ễ', 'ệ', 'ee'],
+                  ['i', 'ì', 'í', 'ỉ', 'ĩ', 'ị', 'i'],
+                  ['o', 'ò', 'ó', 'ỏ', 'õ', 'ọ', 'o'],
+                  ['ô', 'ồ', 'ố', 'ổ', 'ỗ', 'ộ', 'oo'],
+                  ['ơ', 'ờ', 'ớ', 'ở', 'ỡ', 'ợ', 'ow'],
+                  ['u', 'ù', 'ú', 'ủ', 'ũ', 'ụ', 'u'],
+                  ['ư', 'ừ', 'ứ', 'ử', 'ữ', 'ự', 'uw'],
+                  ['y', 'ỳ', 'ý', 'ỷ', 'ỹ', 'ỵ', 'y']]
+bang_ky_tu_dau = ['', 'f', 's', 'r', 'x', 'j']
+nguyen_am_to_ids = {}
+for i in range(len(bang_nguyen_am)):
+    for j in range(len(bang_nguyen_am[i]) - 1):
+        nguyen_am_to_ids[bang_nguyen_am[i][j]] = (i, j)
+def vn_word_to_telex_type(word):
+    dau_cau = 0
+    new_word = ''
+    for char in word:
+        x, y = nguyen_am_to_ids.get(char, (-1, -1))
+        if x == -1:
+            new_word += char
+            continue
+        if y != 0:
+            dau_cau = y
+        new_word += bang_nguyen_am[x][-1]
+    new_word += bang_ky_tu_dau[dau_cau]
+    return new_word
+def vn_sentence_to_telex_type(sentence):
+    """
+    Chuyển câu tiếng việt có dấu về kiểu gõ telex.
+    :param sentence:
+    :return:
+    """
+    words = sentence.split()
+    for index, word in enumerate(words):
+        words[index] = vn_word_to_telex_type(word)
+    return ' '.join(words)
+"""
+    Start section: Chuyển câu văn về cách gõ dấu kiểu cũ: dùng òa úy thay oà uý
+    Xem tại đây: https://vi.wikipedia.org/wiki/Quy_t%E1%BA%AFc_%C4%91%E1%BA%B7t_d%E1%BA%A5u_thanh_trong_ch%E1%BB%AF_qu%E1%BB%91c_ng%E1%BB%AF
+"""
+def chuan_hoa_dau_tu_tieng_viet(word):
+    if not is_valid_vietnam_word(word):
+        return word
+    chars = list(word)
+    dau_cau = 0
+    nguyen_am_index = []
+    qu_or_gi = False
+    for index, char in enumerate(chars):
+        x, y = nguyen_am_to_ids.get(char, (-1, -1))
+        if x == -1:
+            continue
+        elif x == 9:  # check qu
+            if index != 0 and chars[index - 1] == 'q':
+                chars[index] = 'u'
+                qu_or_gi = True
+        elif x == 5:  # check gi
+            if index != 0 and chars[index - 1] == 'g':
+                chars[index] = 'i'
+                qu_or_gi = True
+        if y != 0:
+            dau_cau = y
+            chars[index] = bang_nguyen_am[x][0]
+        if not qu_or_gi or index != 1:
+            nguyen_am_index.append(index)
+    if len(nguyen_am_index) < 2:
+        if qu_or_gi:
+            if len(chars) == 2:
+                x, y = nguyen_am_to_ids.get(chars[1])
+                chars[1] = bang_nguyen_am[x][dau_cau]
+            else:
+                x, y = nguyen_am_to_ids.get(chars[2], (-1, -1))
+                if x != -1:
+                    chars[2] = bang_nguyen_am[x][dau_cau]
+                else:
+                    chars[1] = bang_nguyen_am[5][dau_cau] if chars[1] == 'i' else bang_nguyen_am[9][dau_cau]
+            return ''.join(chars)
+        return word
+    for index in nguyen_am_index:
+        x, y = nguyen_am_to_ids[chars[index]]
+        if x == 4 or x == 8:  # ê, ơ
+            chars[index] = bang_nguyen_am[x][dau_cau]
+            # for index2 in nguyen_am_index:
+            #     if index2 != index:
+            #         x, y = nguyen_am_to_ids[chars[index]]
+            #         chars[index2] = bang_nguyen_am[x][0]
+            return ''.join(chars)
+    if len(nguyen_am_index) == 2:
+        if nguyen_am_index[-1] == len(chars) - 1:
+            x, y = nguyen_am_to_ids[chars[nguyen_am_index[0]]]
+            chars[nguyen_am_index[0]] = bang_nguyen_am[x][dau_cau]
+            # x, y = nguyen_am_to_ids[chars[nguyen_am_index[1]]]
+            # chars[nguyen_am_index[1]] = bang_nguyen_am[x][0]
+        else:
+            # x, y = nguyen_am_to_ids[chars[nguyen_am_index[0]]]
+            # chars[nguyen_am_index[0]] = bang_nguyen_am[x][0]
+            x, y = nguyen_am_to_ids[chars[nguyen_am_index[1]]]
+            chars[nguyen_am_index[1]] = bang_nguyen_am[x][dau_cau]
+    else:
+        # x, y = nguyen_am_to_ids[chars[nguyen_am_index[0]]]
+        # chars[nguyen_am_index[0]] = bang_nguyen_am[x][0]
+        x, y = nguyen_am_to_ids[chars[nguyen_am_index[1]]]
+        chars[nguyen_am_index[1]] = bang_nguyen_am[x][dau_cau]
+        # x, y = nguyen_am_to_ids[chars[nguyen_am_index[2]]]
+        # chars[nguyen_am_index[2]] = bang_nguyen_am[x][0]
+    return ''.join(chars)
+def is_valid_vietnam_word(word):
+    chars = list(word)
+    nguyen_am_index = -1
+    for index, char in enumerate(chars):
+        x, y = nguyen_am_to_ids.get(char, (-1, -1))
+        if x != -1:
+            if nguyen_am_index == -1:
+                nguyen_am_index = index
+            else:
+                if index - nguyen_am_index != 1:
+                    return False
+                nguyen_am_index = index
+    return True
+def chuan_hoa_dau_cau_tieng_viet(sentence):
+    """
+        Chuyển câu tiếng việt về chuẩn gõ dấu kiểu cũ.
+        :param sentence:
+        :return:
+        """
+    sentence = sentence.lower()
+    words = sentence.split()
+    for index, word in enumerate(words):
+        words[index] = chuan_hoa_dau_tu_tieng_viet(word)
+    return ' '.join(words)

dataset/data_generation/typing_error_gen.py ADDED Viewed

	@@ -0,0 +1,13 @@

+import re
+from normalize import chuan_hoa_dau_tu_tieng_viet
+import numpy as np
+with open("common-vietnamese-syllables.txt", "r") as file:
+    vi_syllables = [line.strip("\n") for line in file.readlines()]
+file = open("../../dataset/noising_resources/kieu_go_dau_cu_moi.txt", "w+")
+for syllable in vi_syllables:
+    normalized = chuan_hoa_dau_tu_tieng_viet(syllable)
+    if normalized != syllable:
+        print(normalized, syllable, file = file)
+file.close()

dataset/log/prepare_data.log ADDED Viewed

File without changes

dataset/noise.py ADDED Viewed

	@@ -0,0 +1,655 @@

+import string
+from nltk.tokenize import word_tokenize
+import numpy as np
+import re
+import unidecode
+import nltk
+import json
+import os
+real_file_path = "/".join(os.path.realpath(__file__).split("/")[:-1])
+nltk.download('punkt')
+from dataset.vocab import Vocab
+from ast import literal_eval
+class SynthesizeData(object):
+    """
+    Uitils class to create artificial miss-spelled words
+    Args:
+        vocab_path: path to vocab file. Vocab file is expected to be a set of words, separate by ' ', no newline charactor.
+    """
+    def __init__(self, vocab: Vocab):
+        self.vocab = vocab
+        self.tokenizer = word_tokenize
+        self.vn_alphabet = ['a', 'ă', 'â', 'b', 'c', 'd', 'đ', 'e', 'ê', 'g', 'h', 'i', 'k', 'l', 'm', 'n', 'o', 'ô',
+                            'ơ', 'p', 'q', 'r', 's', 't', 'u', 'ư', 'v', 'x', 'y']
+        self.alphabet_len = len(self.vn_alphabet)
+        self.word_couples = [pair.strip("\n").split(" ") for pair in open(os.path.join(real_file_path, "noising_resources/kieu_go_dau_cu_moi.txt"), "r", encoding='utf-8').readlines()]
+        self.homowords = literal_eval(open( os.path.join(real_file_path, "noising_resources/confusion_set.json"), "r", encoding='utf-8').read())
+        self.homo_leters_dict = literal_eval(open( os.path.join(real_file_path, "noising_resources/homo_leter.json"), "r", encoding='utf-8').read())
+        self.teencode_dict = {'mình': ['mk', 'mik', 'mjk'], 'vô': ['zô', 'zo', 'vo'], 'vậy': ['zậy', 'z', 'zay', 'za'],
+                              'phải': ['fải', 'fai', ], 'biết': ['bit', 'biet'],
+                              'rồi': ['rùi', 'ròi', 'r'], 'bây': ['bi', 'bay'], 'giờ': ['h', ],
+                              'không': ['k', 'ko', 'khong', 'hk', 'hong', 'hông', '0', 'kg', 'kh', ],
+                              'đi': ['di', 'dj', ], 'gì': ['j', ], 'em': ['e', ], 'được': ['dc', 'đc', ], 'tao': ['t'],
+                              'tôi': ['t'], 'chồng': ['ck'], 'vợ': ['vk']
+                              }
+        self.typo = json.load( open(os.path.join(real_file_path,"noising_resources/typo.json"), "r", encoding='utf-8'))
+        self.all_char_candidates = self.get_all_char_candidates()
+        self.all_word_candidates = self.get_all_word_candidates()
+    def replace_teencode(self, word):
+        candidates = self.teencode_dict.get(word, None)
+        if candidates is not None:
+            chosen_one = 0
+            if len(candidates) > 1:
+                chosen_one = np.random.randint(0, len(candidates))
+            return candidates[chosen_one]
+    def replace_char_candidate(self, char):
+        """
+        return a homophone char/subword of the input char.
+        """
+        return np.random.choice(self.homo_leters_dict[char])
+    def replace_word_candidate(self, word):
+        """
+        Return a new typo word of the input word for example òa oà
+        """
+        capital_flag = word[0].isupper()
+        word = word.lower()
+        if capital_flag and word in self.teencode_dict:
+            return self.replace_teencode(word).capitalize()
+        elif word in self.teencode_dict:
+            return self.replace_teencode(word)
+        for couple in self.word_couples:
+            for i in range(2):
+                if couple[i] == word:
+                    if i == 0:
+                        if capital_flag:
+                            return couple[1].capitalize()
+                        else:
+                            return couple[1]
+                    else:
+                        if capital_flag:
+                            return couple[0].capitalize()
+                        else:
+                            return couple[0]
+    def replace_homo_candidate(self, word):
+        """
+        Return a homo word of the input word
+        """
+        capital_flag = word[0].isupper()
+        word = word.lower()
+        def random_capitalize(word):
+            index = np.random.randint(0, len(word))
+            return word[0:index] + word[index].upper() + word[index+1:]
+        candidate_type = np.random.choice(["phu_am_dau", "phu_am_cuoi", "nguyen_am"]\
+            , p = [0.1, 0.3, 0.6])
+        if candidate_type == "nguyen_am":
+            coin = np.random.choice([0, 1], p = [0.7, 0.3])
+            candidates = list(self.homowords[word][candidate_type][coin])
+        else:
+            candidates = list(self.homowords[word][candidate_type])
+        if len(candidates) == 0:
+            if capital_flag:
+                return word
+            return random_capitalize(word)
+        candidate = np.random.choice(candidates)
+        if capital_flag:
+            return candidate.capitalize()
+        return candidate
+    def replace_char_candidate_typo(self, char):
+        """
+        return a homophone char/subword of the input char.
+        """
+        candidates = self.typo[char]
+        num_lower_priority = len(candidates) - 1
+        round_flag = 10 * num_lower_priority
+        return np.random.choice(candidates, p = [0.7, *[3 / round_flag for i in range(num_lower_priority)]])
+    def get_all_char_candidates(self):
+        return list(self.homo_leters_dict.keys())
+    def get_all_word_candidates(self):
+        all_word_candidates = []
+        for couple in self.word_couples:
+            all_word_candidates.extend(couple)
+        return all_word_candidates
+    def remove_diacritics(self, text, onehot_label):
+        """
+        Replace word which has diacritics with the same word without diacritics
+        Args:
+            text: a list of word tokens
+            onehot_label: onehot array indicate position of word that has already modify, so this
+            function only choose the word that do not has onehot label == 1.
+        return: a list of word tokens has one word that its diacritics was removed,
+                a list of onehot label indicate the position of words that has been modified.
+        """
+        if len(text) == len(' '.join(text).split()):
+            its_me = True
+        else:
+            its_me = False
+        idx = np.random.randint(0, len(onehot_label))
+        prevent_loop = 0
+        noised_token = unidecode.unidecode(text[idx])
+        while onehot_label[idx] != 0 or not self.vocab.exist(text[idx]) or text[idx] in string.punctuation or text[idx] == noised_token:
+            idx = np.random.randint(0, len(onehot_label))
+            noised_token = unidecode.unidecode(text[idx])
+            prevent_loop += 1
+            if prevent_loop > 10:
+                return False, text, onehot_label
+        onehot_label[idx] = 1
+        token = text[idx]
+        text[idx] = unidecode.unidecode(text[idx])
+        if (len(text) != len(' '.join(text).split())) and its_me:
+            print("ERROR:")
+            print("text: ", text)
+            print("replaced token: ", text[idx])
+            print("org token: ", token)
+        return True, text, onehot_label
+    def replace_with_random_letter(self, text, onehot_label):
+        """
+        Replace, add (or remove) a random letter in a random chosen word with a random letter
+        Args:
+            text: a list of word tokens
+            onehot_label: onehot array indicate position of word that has already modify, so this
+            function only choose the word that do not has onehot label == 1.
+        return: a list of word tokens has one word that has been modified,
+                a list of onehot label indicate the position of words that has been modified.
+        """
+        if len(text) == len(' '.join(text).split()):
+            its_me = True
+        else:
+            its_me = False
+        idx = np.random.randint(0, len(onehot_label))
+        prevent_loop = 0
+        while onehot_label[idx] != 0 or not self.vocab.exist(text[idx]) or len(text[idx]) < 3:
+            idx = np.random.randint(0, len(onehot_label))
+            prevent_loop += 1
+            if prevent_loop > 10:
+                return False, text, onehot_label
+        # replace, add or remove? 0 is replace, 1 is add, 2 is remove
+        # 0.8 1 edits, 0.2 2 edits
+        num_edit = np.random.choice([1,2], p = [0.8, 0.2])
+        coin = np.random.choice([0, 1, 2])
+        for i in range(num_edit):
+            token = list(text[idx])
+            if coin == 0:
+                chosen_idx = np.random.randint(0, len(token))
+                replace_candidate = self.vn_alphabet[np.random.randint(
+                    0, self.alphabet_len)]
+                token[chosen_idx] = replace_candidate
+                text[idx] = "".join(token)
+            elif coin == 1:
+                chosen_idx = np.random.randint(0, len(token) + 1)
+                if chosen_idx == len(token):
+                    added_chars = self.vn_alphabet[np.random.randint(0, self.alphabet_len)] + \
+                        token[0]
+                    chosen_idx = 0
+                else:
+                    added_chars = token[chosen_idx] + \
+                        self.vn_alphabet[np.random.randint(0, self.alphabet_len)]
+                token[chosen_idx] = added_chars
+                text[idx] = "".join(token)
+            else:
+                chosen_idx = np.random.randint(0, len(token))
+                token[chosen_idx] = ""
+                text[idx] = "".join(token)
+        onehot_label[idx] = 1
+        if (len(text) != len(' '.join(text).split())) and its_me:
+            print("ERROR:")
+            print("text: ", text)
+            print("replaced token: ", text[idx])
+            print("org token: ", token)
+            print("coin: ", coin)
+            return False, text, onehot_label
+        return True, text, onehot_label
+    def replace_with_new_typo_word(self, text, onehot_label):
+        """
+        Replace a candidate word (if exist in the word_couple) with its homophone. if successful, return True, else False
+        Args:
+            text: a list of word tokens
+            onehot_label: onehot array indicate position of word that has already modify, so this
+            function only choose the word that do not has onehot label == 1.
+        return: True, text, onehot_label if successful replace, else False, text, onehot_label
+        """
+        # account for the case that the word in the text is upper case but its lowercase match the candidates list
+        if len(text) == len(' '.join(text).split()):
+            its_me = True
+        else:
+            its_me = False
+        candidates = []
+        for i in range(len(text)):
+            if text[i].lower() in self.all_word_candidates or text[i].lower() in self.teencode_dict.keys():
+                candidates.append((i, text[i]))
+        if len(candidates) == 0:
+            return False, text, onehot_label
+        idx = np.random.randint(0, len(candidates))
+        prevent_loop = 0
+        while onehot_label[candidates[idx][0]] != 0 or not self.vocab.exist(candidates[idx][1]):
+            idx = np.random.choice(np.arange(0, len(candidates)))
+            prevent_loop += 1
+            if prevent_loop > 10:
+                return False, text, onehot_label
+        text[candidates[idx][0]] = self.replace_word_candidate(
+            candidates[idx][1])
+        if (len(text) != len(' '.join(text).split())) and its_me:
+            print("ERROR:")
+            print("text: ", text)
+            print("replaced token: ", text[candidates[idx][0]])
+            print("org token: ", candidates[idx][1])
+        onehot_label[candidates[idx][0]] = 1
+        return True, text, onehot_label
+    def replace_with_homophone_word(self, text, onehot_label):
+        """
+        Replace a candidate word (if exist in the word_couple) with its homophone. if successful, return True, else False
+        Args:
+            text: a list of word tokens
+            onehot_label: onehot array indicate position of word that has already modify, so this
+            function only choose the word that do not has onehot label == 1.
+        return: True, text, onehot_label if successful replace, else False, text, onehot_label
+        """
+        # account for the case that the word in the text is upper case but its lowercase match the candidates list
+        if len(text) == len(' '.join(text).split()):
+            its_me = True
+        else:
+            its_me = False
+        candidates = []
+        for i in range(len(text)):
+            if text[i].lower() in self.homowords:
+                candidates.append((i, text[i]))
+        if len(candidates) == 0:
+            return False, text, onehot_label
+        idx = np.random.randint(0, len(candidates))
+        prevent_loop = 0
+        while onehot_label[candidates[idx][0]] != 0 or not self.vocab.exist(candidates[idx][1]):
+            idx = np.random.choice(np.arange(0, len(candidates)))
+            prevent_loop += 1
+            if prevent_loop > 10:
+                return False, text, onehot_label
+        text[candidates[idx][0]] = self.replace_homo_candidate(
+            candidates[idx][1])
+        if (len(text) != len(' '.join(text).split())) and its_me:
+            print("ERROR:")
+            print("text: ", text)
+            print("replaced token: ", text[candidates[idx][0]])
+            print("org token: ", candidates[idx][1])
+            return False, text, onehot_label
+        onehot_label[candidates[idx][0]] = 1
+        return True, text, onehot_label
+    def replace_with_homophone_letter(self, text, onehot_label):
+        """
+        Replace a subword/letter with its homophones
+        Args:
+            text: a list of word tokens
+            onehot_label: onehot array indicate position of word that has already modify, so this
+            function only choose the word that do not has onehot label == 1.
+        return: True, text, onehot_label if successful replace, else False, None, None
+        """
+        if len(text) == len(' '.join(text).split()):
+            its_me = True
+        else:
+            its_me = False
+        candidates = []
+        for i in range(len(text)):
+            for char in self.all_char_candidates:
+                if re.search("^" + char, text[i]) is not None:
+                    candidates.append((i, char, "^" + char ))
+                if re.search(char + "$", text[i]) is not None:
+                    candidates.append((i, char, char + "$"))
+        if len(candidates) == 0:
+            return False, text, onehot_label
+        else:
+            idx = np.random.randint(0, len(candidates))
+            prevent_loop = 0
+            while onehot_label[candidates[idx][0]] != 0 or not self.vocab.exist(text[candidates[idx][0]]) or len(text[candidates[idx][0]]) < 2:
+                idx = np.random.randint(0, len(candidates))
+                prevent_loop += 1
+                if prevent_loop > 10:
+                    return False, text, onehot_label
+            replaced = self.replace_char_candidate(candidates[idx][1])
+            ## 0.15% remove the candidate. cát -> cá
+            coin = np.random.choice([0, 1], p = [0.8, 0.2])
+            text_to_replace = text[candidates[idx][0]]
+            result = re.sub(candidates[idx][2], replaced if coin == 0 else "",
+                         text_to_replace)
+            if result == "":
+                result = re.sub(candidates[idx][2], replaced,
+                         text_to_replace)
+            text[candidates[idx][0]] = result
+            if (len(text) != len(' '.join(text).split())) and its_me:
+                print("ERROR:")
+                print("text: ", text)
+                print("replaced token: ", text[candidates[idx][0]])
+                print("letter: ", candidates[idx][1])
+                print("replaced letter: ", replaced)
+            onehot_label[candidates[idx][0]] = 1
+            return True, text, onehot_label
+    def replace_with_typo_letter(self, text, onehot_label):
+        """
+        Replace a subword/letter with its homophones
+        Args:
+            text: a list of word tokens
+            onehot_label: onehot array indicate position of word that has already modify, so this
+            function only choose the word that do not has onehot label == 1.
+        return: True, text, onehot_label if successful replace, else False, None, None
+        """
+        if len(text) == len(' '.join(text).split()):
+            its_me = True
+        else:
+            its_me = False
+        # find index noise
+        idx = np.random.randint(0, len(onehot_label))
+        prevent_loop = 0
+        while onehot_label[idx] != 0 or not self.vocab.exist(text[idx]):
+            idx = np.random.randint(0, len(onehot_label))
+            prevent_loop += 1
+            if prevent_loop > 10:
+                return False, text, onehot_label
+        index_noise = idx
+        onehot_label[index_noise] = 1
+        org_word = text[index_noise]
+        word_noise = text[index_noise]
+        pattern = "(" + "|".join(self.typo.keys()) + "){1}"
+        candidates = re.findall(pattern, word_noise)
+        if len(candidates) == 0:
+            return False, text, onehot_label
+        accent_pattern = "(s|f|r|x|j|1|2|3|4|5){1}"
+        for candidate in candidates:
+            replaced = self.replace_char_candidate_typo(candidate)
+            # Move accent to the end of text
+            result = re.findall(accent_pattern, replaced)
+            if len(result) != 0:
+                word_noise = re.sub(candidate, replaced[0:-1], word_noise)
+                word_noise += replaced[-1]
+            else:
+                word_noise = re.sub(candidate, replaced, word_noise)
+        text[index_noise] = word_noise
+        if len(word_noise) < 3:
+            return True, text, onehot_label
+        ### Introduce one or two edit on text
+        num_edits = np.random.choice([0, 1, 2], p = [0.5, 0.35, 0.15])
+        for i in range(num_edits):
+            coin = np.random.choice([0, 1, 2, 3])
+            word_noise = list(text[index_noise])
+            start_char = word_noise.pop(0)
+            if coin == 0:
+                chosen_idx = np.random.randint(0, len(word_noise))
+                word_noise[chosen_idx] = self.vn_alphabet[np.random.randint(0, self.alphabet_len)]
+                text[index_noise] = start_char + "".join(word_noise)
+            elif coin == 1:
+                chosen_idx = np.random.randint(0, len(word_noise))
+                word_noise[chosen_idx] += self.vn_alphabet[np.random.randint(0, self.alphabet_len)]
+                text[index_noise] = start_char + "".join(word_noise)
+            elif coin == 2:
+                if len(word_noise) < 2:
+                    continue
+                chosen_idxs = np.random.choice(range(len(word_noise)), size = 2)
+                word_noise[chosen_idxs[0]], word_noise[chosen_idxs[1]] = \
+                    word_noise[chosen_idxs[1]], word_noise[chosen_idxs[0]]
+                text[index_noise] = start_char + "".join(word_noise)
+            else:
+                chosen_idx = np.random.randint(0, len(word_noise))
+                word_noise[chosen_idx] = ""
+                text[index_noise] = start_char + "".join(word_noise)
+        return True, text, onehot_label
+    def split_word(self, text, onehot_label):
+        # find index noise
+        idx = np.random.randint(0, len(onehot_label))
+        prevent_loop = 0
+        while onehot_label[idx] not in [0, 1] or len(text[idx]) < 3 or text[idx] in r'''!"#$%&'()*+,-./:;<=>?@[]^_`{|}~''' :
+            idx = np.random.randint(0, len(onehot_label))
+            prevent_loop += 1
+            if prevent_loop > 10:
+                return False, text, onehot_label
+        org_word = text[idx]
+        new_text = text[:idx]
+        new_onehot = onehot_label[:idx]
+        index_split = np.random.randint(1, len(org_word))
+        new_text.extend([org_word[:index_split], org_word[index_split:]])
+        new_onehot.extend([2, 2])
+        if idx < len(text) - 1:
+            new_text.extend(text[idx+1:])
+            new_onehot.extend(onehot_label[idx+1:])
+        return True, new_text, new_onehot
+    def merge_word(self, text, onehot_label):
+        length = len(onehot_label)
+        if length < 2:
+            return False, text, onehot_label
+        def validate_len(idx, size):
+            while idx + size > length:
+                if idx > 0:
+                    idx -= 1
+                else:
+                    size -= 1
+            return idx, size
+        def validate_value(idx, size):
+            for i in range(idx, idx+size):
+                if onehot_label[i] not in [0, 1] or text[i] in r'''!"#$%&'()*+,-./:;<=>?@[]^_`{|}~''':
+                    return False
+            return True
+        # find index noise
+        min_words = 2
+        max_words = 3 if length > 3 else length
+        num_words = np.random.randint(min_words, max_words + 1)
+        idx = np.random.randint(0, length)
+        prevent_loop = 0
+        idx, num_words = validate_len(idx, num_words)
+        while not validate_value(idx, num_words) :
+            prevent_loop += 1
+            if prevent_loop > 10:
+                return False, text, onehot_label
+            idx = np.random.randint(0, length)
+            num_words = np.random.randint(min_words, max_words + 1)
+            idx, num_words = validate_len(idx, num_words)
+        new_text = text[:idx]
+        new_onehot = onehot_label[:idx]
+        new_text.append(''.join(text[idx:idx+num_words]))
+        new_onehot.append(-num_words+1)
+        if idx + num_words < length:
+            new_text.extend(text[idx+num_words:])
+            new_onehot.extend(onehot_label[idx+num_words:])
+        return True, new_text, new_onehot
+    def add_normal_noise(self, sentence, percent_err=0.2, num_type_err=4):
+        tokens = sentence.split()
+        if len(tokens) <= 0:
+            print(f"SOMETHING WROONG - sent: {sentence}")
+        onehot_label = [0] * len(tokens)
+        num_wrong = int(np.ceil(percent_err * len(tokens)))
+        num_wrong = np.random.randint(1, num_wrong + 1)
+        if np.random.rand() < 0.05:
+            num_wrong = 0
+        prevent_loop = 0
+        for i in range(0, num_wrong):
+            err = np.random.choice(range(num_type_err + 1)\
+                , p = [0.15, 0.15, 0.1, 0.2, 0.4])
+            if err == 0:
+                _, tokens, onehot_label = self.remove_diacritics(
+                    tokens, onehot_label)
+            elif err == 1:
+                _, tokens, onehot_label = self.replace_with_typo_letter(
+                    tokens, onehot_label)
+            elif err == 2:
+                _, tokens, onehot_label = self.replace_with_random_letter(
+                    tokens, onehot_label)
+            elif err == 3:
+                _, tokens, onehot_label = self.replace_with_homophone_letter(
+                tokens, onehot_label)
+            else:
+                _, tokens, onehot_label = self.replace_with_homophone_word(
+                    tokens, onehot_label)
+            prevent_loop += 1
+            if prevent_loop > 10:
+                return ' '.join(tokens), ' '.join([str(i) for i in onehot_label])
+            # print(tokens)
+            self.verify(tokens, sentence)
+        return ' '.join(tokens), ' '.join([str(i) for i in onehot_label])
+    def add_split_merge_noise(self, sentence, percent_err=0.15, num_type_err=2, percent_normal_err = 0.15):
+        def count_zero_one(onehot_label):
+            return sum([1 if onehot in [0, 1] else 0 for onehot in onehot_label])
+        ## Introduce normal noise before split merge
+        normal_noise, normal_onehot = self.add_normal_noise(
+                sentence, percent_err=percent_normal_err)
+        tokens = normal_noise.split()
+        length = len(tokens)
+        onehot_label = [int(x) for x in normal_onehot.split(" ")]
+        num_wrong = int(np.ceil(percent_err * length))
+        num_wrong = np.random.randint(1, num_wrong + 1)
+        if np.random.rand() < 0.05:
+            num_wrong = 0
+        min_zeroes = length - num_wrong
+        zero_one_num = length
+        prevent_loop = 0
+        while zero_one_num  > min_zeroes:
+            err = np.random.randint(0, num_type_err)
+            if err == 0:
+                _, tokens, onehot_label = self.split_word(
+                    tokens, onehot_label)
+            else:
+                _, tokens, onehot_label = self.merge_word(
+                    tokens, onehot_label)
+            prevent_loop += 1
+            if prevent_loop > 10:
+                return ' '.join(tokens), ' '.join([str(i) for i in onehot_label])
+            zero_one_num = count_zero_one(onehot_label)
+        return ' '.join(tokens), ' '.join([str(i) for i in onehot_label])
+    def verify(self, noised_tokens, sentence):
+        if len(noised_tokens) != len(' '.join(noised_tokens).split()):
+                print("ERROR:")
+                print("TEXT  : ", sentence)
+                print("TOKENS: ", ' '.join(noised_tokens))
+                exit()
+        return True
+if __name__ == "__main__":
+    text = "Ô kìa ai như cô thắm , con bác năm ở xa mới về , nghiêng nghiêng"
+    dict_pickle_path = '../data/vi/datasets/vi_wiki/vi_wiki.vocab.test.pkl'
+    vocab = Vocab()
+    vocab.load_vocab_dict(dict_pickle_path)
+    noiser = SynthesizeData(vocab)
+    noised_text, onehot_label = noiser.add_split_merge_noise(text, percent_err=0.5)
+    print(noised_text)

dataset/noising_resources/accents.json ADDED Viewed

	@@ -0,0 +1,498 @@

+{"ă": [
+        ["ắằẳẵặ", "âấầẩẫậ", "aáàảãạ"
+        ],
+        [
+            0.7,
+            0.15,
+            0.15
+        ]
+    ], "â": [
+        ["ấầẩẫậ", "aáàảãạ", "ăắằẳẵặ"
+        ],
+        [
+            0.7,
+            0.15,
+            0.15
+        ]
+    ], "á": [
+        ["aàảãạ", "ăắằẳẵặ", "âấầẩẫậ"
+        ],
+        [
+            0.7,
+            0.15,
+            0.15
+        ]
+    ], "à": [
+        ["aáảãạ", "ăắằẳẵặ", "âấầẩẫậ"
+        ],
+        [
+            0.7,
+            0.15,
+            0.15
+        ]
+    ], "ả": [
+        ["aáàãạ", "ăắằẳẵặ", "âấầẩẫậ"
+        ],
+        [
+            0.7,
+            0.15,
+            0.15
+        ]
+    ], "ã": [
+        ["aáàảạ", "ăắằẳẵặ", "âấầẩẫậ"
+        ],
+        [
+            0.7,
+            0.15,
+            0.15
+        ]
+    ], "ạ": [
+        ["aáàảã", "ăắằẳẵặ", "âấầẩẫậ"
+        ],
+        [
+            0.7,
+            0.15,
+            0.15
+        ]
+    ], "ắ": [
+        ["ăằẳẵặ", "aáàảãạ", "âấầẩẫậ"
+        ],
+        [
+            0.7,
+            0.15,
+            0.15
+        ]
+    ], "ằ": [
+        ["ăắẳẵặ", "aáàảãạ", "âấầẩẫậ"
+        ],
+        [
+            0.7,
+            0.15,
+            0.15
+        ]
+    ], "ẳ": [
+        ["ăắằẵặ", "aáàảãạ", "âấầẩẫậ"
+        ],
+        [
+            0.7,
+            0.15,
+            0.15
+        ]
+    ], "ặ": [
+        ["ăắằẳẵ", "aáàảãạ", "âấầẩẫậ"
+        ],
+        [
+            0.7,
+            0.15,
+            0.15
+        ]
+    ], "ẵ": [
+        ["ăắằẳặ", "aáàảãạ", "âấầẩẫậ"
+        ],
+        [
+            0.7,
+            0.15,
+            0.15
+        ]
+    ], "ấ": [
+        ["âầẩẫậ", "aáàảãạ", "ăắằẳẵặ"
+        ],
+        [
+            0.7,
+            0.15,
+            0.15
+        ]
+    ], "ầ": [
+        ["âấẩẫậ", "aáàảãạ", "ăắằẳẵặ"
+        ],
+        [
+            0.7,
+            0.15,
+            0.15
+        ]
+    ], "ẩ": [
+        ["âấầẫậ", "aáàảãạ", "ăắằẳẵặ"
+        ],
+        [
+            0.7,
+            0.15,
+            0.15
+        ]
+    ], "ẫ": [
+        ["âấầẩậ", "aáàảãạ", "ăắằẳẵặ"
+        ],
+        [
+            0.7,
+            0.15,
+            0.15
+        ]
+    ], "ậ": [
+        ["âấầẩẫ", "aáàảãạ", "ăắằẳẵặ"
+        ],
+        [
+            0.7,
+            0.15,
+            0.15
+        ]
+    ], "é": [
+        ["eèẻẽẹ", "êếềểễệ"
+        ],
+        [
+            0.7,
+            0.3
+        ]
+    ], "è": [
+        ["eéẻẽẹ", "êếềểễệ"
+        ],
+        [
+            0.7,
+            0.3
+        ]
+    ], "ẻ": [
+        ["eéèẽẹ", "êếềểễệ"
+        ],
+        [
+            0.7,
+            0.3
+        ]
+    ], "ẽ": [
+        ["eéèẻẹ", "êếềểễệ"
+        ],
+        [
+            0.7,
+            0.3
+        ]
+    ], "ẹ": [
+        ["eéèẻẽ", "êếềểễệ"
+        ],
+        [
+            0.7,
+            0.3
+        ]
+    ], "ê": [
+        ["eéèẻẽẹ", "ếềểễệ"
+        ],
+        [
+            0.7,
+            0.3
+        ]
+    ], "ế": [
+        ["eéèẻẽẹ", "êềểễệ"
+        ],
+        [
+            0.3,
+            0.7
+        ]
+    ], "ề": [
+        ["eéèẻẽẹ", "êếểễệ"
+        ],
+        [
+            0.3,
+            0.7
+        ]
+    ], "ể": [
+        ["eéèẻẽẹ", "êếềễệ"
+        ],
+        [
+            0.3,
+            0.7
+        ]
+    ], "ễ": [
+        ["eéèẻẽẹ", "êếềểệ"
+        ],
+        [
+            0.3,
+            0.7
+        ]
+    ], "ệ": [
+        ["eéèẻẽẹ", "êếềểễ"
+        ],
+        [
+            0.3,
+            0.7
+        ]
+    ], "í": [
+        ["iìỉĩị", "ýỳỷỹỵ"
+        ],
+        [
+            0.7,
+            0.3
+        ]
+    ], "ì": [
+        ["iíỉĩị", "ýỳỷỹỵ"
+        ],
+        [
+            0.7,
+            0.3
+        ]
+    ], "ỉ": [
+        ["iíìĩị", "ýỳỷỹỵ"
+        ],
+        [
+            0.7,
+            0.3
+        ]
+    ], "ĩ": [
+        ["iíìỉị", "ýỳỷỹỵ"
+        ],
+        [
+            0.7,
+            0.3
+        ]
+    ], "ị": [
+        ["iíìỉĩ", "ýỳỷỹỵ"
+        ],
+        [
+            0.7,
+            0.3
+        ]
+    ], "ó": [
+        ["oòỏọõ", "ôốồổỗộ", "ơớờởợỡ"
+        ],
+        [
+            0.7,
+            0.15,
+            0.15
+        ]
+    ], "ò": [
+        ["oóỏọõ", "ôốồổỗộ", "ơớờởợỡ"
+        ],
+        [
+            0.7,
+            0.15,
+            0.15
+        ]
+    ], "ỏ": [
+        ["oóòọõ", "ôốồổỗộ", "ơớờởợỡ"
+        ],
+        [
+            0.7,
+            0.15,
+            0.15
+        ]
+    ], "õ": [
+        ["oóòỏọ", "ôốồổỗộ", "ơớờởợỡ"
+        ],
+        [
+            0.7,
+            0.15,
+            0.15
+        ]
+    ], "ọ": [
+        ["oóòỏõ", "ôốồổỗộ", "ơớờởợỡ"
+        ],
+        [
+            0.7,
+            0.15,
+            0.15
+        ]
+    ], "ô": [
+        ["oóòỏọõ", "ốồổỗộ", "ơớờởợỡ"
+        ],
+        [
+            0.15,
+            0.7,
+            0.15
+        ]
+    ], "ố": [
+        ["oóòỏọõ", "ôồổỗộ", "ơớờởợỡ"
+        ],
+        [
+            0.15,
+            0.7,
+            0.15
+        ]
+    ], "ồ": [
+        ["oóòỏọõ", "ôốổỗộ", "ơớờởợỡ"
+        ],
+        [
+            0.15,
+            0.7,
+            0.15
+        ]
+    ], "ổ": [
+        ["oóòỏọõ", "ôốồỗộ", "ơớờởợỡ"
+        ],
+        [
+            0.15,
+            0.7,
+            0.15
+        ]
+    ], "ộ": [
+        ["oóòỏọõ", "ôốồổỗ", "ơớờởợỡ"
+        ],
+        [
+            0.15,
+            0.7,
+            0.15
+        ]
+    ], "ỗ": [
+        ["oóòỏọõ", "ôốồổộ", "ơớờởợỡ"
+        ],
+        [
+            0.15,
+            0.7,
+            0.15
+        ]
+    ], "ơ": [
+        ["oóòỏọõ", "ôốồổỗộ", "ớờởợỡ"
+        ],
+        [
+            0.15,
+            0.15,
+            0.7
+        ]
+    ], "ớ": [
+        ["oóòỏọõ", "ôốồổỗộ", "ơờởợỡ"
+        ],
+        [
+            0.15,
+            0.15,
+            0.7
+        ]
+    ], "ờ": [
+        ["oóòỏọõ", "ôốồổỗộ", "ơớởợỡ"
+        ],
+        [
+            0.15,
+            0.15,
+            0.7
+        ]
+    ], "ở": [
+        ["oóòỏọõ", "ôốồổỗộ", "ơớờợỡ"
+        ],
+        [
+            0.15,
+            0.15,
+            0.7
+        ]
+    ], "ợ": [
+        ["oóòỏọõ", "ôốồổỗộ", "ơớờởỡ"
+        ],
+        [
+            0.15,
+            0.15,
+            0.7
+        ]
+    ], "ỡ": [
+        ["oóòỏọõ", "ôốồổỗộ", "ơớờởợ"
+        ],
+        [
+            0.15,
+            0.15,
+            0.7
+        ]
+    ], "ú": [
+        ["uùủũụ", "ưứừữửự"
+        ],
+        [
+            0.7,
+            0.3
+        ]
+    ], "ù": [
+        ["uúủũụ", "ưứừữửự"
+        ],
+        [
+            0.7,
+            0.3
+        ]
+    ], "ủ": [
+        ["uúùũụ", "ưứừữửự"
+        ],
+        [
+            0.7,
+            0.3
+        ]
+    ], "ũ": [
+        ["uúùủụ", "ưứừữửự"
+        ],
+        [
+            0.7,
+            0.3
+        ]
+    ], "ụ": [
+        ["uúùủũ", "ưứừữửự"
+        ],
+        [
+            0.7,
+            0.3
+        ]
+    ], "ư": [
+        ["uúùủũụ", "ứừữửự"
+        ],
+        [
+            0.3,
+            0.7
+        ]
+    ], "ứ": [
+        ["uúùủũụ", "ưừữửự"
+        ],
+        [
+            0.3,
+            0.7
+        ]
+    ], "ừ": [
+        ["uúùủũụ", "ưứữửự"
+        ],
+        [
+            0.3,
+            0.7
+        ]
+    ], "ử": [
+        ["uúùủũụ", "ưứừữự"
+        ],
+        [
+            0.3,
+            0.7
+        ]
+    ], "ữ": [
+        ["uúùủũụ", "ưứừửự"
+        ],
+        [
+            0.3,
+            0.7
+        ]
+    ], "ự": [
+        ["uúùủũụ", "ưứừữử"
+        ],
+        [
+            0.3,
+            0.7
+        ]
+    ], "ý": [
+        ["yỳỷỵỹ", "iíìỉĩị"
+        ],
+        [
+            0.7,
+            0.3
+        ]
+    ], "ỳ": [
+        ["yýỷỵỹ", "iíìỉĩị"
+        ],
+        [
+            0.7,
+            0.3
+        ]
+    ], "ỷ": [
+        ["yýỳỵỹ", "iíìỉĩị"
+        ],
+        [
+            0.7,
+            0.3
+        ]
+    ], "ỵ": [
+        ["yýỳỷỹ", "iíìỉĩị"
+        ],
+        [
+            0.7,
+            0.3
+        ]
+    ], "ỹ": [
+        ["yýỳỷỵ", "iíìỉĩị"
+        ],
+        [
+            0.7,
+            0.3
+        ]
+    ]
+}

dataset/noising_resources/confusion_set.json ADDED Viewed

The diff for this file is too large to render. See raw diff

dataset/noising_resources/homo_leter.json ADDED Viewed

	@@ -0,0 +1,27 @@

+{"s": ["x"],
+ "x": ["s"],
+ "gi": ["d", "j"],
+ "d": ["gi", "z", "đ"],
+ "ch": ["tr", "c"],
+ "tr": ["ch", "t"],
+ "ng": ["n", "ngh"],
+ "n": ["ng", "nh", "l", "m"],
+ "nh": ["n", "ngh"],
+ "ngh": ["ng", "nh"],
+ "y": ["i"],
+ "i": ["y", "j"],
+ "l": ["n"],
+ "qu": ["w", "q"],
+ "w": ["qu"],
+ "ph": ["f"],
+ "f": ["ph"],
+ "th": ["t"],
+ "t": ["th", "c", "p", "tr"],
+ "z": ["d"],
+ "c": ["k", "t", "ch"],
+ "k": ["c"],
+ "q": ["qu"],
+ "j": ["i", "gi"],
+ "đ": ["d"],
+ "m": ["n"],
+ "p": ["t"]}

dataset/noising_resources/kieu_go_dau_cu_moi.txt ADDED Viewed

	@@ -0,0 +1,78 @@

+hóa hoá
+hòa hoà
+ủy uỷ
+thủy thuỷ
+khỏe khoẻ
+tòa toà
+khóa khoá
+họa hoạ
+túy tuý
+thỏa thoả
+hủy huỷ
+thúy thuý
+tùy tuỳ
+tỏa toả
+dọa doạ
+hỏa hoả
+xóa xoá
+thùy thuỳ
+thụy thuỵ
+tọa toạ
+úy uý
+lũy luỹ
+khỏa khoả
+lụy luỵ
+tụy tuỵ
+tủy tuỷ
+ngụy nguỵ
+òa oà
+đóa đoá
+xòe xoè
+hòe hoè
+lòa loà
+nhòa nhoà
+khóe khoé
+trụy truỵ
+góa goá
+tóe toé
+xòa xoà
+lóa loá
+lòe loè
+đọa đoạ
+nhòe nhoè
+lõa loã
+lóe loé
+nhụy nhuỵ
+ngọa ngoạ
+súy suý
+xõa xoã
+xúy xuý
+quá qúa
+chọe choẹ
+quả qủa
+chóe choé
+thóa thoá
+giá gía
+chúy chuý
+ọe oẹ
+khụy khuỵ
+nóe noé
+họe hoẹ
+húy huý
+ngõa ngoã
+chòe choè
+dụy duỵ
+chùy chuỳ
+hùy huỳ
+thõa thoã
+khủy khuỷ
+quí qúi
+chóa choá
+quà qùa
+trủy truỷ
+già gìa
+tóa toá
+lúy luý
+giả gỉa
+chõa choã
+đòa đoà

dataset/noising_resources/typo.json ADDED Viewed

	@@ -0,0 +1,650 @@

+{
+    "ă": [
+        "aw",
+        "a8"
+    ],
+    "â": [
+        "aa",
+        "a6"
+    ],
+    "ấ": [
+        "aas",
+        "a61",
+        "âs",
+        "â1"
+    ],
+    "ầ": [
+        "aaf",
+        "a62",
+        "âf",
+        "â2"
+    ],
+    "ẩ": [
+        "aar",
+        "a63",
+        "âr",
+        "â3"
+    ],
+    "ẫ": [
+        "aax",
+        "a64",
+        "âx",
+        "â4"
+    ],
+    "ậ": [
+        "aaj",
+        "a65",
+        "âj",
+        "â5"
+    ],
+    "á": [
+        "as",
+        "a1"
+    ],
+    "à": [
+        "af",
+        "a2"
+    ],
+    "ả": [
+        "ar",
+        "a3"
+    ],
+    "ã": [
+        "ax",
+        "a4"
+    ],
+    "ạ": [
+        "aj",
+        "a5"
+    ],
+    "ắ": [
+        "aws",
+        "ăs",
+        "ă1",
+        "a81"
+    ],
+    "ổ": [
+        "oor",
+        "ô3",
+        "ôr",
+        "o63"
+    ],
+    "ỗ": [
+        "oox",
+        "ô4",
+        "ôx",
+        "o64"
+    ],
+    "ộ": [
+        "ooj",
+        "ô5",
+        "ôj",
+        "o65"
+    ],
+    "ơ": [
+        "ow",
+        "o7"
+    ],
+    "ằ": [
+        "awf",
+        "ă2",
+        "ăf",
+        "a82"
+    ],
+    "ẳ": [
+        "awr",
+        "ă3",
+        "ăr",
+        "a83"
+    ],
+    "ẵ": [
+        "awx",
+        "ă4",
+        "ăx",
+        "a84"
+    ],
+    "ặ": [
+        "awj",
+        "ă5",
+        "ăj",
+        "a85"
+    ],
+    "ó": [
+        "os",
+        "o1"
+    ],
+    "ò": [
+        "of",
+        "o2"
+    ],
+    "ỏ": [
+        "or",
+        "o3"
+    ],
+    "õ": [
+        "ox",
+        "o4"
+    ],
+    "ọ": [
+        "oj",
+        "o5"
+    ],
+    "ô": [
+        "oo",
+        "o6"
+    ],
+    "ố": [
+        "oos",
+        "ô1",
+        "ôs",
+        "o61"
+    ],
+    "ồ": [
+        "oof",
+        "ô2",
+        "ôf",
+        "o62"
+    ],
+    "ớ": [
+        "ows",
+        "ơ1",
+        "ơs",
+        "o71"
+    ],
+    "ờ": [
+        "owf",
+        "ơ2",
+        "ơf",
+        "o72"
+    ],
+    "ở": [
+        "owr",
+        "ơ3",
+        "ơr",
+        "o73"
+    ],
+    "ỡ": [
+        "owx",
+        "ơ4",
+        "ơx",
+        "o74"
+    ],
+    "ợ": [
+        "owj",
+        "ơ5",
+        "ơj",
+        "o75"
+    ],
+    "é": [
+        "es",
+        "e1"
+    ],
+    "è": [
+        "ef",
+        "e2"
+    ],
+    "ẻ": [
+        "er",
+        "e3"
+    ],
+    "ẽ": [
+        "ex",
+        "e4"
+    ],
+    "ẹ": [
+        "ej",
+        "e5"
+    ],
+    "ê": [
+        "ee",
+        "e6"
+    ],
+    "ế": [
+        "ees",
+        "ês",
+        "ê1",
+        "e61"
+    ],
+    "ề": [
+        "eef",
+        "ê2",
+        "e62",
+        "êf"
+    ],
+    "ể": [
+        "eer",
+        "ê3",
+        "êr",
+        "e63"
+    ],
+    "ễ": [
+        "eex",
+        "ê4",
+        "êx",
+        "e64"
+    ],
+    "ệ": [
+        "eej",
+        "ê5",
+        "êj",
+        "e65"
+    ],
+    "ú": [
+        "us",
+        "u1"
+    ],
+    "ù": [
+        "uf",
+        "u2"
+    ],
+    "ủ": [
+        "ur",
+        "u3"
+    ],
+    "ũ": [
+        "ux",
+        "u4"
+    ],
+    "ụ": [
+        "uj",
+        "u5"
+    ],
+    "ư": [
+        "uw",
+        "u7"
+    ],
+    "ứ": [
+        "uws",
+        "ư1",
+        "ưs",
+        "u71"
+    ],
+    "ừ": [
+        "uwf",
+        "ư2",
+        "ưf",
+        "u72"
+    ],
+    "ử": [
+        "uwr",
+        "ư3",
+        "ưr",
+        "u73"
+    ],
+    "ữ": [
+        "uwx",
+        "ư4",
+        "ưx",
+        "u74"
+    ],
+    "ự": [
+        "uwj",
+        "ư5",
+        "ưj",
+        "u75"
+    ],
+    "í": [
+        "is",
+        "i1"
+    ],
+    "ì": [
+        "if",
+        "i2"
+    ],
+    "ỉ": [
+        "ir",
+        "i3"
+    ],
+    "ị": [
+        "ij",
+        "i5"
+    ],
+    "ĩ": [
+        "ix",
+        "i4"
+    ],
+    "ý": [
+        "ys",
+        "y1"
+    ],
+    "ỳ": [
+        "yf",
+        "y2"
+    ],
+    "ỷ": [
+        "yr",
+        "y3"
+    ],
+    "ỵ": [
+        "yj",
+        "y5"
+    ],
+    "đ": [
+        "dd",
+        "d9"
+    ],
+    "Ă": [
+        "Aw",
+        "A8"
+    ],
+    "Â": [
+        "Aa",
+        "A6"
+    ],
+    "Ấ": [
+        "Aas",
+        "A61",
+        "Âs",
+        "Ă1"
+    ],
+    "Ầ": [
+        "Aaf",
+        "A62",
+        "Âf",
+        "Ă2"
+    ],
+    "Ẩ": [
+        "Aar",
+        "A63",
+        "Âr",
+        "Ă3"
+    ],
+    "Ẫ": [
+        "Aax",
+        "A64",
+        "Âx",
+        "Ă4"
+    ],
+    "Ậ": [
+        "Aaj",
+        "A65",
+        "Âj",
+        "Ă5"
+    ],
+    "Á": [
+        "As",
+        "A1"
+    ],
+    "À": [
+        "Af",
+        "A2"
+    ],
+    "Ả": [
+        "Ar",
+        "A3"
+    ],
+    "Ã": [
+        "Ax",
+        "A4"
+    ],
+    "Ạ": [
+        "Aj",
+        "A5"
+    ],
+    "Ắ": [
+        "Aws",
+        "Ă1",
+        "Ăs",
+        "A81"
+    ],
+    "Ổ": [
+        "Oor",
+        "Ô3",
+        "Ôr",
+        "O63"
+    ],
+    "Ỗ": [
+        "Oox",
+        "Ô4",
+        "Ôx",
+        "O64"
+    ],
+    "Ộ": [
+        "Ooj",
+        "Ô5",
+        "Ôj",
+        "O65"
+    ],
+    "Ơ": [
+        "Ow",
+        "O7"
+    ],
+    "Ằ": [
+        "Awf",
+        "Ă2",
+        "Ăf",
+        "A82"
+    ],
+    "Ẳ": [
+        "Awr",
+        "Ă3",
+        "Ăr",
+        "A83"
+    ],
+    "Ẵ": [
+        "Awx",
+        "Ă4",
+        "Ăx",
+        "A84"
+    ],
+    "Ặ": [
+        "Awj",
+        "Ă5",
+        "Ăj",
+        "A85"
+    ],
+    "Ó": [
+        "Os",
+        "O1"
+    ],
+    "Ò": [
+        "Of",
+        "O2"
+    ],
+    "Ỏ": [
+        "Or",
+        "O3"
+    ],
+    "Õ": [
+        "Ox",
+        "O4"
+    ],
+    "Ọ": [
+        "Oj",
+        "O5"
+    ],
+    "Ô": [
+        "Oo",
+        "O6"
+    ],
+    "Ố": [
+        "Oos",
+        "Ô1",
+        "Ôs",
+        "O61"
+    ],
+    "Ồ": [
+        "Oof",
+        "Ô2",
+        "Ôf",
+        "O62"
+    ],
+    "Ớ": [
+        "Ows",
+        "Ơ1",
+        "Ơs",
+        "O71"
+    ],
+    "Ờ": [
+        "Owf",
+        "Ơ2",
+        "Ơf",
+        "O72"
+    ],
+    "Ở": [
+        "Owr",
+        "Ơ3",
+        "Ơr",
+        "O73"
+    ],
+    "Ỡ": [
+        "Owx",
+        "Ơ4",
+        "Ơx",
+        "O74"
+    ],
+    "Ợ": [
+        "Owj",
+        "Ơ5",
+        "Ơj",
+        "O75"
+    ],
+    "É": [
+        "Es",
+        "E1"
+    ],
+    "È": [
+        "Ef",
+        "E2"
+    ],
+    "Ẻ": [
+        "Er",
+        "E3"
+    ],
+    "Ẽ": [
+        "Ex",
+        "E4"
+    ],
+    "Ẹ": [
+        "Ej",
+        "E5"
+    ],
+    "Ê": [
+        "Ee",
+        "E6"
+    ],
+    "Ế": [
+        "Ees",
+        "Ê1",
+        "Ês",
+        "E61"
+    ],
+    "Ề": [
+        "Eef",
+        "Ê2",
+        "Êf",
+        "E62"
+    ],
+    "Ể": [
+        "Eer",
+        "Ê3",
+        "Êr",
+        "E63"
+    ],
+    "Ễ": [
+        "Eex",
+        "Ê4",
+        "Êx",
+        "E64"
+    ],
+    "Ệ": [
+        "Eej",
+        "Ê5",
+        "Êj",
+        "E65"
+    ],
+    "Ú": [
+        "Us",
+        "U1"
+    ],
+    "Ù": [
+        "Uf",
+        "U2"
+    ],
+    "Ủ": [
+        "Ur",
+        "U3"
+    ],
+    "Ũ": [
+        "Ux",
+        "U4"
+    ],
+    "Ụ": [
+        "Uj",
+        "U5"
+    ],
+    "Ư": [
+        "Uw",
+        "U7"
+    ],
+    "Ứ": [
+        "Uws",
+        "Ư1",
+        "Ưs",
+        "U71"
+    ],
+    "Ừ": [
+        "Uwf",
+        "Ư2",
+        "Ưf",
+        "U72"
+    ],
+    "Ử": [
+        "Uwr",
+        "Ư3",
+        "Ưr",
+        "U73"
+    ],
+    "Ữ": [
+        "Uwx",
+        "Ư4",
+        "Ưx",
+        "U74"
+    ],
+    "Ự": [
+        "Uwj",
+        "Ư5",
+        "Ưj",
+        "U75"
+    ],
+    "Í": [
+        "Is",
+        "I1"
+    ],
+    "Ì": [
+        "If",
+        "I2"
+    ],
+    "Ỉ": [
+        "Ir",
+        "I3"
+    ],
+    "Ị": [
+        "Ij",
+        "I5"
+    ],
+    "Ĩ": [
+        "Ix",
+        "I4"
+    ],
+    "Ý": [
+        "Ys",
+        "Y1"
+    ],
+    "Ỳ": [
+        "Yf",
+        "Y2"
+    ],
+    "Ỷ": [
+        "Yr",
+        "Y3"
+    ],
+    "Ỵ": [
+        "Yj",
+        "Y5"
+    ],
+    "Đ": [
+        "Dd",
+        "D9"
+    ]
+}

dataset/prepare_dataset.py ADDED Viewed

	@@ -0,0 +1,310 @@

+from vocab import Vocab
+from noise import SynthesizeData
+import os
+import sys
+import ray
+import re
+import time
+from datetime import datetime as dt
+sys.path.append("..")
+import numpy as np
+from params import PERCENT_NOISE, NUM_CPUS, NUM_PROCESSES
+from utils.logger import get_logger
+from viet_text_tools import normalize_diacritics
+from transformers import AutoTokenizer
+CHAR_TRANSFORMER_MAX_SEQ_LEN = 512
+tokenizer = AutoTokenizer.from_pretrained("vinai/bartpho-word-base", use_fast=False)
+logger = get_logger("./log/prepare_data.log")
+@ray.remote
+class PrepareActor(object):
+    def __init__(self, id, lang, data_root='../data', corpus="binhbq") -> None:
+        self.data_root, self.lang, self.corpus = data_root, lang, corpus
+        self.id = id
+        self.data_dir = f'{data_root}/{corpus}'
+    def open_files(self):
+        self.train_noise_file_name = f'{self.corpus}.train.noise'  + str(self.id)
+        self.train_file_name =  f'{self.corpus}.train'  + str(self.id)
+        self.train_onehot_file_name = f'{self.corpus}.onehot.train'  + str(self.id)
+        self.train_length_file_name = f'{self.corpus}.length.train' + str(self.id)
+        self.train_file_path = self.data_dir + '/' + self.train_file_name
+        self.train_noise_file_path = self.data_dir + '/' + self.train_noise_file_name
+        self.train_onehot_file_path = self.data_dir + '/' + self.train_onehot_file_name
+        self.train_length_file_path = self.data_dir + '/' + self.train_length_file_name
+        self.train_file = open(self.train_file_path, 'w', encoding='utf-8')
+        self.train_noise_file = open(self.train_noise_file_path, 'w', encoding='utf-8')
+        self.train_onehot_file = open(self.train_onehot_file_path, 'w', encoding='utf-8')
+        self.train_length_file = open(self.train_length_file_path, 'w', encoding='utf-8')
+        self.valid_file_name =  f'{self.corpus}.valid'  + str(self.id)
+        self.valid_noise_file_name =  f'{self.corpus}.valid.noise'  + str(self.id)
+        self.valid_onehot_file_name = f'{self.corpus}.onehot.valid'  + str(self.id)
+        self.valid_length_file_name = f'{self.corpus}.length.valid'  + str(self.id)
+        self.valid_file_path = self.data_dir + '/' + self.valid_file_name
+        self.valid_noise_file_path = self.data_dir + '/' + self.valid_noise_file_name
+        self.valid_onehot_file_path = self.data_dir + '/' + self.valid_onehot_file_name
+        self.valid_length_file_path = self.data_dir + '/' + self.valid_length_file_name
+        self.valid_file = open(self.valid_file_path, 'w', encoding='utf-8')
+        self.valid_noise_file = open(self.valid_noise_file_path, 'w', encoding='utf-8')
+        self.valid_onehot_file = open(self.valid_onehot_file_path, 'w', encoding='utf-8')
+        self.valid_length_file = open(self.valid_length_file_path, 'w', encoding='utf-8')
+        self.test_file_name =  f'{self.corpus}.test'  + str(self.id)
+        self.test_noise_file_name =  f'{self.corpus}.test.noise'  + str(self.id)
+        self.test_onehot_file_name = f'{self.corpus}.onehot.test'  + str(self.id)
+        self.test_length_file_name = f'{self.corpus}.length.test'  + str(self.id)
+        self.test_file_path = self.data_dir + '/' + self.test_file_name
+        self.test_noise_file_path = self.data_dir + '/' + self.test_noise_file_name
+        self.test_onehot_file_path = self.data_dir + '/' + self.test_onehot_file_name
+        self.test_length_file_path = self.data_dir + '/' + self.test_length_file_name
+        self.test_file = open(self.test_file_path, 'w', encoding='utf-8')
+        self.test_noise_file = open(self.test_noise_file_path, 'w', encoding='utf-8')
+        self.test_onehot_file = open(self.test_onehot_file_path, 'w', encoding='utf-8')
+        self.test_length_file = open(self.test_length_file_path, 'w', encoding='utf-8')
+    def close_files(self):
+        if self.train_noise_file:
+            self.train_noise_file.close()
+        if self.train_onehot_file:
+            self.train_onehot_file.close()
+        if self.train_length_file:
+            self.train_length_file.close()
+        if self.train_file:
+            self.train_file.close()
+        if self.test_noise_file:
+            self.test_noise_file.close()
+        if self.test_onehot_file:
+            self.test_onehot_file.close()
+        if self.test_length_file:
+            self.test_length_file.close()
+        if self.test_file:
+            self.test_file.close()
+        if self.valid_noise_file:
+            self.valid_noise_file.close()
+        if self.valid_onehot_file:
+            self.valid_onehot_file.close()
+        if self.valid_length_file:
+            self.valid_length_file.close()
+        if self.valid_file:
+            self.valid_file.close()
+    def prepare_subword_sents_and_vocab(self, lines: ray.data.Dataset):
+        vocab = Vocab(self.lang)
+        self.subword_sents = []
+        print(f"{dt.now()} PrepareActor[{self.id}].prepare_sublist_and_vocab() BEGIN...")
+        for line in lines.iter_rows():
+            line = line.strip("\n")
+            words = line.split(" ")
+            ###
+            if len(words) > 150:
+                splited_lines = re.split("[.;]+", line)
+                for splited_line in splited_lines:
+                    words = splited_line.split(" ")
+                    if len(words) < 10 or len(words) > 150:
+                        continue
+                    words = [normalize_diacritics(word) for word in words]
+                    vocab.update_subword_freq(words)
+                    splited_line = " ".join(words)
+                    self.subword_sents.append(splited_line)
+                continue
+            ###
+            if len(words) < 10:
+                continue
+            words = [normalize_diacritics(word) for word in words]
+            line = " ".join(words)
+            vocab.update_subword_freq(words)
+            self.subword_sents.append(line)
+        print(f"{dt.now()} PrepareActor[{self.id}].prepare_sublist_and_vocab() COMPLETED...")
+        return vocab
+    def gen_noised_and_onehot(self, noiser:SynthesizeData = None):
+        print(f"{dt.now()} PrepareActor[{self.id}].gen_training_data() BEGIN...")
+        self.open_files()
+        logger = get_logger(f"log/prepare_data_worker{self.id}.log")
+        assert noiser != None
+        self.noiser = noiser
+        np.random.seed(2001)
+        np.random.shuffle(self.subword_sents)
+        train_examples = 0
+        #### Train 0.89 Valid 0.01 Test 0.10
+        max_train_examples = int(0.89 * len(self.subword_sents))
+        max_valid_examples = int(0.90 * len(self.subword_sents))
+        for line in self.subword_sents:
+            train_examples += 1
+            if train_examples < max_train_examples:
+                data_for = "train"
+            elif train_examples < max_valid_examples:
+                data_for = "valid"
+            else:
+                data_for = "test"
+            if len(line) > (CHAR_TRANSFORMER_MAX_SEQ_LEN - 2):
+                continue
+            normal_noise, normal_onehot = self.noiser.add_normal_noise(
+                line, percent_err=PERCENT_NOISE)
+            split_merge_noise, split_merge_onehot = self.noiser.add_split_merge_noise(
+                line, percent_err=PERCENT_NOISE, percent_normal_err=PERCENT_NOISE)
+            la = len(normal_noise)
+            lb = len(split_merge_noise)
+            if la > (CHAR_TRANSFORMER_MAX_SEQ_LEN - 2):
+                logger.log(f"INFO:  Noised longer than Transformer's max limit (NORMAL NOISE).")
+                logger.log(f"TEXT: {normal_noise}")
+                continue
+            if lb > (CHAR_TRANSFORMER_MAX_SEQ_LEN - 2):
+                logger.log(f"INFO:  Noised longer than Transformer's max limit (SPLIT MERGE NOISE).")
+                logger.log(f"TEXT: {split_merge_noise}")
+                continue
+            if data_for == "train":
+                self.train_noise_file.write(normal_noise + '\n')
+                self.train_noise_file.write(split_merge_noise + '\n')
+                self.train_onehot_file.write(normal_onehot + '\n')
+                self.train_onehot_file.write(split_merge_onehot + '\n')
+                self.train_file.write(line + "\n")
+                self.train_length_file.write(str(la) + "\n")
+                self.train_length_file.write(str(lb) + "\n")
+            elif data_for == "test":
+                self.test_noise_file.write(normal_noise + '\n')
+                self.test_noise_file.write(split_merge_noise + '\n')
+                self.test_onehot_file.write(normal_onehot + '\n')
+                self.test_onehot_file.write(split_merge_onehot + '\n')
+                self.test_file.write(line + "\n")
+                self.test_length_file.write(str(la) + "\n")
+                self.test_length_file.write(str(lb) + "\n")
+            else:
+                self.valid_noise_file.write(normal_noise + '\n')
+                self.valid_noise_file.write(split_merge_noise + '\n')
+                self.valid_onehot_file.write(normal_onehot + '\n')
+                self.valid_onehot_file.write(split_merge_onehot + '\n')
+                self.valid_file.write(line + "\n")
+                self.valid_length_file.write(str(la) + "\n")
+                self.valid_length_file.write(str(lb) + "\n")
+        print(f"{dt.now()} PrepareActor[{self.id}].gen_training_data() COMPLETED...")
+        self.close_files()
+class PrepareDataset:
+    def __init__(self, data_root='../data', lang='vi', corpus='binhvq'):
+        self.data_root, self.lang, self.corpus = data_root, lang, corpus
+        self.data_dir = f'{data_root}/{corpus}'
+        self.vocab = Vocab(self.lang)
+        # Number of CPUS
+        self.MAX_CPUS = 12
+        self.NUM_CPUS = NUM_CPUS if NUM_CPUS < self.MAX_CPUS else self.MAX_CPUS
+        ray.init(num_cpus=NUM_CPUS)
+        print(f"{dt.now()} PrepareDataset: Initiating {NUM_PROCESSES} PrepareActor")
+        self.actors = [PrepareActor.remote(i, lang, self.data_root, self.corpus) for i in range(NUM_PROCESSES)]
+        self.vocab_pickle_name = f'{self.corpus}.vocab.pkl'
+        self.vocab_pickle_path = self.data_dir + '/' + self.vocab_pickle_name
+        self.vocab_dict_name = f'{self.corpus}.dict.txt'
+        self.vocab_dict_path = self.data_dir + '/' + self.vocab_dict_name
+    def build_vocab_and_subwords(self, ray_ds: ray.data.Dataset):
+        print(f"{dt.now()} PrepareDataset.build_vocab_and_subwords()")
+        shards = ray_ds.split(n = NUM_PROCESSES)
+        subword_and_vocab_refs = [actor.prepare_subword_sents_and_vocab.remote(
+            shard) for actor, shard in zip(self.actors, shards)]
+        subwords_and_vocabs = ray.get(subword_and_vocab_refs)
+        # Return results is vocab
+        for i in range(NUM_PROCESSES):
+            self.vocab.merge_sub_vocabs(subwords_and_vocabs[i])
+    def build_noised_and_onehot(self):
+        print(f"{dt.now()} PrepareDataset.build_noised_and_onehot.remote() BEGIN...")
+        noiser = SynthesizeData(self.vocab)
+        noised_and_onehot_refs = [actor.gen_noised_and_onehot.remote(noiser) \
+            for actor in self.actors]
+        _ = ray.get(noised_and_onehot_refs)
+        print(f"{dt.now()} PrepareDataset.build_noised_and_onehot.remote() COMPLETE !!!")
+        print(f"{dt.now()} PrepareDataset.build_noised_and_onehot(): Writing to noised and onehot files!!!")
+    def prepare_data(self, in_file_name='vi_wiki.data.txt'):
+        print(f"{dt.now()} PrepareDataset.prepare_data(): open_files()")
+        self.in_file_path = self.data_dir + '/' + in_file_name
+        if not os.path.exists(self.in_file_path):
+            print(f"{dt.now()} PrepareDataset.prepare_data(): Cannot find input file!!!")
+            print(f'File path: {self.in_file_path}')
+            return
+        print(f"{dt.now()} PrepareDataset.prepare_data(): Processing file part by part ...")
+        with open(self.in_file_path, 'r', encoding='utf-8') as ifile:
+            lines = ifile.readlines()
+        ray_ds = ray.data.from_items(lines)
+        del lines
+        print(f"{dt.now()} PrepareDataset.prepare_data(): Building Vocabulary...")
+        self.build_vocab_and_subwords(ray_ds)
+        self.vocab.build_vocab(topk=100000)
+        print(f"{dt.now()} PrepareDataset.prepare_data(): Writing Vocabulary to text file...")
+        self.vocab.save_dict_text(self.vocab_dict_path)
+        print(f"{dt.now()} PrepareDataset.prepare_data(): Writing Vocabulary to pickle file...")
+        self.vocab.save_vocab_dict(self.vocab_pickle_path)
+        print(f"{dt.now()} PrepareDataset.prepare_data(): Gen train noised and onehot...")
+        self.build_noised_and_onehot()
+        print(f"{dt.now()} PrepareDataset - Complete preparing dataset!!!")
+if __name__ == "__main__":
+    import argparse
+    description = '''
+        prepare_dataset.py:
+        Usage: python prepare_dataset.py --dataset vi_wiki --file vi_wiki.data.txt --test False
+    '''
+    parser = argparse.ArgumentParser(description=description)
+    parser.add_argument('--file', type=str, default='corpus-small.txt')
+    parser.add_argument('--corpus', type=str, default='binhvq')
+    parser.add_argument('--data_root', type=str, default="../data")
+    args = parser.parse_args()
+    creater = PrepareDataset(data_root = args.data_root, corpus=args.corpus)
+    start_time = time.time()
+    creater.prepare_data(args.file)
+    end_time = time.time()
+    print(f"Time consumed for generate data: {end_time - start_time}")

dataset/prepare_vsec.py ADDED Viewed

	@@ -0,0 +1,46 @@

+import json
+from tqdm import tqdm
+import sys
+from viet_text_tools import normalize_diacritics
+sys.path.append("..")
+from utils.logger import get_logger
+import re
+vsec_path = "../data/vsec/VSEC.jsonl"
+test_file = open("../data/vsec/vsec.test", "w+")
+test_noise_file = open("../data/vsec/vsec.test.noise", "w+")
+with open(vsec_path, "r") as file:
+    data = [json.loads(x[0:-1]) for x in file.readlines()]
+def get_true_text(sentence: dict):
+    true_tokens = []
+    for word in sentence['annotations']:
+        if word['is_correct'] == True:
+            true_tokens.append(word['current_syllable'])
+        else:
+            true_tokens.append(word['alternative_syllables'][0])
+    true_sentence =  " ".join(true_tokens)
+    words = re.findall("\w+|[^\w\s]{1}", true_sentence)
+    return " ".join(words)
+def get_noise_text(sentence: dict):
+    noised_tokens = []
+    for word in sentence['annotations']:
+        noised_tokens.append(word['current_syllable'])
+    noised_sentence = " ".join(noised_tokens)
+    words = re.findall("\w+|[^\w\s]{1}", noised_sentence)
+    noised_tokens = []
+    for word in words:
+        new_word = normalize_diacritics(word)
+        noised_tokens.append(new_word)
+    return " ".join(noised_tokens)
+for sentence in tqdm(data):
+    true_text = get_true_text(sentence)
+    noised_text = get_noise_text(sentence)
+    test_file.write(true_text + "\n")
+    test_noise_file.write(noised_text + "\n")
+test_file.close()
+test_noise_file.close()

dataset/util.py ADDED Viewed

	@@ -0,0 +1,128 @@

+import os
+import numpy as np
+from tqdm import tqdm
+from math import ceil
+def load_vsec_dataset(base_path, corr_file, incorr_file):
+    # load files
+    if base_path:
+        assert os.path.exists(base_path) == True
+    incorr_data = []
+    opfile1 = open(os.path.join(base_path, incorr_file), "r", encoding="utf-8")
+    for line in opfile1:
+        if line.strip() != "":
+            incorr_data.append(line.strip())
+    opfile1.close()
+    corr_data = []
+    opfile2 = open(os.path.join(base_path, corr_file), "r", encoding="utf-8")
+    for line in opfile2:
+        if line.strip() != "":
+            corr_data.append(line.strip())
+    opfile2.close()
+    assert len(incorr_data) == len(corr_data)
+    data = []
+    for x, y in zip(corr_data, incorr_data):
+        data.append((x, y))
+    print(f"loaded tuples of (incorr, corr) examples from {base_path}")
+    return data
+def load_dataset(base_path, corr_file, incorr_file, length_file = None):
+    # load files
+    if base_path:
+        assert os.path.exists(base_path) == True
+    data = []
+    opfile2 = open(os.path.join(base_path, corr_file), "r", encoding="utf-8")
+    for line in tqdm(opfile2):
+        if line.strip() != "":
+            data.append([line.strip()])
+            data.append([line.strip()])
+    opfile2.close()
+    opfile1 = open(os.path.join(base_path, incorr_file), "r", encoding="utf-8")
+    for i, line in tqdm(enumerate(opfile1)):
+        if line.strip() != "":
+            data[i].append(line.strip())
+    opfile1.close()
+    opfile4 = open(os.path.join(base_path, length_file), "r", encoding="utf-8")
+    for i, line in tqdm(enumerate(opfile4)):
+        if line.strip() != "":
+            data[i].append(int(line))
+    opfile4.close()
+    print(f"loaded tuples of (incorr, corr, length) examples from {base_path}")
+    return data
+def load_epoch_dataset(base_path, corr_file, incorr_file, length_file, epoch: int, num_epoch: int):
+    # load files
+    if base_path:
+        assert os.path.exists(base_path) == True
+    assert num_epoch >= 1
+    assert epoch >= 1 and epoch <= num_epoch
+    ## Count number of data
+    opfile = open(os.path.join(base_path, length_file), "r", encoding="utf-8")
+    count = 0
+    for i, line in tqdm(enumerate(opfile)):
+        count +=1
+    opfile.close()
+    print(f"Number of training datas: {count} examples!")
+    epochdataset_examples = int(ceil(1 / num_epoch * count))
+    start_index = epochdataset_examples * (epoch - 1)
+    end_index = start_index + epochdataset_examples
+    data = []
+    opfile2 = open(os.path.join(base_path, corr_file), "r", encoding="utf-8")
+    traverse_count = 0
+    for i, line in tqdm(enumerate(opfile2)):
+        if line.strip() != "":
+            if traverse_count >= start_index and traverse_count < end_index :
+                data.append([line.strip()])
+                traverse_count += 1
+            elif traverse_count >= end_index:
+                break
+            else:
+                traverse_count += 1
+            if traverse_count >= start_index and traverse_count < end_index :
+                data.append([line.strip()])
+                traverse_count += 1
+            elif traverse_count >= end_index:
+                break
+            else:
+                traverse_count += 1
+    opfile2.close()
+    opfile1 = open(os.path.join(base_path, incorr_file), "r", encoding="utf-8")
+    traverse_count = 0
+    for i, line in tqdm(enumerate(opfile1)):
+        if line.strip() != "":
+            if traverse_count >= start_index and traverse_count < end_index :
+                data[i - start_index].append(line.strip())
+            elif traverse_count >= end_index:
+                break
+        traverse_count += 1
+    opfile1.close()
+    traverse_count = 0
+    opfile4 = open(os.path.join(base_path, length_file), "r", encoding="utf-8")
+    for i, line in tqdm(enumerate(opfile4)):
+        if line.strip() != "":
+            if traverse_count >= start_index and traverse_count < end_index :
+                data[i - start_index].append(int(line))
+            elif traverse_count >= end_index:
+                break
+        traverse_count += 1
+    opfile4.close()
+    print(f"loaded tuples of (incorr, corr, length) examples from {base_path}")
+    return data

dataset/vocab.py ADDED Viewed

	@@ -0,0 +1,188 @@

+from __future__ import annotations
+import pickle
+import re
+import os
+import sys
+sys.path.append("..")
+from params import *
+class Vocab():
+    def __init__(self, lang='vi'):
+        self.not_alphabet_regex = '''[^aAàÀảẢãÃáÁạẠăĂằẰẳẲẵẴắẮặẶâÂầẦẩẨẫẪấẤậẬbBcCdDđĐeEèÈẻẺẽẼéÉẹẸêÊềỀểỂễỄếẾệỆfFgGhHiIìÌỉỈĩĨíÍịỊjJkKlLmMnNoOòÒỏỎõÕóÓọỌôÔồỒổỔỗỖốỐộỘơƠờỜởỞỡỠớỚợỢpPqQrRsStTuUùÙủỦũŨúÚụỤưƯừỪửỬữỮứỨựỰvVwWxXyYỳỲỷỶỹỸýÝỵỴzZ0123456789!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~ ]'''
+        self.lang = lang
+        self.token_freq_list = []
+        self.token_freq, self.token2idx, self.idx2token = {}, {}, {}
+        self.pad_token = "<<PAD>>"
+        self.unk_token = "<<UNK>>"
+        self.sub_token = "<<SUB>>"
+        self.eos_token = "<<EOS>>"
+        self.chartoken2idx, self.idx2chartoken = {}, {}
+        self.char_unk_token, self.char_pad_token, self.char_start_token, self.char_end_token = \
+            "<<CHAR_UNK>>", "<<CHAR_PAD>>", "<<CHAR_START>>", "<<CHAR_END>>"
+        self.char_space_token = "<<CHAR_SPACE>>"
+    def set_lang(self, lang):
+        self.lang = lang
+    def exist(self, word):
+        return word in self.token2idx
+    def update_subword_freq(self, subwords: list):
+        for subword in subwords:
+            if not subword.isdigit():
+                if re.search(self.not_alphabet_regex, subword):
+                    continue
+                if subword not in self.token_freq:
+                    self.token_freq[subword] = 0
+                self.token_freq[subword] += 1
+    def merge_sub_vocabs(self, vocab: Vocab):
+        for subword in vocab.token_freq:
+            if subword not in self.token_freq:
+                self.token_freq[subword] = 0
+            self.token_freq[subword] += vocab.token_freq[subword]
+    def insert_special_tokens(self):
+        # add <<PAD>> special token
+        self.pad_token_idx = len(self.token2idx)
+        self.token2idx[self.pad_token] = self.pad_token_idx
+        self.idx2token[self.pad_token_idx] = self.pad_token
+        # add <<SUB>> special token
+        self.sub_token_idx = len(self.token2idx)
+        self.token2idx[self.sub_token] = self.sub_token_idx
+        self.idx2token[self.sub_token_idx] = self.sub_token
+        # add <<UNK>> special token
+        self.unk_token_idx = len(self.token2idx)
+        self.token2idx[self.unk_token] = self.unk_token_idx
+        self.idx2token[self.unk_token_idx] = self.unk_token
+        # add <<EOS>> special token
+        self.eos_token_idx = len(self.token2idx)
+        self.token2idx[self.eos_token] = self.eos_token_idx
+        self.idx2token[self.eos_token_idx] = self.eos_token
+    def insert_dicts(self, build_char_vocab=True):
+        for (token, _) in self.token_freq_list:
+            idx = len(self.token2idx)
+            self.idx2token[idx] = token
+            self.token2idx[token] = idx
+        self.insert_special_tokens()
+        print(f"Total Vocab's size: {len(self.token2idx)}")
+        self.vocab_dict = {"token2idx": self.token2idx,
+                           "idx2token": self.idx2token}
+        # load_char_tokens
+        if build_char_vocab:
+            print("loading character tokens")
+            self.get_char_tokens()
+    def build_vocab(self,  topk=100000, build_char_vocab=True):
+        # retain only topk tokens
+        if topk is not None:
+            sorted_ = sorted(self.token_freq.items(),
+                             key=lambda item: item[1], reverse=True)
+            self.token_freq_list = sorted_[:topk]
+            print(f"Total tokens retained: {len(self.token_freq_list)}")
+        self.insert_dicts(build_char_vocab)
+    def build_vocab_from_text(self, path_: str, build_char_vocab=True):
+        if not os.path.exists(path_):
+            print(f"Vocab: Cannot find dict file: {path_}")
+        else:
+            print("Building vocab from vocab dict file!")
+            with open(path_, 'r') as dict_file:
+                for line in dict_file:
+                    token_freq = line.split()
+                    if token_freq[0] not in [self.pad_token, self.sub_token, self.unk_token, self.eos_token]:
+                        try:
+                            self.token_freq_list.append((token_freq[0], token_freq[1]))
+                        except:
+                            print(line)
+            self.insert_dicts(build_char_vocab)
+    def load_vocab_dict(self, path_: str):
+        """
+        path_: path where the vocab pickle file is saved
+        """
+        with open(path_, 'rb') as fp:
+            self.vocab_dict = pickle.load(fp)
+            self.token2idx = self.vocab_dict['token2idx']
+            self.idx2token = self.vocab_dict['idx2token']
+            self.chartoken2idx = self.vocab_dict['chartoken2idx']
+            self.idx2chartoken = self.vocab_dict['idx2chartoken']
+            self.pad_token_idx = self.token2idx[self.pad_token]
+            self.sub_token_idx = self.token2idx[self.sub_token]
+            self.unk_token_idx = self.token2idx[self.unk_token]
+            self.char_unk_token_idx = self.chartoken2idx[self.char_unk_token]
+    def save_vocab_dict(self, path_: str):
+        """
+        path_: path where the vocab pickle file to be saved
+        vocab_: the dict data
+        """
+        with open(path_, 'wb') as fp:
+            pickle.dump(self.vocab_dict, fp, protocol=pickle.HIGHEST_PROTOCOL)
+        return
+    def save_dict_text(self, path_):
+        with open(path_, 'w', encoding='utf-8') as ofile:
+            print("len(self.token_freq_list): ", len(self.token_freq_list))
+            for (subword, fre) in self.token_freq_list:
+                ofile.write(f'{subword} {fre}\n')
+            ofile.write(f'{self.pad_token} -1\n')
+            ofile.write(f'{self.sub_token} -1\n')
+            ofile.write(f'{self.unk_token} -1\n')
+            ofile.write(f'{self.eos_token} -1\n')
+    def get_char_tokens(self):
+        special_tokens = [self.char_pad_token, self.char_start_token,
+                            self.char_end_token, self.char_unk_token,
+                            self.char_space_token]
+        for char in special_tokens:
+            idx = len(self.chartoken2idx)
+            self.chartoken2idx[char] = idx
+            self.idx2chartoken[idx] = char
+        if self.lang == 'vi':
+            chars = list(
+                '''aAàÀảẢãÃáÁạẠăĂằẰẳẲẵẴắẮặẶâÂầẦẩẨẫẪấẤậẬbBcCdDđĐeEèÈẻẺẽẼéÉẹẸêÊềỀểỂễỄếẾệỆfFgGhHiIìÌỉỈĩĨíÍịỊjJkKlLmMnNoOòÒỏỎõÕóÓọỌôÔồỒổỔỗỖốỐộỘơƠờỜởỞỡỠớỚợỢpPqQrRsStTuUùÙủỦũŨúÚụỤưƯừỪửỬữỮứỨựỰvVwWxXyYỳỲỷỶỹỸýÝỵỴzZ0123456789!"#$%&'()*+,-./:;<=>?@[]^_`{|}~''')
+        else:
+            chars = list(
+                '''aAbBcCdDeEfFgGhHiIjJkKlLmMnNoOpPqQrRsStTuUvVwWxXyYzZ0123456789,;.!?:'"/\_@#$%^&*~`+-=<>()[]{|}''')
+        for char in chars:
+            if char not in self.chartoken2idx:
+                idx = len(self.chartoken2idx)
+                self.chartoken2idx[char] = idx
+                self.idx2chartoken[idx] = char
+        print(f"number of unique chars found: {len(self.chartoken2idx)}")
+        self.vocab_dict["chartoken2idx"] = self.chartoken2idx
+        self.vocab_dict["idx2chartoken"] = self.idx2chartoken

models/__init__.py ADDED Viewed

File without changes

models/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (151 Bytes). View file

models/__pycache__/collator.cpython-310.pyc ADDED Viewed

Binary file (2.72 kB). View file

models/__pycache__/corrector.cpython-310.pyc ADDED Viewed

Binary file (5.21 kB). View file

models/__pycache__/model.cpython-310.pyc ADDED Viewed

Binary file (1.07 kB). View file

models/__pycache__/sampler.cpython-310.pyc ADDED Viewed

Binary file (3.83 kB). View file

models/__pycache__/tokenizer.cpython-310.pyc ADDED Viewed

Binary file (2.56 kB). View file

models/__pycache__/transformer.cpython-310.pyc ADDED Viewed

Binary file (2.29 kB). View file

models/__pycache__/util.cpython-310.pyc ADDED Viewed

Binary file (1.48 kB). View file

models/collator.py ADDED Viewed

	@@ -0,0 +1,78 @@

+from abc import abstractmethod
+from models.tokenizer import TokenAligner
+class PTCollator():
+    def __init__(self, tokenAligner: TokenAligner):
+        self.tokenAligner = tokenAligner
+    def collate(self, dataloader_batch, type = "train") -> dict:
+        if type == "train":
+            return self.collate_train(dataloader_batch)
+        elif type == "test":
+            return self.collate_test(dataloader_batch)
+        elif type == "correct":
+            return self.collate_correct(dataloader_batch)
+    @abstractmethod
+    def collate_train(self, dataloader_batch):
+        pass
+    @abstractmethod
+    def collate_test(self, dataloader_batch):
+        pass
+    @abstractmethod
+    def collate_correct(self, dataloader_batch):
+        pass
+class DataCollatorForCharacterTransformer(PTCollator):
+    def __init__(self, tokenAligner: TokenAligner):
+        super().__init__(tokenAligner)
+    def collate_train(self, dataloader_batch):
+        noised, labels = [], []
+        for sample in dataloader_batch:
+            labels.append(sample[0])
+            noised.append(sample[1])
+        batch_srcs, batch_tgts, batch_lengths, batch_attention_masks = self.tokenAligner.tokenize_for_transformer_with_tokenization(noised, labels)
+        data = dict()
+        data['batch_src'] = batch_srcs
+        data['batch_tgt'] = batch_tgts
+        data['attn_masks'] = batch_attention_masks
+        data['lengths'] = batch_lengths
+        return data
+    def collate_test(self, dataloader_batch):
+        noised, labels = [], []
+        for sample in dataloader_batch:
+            labels.append(sample[0])
+            noised.append(sample[1])
+        batch_srcs, batch_attention_masks = self.tokenAligner.tokenize_for_transformer_with_tokenization(noised, None)
+        data = dict()
+        data['batch_src'] = batch_srcs
+        data['noised_texts'] = noised
+        data['label_texts'] = labels
+        data['attn_masks'] = batch_attention_masks
+        return data
+    def collate_correct(self, dataloader_batch):
+        noised, labels = [], []
+        for sample in dataloader_batch:
+            noised.append(sample[1])
+        batch_srcs, batch_attention_masks= self.tokenAligner.tokenize_for_transformer_with_tokenization(noised)
+        data = dict()
+        data['batch_src'] = batch_srcs
+        data['noised_texts'] = noised
+        data['attn_masks'] = batch_attention_masks
+        return data

models/corrector.py ADDED Viewed

	@@ -0,0 +1,170 @@

+import torch
+import time
+from torch.utils.data import DataLoader
+from datetime import datetime as dt
+from params import *
+from models.model import ModelWrapper
+from utils.metrics import get_mned_metric_from_TruePredict, get_metric_from_TrueWrongPredictV3
+from utils.logger import get_logger
+from models.sampler import RandomBatchSampler, BucketBatchSampler
+from termcolor import colored
+import re
+class Corrector:
+    def __init__(self, model_wrapper: ModelWrapper):
+        self.model_name = model_wrapper.model_name
+        self.model = model_wrapper.model
+        self.model_wrapper = model_wrapper
+        self.logger = get_logger("./log/test.log")
+        self.device = DEVICE
+        self.model.to(self.device)
+        self.logger.log(f"Device: {self.device}")
+        self.logger.log("Loaded model")
+    def correct_transfomer_with_tr(self, batch, num_beams = 2):
+        correction = dict()
+        original = batch
+        splits = re.findall("\w\S+\w|\w+|[^\w\s]{1}", batch)
+        batch = " ".join(splits)
+        with torch.no_grad():
+            self.model.eval()
+            batch_infer_start = time.time()
+            batch = self.model_wrapper.collator.collate([[None, batch,None, None]], type = "correct")
+            result = self.model.inference(batch['batch_src'], num_beams = num_beams,
+                 tokenAligner=self.model_wrapper.collator.tokenAligner)
+            correction['predict_text'] = result
+            correction['noised_text'] = batch['noised_texts']
+            correction['original_text'] = original
+            total_infer_time = time.time() - batch_infer_start
+            correction['time'] = total_infer_time
+        t = re.sub(r"(\s*)([.,:?!;]{1})(\s*)", r"\2\3", correction['predict_text'][0])
+        t = re.sub(r"((?P<parenthesis>\()\s)", r"\g<parenthesis>", t)
+        t = re.sub(r"(\s(?P<parenthesis>\)))", r"\g<parenthesis>", t)
+        t = re.sub(r"((?P<bracket>\[)\s)", r"\g<bracket>", t)
+        t = re.sub(r"(\s(?P<bracket>\]))", r"\g<bracket>", t)
+        t = re.sub(r"([\'\"])\s(.*)\s([\'\"])", r"\1\2\3", t)
+        correction['predict_text']= re.sub(r"\s(%)", "%", t)
+        return correction
+    def _get_transfomer_with_tr_generations(self, batch, num_beams = 2):
+        correction = dict()
+        with torch.no_grad():
+            self.model.eval()
+            batch_infer_start = time.time()
+            result = self.model.inference(batch['batch_src'], num_beams = num_beams,
+                 tokenAligner=self.model_wrapper.collator.tokenAligner)
+            correction['predict_text'] = result
+            correction['noised_text'] = batch['noised_texts']
+            total_infer_time = time.time() - batch_infer_start
+            correction['time'] = total_infer_time
+        return correction
+    def step(self, batch, num_beams = 2):
+        outputs= self._get_transfomer_with_tr_generations(batch, num_beams)
+        batch_predictions = outputs['predict_text']
+        batch_label_texts = batch['label_texts']
+        batch_noised_texts = batch['noised_texts']
+        return batch_predictions, batch_noised_texts, batch_label_texts
+    def _evaluation_loop_autoregressive(self, data_loader, num_beams = 2):
+        TP, FP, FN = 0, 0, 0
+        MNED = 0.0
+        O_MNED = 0.0
+        total_infer_time = 0.0
+        twp_logger = get_logger(f"./log/true_wrong_predict{time.time()}.log")
+        with torch.no_grad():
+            self.model.eval()
+            for step, batch in enumerate(data_loader):
+                batch_infer_start = time.time()
+                batch_predictions, batch_noised_texts, batch_label_texts = \
+                        self.step(batch, num_beams = num_beams)
+                batch_infer_time = time.time() - batch_infer_start
+                _TP, _FP, _FN = get_metric_from_TrueWrongPredictV3(batch_label_texts, batch_noised_texts, batch_predictions, self.model_wrapper.tokenAligner.vocab, twp_logger)
+                TP += _TP
+                FP += _FP
+                FN += _FN
+                _MNED = get_mned_metric_from_TruePredict(batch_label_texts, batch_predictions)
+                MNED += _MNED
+                _O_MNED = get_mned_metric_from_TruePredict(batch_label_texts, batch_noised_texts)
+                O_MNED += _O_MNED
+                info = '{} - Evaluate - iter: {:08d}/{:08d} - TP: {} - FP: {} - FN: {} - _MNED: {:.5f} - _O_MNED: {:.5f} - {} time: {:.2f}s'.format(
+                    dt.now(),
+                    step,
+                    self.test_iters,
+                    _TP,
+                    _FP,
+                    _FN,
+                    _MNED,
+                    _O_MNED,
+                    self.device,
+                    batch_infer_time)
+                self.logger.log(info)
+                torch.cuda.empty_cache()
+                total_infer_time += time.time() - batch_infer_start
+        return total_infer_time, TP, FP, FN, MNED / len(data_loader), O_MNED / len(data_loader)
+    def evaluate(self, dataset, beams: int = None):
+        def test_collate_wrapper(batch):
+            return self.model_wrapper.collator.collate(batch, type = "test")
+        if not BUCKET_SAMPLING:
+            self.test_sampler = RandomBatchSampler(dataset, VALID_BATCH_SIZE, shuffle = False)
+        else:
+            self.test_sampler = BucketBatchSampler(dataset, shuffle = True)
+        data_loader = DataLoader(dataset=dataset,batch_sampler= self.test_sampler,\
+            collate_fn=test_collate_wrapper)
+        self.test_iters = len(data_loader)
+        assert beams != None
+        total_infer_time, TP, FP, FN, MNED, O_MNED = self._evaluation_loop_autoregressive(data_loader, num_beams = beams)
+        self.logger.log("Total inference time for this data is: {:4f} secs".format(total_infer_time))
+        self.logger.log("###############################################")
+        info = f"Metrics for Auto-Regressive with Beam Search number {beams}"
+        self.logger.log(colored(info, "green"))
+        dc_TP = TP
+        dc_FP = FP
+        dc_FN = FN
+        dc_precision = dc_TP / (dc_TP + dc_FP)
+        dc_recall = dc_TP / (dc_TP + dc_FN)
+        dc_F1 = 2. * dc_precision * dc_recall/ ((dc_precision + dc_recall) + 1e-8)
+        self.logger.log(f"TP: {TP}. FP: {FP}. FN: {FN}")
+        self.logger.log(f"Precision: {dc_precision}")
+        self.logger.log(f"Recall: {dc_recall}")
+        self.logger.log(f"F1: {dc_F1}")
+        self.logger.log(f"MNED: {MNED}")
+        self.logger.log(f"O_MNED: {O_MNED}")
+        return

models/model.py ADDED Viewed

	@@ -0,0 +1,22 @@

+from models.transformer import TransformerWithTR
+from models.collator import *
+from transformers import AutoTokenizer
+import transformers
+from models.tokenizer import TokenAligner
+from dataset.vocab import Vocab
+class ModelWrapper:
+    def __init__(self, model, vocab: Vocab):
+        self.model_name = model
+        if model == "tfmwtr":
+            self.tokenizer = AutoTokenizer.from_pretrained("vinai/bartpho-word-base")
+            self.tokenAligner = TokenAligner(self.tokenizer, vocab)
+            self.bart = transformers.MBartForConditionalGeneration.from_pretrained("vinai/bartpho-word-base")
+            self.model = TransformerWithTR(self.bart, self.tokenizer.pad_token_id)
+            self.collator = DataCollatorForCharacterTransformer(self.tokenAligner)
+            # self.model.resize_token_embeddings(self.tokenAligner)
+        else:
+            raise(Exception(f"Model {model} isn't implemented!"))

models/sampler.py ADDED Viewed

	@@ -0,0 +1,99 @@

+from torch.utils.data.dataloader import Sampler
+import sys
+sys.path.append("..")
+from dataset.autocorrect_dataset import SpellCorrectDataset
+import numpy as np
+from params import RANDOM_SEED, MAXIMUM_TOKENS_PER_BATCH
+import copy
+from tqdm import tqdm
+import time
+class RandomBatchSampler(Sampler):
+    def __init__(self, data: SpellCorrectDataset, batch_size = 1, shuffle = True):
+        self.data = data
+        self.seq = list(range(0, len(self.data)))
+        self.shuffle = shuffle
+        self.iters = 0
+        self.batch_size = batch_size
+        if self.shuffle:
+            np.random.seed(RANDOM_SEED)
+            np.random.shuffle(self.seq)
+        self.seq = [ self.seq[index: index + self.batch_size] \
+            for index in range(self.iters, len(self.seq), self.batch_size)]
+        self.default_seq = copy.deepcopy(self.seq)
+    def __iter__(self):
+        return iter(self.seq)
+    def __len__(self):
+        return len(self.seq)
+    def load_checkpoints(self, iters = 0):
+        self.seq = list(range(0, len(self.data)))
+        if self.shuffle:
+            np.random.seed(RANDOM_SEED)
+            np.random.shuffle(self.seq)
+        self.iters = iters
+        self.seq = [ self.seq[index: index + self.batch_size] \
+            for index in range(self.iters, len(self.seq), self.batch_size)]
+class BucketBatchSampler(Sampler):
+    def __init__(self, data: SpellCorrectDataset, shuffle = True):
+        start = time.time()
+        self.remained_indies = None
+        self.data = data
+        self.shuffle = shuffle
+        print("Initializing Bucket Batch Sampler From Scratch")
+        self.data.dataset = sorted(self.data.dataset, key = lambda x: x[2])
+        token_counts = 0
+        indies_lists = []
+        self.seq = []
+        for index, values in tqdm(enumerate(self.data.dataset)):
+            if token_counts >= MAXIMUM_TOKENS_PER_BATCH:
+                self.seq.append(indies_lists)
+                indies_lists = []
+                token_counts = 0
+            indies_lists.append(index)
+            token_counts += values[2]
+        if len(indies_lists) != 0 and token_counts != 0:
+            self.seq.append(indies_lists)
+        if shuffle:
+            np.random.seed(RANDOM_SEED)
+            np.random.shuffle(self.seq)
+        end = time.time()
+        print(f"Initialized Bucket Batch Sampler From Scratch: {end - start}")
+        self.default_seq = copy.deepcopy(self.seq)
+    def __iter__(self):
+        return iter(self.seq)
+    def __len__(self):
+        return len(self.seq)
+    def load_checkpoints(self, remained_indies):
+        start = time.time()
+        print("Loading Bucket Batch Sampler From Checkpoint")
+        remained_indies = sorted(remained_indies)
+        token_counts = 0
+        indies_lists = []
+        self.seq = []
+        for index in tqdm(remained_indies):
+            values = self.data.dataset[index]
+            if token_counts >= MAXIMUM_TOKENS_PER_BATCH:
+                self.seq.append(indies_lists)
+                indies_lists = []
+                token_counts = 0
+            indies_lists.append(index)
+            token_counts += values[2]
+        if len(indies_lists) != 0 and token_counts != 0:
+            self.seq.append(indies_lists)
+        if self.shuffle:
+            np.random.seed(RANDOM_SEED)
+            np.random.shuffle(self.seq)
+        end = time.time()
+        print(f"Loaded Bucket Batch Sampler From Checkpoint: {end - start}")

models/tokenizer.py ADDED Viewed

	@@ -0,0 +1,55 @@

+import sys
+sys.path.append("..")
+from dataset.vocab import Vocab
+from torch.nn.utils.rnn import pad_sequence
+from transformers import AutoTokenizer
+class TokenAligner():
+    def __init__(self, tokenizer: AutoTokenizer, vocab: Vocab):
+        self.tokenizer = tokenizer
+        self.vocab = vocab
+    """
+    params:
+        text  ----  str
+    """
+    def _char_tokenize(self, text):
+        characters = list(text)
+        tokens = [ token + "@@" if i < len(characters) - 1 and characters[i + 1] != " " else token for i, token in enumerate(characters)]
+        tokens = [token for token in tokens if token not in [" @@", " "]]
+        encoded = self.tokenizer.encode_plus(tokens, return_tensors = "pt")
+        token_ids = encoded['input_ids'].squeeze(0)
+        attn_mask = encoded['attention_mask'].squeeze(0)
+        return tokens, token_ids, attn_mask
+    def char_tokenize(self, batch_texts):
+        doc = dict()
+        doc['tokens'] = []
+        doc['token_ids'] = []
+        doc['attention_mask'] = []
+        for text in batch_texts:
+            tokens, token_ids, attn_mask = self._char_tokenize(text)
+            doc['tokens'].append(tokens)
+            doc['token_ids'].append(token_ids)
+            doc['attention_mask'].append(attn_mask)
+        return doc
+    def tokenize_for_transformer_with_tokenization(self, batch_noised_text, batch_label_texts = None):
+        docs = self.char_tokenize(batch_noised_text)
+        batch_srcs = docs['token_ids']
+        batch_attention_masks = docs['attention_mask']
+        batch_attention_masks = pad_sequence(batch_attention_masks ,
+            batch_first=True, padding_value=0)
+        batch_srcs = pad_sequence(batch_srcs ,
+            batch_first=True, padding_value=self.tokenizer.pad_token_id)
+        if batch_label_texts != None:
+            batch_lengths = [len(self.tokenizer.tokenize(text)) for text in batch_label_texts]
+            batch_tgts = self.tokenizer.batch_encode_plus(batch_label_texts, max_length = 512,
+                    truncation = True, padding=True, return_tensors="pt")['input_ids']
+            return batch_srcs, batch_tgts, batch_lengths, batch_attention_masks
+        return batch_srcs, batch_attention_masks