spelling-correction / dataset /prepare_vsec.py
hoang1007's picture
Upload 69 files
44db343
raw
history blame
No virus
1.47 kB
import json
from tqdm import tqdm
import sys
from viet_text_tools import normalize_diacritics
sys.path.append("..")
from utils.logger import get_logger
import re
vsec_path = "../data/vsec/VSEC.jsonl"
test_file = open("../data/vsec/vsec.test", "w+")
test_noise_file = open("../data/vsec/vsec.test.noise", "w+")
with open(vsec_path, "r") as file:
data = [json.loads(x[0:-1]) for x in file.readlines()]
def get_true_text(sentence: dict):
true_tokens = []
for word in sentence['annotations']:
if word['is_correct'] == True:
true_tokens.append(word['current_syllable'])
else:
true_tokens.append(word['alternative_syllables'][0])
true_sentence = " ".join(true_tokens)
words = re.findall("\w+|[^\w\s]{1}", true_sentence)
return " ".join(words)
def get_noise_text(sentence: dict):
noised_tokens = []
for word in sentence['annotations']:
noised_tokens.append(word['current_syllable'])
noised_sentence = " ".join(noised_tokens)
words = re.findall("\w+|[^\w\s]{1}", noised_sentence)
noised_tokens = []
for word in words:
new_word = normalize_diacritics(word)
noised_tokens.append(new_word)
return " ".join(noised_tokens)
for sentence in tqdm(data):
true_text = get_true_text(sentence)
noised_text = get_noise_text(sentence)
test_file.write(true_text + "\n")
test_noise_file.write(noised_text + "\n")
test_file.close()
test_noise_file.close()