|
import random |
|
from umsc import UgMultiScriptConverter |
|
import string |
|
import epitran |
|
from difflib import SequenceMatcher |
|
import pandas as pd |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
short_texts = [ |
|
"ياخشىمۇسىز", |
|
"تىشلىقمۇ", |
|
"بەلكىم", |
|
"خەيرلىك كۈن", |
|
"خەير خوش", |
|
"كەچۈرۈڭ", |
|
"رەھمەت", |
|
"ئەرزىمەيدۇ", |
|
"ياردەملىشىڭ", |
|
"توختا", |
|
"چۈشەندىم", |
|
"ھەئە", |
|
"ياق" |
|
] |
|
long_texts = [ |
|
"مەكتەپكە بارغاندا تېخىمۇ بىلىملىك بولۇمەن.", |
|
"يېزا مەنزىرىسى ھەقىقەتەن گۈزەل.", |
|
"بىزنىڭ ئۆيدە تۆت تەكچە، تۆتىلىسى تەك-تەكچە", |
|
"قىلىچ قان تامغۇزسا، بەگ ئەل ئالىدۇ؛ قەلەمدىن سىياھتانسا، ئالتۇن كېلىدۇ.", |
|
"ئۇ بىر كۆزگە كۆرۈنگەن ناخشىچى", |
|
"بۇ پۇتبول مۇسابىقىسىنىڭ ئاخىرلىشىشى." |
|
] |
|
|
|
df = pd.read_csv('uyghur_texts.csv', header=None) |
|
long_texts += df.iloc[:, 0].tolist() |
|
|
|
|
|
|
|
|
|
|
|
|
|
ug_arab_to_latn = UgMultiScriptConverter('UAS', 'ULS') |
|
ug_latn_to_arab = UgMultiScriptConverter('ULS', 'UAS') |
|
|
|
|
|
ipa_converter = epitran.Epitran('uig-Arab') |
|
|
|
|
|
def generate_short_text(script_choice): |
|
"""Generate a random Uyghur short text based on the type.""" |
|
text = random.choice(short_texts) |
|
return ug_arab_to_latn(text) if script_choice == "Uyghur Latin" else text |
|
|
|
def generate_long_text(script_choice): |
|
"""Generate a random Uyghur long text based on the type.""" |
|
text = random.choice(long_texts) |
|
return ug_arab_to_latn(text) if script_choice == "Uyghur Latin" else text |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def remove_punctuation(text): |
|
"""Helper function to remove punctuation from text.""" |
|
extra_punctuation = "–؛;،؟?«»‹›−—¬”“" |
|
all_punctuation = string.punctuation + extra_punctuation |
|
|
|
return text.translate(str.maketrans('', '', all_punctuation)) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def calculate_pronunciation_accuracy(reference_text, output_text, script_choice): |
|
""" |
|
Calculate pronunciation accuracy between reference and ASR output text using Epitran. |
|
""" |
|
|
|
|
|
if script_choice == 'Uyghur Latin': |
|
reference_text = ug_latn_to_arab(reference_text) |
|
|
|
|
|
reference_text_clean = remove_punctuation(reference_text) |
|
output_text_clean = remove_punctuation(output_text) |
|
|
|
|
|
reference_ipa = ipa_converter.transliterate(reference_text_clean) |
|
output_ipa = ipa_converter.transliterate(output_text_clean) |
|
|
|
|
|
matcher = SequenceMatcher(None, reference_text_clean, output_text_clean) |
|
match_ratio = matcher.ratio() |
|
|
|
|
|
pronunciation_accuracy = match_ratio * 100 |
|
|
|
|
|
comparison_md = "<h4>Pronunciation Feedback</h4>\n" |
|
comparison_md += "<div style='margin-top: 10px;'>\n" |
|
for opcode, i1, i2, j1, j2 in matcher.get_opcodes(): |
|
ref_segment = reference_text_clean[i1:i2] |
|
out_segment = output_text_clean[j1:j2] |
|
|
|
if opcode == 'equal': |
|
comparison_md += f'<span style="color: green; font-size: 20px;">{ref_segment}</span>' |
|
elif opcode in ['replace', 'delete', 'insert']: |
|
comparison_md += f'<span style="color: red; font-size: 20px;">{ref_segment}</span>' |
|
comparison_md += "</div>" |
|
|
|
return reference_ipa, output_ipa, comparison_md, pronunciation_accuracy |
|
|
|
|