Laronix_voice_quality_checking_system

Sleeping

App Files Files Community

KevinGeng commited on Oct 23, 2023

Commit

0514036

•

1 Parent(s): 8275b12

add MOS and WER conversion

Browse files

Files changed (5) hide show

.gitignore +22 -0
app.py +11 -3
local/WER2INTELI.png +0 -0
local/convert_metrics.py +71 -0
local/nat2avaMOS.png +0 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,22 @@

+# Python
+__pycache__/
+*.pyc
+*.pyo
+*.pyd
+__pycache__/
+*.db
+*.sqlite3
+*.sqlite
+*.log
+*.bak
+*.swp
+*.swo
+*.tmp
+*.tmp.*
+*~
+# flagged
+flagged/
+#
+*.wav

app.py CHANGED Viewed

@@ -6,6 +6,7 @@ import torch.nn as nn
 import lightning_module
 import pdb
 import jiwer
 # ASR part
 from transformers import pipeline
@@ -54,6 +55,10 @@ def calc_mos(audio_path, ref):
     trans = p(audio_path)["text"]
     # WER
     wer = jiwer.wer(ref, trans, truth_transform=transformation, hypothesis_transform=transformation)
     # MOS
     batch = {
         'wav': out_wavs,
@@ -63,6 +68,8 @@ def calc_mos(audio_path, ref):
     with torch.no_grad():
         output = model(batch)
     predic_mos = output.mean(dim=1).squeeze().detach().numpy()*2 + 3
     # Phonemes per minute (PPM)
     with torch.no_grad():
         logits = phoneme_model(out_wavs).logits
@@ -72,7 +79,8 @@ def calc_mos(audio_path, ref):
     wav_vad = torchaudio.functional.vad(wav, sample_rate=sr)
     ppm = len(lst_phonemes) / (wav_vad.shape[-1] / sr) * 60
-    return predic_mos, trans, wer, phone_transcription, ppm
 description ="""
@@ -91,9 +99,9 @@ iface = gr.Interface(
   fn=calc_mos,
   inputs=[gr.Audio(source='microphone', type="filepath", label="Audio to evaluate"),
           gr.Textbox(placeholder="Input reference here (Don't keep this empty)", label="Reference")],
-  outputs=[gr.Textbox(placeholder="Naturalness evaluation, ranged 1 to 5, the higher the better.", label="Predicted MOS"),
            gr.Textbox(placeholder="Hypothesis", label="Hypothesis"),
-           gr.Textbox(placeholder="Word Error Rate: Only valid when Reference is given", label = "WER"),
            gr.Textbox(placeholder="Predicted Phonemes", label="Predicted Phonemes"),
            gr.Textbox(placeholder="Speaking Rate, Phonemes per minutes", label="PPM")],
   title="Laronix's Voice Quality Checking System Demo",

 import lightning_module
 import pdb
 import jiwer
+from local.convert_metrics import nat2avaMOS, WER2INTELI
 # ASR part
 from transformers import pipeline
     trans = p(audio_path)["text"]
     # WER
     wer = jiwer.wer(ref, trans, truth_transform=transformation, hypothesis_transform=transformation)
+    # WER convert to Intellibility score
+    INTELI_score = WER2INTELI(wer*100)
     # MOS
     batch = {
         'wav': out_wavs,
     with torch.no_grad():
         output = model(batch)
     predic_mos = output.mean(dim=1).squeeze().detach().numpy()*2 + 3
+    # MOS to AVA MOS
+    AVA_MOS = nat2avaMOS(predic_mos)
     # Phonemes per minute (PPM)
     with torch.no_grad():
         logits = phoneme_model(out_wavs).logits
     wav_vad = torchaudio.functional.vad(wav, sample_rate=sr)
     ppm = len(lst_phonemes) / (wav_vad.shape[-1] / sr) * 60
+    return AVA_MOS, INTELI_score, trans, phone_transcription, ppm
 description ="""
   fn=calc_mos,
   inputs=[gr.Audio(source='microphone', type="filepath", label="Audio to evaluate"),
           gr.Textbox(placeholder="Input reference here (Don't keep this empty)", label="Reference")],
+  outputs=[gr.Textbox(placeholder="Naturalness Score", label="Naturalness Score, ranged from 0 to 5, the higher the better."),
+           gr.Textbox(placeholder="Intelligibility Score", label = "Intelligibility Score, range from 0 to 100, the higher the better"),
            gr.Textbox(placeholder="Hypothesis", label="Hypothesis"),
            gr.Textbox(placeholder="Predicted Phonemes", label="Predicted Phonemes"),
            gr.Textbox(placeholder="Speaking Rate, Phonemes per minutes", label="PPM")],
   title="Laronix's Voice Quality Checking System Demo",

local/WER2INTELI.png ADDED Viewed

local/convert_metrics.py ADDED Viewed

	@@ -0,0 +1,71 @@

+import numpy as np
+import matplotlib.pyplot as plt
+# Natural MOS to AVA MOS
+def linear_function(x):
+    return 8 * x - 8
+def quadratic_function(x):
+    return -0.0816 * (x - 5) ** 2 + 5
+# Natural MOS to AVA MOS
+def nat2avaMOS(x):
+    if x <= 1.5:
+        return linear_function(x)
+    elif x >1.5 and x <= 5:
+        return quadratic_function(x)
+# Word error rate to Intellibility Score (X is percentage)
+def WER2INTELI(x):
+    if x <= 10:
+        return 100
+    elif x <= 100:
+        slope = (30 - 100) / (100 - 10)
+        intercept = 100 - slope * 10
+        return slope * x + intercept
+    else:
+        return 100 * np.exp(-0.01 * (x - 100))
+# # 生成 x 值
+# x = np.linspace(0, 200, 400)  # 从0到200生成400个点
+# # 计算对应的 y 值
+# y = [WER2INT(xi) for xi in x]
+# # 绘制函数图像
+# plt.plot(x, y)
+# plt.xlabel('x')
+# plt.ylabel('f(x)')
+# plt.title('Custom Function')
+# plt.grid(True)
+# plt.show()
+# # 生成 x 值的范围
+# x1 = np.linspace(1, 1.5, 100)
+# x2 = np.linspace(1.5, 5, 100)
+# # 计算对应的 y 值
+# y1 = linear_function(x1)
+# y2 = quadratic_function(x2)
+# # 绘制线性部分
+# plt.plot(x1, y1, label='Linear Function (1 <= x <= 1.5)')
+# # 绘制二次部分
+# plt.plot(x2, y2, label='Quadratic Function (1.5 <= x <= 5)')
+# # 添加标签和标题
+# plt.xlabel('Natural Mean Opinion Score')
+# plt.ylabel('AVA Mean Opinion Score')
+# plt.title('nat2avaMOS')
+# # 添加图例
+# plt.legend()
+# # 显示图形
+# plt.grid(True)
+# # 显示图像
+# plt.savefig("./local/nat2avaMOS.png")
+# plt.savefig("./local/WER2INT.png")

local/nat2avaMOS.png ADDED Viewed