KevinGeng commited on
Commit
0514036
1 Parent(s): 8275b12

add MOS and WER conversion

Browse files
Files changed (5) hide show
  1. .gitignore +22 -0
  2. app.py +11 -3
  3. local/WER2INTELI.png +0 -0
  4. local/convert_metrics.py +71 -0
  5. local/nat2avaMOS.png +0 -0
.gitignore ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python
2
+ __pycache__/
3
+ *.pyc
4
+ *.pyo
5
+ *.pyd
6
+ __pycache__/
7
+ *.db
8
+ *.sqlite3
9
+ *.sqlite
10
+ *.log
11
+ *.bak
12
+ *.swp
13
+ *.swo
14
+ *.tmp
15
+ *.tmp.*
16
+ *~
17
+
18
+ # flagged
19
+ flagged/
20
+
21
+ #
22
+ *.wav
app.py CHANGED
@@ -6,6 +6,7 @@ import torch.nn as nn
6
  import lightning_module
7
  import pdb
8
  import jiwer
 
9
 
10
  # ASR part
11
  from transformers import pipeline
@@ -54,6 +55,10 @@ def calc_mos(audio_path, ref):
54
  trans = p(audio_path)["text"]
55
  # WER
56
  wer = jiwer.wer(ref, trans, truth_transform=transformation, hypothesis_transform=transformation)
 
 
 
 
57
  # MOS
58
  batch = {
59
  'wav': out_wavs,
@@ -63,6 +68,8 @@ def calc_mos(audio_path, ref):
63
  with torch.no_grad():
64
  output = model(batch)
65
  predic_mos = output.mean(dim=1).squeeze().detach().numpy()*2 + 3
 
 
66
  # Phonemes per minute (PPM)
67
  with torch.no_grad():
68
  logits = phoneme_model(out_wavs).logits
@@ -72,7 +79,8 @@ def calc_mos(audio_path, ref):
72
  wav_vad = torchaudio.functional.vad(wav, sample_rate=sr)
73
  ppm = len(lst_phonemes) / (wav_vad.shape[-1] / sr) * 60
74
 
75
- return predic_mos, trans, wer, phone_transcription, ppm
 
76
 
77
 
78
  description ="""
@@ -91,9 +99,9 @@ iface = gr.Interface(
91
  fn=calc_mos,
92
  inputs=[gr.Audio(source='microphone', type="filepath", label="Audio to evaluate"),
93
  gr.Textbox(placeholder="Input reference here (Don't keep this empty)", label="Reference")],
94
- outputs=[gr.Textbox(placeholder="Naturalness evaluation, ranged 1 to 5, the higher the better.", label="Predicted MOS"),
 
95
  gr.Textbox(placeholder="Hypothesis", label="Hypothesis"),
96
- gr.Textbox(placeholder="Word Error Rate: Only valid when Reference is given", label = "WER"),
97
  gr.Textbox(placeholder="Predicted Phonemes", label="Predicted Phonemes"),
98
  gr.Textbox(placeholder="Speaking Rate, Phonemes per minutes", label="PPM")],
99
  title="Laronix's Voice Quality Checking System Demo",
 
6
  import lightning_module
7
  import pdb
8
  import jiwer
9
+ from local.convert_metrics import nat2avaMOS, WER2INTELI
10
 
11
  # ASR part
12
  from transformers import pipeline
 
55
  trans = p(audio_path)["text"]
56
  # WER
57
  wer = jiwer.wer(ref, trans, truth_transform=transformation, hypothesis_transform=transformation)
58
+
59
+ # WER convert to Intellibility score
60
+ INTELI_score = WER2INTELI(wer*100)
61
+
62
  # MOS
63
  batch = {
64
  'wav': out_wavs,
 
68
  with torch.no_grad():
69
  output = model(batch)
70
  predic_mos = output.mean(dim=1).squeeze().detach().numpy()*2 + 3
71
+ # MOS to AVA MOS
72
+ AVA_MOS = nat2avaMOS(predic_mos)
73
  # Phonemes per minute (PPM)
74
  with torch.no_grad():
75
  logits = phoneme_model(out_wavs).logits
 
79
  wav_vad = torchaudio.functional.vad(wav, sample_rate=sr)
80
  ppm = len(lst_phonemes) / (wav_vad.shape[-1] / sr) * 60
81
 
82
+ return AVA_MOS, INTELI_score, trans, phone_transcription, ppm
83
+
84
 
85
 
86
  description ="""
 
99
  fn=calc_mos,
100
  inputs=[gr.Audio(source='microphone', type="filepath", label="Audio to evaluate"),
101
  gr.Textbox(placeholder="Input reference here (Don't keep this empty)", label="Reference")],
102
+ outputs=[gr.Textbox(placeholder="Naturalness Score", label="Naturalness Score, ranged from 0 to 5, the higher the better."),
103
+ gr.Textbox(placeholder="Intelligibility Score", label = "Intelligibility Score, range from 0 to 100, the higher the better"),
104
  gr.Textbox(placeholder="Hypothesis", label="Hypothesis"),
 
105
  gr.Textbox(placeholder="Predicted Phonemes", label="Predicted Phonemes"),
106
  gr.Textbox(placeholder="Speaking Rate, Phonemes per minutes", label="PPM")],
107
  title="Laronix's Voice Quality Checking System Demo",
local/WER2INTELI.png ADDED
local/convert_metrics.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import matplotlib.pyplot as plt
3
+
4
+ # Natural MOS to AVA MOS
5
+
6
+ def linear_function(x):
7
+ return 8 * x - 8
8
+
9
+ def quadratic_function(x):
10
+ return -0.0816 * (x - 5) ** 2 + 5
11
+
12
+ # Natural MOS to AVA MOS
13
+ def nat2avaMOS(x):
14
+ if x <= 1.5:
15
+ return linear_function(x)
16
+ elif x >1.5 and x <= 5:
17
+ return quadratic_function(x)
18
+
19
+ # Word error rate to Intellibility Score (X is percentage)
20
+ def WER2INTELI(x):
21
+ if x <= 10:
22
+ return 100
23
+ elif x <= 100:
24
+ slope = (30 - 100) / (100 - 10)
25
+ intercept = 100 - slope * 10
26
+ return slope * x + intercept
27
+ else:
28
+ return 100 * np.exp(-0.01 * (x - 100))
29
+
30
+ # # 生成 x 值
31
+ # x = np.linspace(0, 200, 400) # 从0到200生成400个点
32
+
33
+ # # 计算对应的 y 值
34
+ # y = [WER2INT(xi) for xi in x]
35
+
36
+ # # 绘制函数图像
37
+ # plt.plot(x, y)
38
+ # plt.xlabel('x')
39
+ # plt.ylabel('f(x)')
40
+ # plt.title('Custom Function')
41
+ # plt.grid(True)
42
+ # plt.show()
43
+
44
+ # # 生成 x 值的范围
45
+ # x1 = np.linspace(1, 1.5, 100)
46
+ # x2 = np.linspace(1.5, 5, 100)
47
+
48
+ # # 计算对应的 y 值
49
+ # y1 = linear_function(x1)
50
+ # y2 = quadratic_function(x2)
51
+
52
+ # # 绘制线性部分
53
+ # plt.plot(x1, y1, label='Linear Function (1 <= x <= 1.5)')
54
+
55
+ # # 绘制二次部分
56
+ # plt.plot(x2, y2, label='Quadratic Function (1.5 <= x <= 5)')
57
+
58
+ # # 添加标签和标题
59
+ # plt.xlabel('Natural Mean Opinion Score')
60
+ # plt.ylabel('AVA Mean Opinion Score')
61
+ # plt.title('nat2avaMOS')
62
+
63
+ # # 添加图例
64
+ # plt.legend()
65
+
66
+ # # 显示图形
67
+ # plt.grid(True)
68
+
69
+ # # 显示图像
70
+ # plt.savefig("./local/nat2avaMOS.png")
71
+ # plt.savefig("./local/WER2INT.png")
local/nat2avaMOS.png ADDED