KevinGeng commited on
Commit
2090f04
·
1 Parent(s): 1732ec4

gradio-4 support

Browse files
Files changed (4) hide show
  1. app.py +26 -21
  2. local/description.md +5 -0
  3. local/indicator_plot.py +112 -0
  4. requirements.txt +8 -96
app.py CHANGED
@@ -7,6 +7,7 @@ import lightning_module
7
  import pdb
8
  import jiwer
9
  from local.convert_metrics import nat2avaMOS, WER2INTELI
 
10
 
11
  # ASR part
12
  from transformers import pipeline
@@ -55,10 +56,11 @@ def calc_mos(audio_path, ref):
55
  trans = p(audio_path)["text"]
56
  # WER
57
  wer = jiwer.wer(ref, trans, truth_transform=transformation, hypothesis_transform=transformation)
58
-
59
  # WER convert to Intellibility score
60
  INTELI_score = WER2INTELI(wer*100)
61
 
 
 
62
  # MOS
63
  batch = {
64
  'wav': out_wavs,
@@ -70,6 +72,9 @@ def calc_mos(audio_path, ref):
70
  predic_mos = output.mean(dim=1).squeeze().detach().numpy()*2 + 3
71
  # MOS to AVA MOS
72
  AVA_MOS = nat2avaMOS(predic_mos)
 
 
 
73
  # Phonemes per minute (PPM)
74
  with torch.no_grad():
75
  logits = phoneme_model(out_wavs).logits
@@ -78,34 +83,34 @@ def calc_mos(audio_path, ref):
78
  lst_phonemes = phone_transcription[0].split(" ")
79
  wav_vad = torchaudio.functional.vad(wav, sample_rate=sr)
80
  ppm = len(lst_phonemes) / (wav_vad.shape[-1] / sr) * 60
81
-
82
- return AVA_MOS, INTELI_score, trans, phone_transcription, ppm
83
 
84
 
 
 
85
 
86
- description ="""
87
- MOS prediction demo using UTMOS-strong w/o phoneme encoder model, which is trained on the main track dataset.
88
- This demo only accepts .wav format. Best at 16 kHz sampling rate.
89
-
90
- Paper is available [here](https://arxiv.org/abs/2204.02152)
91
-
92
- Add ASR based on wav2vec-960, currently only English available.
93
- This is a lite version of ASR, delievring faster calculation and compromise to recognition performance
94
- Add WER interface by Laronix Pty LTD
95
- """
96
-
97
-
98
  iface = gr.Interface(
99
  fn=calc_mos,
100
- inputs=[gr.Audio(source='microphone', type="filepath", label="Audio to evaluate"),
101
  gr.Textbox(placeholder="Input reference here (Don't keep this empty)", label="Reference")],
102
- outputs=[gr.Textbox(placeholder="Naturalness Score", label="Naturalness Score, ranged from 0 to 5, the higher the better."),
103
- gr.Textbox(placeholder="Intelligibility Score", label = "Intelligibility Score, range from 0 to 100, the higher the better"),
 
 
104
  gr.Textbox(placeholder="Hypothesis", label="Hypothesis"),
105
- gr.Textbox(placeholder="Predicted Phonemes", label="Predicted Phonemes"),
106
- gr.Textbox(placeholder="Speaking Rate, Phonemes per minutes", label="PPM")],
107
- title="Laronix's Voice Quality Checking System Demo",
108
  description=description,
109
  allow_flagging="auto",
 
110
  )
 
111
  iface.launch()
 
7
  import pdb
8
  import jiwer
9
  from local.convert_metrics import nat2avaMOS, WER2INTELI
10
+ from local.indicator_plot import Intelligibility_Plot, Naturalness_Plot
11
 
12
  # ASR part
13
  from transformers import pipeline
 
56
  trans = p(audio_path)["text"]
57
  # WER
58
  wer = jiwer.wer(ref, trans, truth_transform=transformation, hypothesis_transform=transformation)
 
59
  # WER convert to Intellibility score
60
  INTELI_score = WER2INTELI(wer*100)
61
 
62
+ INT_fig = Intelligibility_Plot(INTELI_score)
63
+
64
  # MOS
65
  batch = {
66
  'wav': out_wavs,
 
72
  predic_mos = output.mean(dim=1).squeeze().detach().numpy()*2 + 3
73
  # MOS to AVA MOS
74
  AVA_MOS = nat2avaMOS(predic_mos)
75
+
76
+ MOS_fig = Naturalness_Plot(AVA_MOS)
77
+
78
  # Phonemes per minute (PPM)
79
  with torch.no_grad():
80
  logits = phoneme_model(out_wavs).logits
 
83
  lst_phonemes = phone_transcription[0].split(" ")
84
  wav_vad = torchaudio.functional.vad(wav, sample_rate=sr)
85
  ppm = len(lst_phonemes) / (wav_vad.shape[-1] / sr) * 60
86
+ # pdb.set_trace()
87
+ return AVA_MOS, MOS_fig, INTELI_score, INT_fig, trans, phone_transcription, ppm
88
 
89
 
90
+ with open("local/description.md") as f:
91
+ description = f.read()
92
 
93
+ # calc_mos("audio_2023-11-01_15-57-39.wav", "hello world")
94
+ # pdb.set_trace()
95
+ examples = [
96
+ [None, "Once upon a time, there was a young rat named Arthur who couldn't make up his mind."],
97
+ [None, "Whenever the other rats asked Arthur if he wanted to go to the park, he would say, 'I don't know.'"],
98
+ ]
 
 
 
 
 
 
99
  iface = gr.Interface(
100
  fn=calc_mos,
101
+ inputs=[gr.Audio(type='filepath', label="Audio to evaluate"),
102
  gr.Textbox(placeholder="Input reference here (Don't keep this empty)", label="Reference")],
103
+ outputs=[gr.Textbox(placeholder="Naturalness Score, ranged from 0 to 5, the higher the better.", label="Naturalness Score, ranged from 0 to 5, the higher the better.", visible=False),
104
+ gr.Plot(label="Naturalness Score, ranged from 0 to 5, the higher the better.", show_label=True, container=True),
105
+ gr.Textbox(placeholder="Intelligibility Score", label = "Intelligibility Score, range from 0 to 100, the higher the better", visible=False),
106
+ gr.Plot(label="Intelligibility Score, range from 0 to 100, the higher the better", show_label=True, container=True),
107
  gr.Textbox(placeholder="Hypothesis", label="Hypothesis"),
108
+ gr.Textbox(placeholder="Predicted Phonemes", label="Predicted Phonemes", visible=False),
109
+ gr.Textbox(placeholder="Speaking Rate, Phonemes per minutes", label="Speaking Rate, Phonemes per minutes", visible=False)],
110
+ title="Speech Analysis by Laronix AI",
111
  description=description,
112
  allow_flagging="auto",
113
+ examples=examples,
114
  )
115
+
116
  iface.launch()
local/description.md ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ ## Let’s get started!
2
+ + Use this in a quiet environment.
3
+ + Make sure your microphone is working and attached.
4
+ + If prompted in a pop-up, allow the browser access to your microphone.
5
+ + When you’re ready, hit record and read the reference sentence below:
local/indicator_plot.py ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import plotly.graph_objects as go
2
+
3
+ def Intelligibility_Plot(Int_Score, fair_thre=30, good_thre = 70, Upper=100, Lower=0):
4
+ '''
5
+ Int_Score: a float number between 0 and 100
6
+ Upper: the upper bound of the plot
7
+ Lower: the lower bound of the plot
8
+ '''
9
+ # Assert Nat_Score is a float number between 0 and 100
10
+ assert isinstance(Int_Score, float|int)
11
+ assert Int_Score >= Lower
12
+ assert Int_Score <= Upper
13
+ # Indicator plot with different colors, under fair_threshold the plot is red, then yellow, then green
14
+ # Design 1: Show bar in different colors refer to the threshold
15
+
16
+ color = "#75DA99"
17
+ if Int_Score <= fair_thre:
18
+ color = "#F2ADA0"
19
+ elif Int_Score <= good_thre:
20
+ color = "#e8ee89"
21
+ else:
22
+ color = "#75DA99"
23
+
24
+ fig = go.Figure(go.Indicator(
25
+ mode="number+gauge",
26
+ gauge={'shape': "bullet",
27
+ 'axis':{'range': [Lower, Upper+10]},
28
+ 'bgcolor': 'white',
29
+ 'bar': {'color': color},
30
+ },
31
+ value=Int_Score,
32
+ domain = {'x': [0, 1], 'y': [0, 1]},
33
+ )
34
+ )
35
+ # # Design 2: Show all thresholds in the background
36
+ # fig = go.Figure(go.Indicator(
37
+ # mode = "number+gauge",
38
+ # gauge = {'shape': "bullet",
39
+ # 'axis': {'range': [Lower, Upper]},
40
+ # 'bgcolor': 'white',
41
+ # 'steps': [
42
+ # {'range': [Lower, fair_thre], 'color': "#F2ADA0"},
43
+ # {'range': [fair_thre, good_thre], 'color': "#e8ee89"},
44
+ # {'range': [good_thre, Upper], 'color': " #75DA99"}],
45
+ # 'bar': {'color': "grey"},
46
+ # },
47
+ # value = Int_Score,
48
+ # domain = {'x': [0, 1], 'y': [0, 1]},
49
+ # )
50
+ # )
51
+ fig.update_layout(
52
+ autosize=False,
53
+ width=650,
54
+ height=250,
55
+ margin=dict(
56
+ l=10,
57
+ r=10,
58
+ b=10,
59
+ t=10,
60
+ pad=4
61
+ ),)
62
+ return fig
63
+
64
+ def Naturalness_Plot(Nat_Score, fair_thre=2, good_thre = 4, Upper=5, Lower=0):
65
+ '''
66
+ Nat_Score: a float number between 0 and 100
67
+ Upper: the upper bound of the plot
68
+ Lower: the lower bound of the plot
69
+ '''
70
+ # Assert Nat_Score is a float number between 0 and 100
71
+ assert isinstance(Nat_Score, float)
72
+ assert Nat_Score >= Lower
73
+ assert Nat_Score <= Upper
74
+
75
+ color = "#75DA99"
76
+ if Nat_Score <= fair_thre:
77
+ color = "#F2ADA0"
78
+ elif Nat_Score <= good_thre:
79
+ color = "#e8ee89"
80
+ else:
81
+ color = "#75DA99"
82
+ fig = go.Figure(go.Indicator(
83
+ mode = "number+gauge",
84
+ gauge = {'shape': "bullet",
85
+ 'axis':{'range': [Lower, Upper+0.4]},
86
+ "bar":{'color': color}},
87
+ value = Nat_Score,
88
+ domain = {'x': [0, 1], 'y': [0, 1]},
89
+ )
90
+ )
91
+ fig.update_layout(
92
+ autosize=False,
93
+ width=650,
94
+ height=250,
95
+ margin=dict(
96
+ l=10,
97
+ r=10,
98
+ b=10,
99
+ t=10,
100
+ pad=4
101
+ ),)
102
+ return fig
103
+
104
+ # test case Intelligibility_Plot
105
+ # x = Intelligibility_Plot(10)
106
+ # x.show()
107
+ # x = Naturalness_Plot(3.5)
108
+ # x.show()
109
+ # x = Intelligibility_Plot(50)
110
+ # x.show()
111
+ # x = Intelligibility_Plot(90)
112
+ # x.show()
requirements.txt CHANGED
@@ -1,101 +1,13 @@
1
- absl-py==1.0.0
2
- aiohttp==3.8.1
3
- aiosignal==1.2.0
4
- analytics-python==1.4.0
5
- antlr4-python3-runtime==4.8
6
- anyio==3.5.0
7
- asgiref==3.5.0
8
- async-timeout==4.0.2
9
- attrs==21.4.0
10
- backoff==1.10.0
11
- bcrypt==3.2.0
12
- bitarray==2.4.0
13
- cachetools==5.0.0
14
- certifi==2021.10.8
15
- cffi==1.15.0
16
- charset-normalizer==2.0.12
17
- click==8.0.4
18
- colorama==0.4.4
19
- cryptography==36.0.1
20
- cycler==0.11.0
21
- Cython==0.29.28
22
- fairseq @ git+https://github.com/pytorch/fairseq.git@d03f4e771484a433f025f47744017c2eb6e9c6bc
23
- fastapi==0.75.0
24
- ffmpy==0.3.0
25
- fonttools==4.30.0
26
- frozenlist==1.3.0
27
- fsspec==2022.2.0
28
- future==0.18.2
29
- google-auth==2.6.0
30
- google-auth-oauthlib==0.4.6
31
- gradio==3.46.1
32
- grpcio==1.44.0
33
- h11==0.12.0
34
- hydra-core==1.0.7
35
- idna==3.3
36
- importlib-metadata==4.11.3
37
- Jinja2==3.0.3
38
- kiwisolver==1.3.2
39
- linkify-it-py==1.0.3
40
- Markdown==3.3.6
41
- markdown-it-py==2.0.1
42
- MarkupSafe==2.1.0
43
- matplotlib==3.5.1
44
- mdit-py-plugins==0.3.0
45
- mdurl==0.1.0
46
- monotonic==1.6
47
- multidict==6.0.2
48
- numpy==1.22.3
49
- oauthlib==3.2.0
50
- omegaconf==2.0.6
51
- orjson==3.6.7
52
- packaging==21.3
53
- pandas==1.4.1
54
- paramiko==2.10.1
55
- Pillow==9.0.1
56
- portalocker==2.4.0
57
- protobuf==3.19.4
58
- pyasn1==0.4.8
59
- pyasn1-modules==0.2.8
60
- pycparser==2.21
61
- pycryptodome==3.14.1
62
- pydantic==1.9.0
63
- pyDeprecate==0.3.1
64
- pydub==0.25.1
65
- PyNaCl==1.5.0
66
- pyparsing==3.0.7
67
- python-dateutil==2.8.2
68
- python-multipart==0.0.5
69
- pytorch-lightning==1.5.10
70
- pytz==2021.3
71
- PyYAML==6.0
72
- regex==2022.3.2
73
- requests==2.27.1
74
- requests-oauthlib==1.3.1
75
- rsa==4.8
76
- sacrebleu==2.0.0
77
- six==1.16.0
78
- sniffio==1.2.0
79
- starlette==0.17.1
80
- tabulate==0.8.9
81
- tensorboard==2.8.0
82
- tensorboard-data-server==0.6.1
83
- tensorboard-plugin-wit==1.8.1
84
- torch==1.11.0
85
- torchaudio==0.11.0
86
- torchmetrics==0.7.2
87
- tqdm==4.63.0
88
- typing-extensions==4.1.1
89
- uc-micro-py==1.0.1
90
- urllib3==1.26.8
91
- uvicorn==0.17.6
92
- Werkzeug==2.0.3
93
- yarl==1.7.2
94
- zipp==3.7.0
95
-
96
  transformers
 
 
97
  tensorboardX
98
  jiwer
99
  phonemizer
100
  librosa
101
- speake
 
 
1
+ gradio
2
+ pytorch_lightning
3
+ torch
4
+ jiwer
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  transformers
6
+ numpy
7
+ fairseq @ git+https://github.com/pytorch/fairseq.git@d03f4e771484a433f025f47744017c2eb6e9c6bc
8
  tensorboardX
9
  jiwer
10
  phonemizer
11
  librosa
12
+ speake
13
+ plotly