Voice_Assistant_TTS

Sleeping

App Files Files Community

Siddhant commited on 16 days ago

Commit

f78ed8b

•

1 Parent(s): 3168a2f

Add eval metrics

Browse files

Files changed (10) hide show

app.py +101 -42
eval/ASR_WER.py +55 -0
eval/LLM_Metrics.py +106 -0
eval/TTS_intelligibility.py +58 -0
eval/TTS_speech_quality.py +42 -0
eval/vert.py +272 -0
requirements.txt +2 -1
tts_samples/sample1.wav +0 -0
utils.py +12 -0
versa.sh +4 -0

app.py CHANGED Viewed

@@ -1,7 +1,12 @@
 import os
 import shutil
-import os
 from huggingface_hub import HfApi
 api = HfApi()
 import nltk
 nltk.download('averaged_perceptron_tagger_eng')
@@ -29,7 +34,13 @@ text2speech = Text2Speech.from_pretrained(
     noise_scale=0.333,
     noise_scale_dur=0.333,
 )
 import numpy as np
 from VAD.vad_iterator import VADIterator
 import torch
@@ -66,6 +77,8 @@ import soundfile as sf
 import kaldiio
 from espnet2.bin.s2t_inference_ctc import Speech2TextGreedySearch
 from espnet2.bin.s2t_inference import Speech2Text
 s2t = Speech2TextGreedySearch.from_pretrained(
     "pyf98/owsm_ctc_v3.1_1B",
@@ -74,6 +87,9 @@ s2t = Speech2TextGreedySearch.from_pretrained(
     lang_sym='<eng>',
     task_sym='<asr>',
 )
 start_event = torch.cuda.Event(enable_timing=True)
 end_event = torch.cuda.Event(enable_timing=True)
@@ -84,23 +100,13 @@ res = s2t(speech)
 end_event.record()
 torch.cuda.synchronize()
-def int2float(sound):
-    """
-    Taken from https://github.com/snakers4/silero-vad
-    """
-    abs_max = np.abs(sound).max()
-    sound = sound.astype("float32")
-    if abs_max > 0:
-        sound *= 1 / 32768
-    sound = sound.squeeze()  # depends on the use case
-    return sound
 text_str=""
 asr_output_str=""
 vad_output=None
 audio_output = None
 audio_output1 = None
 min_speech_ms=500
 max_speech_ms=float("inf")
 # ASR_model = LightningWhisperMLX(model="distil-large-v3", batch_size=6, quant=None)
@@ -239,8 +245,7 @@ def relevant_vote4_last_response(
 import json
 import time
-def transcribe(stream, new_chunk, option):
     sr, y = new_chunk
     global text_str
     global chat
@@ -252,6 +257,11 @@ def transcribe(stream, new_chunk, option):
     global start_record_time
     global sids
     global spembs
     if stream is None:
         stream=y
         chat.init_chat({"role": "system", "content": "You are a helpful and friendly AI assistant. You are polite, respectful, and aim to provide concise and complete responses of less than 15 words."})
@@ -295,11 +305,14 @@ def transcribe(stream, new_chunk, option):
         array = torch.cat(vad_output).cpu().numpy()
         duration_ms = len(array) / sr * 1000
         if (not(duration_ms < min_speech_ms or duration_ms > max_speech_ms)):
-            print(len(array))
-            array = librosa.util.fix_length(array, size=(16000 * 30))
-            print(len(array))
             start_time = time.time()
-            prompt=" ".join(s2t(array)[0][0].split()[1:])
             vad_output = None
             if len(prompt.strip().split())<2:
                 text_str1=text_str
@@ -310,8 +323,11 @@ def transcribe(stream, new_chunk, option):
             print(len(prompt.strip().split()))
             print(prompt)
             asr_output_str=prompt
             # yield (stream,asr_output_str,text_str, audio_output)
-            print("--- %s seconds ---" % (time.time() - start_time))
             # prompt=ASR_model.transcribe(array)["text"].strip()
             chat.append({"role": user_role, "content": prompt})
             chat_messages = chat.to_list()
@@ -322,14 +338,16 @@ def transcribe(stream, new_chunk, option):
                 temperature=0.0,
                 do_sample=False,
             )
-            print("--- %s seconds ---" % (time.time() - start_time))
             generated_text = output[0]['generated_text'][-1]["content"]
             # torch.mps.empty_cache()
             chat.append({"role": "assistant", "content": generated_text})
             text_str=generated_text
             # import pdb;pdb.set_trace()
             with torch.no_grad():
                 if option=="ChatTTS":
@@ -347,7 +365,7 @@ def transcribe(stream, new_chunk, option):
                 audio_output=(text2speech.fs, audio_chunk)
             audio_output1=(orig_sr,stream)
             stream=y
-            print("--- %s seconds ---" % (time.time() - start_time))
     # else:
     #     audio_output=None
     text_str1=text_str
@@ -362,20 +380,22 @@ def transcribe(stream, new_chunk, option):
         if current_record_time-start_record_time>300:
             gr.Info("Conversations are limited to 5 minutes. The session will restart in approximately 60 seconds. Please wait for the demo to reset. Close this message once you have read it.", duration=None)
             yield stream,gr.Textbox(visible=False),gr.Textbox(visible=False),gr.Audio(visible=False),gr.Audio(visible=False)
-            # api.upload_folder(
-            #     folder_path="flagged_data_points",
-            #     path_in_repo="checkpoint_"+str(start_record_time),
-            #     repo_id="Siddhant/Cascaded_demo_data",
-            #     repo_type="dataset",
-            #     token=access_token,
-            #     # ignore_patterns="**/logs/*.txt", # Ignore all text logs
-            # )
             chat.buffer=[{"role": "system", "content": "You are a helpful and friendly AI assistant. You are polite, respectful, and aim to provide concise and complete responses of less than 15 words."}]
             text_str=""
             audio_output = None
             audio_output1 = None
             asr_output_str = ""
             start_record_time = None
             shutil.rmtree('flagged_data_points')
             os.mkdir("flagged_data_points")
             yield (stream,asr_output_str,text_str1, audio_output, audio_output1)
@@ -486,6 +506,8 @@ def handle_LLM_selection(option):
 def handle_ASR_selection(option):
     yield gr.Textbox(visible=False),gr.Textbox(visible=False),gr.Audio(visible=False)
     global s2t
     if option=="espnet/owsm_v3.1_ebf":
         s2t = Speech2Text.from_pretrained(
@@ -495,6 +517,14 @@ def handle_ASR_selection(option):
             beam_size=1,
             predict_time=False,
         )
     else:
         s2t = Speech2TextGreedySearch.from_pretrained(
             option,
@@ -508,17 +538,36 @@ def handle_ASR_selection(option):
     end_event = torch.cuda.Event(enable_timing=True)
     torch.cuda.synchronize()
     start_event.record()
-    speech = librosa.util.fix_length(dummy_input, size=(16000 * 30))
-    res = s2t(speech)
     end_event.record()
     torch.cuda.synchronize()
     yield gr.Textbox(visible=True),gr.Textbox(visible=True),gr.Audio(visible=True)
-# demo = gr.Interface(
-#     transcribe,
-#     ["state", gr.Audio(sources=["microphone"], streaming=True, waveform_options=gr.WaveformOptions(sample_rate=16000))],
-#     ["state", gr.Textbox(label="ASR output"),gr.Textbox(label="LLM output"), gr.Audio(label="Output", autoplay=True)],
-#     live=True,
-# )
 with gr.Blocks(
         title="E2E Spoken Dialog System",
     ) as demo:
@@ -527,7 +576,7 @@ with gr.Blocks(
                 user_audio = gr.Audio(sources=["microphone"], streaming=True, waveform_options=gr.WaveformOptions(sample_rate=16000))
                 with gr.Row():
                     ASR_radio = gr.Radio(
-                        choices=["pyf98/owsm_ctc_v3.1_1B", "espnet/owsm_ctc_v3.2_ft_1B", "espnet/owsm_v3.1_ebf"],
                         label="Choose ASR:",
                         value="pyf98/owsm_ctc_v3.1_1B",
                     )
@@ -574,6 +623,15 @@ with gr.Blocks(
                 output_audio1 = gr.Audio(label="Output1", autoplay=False, visible=False)
                 output_asr_text = gr.Textbox(label="ASR output")
                 output_text = gr.Textbox(label="LLM output")
                 state = gr.State()
         with gr.Row():
             privacy_text = gr.Textbox(label="Privacy Notice",interactive=False, value="By using this demo, you acknowledge that interactions with this dialog system are collected for research and improvement purposes. The data will only be used to enhance the performance and understanding of the system. If you have any concerns about data collection, please discontinue use.")
@@ -604,10 +662,11 @@ with gr.Blocks(
         diversity_response = gr.Textbox(label="diversity_response",visible=False,interactive=False)
         ip_address = gr.Textbox(label="ip_address",visible=False,interactive=False)
         callback.setup([user_audio, output_asr_text, output_text, output_audio,output_audio1,natural_response,diversity_response,ip_address],"flagged_data_points")
-        user_audio.stream(transcribe, inputs=[state, user_audio, radio], outputs=[state, output_asr_text, output_text, output_audio, output_audio1]).then(lambda *args: callback.flag(list(args)),[user_audio], None,preprocess=False)
         radio.change(fn=handle_selection, inputs=[radio], outputs=[output_asr_text, output_text, output_audio])
         LLM_radio.change(fn=handle_LLM_selection, inputs=[LLM_radio], outputs=[output_asr_text, output_text, output_audio])
         ASR_radio.change(fn=handle_ASR_selection, inputs=[ASR_radio], outputs=[output_asr_text, output_text, output_audio])
         output_audio.play(
             flash_buttons, [], [natural_response,diversity_response]+btn_list
         ).then(lambda *args: callback.flag(list(args)),[user_audio,output_asr_text, output_text, output_audio,output_audio1], None,preprocess=False)

 import os
 import shutil
+import soundfile
+from eval.TTS_intelligibility import handle_espnet_TTS_intelligibility
+from eval.ASR_WER import handle_espnet_ASR_WER
+from eval.TTS_speech_quality import TTS_psuedomos
+from eval.LLM_Metrics import perplexity, vert, bert_score, DialoGPT_perplexity
 from huggingface_hub import HfApi
+from utils import int2float
 api = HfApi()
 import nltk
 nltk.download('averaged_perceptron_tagger_eng')
     noise_scale=0.333,
     noise_scale_dur=0.333,
 )
+try:
+    import versa
+except ImportError:
+    from subprocess import call
+    with open('versa.sh', 'rb') as file:
+        script = file.read()
+    rc = call(script, shell=True)
 import numpy as np
 from VAD.vad_iterator import VADIterator
 import torch
 import kaldiio
 from espnet2.bin.s2t_inference_ctc import Speech2TextGreedySearch
 from espnet2.bin.s2t_inference import Speech2Text
+from espnet2.bin.asr_inference import Speech2Text as S2T_ASR
+import whisper
 s2t = Speech2TextGreedySearch.from_pretrained(
     "pyf98/owsm_ctc_v3.1_1B",
     lang_sym='<eng>',
     task_sym='<asr>',
 )
+latency_ASR=0.0
+latency_LM=0.0
+latency_TTS=0.0
 start_event = torch.cuda.Event(enable_timing=True)
 end_event = torch.cuda.Event(enable_timing=True)
 end_event.record()
 torch.cuda.synchronize()
 text_str=""
 asr_output_str=""
 vad_output=None
 audio_output = None
 audio_output1 = None
+LLM_response_arr=[]
+total_response_arr=[]
 min_speech_ms=500
 max_speech_ms=float("inf")
 # ASR_model = LightningWhisperMLX(model="distil-large-v3", batch_size=6, quant=None)
 import json
 import time
+def transcribe(stream, new_chunk, option, asr_option):
     sr, y = new_chunk
     global text_str
     global chat
     global start_record_time
     global sids
     global spembs
+    global latency_ASR
+    global latency_LM
+    global latency_TTS
+    global LLM_response_arr
+    global total_response_arr
     if stream is None:
         stream=y
         chat.init_chat({"role": "system", "content": "You are a helpful and friendly AI assistant. You are polite, respectful, and aim to provide concise and complete responses of less than 15 words."})
         array = torch.cat(vad_output).cpu().numpy()
         duration_ms = len(array) / sr * 1000
         if (not(duration_ms < min_speech_ms or duration_ms > max_speech_ms)):
             start_time = time.time()
+            if asr_option=="whisper":
+                prompt=s2t.transcribe(torch.tensor(array).float(), beam_size=1)["text"]
+            elif asr_option=="librispeech_asr":
+                prompt=s2t(array)[0][0]
+            else:
+                array = librosa.util.fix_length(array, size=(16000 * 30))
+                prompt=" ".join(s2t(array)[0][0].split()[1:])
             vad_output = None
             if len(prompt.strip().split())<2:
                 text_str1=text_str
             print(len(prompt.strip().split()))
             print(prompt)
             asr_output_str=prompt
+            total_response_arr.append(prompt.replace("\n"," "))
             # yield (stream,asr_output_str,text_str, audio_output)
+            start_LM_time=time.time()
+            latency_ASR=(start_LM_time - start_time)
+            # print("--- %s seconds ---" % (time.time() - start_time))
             # prompt=ASR_model.transcribe(array)["text"].strip()
             chat.append({"role": user_role, "content": prompt})
             chat_messages = chat.to_list()
                 temperature=0.0,
                 do_sample=False,
             )
+            start_TTS_time=time.time()
+            latency_LM=(start_TTS_time - start_LM_time)
             generated_text = output[0]['generated_text'][-1]["content"]
             # torch.mps.empty_cache()
             chat.append({"role": "assistant", "content": generated_text})
             text_str=generated_text
+            LLM_response_arr.append(text_str.replace("\n"," "))
+            total_response_arr.append(text_str.replace("\n"," "))
             # import pdb;pdb.set_trace()
             with torch.no_grad():
                 if option=="ChatTTS":
                 audio_output=(text2speech.fs, audio_chunk)
             audio_output1=(orig_sr,stream)
             stream=y
+            latency_TTS=(time.time() - start_TTS_time)
     # else:
     #     audio_output=None
     text_str1=text_str
         if current_record_time-start_record_time>300:
             gr.Info("Conversations are limited to 5 minutes. The session will restart in approximately 60 seconds. Please wait for the demo to reset. Close this message once you have read it.", duration=None)
             yield stream,gr.Textbox(visible=False),gr.Textbox(visible=False),gr.Audio(visible=False),gr.Audio(visible=False)
+            api.upload_folder(
+                folder_path="flagged_data_points",
+                path_in_repo="checkpoint_"+str(start_record_time),
+                repo_id="Siddhant/Cascaded_demo_data",
+                repo_type="dataset",
+                token=access_token,
+                # ignore_patterns="**/logs/*.txt", # Ignore all text logs
+            )
             chat.buffer=[{"role": "system", "content": "You are a helpful and friendly AI assistant. You are polite, respectful, and aim to provide concise and complete responses of less than 15 words."}]
             text_str=""
             audio_output = None
             audio_output1 = None
             asr_output_str = ""
             start_record_time = None
+            LLM_response_arr=[]
+            total_response_arr=[]
             shutil.rmtree('flagged_data_points')
             os.mkdir("flagged_data_points")
             yield (stream,asr_output_str,text_str1, audio_output, audio_output1)
 def handle_ASR_selection(option):
     yield gr.Textbox(visible=False),gr.Textbox(visible=False),gr.Audio(visible=False)
+    if option=="librispeech_asr":
+        option="espnet/simpleoier_librispeech_asr_train_asr_conformer7_wavlm_large_raw_en_bpe5000_sp"
     global s2t
     if option=="espnet/owsm_v3.1_ebf":
         s2t = Speech2Text.from_pretrained(
             beam_size=1,
             predict_time=False,
         )
+    elif option=="espnet/simpleoier_librispeech_asr_train_asr_conformer7_wavlm_large_raw_en_bpe5000_sp":
+        s2t = S2T_ASR.from_pretrained(
+            model_tag=option,
+            device="cuda",
+            beam_size=1,
+        )
+    elif option=="whisper":
+        s2t = whisper.load_model("large", device="cuda")
     else:
         s2t = Speech2TextGreedySearch.from_pretrained(
             option,
     end_event = torch.cuda.Event(enable_timing=True)
     torch.cuda.synchronize()
     start_event.record()
+    if option=="whisper":
+        audio, rate = soundfile.read("tts_samples/sample1.wav")
+        array=librosa.resample(audio, orig_sr=rate, target_sr=16000)
+        res=s2t.transcribe(torch.tensor(array).float(), beam_size=1)["text"]
+    elif option=="espnet/simpleoier_librispeech_asr_train_asr_conformer7_wavlm_large_raw_en_bpe5000_sp":
+        res = s2t(dummy_input)[0][0]
+    else:
+        speech = librosa.util.fix_length(dummy_input, size=(16000 * 30))
+        res = s2t(speech)
     end_event.record()
     torch.cuda.synchronize()
     yield gr.Textbox(visible=True),gr.Textbox(visible=True),gr.Audio(visible=True)
+def handle_eval_selection(option, TTS_audio_output, LLM_Output, ASR_audio_output, ASR_transcript):
+    global LLM_response_arr
+    global total_response_arr
+    yield (option,gr.Textbox(visible=True))
+    if option=="Latency":
+        text=f"ASR Latency: {latency_ASR:.2f}\nLLM Latency: {latency_LM:.2f}\nTTS Latency: {latency_TTS:.2f}"
+        yield (None,text)
+    elif option=="TTS Intelligibility":
+        yield (None,handle_espnet_TTS_intelligibility(TTS_audio_output,LLM_Output))
+    elif option=="TTS Speech Quality":
+        yield (None,TTS_psuedomos(TTS_audio_output))
+    elif option=="ASR WER":
+        yield (None,handle_espnet_ASR_WER(ASR_audio_output, ASR_transcript))
+    elif option=="Text Dialog Metrics":
+        yield (None,perplexity(LLM_Output.replace("\n"," "))+vert(LLM_response_arr)+bert_score(total_response_arr)+DialoGPT_perplexity(ASR_transcript.replace("\n"," "),LLM_Output.replace("\n"," ")))
 with gr.Blocks(
         title="E2E Spoken Dialog System",
     ) as demo:
                 user_audio = gr.Audio(sources=["microphone"], streaming=True, waveform_options=gr.WaveformOptions(sample_rate=16000))
                 with gr.Row():
                     ASR_radio = gr.Radio(
+                        choices=["pyf98/owsm_ctc_v3.1_1B", "espnet/owsm_ctc_v3.2_ft_1B", "espnet/owsm_v3.1_ebf", "librispeech_asr", "whisper"],
                         label="Choose ASR:",
                         value="pyf98/owsm_ctc_v3.1_1B",
                     )
                 output_audio1 = gr.Audio(label="Output1", autoplay=False, visible=False)
                 output_asr_text = gr.Textbox(label="ASR output")
                 output_text = gr.Textbox(label="LLM output")
+                eval_radio = gr.Radio(
+                    choices=["Latency", "TTS Intelligibility", "TTS Speech Quality", "ASR WER","Text Dialog Metrics"],
+                    label="Choose Evaluation metrics:",
+                )
+                # TTS Intelligibility_radio = gr.Radio(
+                #     choices=["ESPnet", "TTS Intelligibility", "TTS Speech Quality"],
+                #     label="Choose ASR model:",
+                # )
+                output_eval_text = gr.Textbox(label="Evaluation Results")
                 state = gr.State()
         with gr.Row():
             privacy_text = gr.Textbox(label="Privacy Notice",interactive=False, value="By using this demo, you acknowledge that interactions with this dialog system are collected for research and improvement purposes. The data will only be used to enhance the performance and understanding of the system. If you have any concerns about data collection, please discontinue use.")
         diversity_response = gr.Textbox(label="diversity_response",visible=False,interactive=False)
         ip_address = gr.Textbox(label="ip_address",visible=False,interactive=False)
         callback.setup([user_audio, output_asr_text, output_text, output_audio,output_audio1,natural_response,diversity_response,ip_address],"flagged_data_points")
+        user_audio.stream(transcribe, inputs=[state, user_audio, radio, ASR_radio], outputs=[state, output_asr_text, output_text, output_audio, output_audio1]).then(lambda *args: callback.flag(list(args)),[user_audio], None,preprocess=False)
         radio.change(fn=handle_selection, inputs=[radio], outputs=[output_asr_text, output_text, output_audio])
         LLM_radio.change(fn=handle_LLM_selection, inputs=[LLM_radio], outputs=[output_asr_text, output_text, output_audio])
         ASR_radio.change(fn=handle_ASR_selection, inputs=[ASR_radio], outputs=[output_asr_text, output_text, output_audio])
+        eval_radio.change(fn=handle_eval_selection, inputs=[eval_radio,output_audio,output_text,output_audio1,output_asr_text], outputs=[eval_radio,output_eval_text])
         output_audio.play(
             flash_buttons, [], [natural_response,diversity_response]+btn_list
         ).then(lambda *args: callback.flag(list(args)),[user_audio,output_asr_text, output_text, output_audio,output_audio1], None,preprocess=False)

eval/ASR_WER.py ADDED Viewed

	@@ -0,0 +1,55 @@

+from utils import int2float
+def handle_espnet_ASR_WER(ASR_audio_output,ASR_transcript):
+    from versa import espnet_levenshtein_metric, espnet_wer_setup, owsm_levenshtein_metric, owsm_wer_setup, whisper_levenshtein_metric, whisper_wer_setup
+    score_modules_espnet = {
+        "module": espnet_levenshtein_metric,
+        "args": espnet_wer_setup(
+            model_tag="default",
+            beam_size=1,
+            text_cleaner="whisper_en",
+            use_gpu=True,
+        ),
+    }
+    dict1=score_modules_espnet["module"](
+            score_modules_espnet["args"],
+            int2float(ASR_audio_output[1]),
+            ASR_transcript,
+            ASR_audio_output[0],
+    )
+    espnet_wer=(dict1["espnet_wer_delete"]+dict1["espnet_wer_insert"]+dict1["espnet_wer_replace"])/(dict1["espnet_wer_insert"]+dict1["espnet_wer_replace"]+dict1["espnet_wer_equal"])
+    espnet_cer=(dict1["espnet_cer_delete"]+dict1["espnet_cer_insert"]+dict1["espnet_cer_replace"])/(dict1["espnet_cer_insert"]+dict1["espnet_cer_replace"]+dict1["espnet_cer_equal"])
+    score_modules_owsm = {
+        "module": owsm_levenshtein_metric,
+        "args": owsm_wer_setup(
+            model_tag="default",
+            beam_size=1,
+            text_cleaner="whisper_en",
+            use_gpu=True,
+        ),
+    }
+    dict1=score_modules_owsm["module"](
+            score_modules_owsm["args"],
+            int2float(ASR_audio_output[1]),
+            ASR_transcript,
+            ASR_audio_output[0],
+    )
+    owsm_wer=(dict1["owsm_wer_delete"]+dict1["owsm_wer_insert"]+dict1["owsm_wer_replace"])/(dict1["owsm_wer_insert"]+dict1["owsm_wer_replace"]+dict1["owsm_wer_equal"])
+    owsm_cer=(dict1["owsm_cer_delete"]+dict1["owsm_cer_insert"]+dict1["owsm_cer_replace"])/(dict1["owsm_cer_insert"]+dict1["owsm_cer_replace"]+dict1["owsm_cer_equal"])
+    score_modules_whisper = {
+        "module": whisper_levenshtein_metric,
+        "args": whisper_wer_setup(
+            model_tag="default",
+            beam_size=1,
+            text_cleaner="whisper_en",
+            use_gpu=True,
+        ),
+    }
+    dict1=score_modules_whisper["module"](
+            score_modules_whisper["args"],
+            int2float(ASR_audio_output[1]),
+            ASR_transcript,
+            ASR_audio_output[0],
+    )
+    whisper_wer=(dict1["whisper_wer_delete"]+dict1["whisper_wer_insert"]+dict1["whisper_wer_replace"])/(dict1["whisper_wer_insert"]+dict1["whisper_wer_replace"]+dict1["whisper_wer_equal"])
+    whisper_cer=(dict1["whisper_cer_delete"]+dict1["whisper_cer_insert"]+dict1["whisper_cer_replace"])/(dict1["whisper_cer_insert"]+dict1["whisper_cer_replace"]+dict1["whisper_cer_equal"])
+    return f"ESPnet WER: {espnet_wer*100:.2f}\nESPnet CER: {espnet_cer*100:.2f}\nOWSM WER: {owsm_wer*100:.2f}\nOWSM CER: {owsm_cer*100:.2f}\nWhisper WER: {whisper_wer*100:.2f}\nWhisper CER: {whisper_cer*100:.2f}"

eval/LLM_Metrics.py ADDED Viewed

	@@ -0,0 +1,106 @@

+from multiprocessing import Pool
+from eval.vert import get_self_bleu2_geometric, get_auto_bleu2_geometric, run_f
+import numpy as np
+from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModel
+import torch
+from sklearn.metrics.pairwise import cosine_similarity
+from scipy.stats import gmean
+def perplexity(LLM_Output):
+    import evaluate
+    # import pdb;pdb.set_trace()
+    perplexity = evaluate.load("perplexity", module_type="metric")
+    results = perplexity.compute(model_id='gpt2',predictions=[LLM_Output])
+    return f"Perplexity: {results['mean_perplexity']:.2f}\n"
+def vert(LLM_response_arr):
+    # import pdb;pdb.set_trace()
+    terms = [x.strip().split() for x in LLM_response_arr]
+    tasks = [
+        ('Self-BLEU2-geometric', get_self_bleu2_geometric),
+        ('Auto-BLEU2-geometric', get_auto_bleu2_geometric),
+    ]
+    n_processes = min(16, len(tasks))
+    with Pool(n_processes) as pool:
+        metrics = pool.map(run_f, [(t[1], terms) for t in tasks])
+    metric_arr=[]
+    str1=""
+    for (metric_name, _), metric in zip(tasks, metrics):
+        metric, sem = np.mean(metric), np.std(metric) / np.sqrt(len(metric))
+        metric, sem = [
+            round(100 * x, 2) for x in [metric, sem]
+        ]
+        metric_arr.append(metric)
+        str1+=(f'{metric_name}: {metric}\n')
+    str1+=(f'VERT: {round(100*gmean(metric), 2)}\n')
+    return str1
+def bert_score(total_response_arr):
+    # import pdb;pdb.set_trace()
+    def cosine_similarity_context_response(context, response, model, tokenizer):
+        # Tokenize and encode both context and response
+        context_inputs = tokenizer(context, return_tensors="pt", truncation=True)
+        response_inputs = tokenizer(response, return_tensors="pt", truncation=True)
+        for k in context_inputs:
+            context_inputs[k]=context_inputs[k].cuda()
+        for k in response_inputs:
+            response_inputs[k]=response_inputs[k].cuda()
+        # Get embeddings from the model
+        with torch.no_grad():
+            context_embedding = model(**context_inputs).last_hidden_state.mean(dim=1)
+            response_embedding = model(**response_inputs).last_hidden_state.mean(dim=1)
+        # Compute cosine similarity
+        similarity = cosine_similarity(context_embedding.cpu().numpy(), response_embedding.cpu().numpy())
+        return similarity[0][0]
+    bert_model_name = "bert-base-uncased"
+    bert_model = AutoModel.from_pretrained(bert_model_name).cuda()
+    bert_tokenizer = AutoTokenizer.from_pretrained(bert_model_name)
+    similarity = cosine_similarity_context_response(" ".join(total_response_arr[:-1]), total_response_arr[-1], bert_model, bert_tokenizer)
+    return (f"Cosine Similarity: {similarity*100:.2f}"+"\n")
+def DialoGPT_perplexity(user_utterance, response):
+    # import pdb;pdb.set_trace()
+    def evaluate_response_with_dialoGPT(context, response, model, tokenizer):
+        """
+        Evaluate the appropriateness of a response based on the given context using DialoGPT.
+        Args:
+            context (str): The dialogue context (previous conversation).
+            response (str): The generated response to evaluate.
+            model: Pre-trained DialoGPT model.
+            tokenizer: Corresponding tokenizer for the DialoGPT model.
+        Returns:
+            float: Perplexity score of the response given the context.
+        """
+        model.eval()
+        # Combine context and response as input
+        input_text = context + tokenizer.eos_token + response + tokenizer.eos_token
+        inputs = tokenizer(input_text, return_tensors="pt", truncation=True)
+        inputs['input_ids']=inputs['input_ids'].cuda()
+        inputs['attention_mask']=inputs['attention_mask'].cuda()
+        # import pdb;pdb.set_trace()
+        # Compute model outputs and loss
+        with torch.no_grad():
+            outputs = model(**inputs, labels=inputs["input_ids"].cuda())
+            loss = outputs.loss
+        # Calculate perplexity
+        perplexity = torch.exp(loss)
+        return perplexity.cpu().item()
+    # Load DialoGPT model and tokenizer
+    model_name = "microsoft/DialoGPT-medium"  # Choose small/medium/large based on your resources
+    model = AutoModelForCausalLM.from_pretrained(model_name).cuda()
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    perplexity = evaluate_response_with_dialoGPT(user_utterance, response, model, tokenizer)
+    return (f"DialoGPT Perplexity: {perplexity:.2f}"+"\n")

eval/TTS_intelligibility.py ADDED Viewed

	@@ -0,0 +1,58 @@

+from utils import int2float
+def handle_espnet_TTS_intelligibility(TTS_audio_output,LLM_Output):
+    from versa import espnet_levenshtein_metric, espnet_wer_setup, owsm_levenshtein_metric, owsm_wer_setup, whisper_levenshtein_metric, whisper_wer_setup
+    score_modules_espnet = {
+        "module": espnet_levenshtein_metric,
+        "args": espnet_wer_setup(
+            model_tag="default",
+            beam_size=1,
+            text_cleaner="whisper_en",
+            use_gpu=True,
+        ),
+    }
+    # import pdb;pdb.set_trace()
+    dict1=score_modules_espnet["module"](
+            score_modules_espnet["args"],
+            int2float(TTS_audio_output[1]),
+            LLM_Output,
+            TTS_audio_output[0],
+    )
+    espnet_wer=(dict1["espnet_wer_delete"]+dict1["espnet_wer_insert"]+dict1["espnet_wer_replace"])/(dict1["espnet_wer_delete"]+dict1["espnet_wer_replace"]+dict1["espnet_wer_equal"])
+    espnet_cer=(dict1["espnet_cer_delete"]+dict1["espnet_cer_insert"]+dict1["espnet_cer_replace"])/(dict1["espnet_cer_delete"]+dict1["espnet_cer_replace"]+dict1["espnet_cer_equal"])
+    score_modules_owsm = {
+        "module": owsm_levenshtein_metric,
+        "args": owsm_wer_setup(
+            model_tag="default",
+            beam_size=1,
+            text_cleaner="whisper_en",
+            use_gpu=True,
+        ),
+    }
+    # import pdb;pdb.set_trace()
+    dict1=score_modules_owsm["module"](
+            score_modules_owsm["args"],
+            int2float(TTS_audio_output[1]),
+            LLM_Output,
+            TTS_audio_output[0],
+    )
+    owsm_wer=(dict1["owsm_wer_delete"]+dict1["owsm_wer_insert"]+dict1["owsm_wer_replace"])/(dict1["owsm_wer_delete"]+dict1["owsm_wer_replace"]+dict1["owsm_wer_equal"])
+    owsm_cer=(dict1["owsm_cer_delete"]+dict1["owsm_cer_insert"]+dict1["owsm_cer_replace"])/(dict1["owsm_cer_delete"]+dict1["owsm_cer_replace"]+dict1["owsm_cer_equal"])
+    score_modules_whisper = {
+        "module": whisper_levenshtein_metric,
+        "args": whisper_wer_setup(
+            model_tag="default",
+            beam_size=1,
+            text_cleaner="whisper_en",
+            use_gpu=True,
+        ),
+    }
+    # import pdb;pdb.set_trace()
+    dict1=score_modules_whisper["module"](
+            score_modules_whisper["args"],
+            int2float(TTS_audio_output[1]),
+            LLM_Output,
+            TTS_audio_output[0],
+    )
+    whisper_wer=(dict1["whisper_wer_delete"]+dict1["whisper_wer_insert"]+dict1["whisper_wer_replace"])/(dict1["whisper_wer_delete"]+dict1["whisper_wer_replace"]+dict1["whisper_wer_equal"])
+    whisper_cer=(dict1["whisper_cer_delete"]+dict1["whisper_cer_insert"]+dict1["whisper_cer_replace"])/(dict1["whisper_cer_delete"]+dict1["whisper_cer_replace"]+dict1["whisper_cer_equal"])
+    return f"ESPnet WER: {espnet_wer*100:.2f}\nESPnet CER: {espnet_cer*100:.2f}\nOWSM WER: {owsm_wer*100:.2f}\nOWSM CER: {owsm_cer*100:.2f}\nWhisper WER: {whisper_wer*100:.2f}\nWhisper CER: {whisper_cer*100:.2f}"

eval/TTS_speech_quality.py ADDED Viewed

	@@ -0,0 +1,42 @@

+from utils import int2float
+def TTS_psuedomos(TTS_audio_output):
+    from versa import pseudo_mos_metric, pseudo_mos_setup, sheet_ssqa, sheet_ssqa_setup
+    predictor_dict, predictor_fs = pseudo_mos_setup(
+        use_gpu=True,
+        predictor_types=["utmos", "dnsmos", "plcmos"],
+        predictor_args={"utmos":{"fs": 16000},"dnsmos":{"fs": 16000},"plcmos":{"fs": 16000}},
+    )
+    score_modules = {
+        "module": pseudo_mos_metric,
+        "args": {
+            "predictor_dict": predictor_dict,
+            "predictor_fs": predictor_fs,
+            "use_gpu": True,
+        },
+    }
+    dict1=score_modules["module"](
+            int2float(TTS_audio_output[1]),
+            TTS_audio_output[0],
+            **score_modules["args"],
+    )
+    str1=""
+    for k in dict1:
+        str1=str1+f"{k}: {dict1[k]:.2f}\n"
+    sheet_model = sheet_ssqa_setup(
+        model_tag="default",
+        model_path=None,
+        model_config=None,
+        use_gpu=True,
+    )
+    score_modules = {
+        "module": sheet_ssqa,
+        "args": {"model": sheet_model, "use_gpu": True},
+    }
+    dict1 = score_modules["module"](
+        score_modules["args"]["model"], int2float(TTS_audio_output[1]), TTS_audio_output[0],
+        use_gpu=score_modules["args"]["use_gpu"]
+    )
+    for k in dict1:
+        str1=str1+f"{k}: {dict1[k]:.2f}\n"
+    return str1

eval/vert.py ADDED Viewed

	@@ -0,0 +1,272 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import numpy as np
+import nltk
+import math
+import sys
+from fractions import Fraction
+import warnings
+from collections import Counter
+from nltk.translate.bleu_score import modified_precision, closest_ref_length, brevity_penalty, SmoothingFunction
+import warnings
+def corpus_bleu(
+    list_of_references,
+    hypotheses,
+    weights=(0.25, 0.25, 0.25, 0.25),
+    smoothing_function=None,
+    auto_reweigh=False,
+    averaging_mode="geometric",
+    no_length_penalty=False
+):
+    """
+    Calculate a single corpus-level BLEU score (aka. system-level BLEU) for all
+    the hypotheses and their respective references.
+    Instead of averaging the sentence level BLEU scores (i.e. marco-average
+    precision), the original BLEU metric (Papineni et al. 2002) accounts for
+    the micro-average precision (i.e. summing the numerators and denominators
+    for each hypothesis-reference(s) pairs before the division).
+    >>> hyp1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which',
+    ...         'ensures', 'that', 'the', 'military', 'always',
+    ...         'obeys', 'the', 'commands', 'of', 'the', 'party']
+    >>> ref1a = ['It', 'is', 'a', 'guide', 'to', 'action', 'that',
+    ...          'ensures', 'that', 'the', 'military', 'will', 'forever',
+    ...          'heed', 'Party', 'commands']
+    >>> ref1b = ['It', 'is', 'the', 'guiding', 'principle', 'which',
+    ...          'guarantees', 'the', 'military', 'forces', 'always',
+    ...          'being', 'under', 'the', 'command', 'of', 'the', 'Party']
+    >>> ref1c = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the',
+    ...          'army', 'always', 'to', 'heed', 'the', 'directions',
+    ...          'of', 'the', 'party']
+    >>> hyp2 = ['he', 'read', 'the', 'book', 'because', 'he', 'was',
+    ...         'interested', 'in', 'world', 'history']
+    >>> ref2a = ['he', 'was', 'interested', 'in', 'world', 'history',
+    ...          'because', 'he', 'read', 'the', 'book']
+    >>> list_of_references = [[ref1a, ref1b, ref1c], [ref2a]]
+    >>> hypotheses = [hyp1, hyp2]
+    >>> corpus_bleu(list_of_references, hypotheses) # doctest: +ELLIPSIS
+    0.5920...
+    The example below show that corpus_bleu() is different from averaging
+    sentence_bleu() for hypotheses
+    >>> score1 = sentence_bleu([ref1a, ref1b, ref1c], hyp1)
+    >>> score2 = sentence_bleu([ref2a], hyp2)
+    >>> (score1 + score2) / 2 # doctest: +ELLIPSIS
+    0.6223...
+    :param list_of_references: a corpus of lists of reference sentences, w.r.t. hypotheses
+    :type list_of_references: list(list(list(str)))
+    :param hypotheses: a list of hypothesis sentences
+    :type hypotheses: list(list(str))
+    :param weights: weights for unigrams, bigrams, trigrams and so on
+    :type weights: list(float)
+    :param smoothing_function:
+    :type smoothing_function: SmoothingFunction
+    :param auto_reweigh: Option to re-normalize the weights uniformly.
+    :type auto_reweigh: bool
+    :return: The corpus-level BLEU score.
+    :rtype: float
+    """
+    # Before proceeding to compute BLEU, perform sanity checks.
+    p_numerators = Counter()  # Key = ngram order, and value = no. of ngram matches.
+    p_denominators = Counter()  # Key = ngram order, and value = no. of ngram in ref.
+    hyp_lengths, ref_lengths = 0, 0
+    assert len(list_of_references) == len(hypotheses), (
+        "The number of hypotheses and their reference(s) should be the " "same "
+    )
+    # Iterate through each hypothesis and their corresponding references.
+    for references, hypothesis in zip(list_of_references, hypotheses):
+        # For each order of ngram, calculate the numerator and
+        # denominator for the corpus-level modified precision.
+        for i, _ in enumerate(weights, start=1):
+            p_i = modified_precision(references, hypothesis, i)
+            p_numerators[i] += p_i.numerator
+            p_denominators[i] += p_i.denominator
+        # Calculate the hypothesis length and the closest reference length.
+        # Adds them to the corpus-level hypothesis and reference counts.
+        hyp_len = len(hypothesis)
+        hyp_lengths += hyp_len
+        ref_lengths += closest_ref_length(references, hyp_len)
+    # Calculate corpus-level brevity penalty.
+    if no_length_penalty and averaging_mode == 'geometric':
+        bp = 1.0
+    elif no_length_penalty and averaging_mode == 'arithmetic':
+        bp = 0.0
+    else:
+        assert not no_length_penalty
+        assert averaging_mode != 'arithmetic', 'Not sure how to apply length penalty when aurithmetic mode'
+        bp = brevity_penalty(ref_lengths, hyp_lengths)
+    # Uniformly re-weighting based on maximum hypothesis lengths if largest
+    # order of n-grams < 4 and weights is set at default.
+    if auto_reweigh:
+        if hyp_lengths < 4 and weights == (0.25, 0.25, 0.25, 0.25):
+            weights = (1 / hyp_lengths,) * hyp_lengths
+    # Collects the various precision values for the different ngram orders.
+    p_n = [
+        Fraction(p_numerators[i], p_denominators[i], _normalize=False)
+        for i, _ in enumerate(weights, start=1)
+    ]
+    # Returns 0 if there's no matching n-grams
+    # We only need to check for p_numerators[1] == 0, since if there's
+    # no unigrams, there won't be any higher order ngrams.
+    if p_numerators[1] == 0:
+        return 0
+    # If there's no smoothing, set use method0 from SmoothinFunction class.
+    if not smoothing_function:
+        smoothing_function = SmoothingFunction().method0
+    # Smoothen the modified precision.
+    # Note: smoothing_function() may convert values into floats;
+    #       it tries to retain the Fraction object as much as the
+    #       smoothing method allows.
+    p_n = smoothing_function(
+        p_n, references=references, hypothesis=hypothesis, hyp_len=hyp_lengths
+    )
+    if averaging_mode == "geometric":
+        s = (w_i * math.log(p_i) for w_i, p_i in zip(weights, p_n))
+        s = bp * math.exp(math.fsum(s))
+    elif averaging_mode == "arithmetic":
+        s = (w_i * p_i for w_i, p_i in zip(weights, p_n))
+        s = math.fsum(s)
+    return s
+def sentence_bleu(
+    references,
+    hypothesis,
+    weights=(0.25, 0.25, 0.25, 0.25),
+    smoothing_function=None,
+    auto_reweigh=False,
+    averaging_mode="geometric",
+    no_length_penalty=False
+):
+    return corpus_bleu(
+        [references], [hypothesis], weights, smoothing_function, auto_reweigh, averaging_mode, no_length_penalty
+    )
+def get_target_sequences(manifest, ground_truth, to_take=1000):
+    import json
+    import pathlib
+    with open(ground_truth, 'r') as fin:
+        original_continuations = json.loads(fin.read())
+    sequence2length = [(k, v[0]) for k, v in original_continuations.items()]
+    assert all(float(v) >= 6.0 for (_, v) in sequence2length)  # 6 seconds
+    sequence2length.sort(key=lambda x: x[1])
+    to_take_sequences = set(v[0] for v in sequence2length[:to_take])
+    to_take_ids = []
+    with open(manifest, 'r') as f:
+        f.readline()
+        for i, line in enumerate(f.readlines()):
+            seq_id = line.split()[0]
+            seq_id = pathlib.Path(seq_id).name.split('__')[0]
+            if seq_id in to_take_sequences:
+                to_take_ids.append(i)
+    print(f'Took {len(to_take_ids)} ids')
+    return set(to_take_ids)
+def get_self_bleu(utterances, averaging_mode, weights):
+    self_bleu = []
+    for i in range(len(utterances)):
+        hypo = utterances[i]
+        rest = utterances[:i] + utterances[i+1:]
+        self_bleu.append(sentence_bleu(rest, hypo, weights,
+                         no_length_penalty=True, averaging_mode=averaging_mode))
+    return self_bleu
+def get_self_bleu2_arithmetic(utterances):
+    weights = (0.5, 0.5)  # equal weight for unigrams and bigrams
+    return get_self_bleu(utterances, averaging_mode='arithmetic', weights=weights)
+def get_self_bleu2_geometric(utterances):
+    weights = (0.5, 0.5)
+    return get_self_bleu(utterances, averaging_mode='geometric', weights=weights)
+def get_auto_bleu2_arithmetic(utterances):
+    weights = (0.5, 0.5)
+    return [auto_bleu(u, mean_mode='arithmetic', weights=weights) for u in utterances]
+def get_auto_bleu2_geometric(utterances):
+    weights = (0.5, 0.5)
+    return [auto_bleu(u, mean_mode='geometric', weights=weights) for u in utterances]
+def get_auto_bleu3_geometric(utterances):
+    weights = (1./3, 1./3, 1./3)
+    return [auto_bleu(u, mean_mode='geometric', weights=weights) for u in utterances]
+def get_auto_bleu3_arithmetic(utterances):
+    weights = (1./3, 1./3, 1./3)
+    return [auto_bleu(u, mean_mode='arithmetic', weights=weights) for u in utterances]
+def get_self_bleu3_arithmetic(utterances):
+    weights = (1./3, 1./3, 1./3)
+    return get_self_bleu(utterances, averaging_mode='arithmetic', weights=weights)
+def get_self_bleu3_geometric(utterances):
+    weights = (1./3, 1./3, 1./3)
+    return get_self_bleu(utterances, averaging_mode='geometric', weights=weights)
+def auto_bleu(sentence, weights, mean_mode='arithmetic'):
+    if len(sentence) <= 1:
+        return 0
+    N = len(weights)
+    bleu_n = np.zeros([N])
+    for n in range(N):
+        targ_ngrams = list(nltk.ngrams(sentence, n+1))
+        for p in range(len(targ_ngrams)):
+            left = sentence[:p]
+            right = sentence[(p+n+1):]
+            rest_ngrams = list(nltk.ngrams(left, n+1)) + \
+                list(nltk.ngrams(right, n+1))
+            # compute the nb of matching ngrams
+            bleu_n[n] += targ_ngrams[p] in rest_ngrams
+        bleu_n[n] /= len(targ_ngrams)  # average them to get a proportion
+    weights = np.array(weights)
+    if mean_mode == 'arithmetic':
+        return (bleu_n * weights).sum()
+    elif mean_mode == 'geometric':
+        return (bleu_n ** weights).prod()
+    else:
+        raise ValueError(f'Unknown agggregation mode {mean_mode}')
+def run_f(task_params):
+    f, terms = task_params
+    return f(terms)

requirements.txt CHANGED Viewed

@@ -12,4 +12,5 @@ sounddevice==0.5.0
 webrtcvad-wheels
 webrtcvad==2.0.10
 gradio==4.43.0
-ChatTTS

 webrtcvad-wheels
 webrtcvad==2.0.10
 gradio==4.43.0
+ChatTTS
+evaluate

tts_samples/sample1.wav ADDED Viewed

Binary file (414 kB). View file

utils.py ADDED Viewed

	@@ -0,0 +1,12 @@

+import numpy as np
+def int2float(sound):
+    """
+    Taken from https://github.com/snakers4/silero-vad
+    """
+    abs_max = np.abs(sound).max()
+    sound = sound.astype("float32")
+    if abs_max > 0:
+        sound *= 1 / 32768
+    sound = sound.squeeze()  # depends on the use case
+    return sound

versa.sh ADDED Viewed

	@@ -0,0 +1,4 @@

+git clone https://github.com/shinjiwlab/versa.git
+cd versa
+pip install .
+cd ..