dkounadis
/

artificial-styletts2

+# https://github.com/audeering/shift/tree/main  - MAKE Mimic-3 voice / harvard 1x 4x
+import shutil
+import csv
+import io
+import os
+import typing
+import wave
+import sys
+from mimic3_tts.__main__ import (CommandLineInterfaceState,
+                                 get_args,
+                                 initialize_args,
+                                 initialize_tts,
+                                 # print_voices,
+                                 # process_lines,
+                                 shutdown_tts,
+                                 OutputNaming,
+                                 process_line)
+import time
+import json
+import os
+import numpy as np
+from pathlib import Path
+import audiofile
+# ================================================ LIST OF VOICES
+ROOT_DIR = '/data/dkounadis/mimic3-voices/'
+foreign_voices = []
+english_voices = []
+for lang in os.listdir(ROOT_DIR + 'voices'):
+        for voice in os.listdir(ROOT_DIR + 'voices/' + lang):
+            if 'en_' in lang:
+                try:
+                    with open(ROOT_DIR + 'voices/' + lang + '/' + voice + '/speakers.txt', 'r') as f:
+                        for spk in f:
+                            english_voices.append(lang + '/' + voice + '#' + spk.rstrip())
+                        # voice_spk_string = lang + '/' + voice + '#' + spk.rstrip() for spk in f
+                except FileNotFoundError:
+                    english_voices.append(lang + '/' + voice)
+            else:
+                try:
+                    with open(ROOT_DIR + 'voices/' + lang + '/' + voice + '/speakers.txt', 'r') as f:
+                        for spk in f:
+                            foreign_voices.append(lang + '/' + voice + '#' + spk.rstrip())
+                except FileNotFoundError:
+                    foreign_voices.append(lang + '/' + voice)
+#
+[print(i) for i in foreign_voices]
+print('\n_______________________________\n')
+[print(i) for i in english_voices]
+# ====================================================== LIST Mimic-3 ALL VOICES
+# list_voices = [
+#     'en_US/m-ailabs_low#mary_ann',
+#     'en_UK/apope_low',
+#     'de_DE/thorsten-emotion_low#neutral',  # is the 4x really interesting we can just write it in Section
+#     # 'ko_KO/kss_low',
+#     'fr_FR/m-ailabs_low#gilles_g_le_blanc',
+#     #'human',
+#     ]  # special - for human we load specific style file - no Mimic3 is run
+# ==================================    ====== END INTERFACE
+def process_lines(state: CommandLineInterfaceState, wav_path=None):
+    '''MIMIC3 INTERNAL CALL that yields the sigh sound'''
+    args = state.args
+    result_idx = 0
+    print(f'why waitings in the for loop LIN {state.texts=}\n')
+    for line in state.texts:
+        # print(f'LIN {line=}\n')  # prints \n so is empty not getting the predifne text of state.texts
+        line_voice: typing.Optional[str] = None
+        line_id = ""
+        line = line.strip()
+        # if not line:
+        #     continue
+        if args.output_naming == OutputNaming.ID:
+            # Line has the format id|text instead of just text
+            with io.StringIO(line) as line_io:
+                reader = csv.reader(line_io, delimiter=args.csv_delimiter)
+                row = next(reader)
+                line_id, line = row[0], row[-1]
+                if args.csv_voice:
+                    line_voice = row[1]
+        process_line(line, state,
+                     line_id=line_id,
+                     line_voice=line_voice)
+        result_idx += 1
+    time.sleep(4)
+    # Write combined audio to stdout
+    if state.all_audio:
+        # _LOGGER.debug("Writing WAV audio to stdout")
+        if sys.stdout.isatty() and (not state.args.stdout):
+            with io.BytesIO() as wav_io:
+                wav_file_play: wave.Wave_write = wave.open(wav_io, "wb")
+                with wav_file_play:
+                    wav_file_play.setframerate(state.sample_rate_hz)
+                    wav_file_play.setsampwidth(state.sample_width_bytes)
+                    wav_file_play.setnchannels(state.num_channels)
+                    wav_file_play.writeframes(state.all_audio)
+                    # play_wav_bytes(state.args, wav_io.getvalue())
+                # wav_path = '_direct_call_2.wav'
+                with open(wav_path, 'wb') as wav_file:
+                    wav_file.write(wav_io.getvalue())
+                    wav_file.seek(0)
+                    print('\n\n5T', wav_path)
+    else:
+        print('\n\nDOES NOT TTSING --> ADD SOME time.sleep(4)', wav_path)
+# -----------------------------------------------------------------------------
+# cat _tmp_ssml.txt | mimic3 --cuda --ssml --noise-w 0.90001 --length-scale 0.91 --noise-scale 0.04 > noise_w=0.90_en_happy_2.wav
+# ======================================================================
+for lang, list_voices in [
+    ['english', english_voices],
+    ['foreign', foreign_voices]
+                            ]:
+    for rate in [1, 4]:
+        # # --
+        # # assure mimic-3 generator .onnx exists
+        # home_voice_dir = f'/home/audeering.local/dkounadis/.local/share/mycroft/mimic3/voices/{_voice.split("#")[0]}/'
+        # Path(home_voice_dir).mkdir(parents=True, exist_ok=True)
+        # speaker_free_voice_name = _voice.split("#")[0] if '#' in _voice else _voice
+        # if (
+        #     (not os.path.isfile(home_voice_dir + 'generator.onnx')) or
+        #     (os.path.getsize(home_voice_dir + 'generator.onnx') < 500)  # .onnx - is just LFS header
+        #         ):
+        #     # Copy
+        #     shutil.copyfile(
+        #         f'/data/dkounadis/mimic3-voices/voices/{speaker_free_voice_name}/generator.onnx',
+        #         home_voice_dir + 'generator.onnx')
+        # # --
+        with open('harvard.json', 'r') as f:
+            harvard_individual_sentences = json.load(f)['sentences']
+        total_audio_mimic3 = []
+        ix = 0
+        for list_of_10 in harvard_individual_sentences[:4]:  # 77
+            # text = ' '.join(list_of_10['sentences'])
+            for text in list_of_10['sentences']:
+                _voice = list_voices[ix % len(list_voices)]
+                _str = _voice.replace('/', '_').replace('#', '_').replace('_low', '')
+                if 'cmu-arctic' in _str:
+                    _str = _str.replace('cmu-arctic', 'cmu_arctic') #+ '.wav'
+                print(ix, lang, text)
+                # Synthesis Mimic-3 then use it as prompt for StyleTTS2
+                # MIMIC-3 if _voice is not HUMAN
+                _ssml = (
+                    '<speak>'
+                    '<prosody volume=\'64\'>'
+                    f'<prosody rate=\'{rate}\'>'
+                    f'<voice name=\'{_voice}\'>'
+                    '<s>'
+                    f'{text[:-1] + ", .. !!!"}'
+                    '</s>'
+                    '</voice>'
+                    '</prosody>'
+                    '</prosody>'
+                    '</speak>'
+                )
+                with open('_tmp_ssml.txt', 'w') as f:
+                    f.write(_ssml)
+                # ps = subprocess.Popen(f'cat _tmp_ssml.txt | mimic3 --ssml > {reference_wav}', shell=True)
+                # ps.wait()  # using ps to call mimic3 because samples dont have time to be written in stdout buffer
+                args = get_args()
+                args.ssml = True
+                args.text = [_ssml]  #['aa', 'bb'] #txt
+                args.interactive = False
+                # args.output_naming = OutputNaming.TIME
+                state = CommandLineInterfaceState(args=args)
+                initialize_args(state)
+                initialize_tts(state)
+                # args.texts = [txt] #['aa', 'bb'] #txt
+                # state.stdout = '.' #None #'makeme.wav'
+                # state.output_dir = '.noopy'
+                # state.interactive = False
+                # state.output_naming = OutputNaming.TIME
+                # # state.ssml = 1234546575
+                # state.stdout = True
+                # state.tts = True
+                style_path = 'tmp1.wav'
+                process_lines(state, wav_path=style_path)
+                shutdown_tts(state)
+                x, fs = audiofile.read(style_path)
+                ix += 1
+                total_audio_mimic3.append(x)
+        # save styletts2 .wav
+        total_audio_mimic3 = np.concatenate(total_audio_mimic3) # -- concat 77x lists
+        audiofile.write(f'harvards_upload_mimic3_{rate}_{lang}.wav', total_audio_mimic3, 22050)
+        print(total_audio_mimic3.shape, 'LEN\n')