dkounadis
/

artificial-styletts2

@@ -18,12 +18,14 @@ tags:
 ---
-# Affective TTS & Soundscape Synthesis
-Affective TTS tool for [SHIFT Horizon](https://shift-europe.eu/).
-  - Synthesizes affective speech with sound scape, trees, water, leaves, background from plain text or subtitles (.srt) & overlays it to videos.
   - `134` build-in affective voices available, tuned for [StyleTTS2](https://github.com/yl4579/StyleTTS2).
-  - [GitHub](https://github.com/audeering/shift)
 ### Available Voices
@@ -40,7 +42,7 @@ cd shift/
 pip install -r requirements.txt
 ```
-Start Flask
 ```
 CUDA_DEVICE_ORDER=PCI_BUS_ID HF_HOME=./hf_home CUDA_VISIBLE_DEVICES=2 python api.py
@@ -48,7 +50,7 @@ CUDA_DEVICE_ORDER=PCI_BUS_ID HF_HOME=./hf_home CUDA_VISIBLE_DEVICES=2 python api
 ## Inference
-The following need `api.py` to be running, e.g. `.. on computeXX`.
 **Text 2 Speech**
@@ -77,6 +79,13 @@ python tts.py --text assets/head_of_fortuna_en.srt --video assets/head_of_fortun
 python tts.py --text assets/head_of_fortuna_GPT.txt --video assets/head_of_fortuna.mp4
 ```
 ## Examples
 Substitute Native voice via TTS

 ---
+# Affective TTS & Soundscapes
+Synthesize affective TTS using [SHIFT TTS tool](https://github.com/audeering/shift), as well as audio soundscape.
+  - Affective TTS is based on this [phenomenon](https://huggingface.co/dkounadis/artificial-styletts2/discussions/2)
+  - Soundscapes, e.g. trees, water, leaves, are text-described generations from [AudioGen](https://huggingface.co/dkounadis/artificial-styletts2/discussions/3)
+  - `landscape2soundscape.py` shows how to example
+  -  plain text or subtitles (.srt) & overlays it to videos.
   - `134` build-in affective voices available, tuned for [StyleTTS2](https://github.com/yl4579/StyleTTS2).
 ### Available Voices
 pip install -r requirements.txt
 ```
+Flask
 ```
 CUDA_DEVICE_ORDER=PCI_BUS_ID HF_HOME=./hf_home CUDA_VISIBLE_DEVICES=2 python api.py
 ## Inference
+The following need `api.py` to be running on a `tmux session`.
 **Text 2 Speech**
 python tts.py --text assets/head_of_fortuna_GPT.txt --video assets/head_of_fortuna.mp4
 ```
+**Landscape 2 Soundscape**
+```python
+# TTS & soundscape - overlay to .mp4
+python landscape2soundscape.py
+```
 ## Examples
 Substitute Native voice via TTS

api.py CHANGED Viewed

@@ -396,6 +396,7 @@ def serve_wav():
     print(f'\n=SERVER saved as {OUT_FILE=}\n')
     response = send_from_directory(CACHE_DIR, path=OUT_FILE)
     response.headers['suffix-file-type'] = OUT_FILE
     return response

     print(f'\n=SERVER saved as {OUT_FILE=}\n')
     response = send_from_directory(CACHE_DIR, path=OUT_FILE)
     response.headers['suffix-file-type'] = OUT_FILE
+    print('_________________________________________________________\n              ? \n_______________')
     return response

landscape2soundscape.py CHANGED Viewed

@@ -3,28 +3,18 @@ import subprocess
 import cv2
 # with subprocess and an extra argument 'scene' and a 'resized image saved as png' we can call the server
 # yt-dlp is instaled in .d4
 # Download Part of Video
 # yt-dlp https://www.youtube.com/watch?v=UZ9uyQI3pF0 --downloader ffmpeg --downloader-args "ffmpeg_i:-ss 997 -to 2512"
 # ffmpeg -i Sandra\ Kotevska\,\ Painting\ Rose\ bush\,\ mixed\ media\,\ 2017.\ \[NMzC_036MtE\].mkv -f mp3 -ar 22050 -vn out44.wav -ac 1
 # https://superuser.com/questions/583393/how-to-extract-subtitle-from-video-using-ffmpeg
-def _shift(x):
-    n = x.shape[0]
-    i = np.random.randint(.24 * n, .74 * n)
-    return np.roll(x, i)
 #___________________________________________________________________________________________________
 #   VIDEO FROM IMAGE with CAPTIONS
 #
 # UPLOAD to: Simaviro: Documents General WORK PACKAGES WP1 ContentRepository ANBPR_ROMANIA TTSvideos
 # __________________________________________________________________________________________________
 # TO DONLOAD SRT for youtub
 # yt-dlp --write-sub --sub-lang en --convert-subs "srt" https://www.youtube.com/watch?v=F1Ib7TAu7eg&list=PL4x2B6LSwFewdDvRnUTpBM7jkmpwouhPv&index=2
 # _voice = 'en_US/vctk_low#p330'
 # _voice = 'en_US/cmu-arctic_low#lnh' #en_US/vctk_low#p249'  # 'en_US/vctk_low#p282'
 # _voice = ''en_US/vctk_low#p351''
@@ -93,7 +83,7 @@ DESCRIPTIONS = [
     ],
     # 6
     [
-        '06_Menzel_AI900_001.jpg'
         '06_Menzel_AI900_001.txt',
         'Olive trees in Seville',
         'Adolph Menzel - Bauplatz mit Weiden - 1846',
@@ -181,92 +171,17 @@ for _img_, _text_, soundscape_text, _title_, _voice_ in DESCRIPTIONS[:20]:
     offset_h = 24
     im[offset_h:h+offset_h, :w, :] = (.4 * im[offset_h:h+offset_h, :w, :] + .6 * fram).astype(np.uint8)
     # cv2.imshow('i', im); cv2.waitKey(); cv2.destroyAllWindows()
-    # logo aud
-    logo = cv2.imread('assets/audeering_logo.jpg')[:740, :, :]
-    logo = cv2.resize(logo, (logo.shape[1]//2, logo.shape[0]//2))
-    h, w, _ = logo.shape
-    offset_h = im.shape[0] - h
-    im[offset_h:h+offset_h, :w, :] = (.23 * im[offset_h:h+offset_h, :w, :] + .77 * logo).astype(np.uint8)
-    # logo SMB
-    logo = cv2.imread('assets/SMB_logo.png')#[:740, :, :]
-    logo = cv2.resize(logo, (logo.shape[1]//2, logo.shape[0]//2))
-    h, w, _ = logo.shape
-    offset_h = im.shape[0] - h
-    # fill logo SMB with the pixels of im - where SMB is empty
-    ptc = im[offset_h:h+offset_h, :w, :]
-    logo[logo == 0] = ptc[logo == 0]  # fill empty
-    im[offset_h:h+offset_h, :w, :] = (.13 * im[offset_h:h+offset_h, :w, :] + .86 * logo).astype(np.uint8)
-    # # logo shift
-    # logo = cv2.imread('assets/shift_logo.png')#[:740, :, :]
-    # logo = cv2.resize(logo, (logo.shape[1]//2, logo.shape[0]//2))
-    # h, w, _ = logo.shape
-    # offset_h = im.shape[0] - h #-274
-    # offset_w = im.shape[1] - w #400
-    # # # fill logo SMB with the pixels of im - where SMB is empty
-    # ptc = im[offset_h:h+offset_h, :w, :]
-    # # msk = np.tile(logo[:, :,0:1] > 252, [1,1,3])
-    # # logo[msk] = ptc[msk]  # fill empty
-    # im[offset_h:h+offset_h, offset_w:w+offset_w, :] = (.0 * im[offset_h:h+offset_h, offset_w:w+offset_w, :] + 1 * logo).astype(np.uint8)
-    # silent video - img
-    # im = cv2.resize(im, (700, 700))
-    cv2.imwrite('pic_logo_emb.png', im)
-    # raw, _ = soundfile.read(soundscape_file)  # 12345, 2
-    # # fill
-    # soundscape = []
-    # for _replica in range(math.ceil(len(total) / raw.shape[0])+1):
-    #     soundscape.append(raw)  # _shift non defined for stereo
-    # soundscape = np.concatenate(soundscape, 0)
-    # total = .36 * np.concatenate([total[:, None],
-    #                              total[:, None]], 1) + .64 * soundscape[:len(total), :]
-    # outfile
     OUT_FILE = _img_.split('/')[-1].replace('.','__') + '.mp4'  # assets / -1
     print(f'{OUT_FILE=}\n')
-    # call API passing img
     subprocess.run(
             [
              "python",
              "tts.py",
              "--text", PIC_DIR + _text_,
-             '--image', 'pic_logo_emb.png',
               # "--title", _title_,
               # '--soundscape_text', soundscape_text,
              '--voice', _voice_,
-             '--out_file', OUT_FILE,
-                ])
-    # soundfile.write(AUDIO_TRACK, total, 22050)
-    # subprocess.call(
-    #     ["ffmpeg",
-    #         "-y",
-    #         "-i",
-    #         SILENT_VIDEO,
-    #         "-i",
-    #         AUDIO_TRACK,
-    #         #"-c:v",
-    #         #"copy",
-    #         "-map",
-    #         "0:v:0",
-    #         "-map",
-    #         " 1:a:0",
-    #         "-vf",
-    #         "pad",
-    #         OUT_FILE])

 import cv2
 # with subprocess and an extra argument 'scene' and a 'resized image saved as png' we can call the server
 # yt-dlp is instaled in .d4
 # Download Part of Video
 # yt-dlp https://www.youtube.com/watch?v=UZ9uyQI3pF0 --downloader ffmpeg --downloader-args "ffmpeg_i:-ss 997 -to 2512"
 # ffmpeg -i Sandra\ Kotevska\,\ Painting\ Rose\ bush\,\ mixed\ media\,\ 2017.\ \[NMzC_036MtE\].mkv -f mp3 -ar 22050 -vn out44.wav -ac 1
 # https://superuser.com/questions/583393/how-to-extract-subtitle-from-video-using-ffmpeg
 #___________________________________________________________________________________________________
 #   VIDEO FROM IMAGE with CAPTIONS
 #
 # UPLOAD to: Simaviro: Documents General WORK PACKAGES WP1 ContentRepository ANBPR_ROMANIA TTSvideos
 # __________________________________________________________________________________________________
 # TO DONLOAD SRT for youtub
 # yt-dlp --write-sub --sub-lang en --convert-subs "srt" https://www.youtube.com/watch?v=F1Ib7TAu7eg&list=PL4x2B6LSwFewdDvRnUTpBM7jkmpwouhPv&index=2
 # _voice = 'en_US/vctk_low#p330'
 # _voice = 'en_US/cmu-arctic_low#lnh' #en_US/vctk_low#p249'  # 'en_US/vctk_low#p282'
 # _voice = ''en_US/vctk_low#p351''
     ],
     # 6
     [
+        '06_Menzel_AI900_001.jpg',
         '06_Menzel_AI900_001.txt',
         'Olive trees in Seville',
         'Adolph Menzel - Bauplatz mit Weiden - 1846',
     offset_h = 24
     im[offset_h:h+offset_h, :w, :] = (.4 * im[offset_h:h+offset_h, :w, :] + .6 * fram).astype(np.uint8)
     # cv2.imshow('i', im); cv2.waitKey(); cv2.destroyAllWindows()
+    cv2.imwrite('_tmp_banner.png', im)
     OUT_FILE = _img_.split('/')[-1].replace('.','__') + '.mp4'  # assets / -1
     print(f'{OUT_FILE=}\n')
     subprocess.run(
             [
              "python",
              "tts.py",
              "--text", PIC_DIR + _text_,
+             '--image', '_tmp_banner.png',
               # "--title", _title_,
               # '--soundscape_text', soundscape_text,
              '--voice', _voice_,
+             '--out_file', OUT_FILE,  # save to correct location is handled in client
+                ])

logo_raw_smb_aud.png ADDED Viewed