ai-story-server

Paused

App Files Files Community

jbilcke-hf HF staff commited on Nov 21, 2023

Commit

2b8e454

•

1 Parent(s): fbc5de6

Update app.py

Browse files

Files changed (1) hide show

app.py +17 -11

app.py CHANGED Viewed

@@ -588,29 +588,35 @@ def generate_speech_for_sentence(history, chatbot_role, sentence, return_as_byte
             else:
                 # likely got a ' or " or some other text without alphanumeric in it
                 audio_stream = None
             # XTTS is actually using streaming response but we are playing audio by sentence
             # If you want direct XTTS voice streaming (send each chunk to voice ) you may set DIRECT_STREAM=1 environment variable
             if audio_stream is not None:
                 # frame_length = 0
                 for chunk in audio_stream:
                     try:
-                        wav_bytestream += chunk
                         # frame_length += len(chunk)
                     except:
                         # hack to continue on playing. sometimes last chunk is empty , will be fixed on next TTS
                         continue
-        # Filter output for better voice
-        filter_output=True
-        if filter_output:
-            data_s16 = np.frombuffer(wav_bytestream, dtype=np.int16, count=len(wav_bytestream)//2, offset=0)
-            float_data = data_s16 * 0.5**15
-            reduced_noise = nr.reduce_noise(y=float_data, sr=24000,prop_decrease =0.8,n_fft=1024)
-            wav_bytestream = (reduced_noise * 32767).astype(np.int16)
-            wav_bytestream = wav_bytestream.tobytes()
         # Directly encode the WAV bytestream to base64
-        base64_audio = base64.b64encode(pcm_to_wav(wav_bytestream)).decode('utf8')
         if audio_stream is not None:
             return (history, base64_audio)

             else:
                 # likely got a ' or " or some other text without alphanumeric in it
                 audio_stream = None
+            sentence_wav_bytestream = b""
             # XTTS is actually using streaming response but we are playing audio by sentence
             # If you want direct XTTS voice streaming (send each chunk to voice ) you may set DIRECT_STREAM=1 environment variable
             if audio_stream is not None:
                 # frame_length = 0
                 for chunk in audio_stream:
                     try:
+                        if chunk is not None:
+                            sentence_wav_bytestream += chunk
                         # frame_length += len(chunk)
                     except:
                         # hack to continue on playing. sometimes last chunk is empty , will be fixed on next TTS
                         continue
+            # Filter output for better voice
+            filter_output=True
+            if filter_output:
+                data_s16 = np.frombuffer(sentence_wav_bytestream, dtype=np.int16, count=len(wav_bytestream)//2, offset=0)
+                float_data = data_s16 * 0.5**15
+                reduced_noise = nr.reduce_noise(y=float_data, sr=24000,prop_decrease =0.8,n_fft=1024)
+                sentence_wav_bytestream = (reduced_noise * 32767).astype(np.int16)
+                sentence_wav_bytestream = sentence_wav_bytestream.tobytes()
+            total_wav_bytestream += sentence_wav_bytestream
         # Directly encode the WAV bytestream to base64
+        base64_audio = base64.b64encode(pcm_to_wav(total_wav_bytestream)).decode('utf8')
         if audio_stream is not None:
             return (history, base64_audio)