Spaces:
Paused
Paused
Commit
•
2b8e454
1
Parent(s):
fbc5de6
Update app.py
Browse files
app.py
CHANGED
@@ -588,29 +588,35 @@ def generate_speech_for_sentence(history, chatbot_role, sentence, return_as_byte
|
|
588 |
else:
|
589 |
# likely got a ' or " or some other text without alphanumeric in it
|
590 |
audio_stream = None
|
591 |
-
|
|
|
|
|
592 |
# XTTS is actually using streaming response but we are playing audio by sentence
|
593 |
# If you want direct XTTS voice streaming (send each chunk to voice ) you may set DIRECT_STREAM=1 environment variable
|
594 |
if audio_stream is not None:
|
595 |
# frame_length = 0
|
596 |
for chunk in audio_stream:
|
597 |
try:
|
598 |
-
|
|
|
599 |
# frame_length += len(chunk)
|
600 |
except:
|
601 |
# hack to continue on playing. sometimes last chunk is empty , will be fixed on next TTS
|
602 |
continue
|
603 |
|
604 |
-
|
605 |
-
|
606 |
-
|
607 |
-
|
608 |
-
|
609 |
-
|
610 |
-
|
611 |
-
|
|
|
|
|
|
|
612 |
# Directly encode the WAV bytestream to base64
|
613 |
-
base64_audio = base64.b64encode(pcm_to_wav(
|
614 |
|
615 |
if audio_stream is not None:
|
616 |
return (history, base64_audio)
|
|
|
588 |
else:
|
589 |
# likely got a ' or " or some other text without alphanumeric in it
|
590 |
audio_stream = None
|
591 |
+
|
592 |
+
sentence_wav_bytestream = b""
|
593 |
+
|
594 |
# XTTS is actually using streaming response but we are playing audio by sentence
|
595 |
# If you want direct XTTS voice streaming (send each chunk to voice ) you may set DIRECT_STREAM=1 environment variable
|
596 |
if audio_stream is not None:
|
597 |
# frame_length = 0
|
598 |
for chunk in audio_stream:
|
599 |
try:
|
600 |
+
if chunk is not None:
|
601 |
+
sentence_wav_bytestream += chunk
|
602 |
# frame_length += len(chunk)
|
603 |
except:
|
604 |
# hack to continue on playing. sometimes last chunk is empty , will be fixed on next TTS
|
605 |
continue
|
606 |
|
607 |
+
# Filter output for better voice
|
608 |
+
filter_output=True
|
609 |
+
if filter_output:
|
610 |
+
data_s16 = np.frombuffer(sentence_wav_bytestream, dtype=np.int16, count=len(wav_bytestream)//2, offset=0)
|
611 |
+
float_data = data_s16 * 0.5**15
|
612 |
+
reduced_noise = nr.reduce_noise(y=float_data, sr=24000,prop_decrease =0.8,n_fft=1024)
|
613 |
+
sentence_wav_bytestream = (reduced_noise * 32767).astype(np.int16)
|
614 |
+
sentence_wav_bytestream = sentence_wav_bytestream.tobytes()
|
615 |
+
|
616 |
+
total_wav_bytestream += sentence_wav_bytestream
|
617 |
+
|
618 |
# Directly encode the WAV bytestream to base64
|
619 |
+
base64_audio = base64.b64encode(pcm_to_wav(total_wav_bytestream)).decode('utf8')
|
620 |
|
621 |
if audio_stream is not None:
|
622 |
return (history, base64_audio)
|