Spaces:
Sleeping
Sleeping
RobCaamano
commited on
Commit
•
8b4615f
1
Parent(s):
bea947c
Cleaned
Browse files
tts.py
CHANGED
@@ -7,13 +7,22 @@ import shutil
|
|
7 |
import argparse
|
8 |
import torch
|
9 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
os.environ["COQUI_TOS_AGREED"] = "1"
|
11 |
|
|
|
12 |
def adjust_speed(input_file, speed_factor):
|
13 |
output_file = input_file.replace(".wav", "_adjusted.wav")
|
14 |
ffmpeg.input(input_file).filter('atempo', speed_factor).output(output_file, acodec='pcm_s16le').run()
|
15 |
return output_file
|
16 |
|
|
|
17 |
def generate_speech(text, speaker_voice_map, output_file):
|
18 |
combined_audio = AudioSegment.empty()
|
19 |
temp_files = []
|
@@ -27,6 +36,7 @@ def generate_speech(text, speaker_voice_map, output_file):
|
|
27 |
if not line.strip():
|
28 |
continue
|
29 |
|
|
|
30 |
match = re.match(r"\[SPEAKER_(\d+)\] \[(\d+\.\d+)-(\d+\.\d+)\] (.+)", line)
|
31 |
if not match:
|
32 |
continue
|
@@ -39,21 +49,25 @@ def generate_speech(text, speaker_voice_map, output_file):
|
|
39 |
if not speaker_wav:
|
40 |
continue
|
41 |
|
|
|
42 |
os.makedirs('./audio/temp', exist_ok=True)
|
43 |
temp_file_path = f"./audio/temp/temp_output_part_{len(temp_files)}.wav"
|
44 |
temp_files.append(temp_file_path)
|
45 |
|
|
|
46 |
tts_speed = 2.0 # original 1.0
|
47 |
tts.tts_to_file(text=sentence, file_path=temp_file_path, speaker_wav=speaker_wav, language="es", speed=tts_speed)
|
48 |
|
49 |
segment_audio = AudioSegment.from_wav(temp_file_path)
|
50 |
|
|
|
51 |
if segment_audio.duration_seconds * 1000 > segment_duration:
|
52 |
#while tts_speed < 2.0 and segment_audio.duration_seconds * 1000 > segment_duration:
|
53 |
# tts_speed += 0.5
|
54 |
# tts.tts_to_file(text=sentence, file_path=temp_file_path, speaker_wav=speaker_wav, language="es", speed=tts_speed)
|
55 |
# segment_audio = AudioSegment.from_wav(temp_file_path)
|
56 |
|
|
|
57 |
if segment_audio.duration_seconds * 1000 > segment_duration:
|
58 |
required_speed = segment_duration / (segment_audio.duration_seconds * 1000)
|
59 |
if required_speed < 1.0:
|
@@ -61,9 +75,11 @@ def generate_speech(text, speaker_voice_map, output_file):
|
|
61 |
temp_file_path = adjust_speed(temp_file_path, required_speed)
|
62 |
segment_audio = AudioSegment.from_wav(temp_file_path)
|
63 |
|
|
|
64 |
if combined_audio.duration_seconds == 0 and start_time > 0:
|
65 |
combined_audio = AudioSegment.silent(duration=start_time * 1000) + combined_audio
|
66 |
|
|
|
67 |
if segment_audio.duration_seconds * 1000 > segment_duration:
|
68 |
segment_audio = segment_audio[:segment_duration]
|
69 |
else:
|
@@ -71,11 +87,14 @@ def generate_speech(text, speaker_voice_map, output_file):
|
|
71 |
|
72 |
combined_audio += segment_audio
|
73 |
|
|
|
74 |
combined_audio.export(output_file, format="wav")
|
75 |
|
|
|
76 |
for temp_file in temp_files:
|
77 |
os.remove(temp_file)
|
78 |
|
|
|
79 |
def map_speaker_ids(directory):
|
80 |
speaker_voice_map = {}
|
81 |
for file in os.listdir(directory):
|
@@ -85,10 +104,15 @@ def map_speaker_ids(directory):
|
|
85 |
return speaker_voice_map
|
86 |
|
87 |
def main(speaker_directory, aligned_text_file, output_audio_file):
|
|
|
88 |
speaker_voice_map = map_speaker_ids(speaker_directory)
|
89 |
with open(aligned_text_file, 'r') as file:
|
90 |
translated_text = file.read()
|
|
|
|
|
91 |
generate_speech(translated_text, speaker_voice_map, output_audio_file)
|
|
|
|
|
92 |
if os.path.exists('./audio/temp'):
|
93 |
shutil.rmtree('./audio/temp')
|
94 |
|
|
|
7 |
import argparse
|
8 |
import torch
|
9 |
|
10 |
+
'''
|
11 |
+
PLEASE NOTE:
|
12 |
+
tts_speed initially 1.0 and increases to 2.0 if necessary. This has been commented out for improved execution times due to CPU usage.
|
13 |
+
Feel free to remove comments when used locally.
|
14 |
+
'''
|
15 |
+
|
16 |
+
# Accept TOS for tts
|
17 |
os.environ["COQUI_TOS_AGREED"] = "1"
|
18 |
|
19 |
+
# Adjust speed of audio segment
|
20 |
def adjust_speed(input_file, speed_factor):
|
21 |
output_file = input_file.replace(".wav", "_adjusted.wav")
|
22 |
ffmpeg.input(input_file).filter('atempo', speed_factor).output(output_file, acodec='pcm_s16le').run()
|
23 |
return output_file
|
24 |
|
25 |
+
# Generate and process speech for each line
|
26 |
def generate_speech(text, speaker_voice_map, output_file):
|
27 |
combined_audio = AudioSegment.empty()
|
28 |
temp_files = []
|
|
|
36 |
if not line.strip():
|
37 |
continue
|
38 |
|
39 |
+
# Extract speaker ID, timestamps & text
|
40 |
match = re.match(r"\[SPEAKER_(\d+)\] \[(\d+\.\d+)-(\d+\.\d+)\] (.+)", line)
|
41 |
if not match:
|
42 |
continue
|
|
|
49 |
if not speaker_wav:
|
50 |
continue
|
51 |
|
52 |
+
# Create temp
|
53 |
os.makedirs('./audio/temp', exist_ok=True)
|
54 |
temp_file_path = f"./audio/temp/temp_output_part_{len(temp_files)}.wav"
|
55 |
temp_files.append(temp_file_path)
|
56 |
|
57 |
+
# Initial TTS (original : 1.0 speed)
|
58 |
tts_speed = 2.0 # original 1.0
|
59 |
tts.tts_to_file(text=sentence, file_path=temp_file_path, speaker_wav=speaker_wav, language="es", speed=tts_speed)
|
60 |
|
61 |
segment_audio = AudioSegment.from_wav(temp_file_path)
|
62 |
|
63 |
+
# Increase TTS speed if audio is longer than duration
|
64 |
if segment_audio.duration_seconds * 1000 > segment_duration:
|
65 |
#while tts_speed < 2.0 and segment_audio.duration_seconds * 1000 > segment_duration:
|
66 |
# tts_speed += 0.5
|
67 |
# tts.tts_to_file(text=sentence, file_path=temp_file_path, speaker_wav=speaker_wav, language="es", speed=tts_speed)
|
68 |
# segment_audio = AudioSegment.from_wav(temp_file_path)
|
69 |
|
70 |
+
# Speed up using FFmpeg if audio is longer than duration
|
71 |
if segment_audio.duration_seconds * 1000 > segment_duration:
|
72 |
required_speed = segment_duration / (segment_audio.duration_seconds * 1000)
|
73 |
if required_speed < 1.0:
|
|
|
75 |
temp_file_path = adjust_speed(temp_file_path, required_speed)
|
76 |
segment_audio = AudioSegment.from_wav(temp_file_path)
|
77 |
|
78 |
+
# Add silence at start of audio if needed
|
79 |
if combined_audio.duration_seconds == 0 and start_time > 0:
|
80 |
combined_audio = AudioSegment.silent(duration=start_time * 1000) + combined_audio
|
81 |
|
82 |
+
# Trim or pad audio to match segment duration (Should not trim since audio sped up)
|
83 |
if segment_audio.duration_seconds * 1000 > segment_duration:
|
84 |
segment_audio = segment_audio[:segment_duration]
|
85 |
else:
|
|
|
87 |
|
88 |
combined_audio += segment_audio
|
89 |
|
90 |
+
# Export combined audio
|
91 |
combined_audio.export(output_file, format="wav")
|
92 |
|
93 |
+
# Delete temp files
|
94 |
for temp_file in temp_files:
|
95 |
os.remove(temp_file)
|
96 |
|
97 |
+
# Map speaker IDs to their voice files
|
98 |
def map_speaker_ids(directory):
|
99 |
speaker_voice_map = {}
|
100 |
for file in os.listdir(directory):
|
|
|
104 |
return speaker_voice_map
|
105 |
|
106 |
def main(speaker_directory, aligned_text_file, output_audio_file):
|
107 |
+
# Generate speaker voice map and read translated text
|
108 |
speaker_voice_map = map_speaker_ids(speaker_directory)
|
109 |
with open(aligned_text_file, 'r') as file:
|
110 |
translated_text = file.read()
|
111 |
+
|
112 |
+
# Generate speech
|
113 |
generate_speech(translated_text, speaker_voice_map, output_audio_file)
|
114 |
+
|
115 |
+
# Remove temp folder
|
116 |
if os.path.exists('./audio/temp'):
|
117 |
shutil.rmtree('./audio/temp')
|
118 |
|