RobCaamano commited on
Commit
a882af7
1 Parent(s): 6e84375

Update tts.py

Browse files
Files changed (1) hide show
  1. tts.py +98 -96
tts.py CHANGED
@@ -1,96 +1,98 @@
1
- from TTS.api import TTS
2
- from pydub import AudioSegment
3
- import os
4
- import re
5
- import ffmpeg
6
- import shutil
7
- import argparse
8
-
9
- def adjust_speed(input_file, speed_factor):
10
- output_file = input_file.replace(".wav", "_adjusted.wav")
11
- ffmpeg.input(input_file).filter('atempo', speed_factor).output(output_file, acodec='pcm_s16le').run()
12
- return output_file
13
-
14
- def generate_speech(text, speaker_voice_map, output_file):
15
- combined_audio = AudioSegment.empty()
16
- temp_files = []
17
-
18
- tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to("cuda")
19
-
20
- for line in text.split("\n"):
21
- if not line.strip():
22
- continue
23
-
24
- match = re.match(r"\[SPEAKER_(\d+)\] \[(\d+\.\d+)-(\d+\.\d+)\] (.+)", line)
25
- if not match:
26
- continue
27
-
28
- speaker_id, start_time, end_time, sentence = match.groups()
29
- start_time, end_time = float(start_time), float(end_time)
30
- segment_duration = (end_time - start_time) * 1000 # Duration in milliseconds
31
-
32
- speaker_wav = speaker_voice_map.get(f"SPEAKER_{speaker_id}")
33
- if not speaker_wav:
34
- continue
35
-
36
- os.makedirs('./audio/temp', exist_ok=True)
37
- temp_file_path = f"./audio/temp/temp_output_part_{len(temp_files)}.wav"
38
- temp_files.append(temp_file_path)
39
-
40
- tts_speed = 1.0
41
- tts.tts_to_file(text=sentence, file_path=temp_file_path, speaker_wav=speaker_wav, language="es", speed=tts_speed)
42
-
43
- segment_audio = AudioSegment.from_wav(temp_file_path)
44
-
45
- if segment_audio.duration_seconds * 1000 > segment_duration:
46
- while tts_speed < 2.0 and segment_audio.duration_seconds * 1000 > segment_duration:
47
- tts_speed += 0.5
48
- tts.tts_to_file(text=sentence, file_path=temp_file_path, speaker_wav=speaker_wav, language="es", speed=tts_speed)
49
- segment_audio = AudioSegment.from_wav(temp_file_path)
50
-
51
- if segment_audio.duration_seconds * 1000 > segment_duration:
52
- required_speed = segment_duration / (segment_audio.duration_seconds * 1000)
53
- if required_speed < 1.0:
54
- required_speed = 1.0 / required_speed
55
- temp_file_path = adjust_speed(temp_file_path, required_speed)
56
- segment_audio = AudioSegment.from_wav(temp_file_path)
57
-
58
- if combined_audio.duration_seconds == 0 and start_time > 0:
59
- combined_audio = AudioSegment.silent(duration=start_time * 1000) + combined_audio
60
-
61
- if segment_audio.duration_seconds * 1000 > segment_duration:
62
- segment_audio = segment_audio[:segment_duration]
63
- else:
64
- segment_audio = segment_audio + AudioSegment.silent(duration=segment_duration - len(segment_audio))
65
-
66
- combined_audio += segment_audio
67
-
68
- combined_audio.export(output_file, format="wav")
69
-
70
- for temp_file in temp_files:
71
- os.remove(temp_file)
72
-
73
- def map_speaker_ids(directory):
74
- speaker_voice_map = {}
75
- for file in os.listdir(directory):
76
- if file.endswith(".wav"):
77
- speaker_id = file.replace(".wav", "")
78
- speaker_voice_map[speaker_id] = os.path.join(directory, file)
79
- return speaker_voice_map
80
-
81
- def main(speaker_directory, aligned_text_file, output_audio_file):
82
- speaker_voice_map = map_speaker_ids(speaker_directory)
83
- with open(aligned_text_file, 'r') as file:
84
- translated_text = file.read()
85
- generate_speech(translated_text, speaker_voice_map, output_audio_file)
86
- if os.path.exists('./audio/temp'):
87
- shutil.rmtree('./audio/temp')
88
-
89
- if __name__ == "__main__":
90
- parser = argparse.ArgumentParser(description="Generate speech from translated text")
91
- parser.add_argument("speaker_directory", help="Directory containing speaker voice clips")
92
- parser.add_argument("aligned_text_file", help="Path to the translated and aligned text file")
93
- parser.add_argument("output_audio_file", help="Path to save the generated speech audio file")
94
- args = parser.parse_args()
95
-
96
- main(args.speaker_directory, args.aligned_text_file, args.output_audio_file)
 
 
 
1
+ from TTS.api import TTS
2
+ from pydub import AudioSegment
3
+ import os
4
+ import re
5
+ import ffmpeg
6
+ import shutil
7
+ import argparse
8
+
9
+ def adjust_speed(input_file, speed_factor):
10
+ output_file = input_file.replace(".wav", "_adjusted.wav")
11
+ ffmpeg.input(input_file).filter('atempo', speed_factor).output(output_file, acodec='pcm_s16le').run()
12
+ return output_file
13
+
14
+ def generate_speech(text, speaker_voice_map, output_file):
15
+ combined_audio = AudioSegment.empty()
16
+ temp_files = []
17
+
18
+ tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to("cuda")
19
+
20
+ for line in text.split("\n"):
21
+ if not line.strip():
22
+ continue
23
+
24
+ match = re.match(r"\[SPEAKER_(\d+)\] \[(\d+\.\d+)-(\d+\.\d+)\] (.+)", line)
25
+ if not match:
26
+ continue
27
+
28
+ speaker_id, start_time, end_time, sentence = match.groups()
29
+ start_time, end_time = float(start_time), float(end_time)
30
+ segment_duration = (end_time - start_time) * 1000 # Duration in milliseconds
31
+
32
+ speaker_wav = speaker_voice_map.get(f"SPEAKER_{speaker_id}")
33
+ if not speaker_wav:
34
+ continue
35
+
36
+ os.makedirs('./audio/temp', exist_ok=True)
37
+ temp_file_path = f"./audio/temp/temp_output_part_{len(temp_files)}.wav"
38
+ temp_files.append(temp_file_path)
39
+
40
+ tts_speed = 1.0
41
+ tts.tts_to_file(text=sentence, file_path=temp_file_path, speaker_wav=speaker_wav, language="es", speed=tts_speed)
42
+
43
+ segment_audio = AudioSegment.from_wav(temp_file_path)
44
+
45
+ if segment_audio.duration_seconds * 1000 > segment_duration:
46
+ while tts_speed < 2.0 and segment_audio.duration_seconds * 1000 > segment_duration:
47
+ tts_speed += 0.5
48
+ tts.tts_to_file(text=sentence, file_path=temp_file_path, speaker_wav=speaker_wav, language="es", speed=tts_speed)
49
+ segment_audio = AudioSegment.from_wav(temp_file_path)
50
+
51
+ if segment_audio.duration_seconds * 1000 > segment_duration:
52
+ required_speed = segment_duration / (segment_audio.duration_seconds * 1000)
53
+ if required_speed < 1.0:
54
+ required_speed = 1.0 / required_speed
55
+ temp_file_path = adjust_speed(temp_file_path, required_speed)
56
+ segment_audio = AudioSegment.from_wav(temp_file_path)
57
+
58
+ if combined_audio.duration_seconds == 0 and start_time > 0:
59
+ combined_audio = AudioSegment.silent(duration=start_time * 1000) + combined_audio
60
+
61
+ if segment_audio.duration_seconds * 1000 > segment_duration:
62
+ segment_audio = segment_audio[:segment_duration]
63
+ else:
64
+ segment_audio = segment_audio + AudioSegment.silent(duration=segment_duration - len(segment_audio))
65
+
66
+ combined_audio += segment_audio
67
+
68
+ combined_audio.export(output_file, format="wav")
69
+
70
+ for temp_file in temp_files:
71
+ os.remove(temp_file)
72
+
73
+ def map_speaker_ids(directory):
74
+ speaker_voice_map = {}
75
+ for file in os.listdir(directory):
76
+ if file.endswith(".wav"):
77
+ speaker_id = file.replace(".wav", "")
78
+ speaker_voice_map[speaker_id] = os.path.join(directory, file)
79
+ return speaker_voice_map
80
+
81
+ def main(speaker_directory, aligned_text_file, output_audio_file):
82
+ speaker_voice_map = map_speaker_ids(speaker_directory)
83
+ with open(aligned_text_file, 'r') as file:
84
+ translated_text = file.read()
85
+ generate_speech(translated_text, speaker_voice_map, output_audio_file)
86
+ if os.path.exists('./audio/temp'):
87
+ shutil.rmtree('./audio/temp')
88
+
89
+ if __name__ == "__main__":
90
+ os.environ["COQUI_TOS_AGREED"] = 1
91
+
92
+ parser = argparse.ArgumentParser(description="Generate speech from translated text")
93
+ parser.add_argument("speaker_directory", help="Directory containing speaker voice clips")
94
+ parser.add_argument("aligned_text_file", help="Path to the translated and aligned text file")
95
+ parser.add_argument("output_audio_file", help="Path to save the generated speech audio file")
96
+ args = parser.parse_args()
97
+
98
+ main(args.speaker_directory, args.aligned_text_file, args.output_audio_file)