sc45's picture
Initial Commit
f0ceee4
raw history blame
No virus
4.15 kB
from TTS.api import TTS
from pydub import AudioSegment
import os
import re
import ffmpeg
import shutil
import argparse
def adjust_speed(input_file, speed_factor):
output_file = input_file.replace(".wav", "_adjusted.wav")
ffmpeg.input(input_file).filter('atempo', speed_factor).output(output_file, acodec='pcm_s16le').run()
return output_file
def generate_speech(text, speaker_voice_map, output_file):
combined_audio = AudioSegment.empty()
temp_files = []
tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to("cuda")
for line in text.split("\n"):
if not line.strip():
continue
match = re.match(r"\[SPEAKER_(\d+)\] \[(\d+\.\d+)-(\d+\.\d+)\] (.+)", line)
if not match:
continue
speaker_id, start_time, end_time, sentence = match.groups()
start_time, end_time = float(start_time), float(end_time)
segment_duration = (end_time - start_time) * 1000 # Duration in milliseconds
speaker_wav = speaker_voice_map.get(f"SPEAKER_{speaker_id}")
if not speaker_wav:
continue
os.makedirs('./audio/temp', exist_ok=True)
temp_file_path = f"./audio/temp/temp_output_part_{len(temp_files)}.wav"
temp_files.append(temp_file_path)
tts_speed = 1.0
tts.tts_to_file(text=sentence, file_path=temp_file_path, speaker_wav=speaker_wav, language="es", speed=tts_speed)
segment_audio = AudioSegment.from_wav(temp_file_path)
if segment_audio.duration_seconds * 1000 > segment_duration:
while tts_speed < 2.0 and segment_audio.duration_seconds * 1000 > segment_duration:
tts_speed += 0.5
tts.tts_to_file(text=sentence, file_path=temp_file_path, speaker_wav=speaker_wav, language="es", speed=tts_speed)
segment_audio = AudioSegment.from_wav(temp_file_path)
if segment_audio.duration_seconds * 1000 > segment_duration:
required_speed = segment_duration / (segment_audio.duration_seconds * 1000)
if required_speed < 1.0:
required_speed = 1.0 / required_speed
temp_file_path = adjust_speed(temp_file_path, required_speed)
segment_audio = AudioSegment.from_wav(temp_file_path)
if combined_audio.duration_seconds == 0 and start_time > 0:
combined_audio = AudioSegment.silent(duration=start_time * 1000) + combined_audio
if segment_audio.duration_seconds * 1000 > segment_duration:
segment_audio = segment_audio[:segment_duration]
else:
segment_audio = segment_audio + AudioSegment.silent(duration=segment_duration - len(segment_audio))
combined_audio += segment_audio
combined_audio.export(output_file, format="wav")
for temp_file in temp_files:
os.remove(temp_file)
def map_speaker_ids(directory):
speaker_voice_map = {}
for file in os.listdir(directory):
if file.endswith(".wav"):
speaker_id = file.replace(".wav", "")
speaker_voice_map[speaker_id] = os.path.join(directory, file)
return speaker_voice_map
def main(speaker_directory, aligned_text_file, output_audio_file):
speaker_voice_map = map_speaker_ids(speaker_directory)
with open(aligned_text_file, 'r') as file:
translated_text = file.read()
generate_speech(translated_text, speaker_voice_map, output_audio_file)
if os.path.exists('./audio/temp'):
shutil.rmtree('./audio/temp')
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Generate speech from translated text")
parser.add_argument("speaker_directory", help="Directory containing speaker voice clips")
parser.add_argument("aligned_text_file", help="Path to the translated and aligned text file")
parser.add_argument("output_audio_file", help="Path to save the generated speech audio file")
args = parser.parse_args()
main(args.speaker_directory, args.aligned_text_file, args.output_audio_file)