Spaces:
Sleeping
Sleeping
File size: 8,880 Bytes
2f2406a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 |
import soundfile
import pyrubberband
import pathlib
import os
import io
from . import TTS
from pydub import AudioSegment
from pydub.silence import detect_leading_silence
import langcodes
# Set working folder
workingFolder = "workingFolder"
synth_sample_rate = 24000
debug_mode = False
tts_service = "azure"
batch_tts_synthesize = False
skip_translation = False
stop_after_translation = False
skip_translation = False
skip_synthesize = False
force_stretch_with_twopass = False
output_format = "mp3"
def trim_clip(inputSound):
trim_leading_silence: AudioSegment = lambda x: x[detect_leading_silence(x) :]
trim_trailing_silence: AudioSegment = lambda x: trim_leading_silence(x.reverse()).reverse()
strip_silence: AudioSegment = lambda x: trim_trailing_silence(trim_leading_silence(x))
strippedSound = strip_silence(inputSound)
return strippedSound
# Function to insert audio into canvas at specific point
def insert_audio(canvas, audioToOverlay, startTimeMs):
# Create a copy of the canvas
canvasCopy = canvas
# Overlay the audio onto the copy
canvasCopy = canvasCopy.overlay(audioToOverlay, position=int(startTimeMs))
# Return the copy
return canvasCopy
# Function to create a canvas of a specific duration in miliseconds
def create_canvas(canvasDuration, frame_rate=int(synth_sample_rate)):
canvas = AudioSegment.silent(duration=canvasDuration, frame_rate=frame_rate)
return canvas
def get_speed_factor(subsDict, trimmedAudio, desiredDuration, num):
virtualTempFile = AudioSegment.from_file(trimmedAudio, format="wav")
rawDuration = virtualTempFile.duration_seconds
trimmedAudio.seek(0) # This MUST be done to reset the file pointer to the start of the file, otherwise will get errors next time try to access the virtual files
# Calculate the speed factor, put into dictionary
desiredDuration = float(desiredDuration)
speedFactor = (rawDuration*1000) / desiredDuration
subsDict[num]['speed_factor'] = speedFactor
return subsDict
def stretch_audio(audioFileToStretch, speedFactor, num):
virtualTempAudioFile = io.BytesIO()
# Write the raw string to virtualtempaudiofile
y, sampleRate = soundfile.read(audioFileToStretch)
streched_audio = pyrubberband.time_stretch(y, sampleRate, speedFactor, rbargs={'--fine': '--fine'}) # Need to add rbarges in weird way because it demands a dictionary of two values
#soundfile.write(f'{workingFolder}\\temp_stretched.wav', streched_audio, sampleRate)
soundfile.write(virtualTempAudioFile, streched_audio, sampleRate, format='wav')
if debug_mode:
soundfile.write(os.path.join(workingFolder, f'{num}_s.wav'), streched_audio, sampleRate) # For debugging, saves the stretched audio files
#return AudioSegment.from_file(f'{workingFolder}\\temp_stretched.wav', format="wav")
return AudioSegment.from_file(virtualTempAudioFile, format="wav")
def build_audio(subsDict, langDict, totalAudioLength, outputFileName, twoPassVoiceSynth=False):
if tts_service == 'azure':
twoPassVoiceSynth = False # Azure doesn't need two pass voice synth, so disable it
virtualTrimmedFileDict = {}
# First trim silence off the audio files
for key, value in subsDict.items():
filePathTrimmed = os.path.join(workingFolder, str(key)) + "_t.wav"
subsDict[key]['TTS_FilePath_Trimmed'] = filePathTrimmed
# Trim the clip and re-write file
rawClip = AudioSegment.from_file(value['TTS_FilePath'], format="mp3", frame_rate=int(synth_sample_rate))
trimmedClip = trim_clip(rawClip)
if debug_mode:
trimmedClip.export(filePathTrimmed, format="wav")
# Create virtual file in dictionary with audio to be read later
tempTrimmedFile = io.BytesIO()
trimmedClip.export(tempTrimmedFile, format="wav")
virtualTrimmedFileDict[key] = tempTrimmedFile
keyIndex = list(subsDict.keys()).index(key)
print(f" Trimmed Audio: {keyIndex+1} of {len(subsDict)}", end="\r")
print("\n")
# Calculates speed factor if necessary. Azure doesn't need this, so skip it
if not tts_service == 'azure':
# Calculate speed factors for each clip, aka how much to stretch the audio
for key, value in subsDict.items():
#subsDict = get_speed_factor(subsDict, value['TTS_FilePath_Trimmed'], value['duration_ms'], num=key)
subsDict = get_speed_factor(subsDict, virtualTrimmedFileDict[key], value['duration_ms'], num=key)
keyIndex = list(subsDict.keys()).index(key)
print(f" Calculated Speed Factor: {keyIndex+1} of {len(subsDict)}", end="\r")
print("\n")
# If two pass voice synth is enabled, have API re-synthesize the clips at the new speed
# Azure allows direct specification of audio duration, so no need to re-synthesize
if twoPassVoiceSynth == True and not tts_service == 'azure':
if batch_tts_synthesize == True and tts_service == 'azure':
subsDict = TTS.synthesize_dictionary_batch(subsDict, langDict, skipSynthesize=skip_synthesize, secondPass=True)
else:
subsDict = TTS.synthesize_dictionary(subsDict, langDict, skipSynthesize=skip_synthesize, secondPass=True)
for key, value in subsDict.items():
# Trim the clip and re-write file
rawClip = AudioSegment.from_file(value['TTS_FilePath'], format="mp3", frame_rate=int(synth_sample_rate))
trimmedClip = trim_clip(rawClip)
if debug_mode:
# Remove '.wav' from the end of the file path
secondPassTrimmedFile = value['TTS_FilePath_Trimmed'][:-4] + "_p2_t.wav"
trimmedClip.export(secondPassTrimmedFile, format="wav")
trimmedClip.export(virtualTrimmedFileDict[key], format="wav")
keyIndex = list(subsDict.keys()).index(key)
print(f" Trimmed Audio (2nd Pass): {keyIndex+1} of {len(subsDict)}", end="\r")
print("\n")
if force_stretch_with_twopass == True:
for key, value in subsDict.items():
subsDict = get_speed_factor(subsDict, virtualTrimmedFileDict[key], value['duration_ms'], num=key)
keyIndex = list(subsDict.keys()).index(key)
print(f" Calculated Speed Factor (2nd Pass): {keyIndex+1} of {len(subsDict)}", end="\r")
print("\n")
# Create canvas to overlay audio onto
canvas = create_canvas(totalAudioLength)
# Stretch audio and insert into canvas
for key, value in subsDict.items():
if (not twoPassVoiceSynth or force_stretch_with_twopass == True) and not tts_service == 'azure': # Don't stretch if azure is used
#stretchedClip = stretch_audio(value['TTS_FilePath_Trimmed'], speedFactor=subsDict[key]['speed_factor'], num=key)
stretchedClip = stretch_audio(virtualTrimmedFileDict[key], speedFactor=subsDict[key]['speed_factor'], num=key)
else:
#stretchedClip = AudioSegment.from_file(value['TTS_FilePath_Trimmed'], format="wav")
stretchedClip = AudioSegment.from_file(virtualTrimmedFileDict[key], format="wav")
virtualTrimmedFileDict[key].seek(0) # Not 100% sure if this is necessary but it was in the other place it is used
canvas = insert_audio(canvas, stretchedClip, value['start_ms'])
keyIndex = list(subsDict.keys()).index(key)
print(f" Final Audio Processed: {keyIndex+1} of {len(subsDict)}", end="\r")
print("\n")
# Determine string to use for output format and file extension based on config setting
outputFormat=output_format.lower()
if outputFormat == "mp3":
outputFileName += "mp3"
formatString = "mp3"
elif outputFormat == "wav":
outputFileName += "wav"
formatString = "wav"
elif outputFormat == "aac":
#outputFileName += "m4a"
#formatString = "mp4" # Pydub doesn't accept "aac" as a format, so we have to use "mp4" instead. Alternatively, could use "adts" with file extension "aac"
outputFileName += "aac"
formatString = "adts" # Pydub doesn't accept "aac" as a format, so we have to use "mp4" instead. Alternatively, could use "adts" with file extension "aac"
canvas = canvas.set_channels(2) # Change from mono to stereo
try:
print("\nExporting audio file...")
canvas.export(outputFileName, format=formatString, bitrate="192k")
except:
outputFileName = outputFileName + ".bak"
canvas.export(outputFileName, format=formatString, bitrate="192k")
print("\nThere was an issue exporting the audio, it might be a permission error. The file was saved as a backup with the extension .bak")
print("Try removing the .bak extension then listen to the file to see if it worked.\n")
input("Press Enter to exit...")
return subsDict
|