Spaces:

expressapi
/

vidverse

Sleeping

vidverse / app /scripts /audio_builder.py

badal

feat: initial commit

2f2406a over 2 years ago

8.88 kB

	import soundfile
	import pyrubberband
	import pathlib
	import os
	import io


	from . import TTS

	from pydub import AudioSegment
	from pydub.silence import detect_leading_silence
	import langcodes


	# Set working folder
	workingFolder = "workingFolder"

	synth_sample_rate = 24000
	debug_mode = False
	tts_service = "azure"
	batch_tts_synthesize = False
	skip_translation = False
	stop_after_translation = False
	skip_translation = False
	skip_synthesize = False
	force_stretch_with_twopass = False
	output_format = "mp3"


	def trim_clip(inputSound):
	trim_leading_silence: AudioSegment = lambda x: x[detect_leading_silence(x) :]
	trim_trailing_silence: AudioSegment = lambda x: trim_leading_silence(x.reverse()).reverse()
	strip_silence: AudioSegment = lambda x: trim_trailing_silence(trim_leading_silence(x))
	strippedSound = strip_silence(inputSound)
	return strippedSound

	# Function to insert audio into canvas at specific point
	def insert_audio(canvas, audioToOverlay, startTimeMs):
	# Create a copy of the canvas
	canvasCopy = canvas
	# Overlay the audio onto the copy
	canvasCopy = canvasCopy.overlay(audioToOverlay, position=int(startTimeMs))
	# Return the copy
	return canvasCopy

	# Function to create a canvas of a specific duration in miliseconds
	def create_canvas(canvasDuration, frame_rate=int(synth_sample_rate)):
	canvas = AudioSegment.silent(duration=canvasDuration, frame_rate=frame_rate)
	return canvas

	def get_speed_factor(subsDict, trimmedAudio, desiredDuration, num):
	virtualTempFile = AudioSegment.from_file(trimmedAudio, format="wav")
	rawDuration = virtualTempFile.duration_seconds
	trimmedAudio.seek(0) # This MUST be done to reset the file pointer to the start of the file, otherwise will get errors next time try to access the virtual files
	# Calculate the speed factor, put into dictionary
	desiredDuration = float(desiredDuration)
	speedFactor = (rawDuration*1000) / desiredDuration
	subsDict[num]['speed_factor'] = speedFactor
	return subsDict

	def stretch_audio(audioFileToStretch, speedFactor, num):
	virtualTempAudioFile = io.BytesIO()
	# Write the raw string to virtualtempaudiofile
	y, sampleRate = soundfile.read(audioFileToStretch)

	streched_audio = pyrubberband.time_stretch(y, sampleRate, speedFactor, rbargs={'--fine': '--fine'}) # Need to add rbarges in weird way because it demands a dictionary of two values
	#soundfile.write(f'{workingFolder}\\temp_stretched.wav', streched_audio, sampleRate)
	soundfile.write(virtualTempAudioFile, streched_audio, sampleRate, format='wav')
	if debug_mode:
	soundfile.write(os.path.join(workingFolder, f'{num}_s.wav'), streched_audio, sampleRate) # For debugging, saves the stretched audio files
	#return AudioSegment.from_file(f'{workingFolder}\\temp_stretched.wav', format="wav")
	return AudioSegment.from_file(virtualTempAudioFile, format="wav")


	def build_audio(subsDict, langDict, totalAudioLength, outputFileName, twoPassVoiceSynth=False):
	if tts_service == 'azure':
	twoPassVoiceSynth = False # Azure doesn't need two pass voice synth, so disable it

	virtualTrimmedFileDict = {}
	# First trim silence off the audio files
	for key, value in subsDict.items():
	filePathTrimmed = os.path.join(workingFolder, str(key)) + "_t.wav"
	subsDict[key]['TTS_FilePath_Trimmed'] = filePathTrimmed

	# Trim the clip and re-write file
	rawClip = AudioSegment.from_file(value['TTS_FilePath'], format="mp3", frame_rate=int(synth_sample_rate))
	trimmedClip = trim_clip(rawClip)
	if debug_mode:
	trimmedClip.export(filePathTrimmed, format="wav")

	# Create virtual file in dictionary with audio to be read later
	tempTrimmedFile = io.BytesIO()
	trimmedClip.export(tempTrimmedFile, format="wav")
	virtualTrimmedFileDict[key] = tempTrimmedFile
	keyIndex = list(subsDict.keys()).index(key)
	print(f" Trimmed Audio: {keyIndex+1} of {len(subsDict)}", end="\r")
	print("\n")

	# Calculates speed factor if necessary. Azure doesn't need this, so skip it
	if not tts_service == 'azure':
	# Calculate speed factors for each clip, aka how much to stretch the audio
	for key, value in subsDict.items():
	#subsDict = get_speed_factor(subsDict, value['TTS_FilePath_Trimmed'], value['duration_ms'], num=key)
	subsDict = get_speed_factor(subsDict, virtualTrimmedFileDict[key], value['duration_ms'], num=key)
	keyIndex = list(subsDict.keys()).index(key)
	print(f" Calculated Speed Factor: {keyIndex+1} of {len(subsDict)}", end="\r")
	print("\n")

	# If two pass voice synth is enabled, have API re-synthesize the clips at the new speed
	# Azure allows direct specification of audio duration, so no need to re-synthesize
	if twoPassVoiceSynth == True and not tts_service == 'azure':
	if batch_tts_synthesize == True and tts_service == 'azure':
	subsDict = TTS.synthesize_dictionary_batch(subsDict, langDict, skipSynthesize=skip_synthesize, secondPass=True)
	else:
	subsDict = TTS.synthesize_dictionary(subsDict, langDict, skipSynthesize=skip_synthesize, secondPass=True)

	for key, value in subsDict.items():
	# Trim the clip and re-write file
	rawClip = AudioSegment.from_file(value['TTS_FilePath'], format="mp3", frame_rate=int(synth_sample_rate))
	trimmedClip = trim_clip(rawClip)
	if debug_mode:
	# Remove '.wav' from the end of the file path
	secondPassTrimmedFile = value['TTS_FilePath_Trimmed'][:-4] + "_p2_t.wav"
	trimmedClip.export(secondPassTrimmedFile, format="wav")
	trimmedClip.export(virtualTrimmedFileDict[key], format="wav")
	keyIndex = list(subsDict.keys()).index(key)
	print(f" Trimmed Audio (2nd Pass): {keyIndex+1} of {len(subsDict)}", end="\r")
	print("\n")

	if force_stretch_with_twopass == True:
	for key, value in subsDict.items():
	subsDict = get_speed_factor(subsDict, virtualTrimmedFileDict[key], value['duration_ms'], num=key)
	keyIndex = list(subsDict.keys()).index(key)
	print(f" Calculated Speed Factor (2nd Pass): {keyIndex+1} of {len(subsDict)}", end="\r")
	print("\n")

	# Create canvas to overlay audio onto
	canvas = create_canvas(totalAudioLength)

	# Stretch audio and insert into canvas
	for key, value in subsDict.items():
	if (not twoPassVoiceSynth or force_stretch_with_twopass == True) and not tts_service == 'azure': # Don't stretch if azure is used
	#stretchedClip = stretch_audio(value['TTS_FilePath_Trimmed'], speedFactor=subsDict[key]['speed_factor'], num=key)
	stretchedClip = stretch_audio(virtualTrimmedFileDict[key], speedFactor=subsDict[key]['speed_factor'], num=key)
	else:
	#stretchedClip = AudioSegment.from_file(value['TTS_FilePath_Trimmed'], format="wav")
	stretchedClip = AudioSegment.from_file(virtualTrimmedFileDict[key], format="wav")
	virtualTrimmedFileDict[key].seek(0) # Not 100% sure if this is necessary but it was in the other place it is used

	canvas = insert_audio(canvas, stretchedClip, value['start_ms'])
	keyIndex = list(subsDict.keys()).index(key)
	print(f" Final Audio Processed: {keyIndex+1} of {len(subsDict)}", end="\r")
	print("\n")


	# Determine string to use for output format and file extension based on config setting
	outputFormat=output_format.lower()
	if outputFormat == "mp3":
	outputFileName += "mp3"
	formatString = "mp3"
	elif outputFormat == "wav":
	outputFileName += "wav"
	formatString = "wav"
	elif outputFormat == "aac":
	#outputFileName += "m4a"
	#formatString = "mp4" # Pydub doesn't accept "aac" as a format, so we have to use "mp4" instead. Alternatively, could use "adts" with file extension "aac"
	outputFileName += "aac"
	formatString = "adts" # Pydub doesn't accept "aac" as a format, so we have to use "mp4" instead. Alternatively, could use "adts" with file extension "aac"

	canvas = canvas.set_channels(2) # Change from mono to stereo
	try:
	print("\nExporting audio file...")
	canvas.export(outputFileName, format=formatString, bitrate="192k")
	except:
	outputFileName = outputFileName + ".bak"
	canvas.export(outputFileName, format=formatString, bitrate="192k")
	print("\nThere was an issue exporting the audio, it might be a permission error. The file was saved as a backup with the extension .bak")
	print("Try removing the .bak extension then listen to the file to see if it worked.\n")
	input("Press Enter to exit...")

	return subsDict