Spaces:

onlycaps
/

audio_palette

Runtime error

App Files Files Community

audio_palette / utils /audio_palette.py

manasch

add sentiment analyser and refactor code

8a68e19 verified about 1 year ago

raw

history blame

4.66 kB

	import typing
	from datetime import datetime

	import PIL
	from PIL import Image
	from moviepy.editor import *

	from lib import *

	datetime_format = "%d/%m/%Y %H:%M:%S"
	def now():
	return datetime.now().strftime(datetime_format)

	class AudioPalette:
	def __init__(self, pace_model_weights_path, resnet50_tf_model_weights_path, height, width, channels):
	self.pace_model = PaceModel(height, width, channels, resnet50_tf_model_weights_path, pace_model_weights_path)
	self.image_captioning = ImageCaptioning()
	self.audio_generation = AudioGeneration()
	self.sentiment_analyser = SentimentAnalyser()
	self.pace_map = {
	"Fast": "high",
	"Medium": "medium",
	"Slow": "low"
	}

	def prompt_construction(self, caption: str, pace: str, sentiment: typing.Union[str, None], instrument: typing.Union[str, None], first: bool = True):
	instrument = instrument if instrument is not None else ""

	if first:
	prompt = f"A {instrument} soundtrack for {caption} with {self.pace_map[pace]} beats per minute. High Quality."
	else:
	prompt = f"A {instrument} soundtrack for {caption} with {self.pace_map[pace]} beats per minute. High Quality. Transitions smoothely from the previous audio while sounding different."

	# if sentiment:
	# prompt += f" As a {sentiment} music."

	return prompt

	def generate_single(self, input_image: PIL.Image.Image, instrument: typing.Union[str, None], ngrok_endpoint: str):
	pace = self.pace_model.predict(input_image)
	print(f"[{now()}]", pace)
	print(f"[{now()}] Pace Prediction Done")

	generated_text = self.image_captioning.query(input_image)[0].get("generated_text")
	print(f"[{now()}]", generated_text)
	print(f"[{now()}] Captioning Done")

	sentiment = self.sentiment_analyser.sentiment(generated_text)
	print(f"[{now()}] Sentiment Analysis Done")

	prompt = self.prompt_construction(generated_text, pace, sentiment, instrument)
	print(f"[{now()}] Generated Prompt:", prompt)

	audio_file = self.audio_generation.generate(prompt, ngrok_endpoint)
	print(f"[{now()}]", audio_file)
	print(f"[{now()}] Audio Generation Done")

	outputs = [prompt, pace, generated_text, audio_file]
	return outputs

	def stitch_images(self, file_paths: typing.List[str], audio_paths: typing.List[str]):
	clips = [ImageClip(m).set_duration(5) for m in file_paths]
	audio_clips = [AudioFileClip(a) for a in audio_paths]
	concat_audio = concatenate_audioclips(audio_clips)
	new_audio = CompositeAudioClip([concat_audio])

	concat_clip = concatenate_videoclips(clips, method="compose")
	concat_clip.audio = new_audio

	file_name = "generated_video.mp4"
	concat_clip.write_videofile(file_name, fps=24)
	return file_name

	def generate_multiple(self, file_paths: typing.List[str], instrument: typing.Union[str, None], ngrok_endpoint: str):
	images = [Image.open(image_path) for image_path in file_paths]
	pace = []
	generated_text = []
	sentiments = []
	prompts = []

	# Extracting the pace for all the images
	for image in images:
	pace_prediction = self.pace_model.predict(image)
	pace.append(pace_prediction)
	print(f"[{now()}]", pace)
	print(f"[{now()}] Pace Prediction Done")

	# Generating the caption for all the images
	for image in images:
	caption = self.image_captioning.query(image)[0].get("generated_text")
	generated_text.append(caption)
	print(f"[{now()}]", generated_text)
	print(f"[{now()}] Captioning Done")

	# Extracting the sentiments from the generated captions
	for text in generated_text:
	sentiment = self.sentiment_analyser.sentiment(text)
	sentiments.append(sentiment)
	print(f"[{now()}] Sentiment Analysis Done:", sentiments)

	first = True
	for generated_caption, senti, pace_pred in zip(generated_text, sentiments, pace):
	prompts.append(self.prompt_construction(generated_caption, pace_pred, senti, instrument, first))
	first = False
	print(f"[{now()}] Generated Prompts:", prompts)

	audio_file = self.audio_generation.generate(prompts, ngrok_endpoint)
	print(f"[{now()}]", audio_file)
	print(f"[{now()}] Audio Generation Done")

	video_file = self.stitch_images(file_paths, [audio_file])
	return video_file