import re import typing from datetime import datetime, timezone, timedelta import PIL from PIL import Image from moviepy.editor import * from gradio import Error from lib import * datetime_format = "%d/%m/%Y %H:%M:%S" ist_offset = timedelta(hours=5, minutes=30) def now(): utc_time = datetime.now(timezone.utc) ist_time = utc_time.astimezone(timezone(ist_offset)) return ist_time.strftime(datetime_format) class AudioPalette: def __init__(self, pace_model_weights_path, resnet50_tf_model_weights_path, height, width, channels): self.pace_model = PaceModel(height, width, channels, resnet50_tf_model_weights_path, pace_model_weights_path) self.image_captioning = ImageCaptioning() self.audio_generation = AudioGeneration() self.sentiment_analyser = SentimentAnalyser() self.pace_map = { "Fast": "high", "Medium": "medium", "Slow": "low" } self.ngrok_url_pattern = re.compile("(https:\/\/[a-z0-9\-]+\.ngrok\.io\/)|(https:\/\/[a-z0-9\-]+\.ngrok-free.app\/)") def prompt_construction(self, caption: str, pace: str, sentiment: typing.Union[str, None], instrument: typing.Union[str, None], first: bool = True): instrument = instrument if instrument is not None else "" if first: prompt = f"A {instrument} soundtrack for {caption} with {self.pace_map[pace]} beats per minute. High Quality." else: prompt = f"A {instrument} soundtrack for {caption} with {self.pace_map[pace]} beats per minute. High Quality. Transitions smoothely from the previous audio while sounding different." # if sentiment: # prompt += f" As a {sentiment} music." return prompt def generate_single(self, input_image: PIL.Image.Image, instrument: typing.Union[str, None], ngrok_endpoint: typing.Union[str, None]): if not self.ngrok_url_pattern.search(ngrok_endpoint): print(f"[{now()}] Invalid ngrok endpoint - {ngrok_endpoint}") raise Error(f"Invalid ngrok endpoint - {ngrok_endpoint}") print(f"[{now()}] {ngrok_endpoint}") pace = self.pace_model.predict(input_image) print(f"[{now()}]", pace) print(f"[{now()}] Pace Prediction Done") try: generated_text = self.image_captioning.query(input_image)[0].get("generated_text") except Exception as e: print(f"[{now()}] image captioning error") raise Error(repr(e)) print(f"[{now()}]", generated_text) print(f"[{now()}] Captioning Done") sentiment = self.sentiment_analyser.sentiment(generated_text) print(f"[{now()}] Sentiment Analysis Done") prompt = self.prompt_construction(generated_text, pace, sentiment, instrument) print(f"[{now()}] Generated Prompt:", prompt) try: audio_file = self.audio_generation.generate(prompt, ngrok_endpoint) except Exception as e: print(f"[{now()}] {e}") raise Error(repr(e)) print(f"[{now()}]", audio_file) print(f"[{now()}] Audio Generation Done") outputs = [prompt, pace, generated_text, audio_file] return outputs def stitch_images(self, file_paths: typing.List[str], audio_paths: typing.List[str]): clips = [ImageClip(m).set_duration(5) for m in file_paths] audio_clips = [AudioFileClip(a) for a in audio_paths] concat_audio = concatenate_audioclips(audio_clips) new_audio = CompositeAudioClip([concat_audio]) concat_clip = concatenate_videoclips(clips, method="compose") concat_clip.audio = new_audio file_name = "generated_video.mp4" concat_clip.write_videofile(file_name, fps=24) return file_name def generate_multiple(self, file_paths: typing.List[str], instrument: typing.Union[str, None], ngrok_endpoint: typing.Union[str, None]): if not self.ngrok_url_pattern.search(ngrok_endpoint): print(f"[{now()}] Invalid ngrok endpoint - {ngrok_endpoint}") raise Error(f"Invalid ngrok endpoint - {ngrok_endpoint}") print(f"[{now()}] {ngrok_endpoint}") images = [Image.open(image_path) for image_path in file_paths] pace = [] generated_text = [] sentiments = [] prompts = [] # Extracting the pace for all the images for image in images: pace_prediction = self.pace_model.predict(image) pace.append(pace_prediction) print(f"[{now()}]", pace) print(f"[{now()}] Pace Prediction Done") # Generating the caption for all the images try: for image in images: caption = self.image_captioning.query(image)[0].get("generated_text") generated_text.append(caption) except Exception as e: print(f"[{now()}] image captioning error") raise Error(repr(e)) print(f"[{now()}]", generated_text) print(f"[{now()}] Captioning Done") # Extracting the sentiments from the generated captions for text in generated_text: sentiment = self.sentiment_analyser.sentiment(text) sentiments.append(sentiment) print(f"[{now()}] Sentiment Analysis Done:", sentiments) first = True for generated_caption, senti, pace_pred in zip(generated_text, sentiments, pace): prompts.append(self.prompt_construction(generated_caption, pace_pred, senti, instrument, first)) first = False print(f"[{now()}] Generated Prompts:", prompts) try: audio_file = self.audio_generation.generate(prompts, ngrok_endpoint) except Exception as e: print(f"[{now()}] {e}") raise Error(repr(e)) print(f"[{now()}]", audio_file) print(f"[{now()}] Audio Generation Done") video_file = self.stitch_images(file_paths, [audio_file]) return video_file