import torch from diffusers.pipelines.auto_pipeline import AutoPipelineForText2Image from diffusers.pipelines.pipeline_utils import DiffusionPipeline from diffusers.utils.export_utils import export_to_video from typing import Optional # Importing the model name from a configuration file # This allows for easy changes to the model without modifying the code # Ensure that the model_name is defined in utils/config.py from utils.config import IMG_MODEL_NAME, VIDEO_MODEL_NAME, OUTPUT_DIR class VisualSynthesizer: def __init__(self, img_model: str = IMG_MODEL_NAME, video_model: str = VIDEO_MODEL_NAME): """ Initializes the ImageGenerator with a specified text-to-image model. Args: img_model (str): The Hugging Face model ID for the diffusion model. video_model (str): The Hugging Face model ID for the video generation model (if applicable). """ self.device = "cuda" if torch.cuda.is_available() else "cpu" self.torch_dtype = torch.float16 if self.device == "cuda" else torch.float32 torch.backends.cudnn.benchmark = True # Optimize for input sizes # Initialize text-to-image pipeline with the specified model self.image_pipe = AutoPipelineForText2Image.from_pretrained( img_model, torch_dtype=self.torch_dtype, variant="fp16" if self.torch_dtype == torch.float16 else None, low_cpu_mem_usage=True ).to(self.device) # Initialize text-to-video pipeline # self.video_pipe = DiffusionPipeline.from_pretrained( # video_model, # torch_dtype=self.torch_dtype, # variant="fp16" if self.torch_dtype == torch.float16 else None, # low_cpu_mem_usage=True # ).to(self.device) # self.video_pipe.enable_model_cpu_offload() def generate_image(self, prompt: str, negative_prompt: str = "blurry, distorted, poorly drawn, watermark", num_inference_steps: int = 50, guidance_scale: float = 7.5): """ Generates an image from a text prompt. Args: prompt (str): Text prompt to guide image generation. negative_prompt (str): Optional negative prompts to avoid certain features. num_inference_steps (int): Number of inference steps for generation. guidance_scale (float): Guidance scale for generation. Returns: PIL.Image: Generated image. """ # use the pipeline to generate an image based on the prompt and other parameters image = self.image_pipe(prompt, negative_prompt=negative_prompt, num_inference_steps=num_inference_steps, guidance_scale=guidance_scale ).images[0] return image # TODO: Fix the video generation method use the correct pipeline and parameters # This is a placeholder implementation, adjust as needed for your video generation requirements def generate_video( self, prompt: str, negative_prompt: Optional[str] = None, num_frames: int = 24, # ~1 second at 24 fps fps: int = 8, output_path: Optional[str] = "output.mp4", guidance_scale: float = 12.5, num_inference_steps: int = 25 ) -> str: # type: ignore """ Generates a short video from a text prompt. Args: prompt (str): Text prompt to guide generation. negative_prompt (str): Optional negative prompts. num_frames (int): Number of video frames. fps (int): Frame rate for the video. output_path (str): Path to save output video. guidance_scale (float): Guidance scale for generation. num_inference_steps (int): Number of inference steps. Returns: str: Path to saved video file. """ # video_output = self.video_pipe( # prompt=prompt, # negative_prompt=negative_prompt, # num_frames=num_frames, # guidance_scale=guidance_scale, # num_inference_steps=num_inference_steps # ).frames # result = self.video_pipe(prompt, num_frames=num_frames, **kwargs) # frames = result.frames[0] # video_path = export_to_video(frames, output_video_path=f"{OUTPUT_DIR}_video", fps=fps) # return video_path pass