diff --git a/.gitignore b/.gitignore index 5bbee1b..1d17dae 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1 @@ .venv -data -scripts/wandb -models -scripts/yolov8* diff --git a/requirements.txt b/requirements.txt index d1c8048..85f0bbc 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,7 +9,13 @@ numpy rich tqdm transformers -opencv-python-headless fastapi uvicorn matplotlib +accelerate +torchvision +ftfy +tensorboard +Jinja2 +datasets +peft diff --git a/scripts/clear_memory.py b/scripts/clear_memory.py deleted file mode 100644 index 7b6010e..0000000 --- a/scripts/clear_memory.py +++ /dev/null @@ -1,18 +0,0 @@ -import gc -import torch -from logger import rich_logger as l - -def clear_memory(): - """ - Clears the memory by collecting garbage and emptying the CUDA cache. - - This function is useful when dealing with memory-intensive operations in Python, especially when using libraries like PyTorch. - - Note: - This function requires the `gc` and `torch` modules to be imported. - - """ - gc.collect() - torch.cuda.empty_cache() - l.info("Memory Cleared") - \ No newline at end of file diff --git a/scripts/config.py b/scripts/config.py index b620197..10947d3 100644 --- a/scripts/config.py +++ b/scripts/config.py @@ -1,13 +1,60 @@ -LOGS_DIR = '../logs' -DATA_DIR = '../data' -Project_Name = 'product_placement_api' -entity = 'vikramxd' -image_dir = '../sample_data' -mask_dir = '../masks' -segmentation_model = 'facebook/sam-vit-large' -detection_model = 'yolov8l' -kandinsky_model_name = 'kandinsky-community/kandinsky-2-2-decoder-inpaint' -video_model_name = 'stabilityai/stable-video-diffusion-img2vid-xt' -target_width = 2560 -target_height = 1440 -roi_scale = 0.6 +MODEL_NAME="stabilityai/stable-diffusion-xl-base-1.0" +VAE_NAME= "madebyollin/sdxl-vae-fp16-fix" +DATASET_NAME= "hahminlew/kream-product-blip-captions" +PROJECT_NAME = "Product Photography" + +class Config: + def __init__(self): + self.pretrained_model_name_or_path = MODEL_NAME + self.pretrained_vae_model_name_or_path = VAE_NAME + self.revision = None + self.variant = None + self.dataset_name = DATASET_NAME + self.dataset_config_name = None + self.train_data_dir = None + self.image_column = 'image' + self.caption_column = 'text' + self.validation_prompt = None + self.num_validation_images = 4 + self.validation_epochs = 1 + self.max_train_samples = None + self.output_dir = "output" + self.cache_dir = None + self.seed = None + self.resolution = 1024 + self.center_crop = False + self.random_flip = False + self.train_text_encoder = False + self.train_batch_size = 16 + self.num_train_epochs = 200 + self.max_train_steps = None + self.checkpointing_steps = 500 + self.checkpoints_total_limit = None + self.resume_from_checkpoint = None + self.gradient_accumulation_steps = 1 + self.gradient_checkpointing = False + self.learning_rate = 1e-4 + self.scale_lr = False + self.lr_scheduler = "constant" + self.lr_warmup_steps = 500 + self.snr_gamma = None + self.allow_tf32 = False + self.dataloader_num_workers = 0 + self.use_8bit_adam = True + self.adam_beta1 = 0.9 + self.adam_beta2 = 0.999 + self.adam_weight_decay = 1e-2 + self.adam_epsilon = 1e-08 + self.max_grad_norm = 1.0 + self.push_to_hub = False + self.hub_token = None + self.prediction_type = None + self.hub_model_id = None + self.logging_dir = "logs" + self.report_to = "wandb" + self.mixed_precision = None + self.local_rank = -1 + self.enable_xformers_memory_efficient_attention = False + self.noise_offset = 0 + self.rank = 4 + self.debug_loss = False diff --git a/scripts/endpoint.py b/scripts/endpoint.py deleted file mode 100644 index cbb9ebe..0000000 --- a/scripts/endpoint.py +++ /dev/null @@ -1,65 +0,0 @@ -from fastapi import FastAPI,HTTPException -from fastapi.responses import FileResponse -from fastapi.middleware.cors import CORSMiddleware -from models import kandinsky_inpainting_inference -from segment_everything import extend_image, generate_mask_from_bbox, invert_mask -from video_pipeline import fetch_video_pipeline -from diffusers.utils import load_image -from logger import rich_logger as l -from fastapi import UploadFile, File -from config import segmentation_model, detection_model,target_height, target_width, roi_scale -from PIL import Image -import io -import tempfile - - - - - - -app = FastAPI(title="Product Diffusion API", - description="API for Product Diffusion", - version="0.1.0", - openapi_url="/api/v1/openapi.json") - - -app.add_middleware( - CORSMiddleware, - allow_origins=["*"], - allow_methods=["*"], - allow_headers=["*"], - allow_credentials=True - -) - -@app.post("/api/v1/image_outpainting") -async def image_outpainting(image: UploadFile, prompt: str, negative_prompt: str,num_inference_steps:int=30): - """ - Perform Outpainting on an image. - - Args: - image (UploadFile): The input image file. - prompt (str): The prompt for the outpainting. - negative_prompt (str): The negative prompt for the outpainting. - - Returns: - JSONResponse: The output image path. - """ - image_data = await image.read() - image = Image.open(io.BytesIO(image_data)) - image = load_image(image) - image = extend_image(image, target_width=target_width, target_height=target_height, roi_scale=roi_scale) - mask_image = generate_mask_from_bbox(image, segmentation_model, detection_model) - mask_image = Image.fromarray(mask_image) - mask_image = invert_mask(mask_image) - output_image = kandinsky_inpainting_inference(prompt, negative_prompt, image, mask_image,num_inference_steps=num_inference_steps) - with tempfile.NamedTemporaryFile(suffix='.jpg', delete=False) as temp_file: - output_image.save(temp_file, format='JPEG') - temp_file_path = temp_file.name - return FileResponse(temp_file_path, media_type='image/jpeg', filename='output_image.jpg') - - - - - - \ No newline at end of file diff --git a/scripts/logger.py b/scripts/logger.py index 2e0f42f..c493b93 100644 --- a/scripts/logger.py +++ b/scripts/logger.py @@ -25,5 +25,4 @@ for level in log_levels: file_handler = RotatingFileHandler(log_file, maxBytes=10 * 1024 * 1024, backupCount=5) file_handler.setLevel(level) file_handler.setFormatter(logging.Formatter('%(asctime)s [%(levelname)s] %(module)s - %(message)s')) - rich_logger.addHandler(file_handler) - + rich_logger.addHandler(file_handler) \ No newline at end of file diff --git a/scripts/models.py b/scripts/models.py deleted file mode 100644 index 2ca9eea..0000000 --- a/scripts/models.py +++ /dev/null @@ -1,82 +0,0 @@ -from logger import rich_logger as l -from wandb.integration.diffusers import autolog -from config import Project_Name -from clear_memory import clear_memory -import numpy as np -import torch -from diffusers.utils import load_image -from pipeline import fetch_kandinsky_pipeline -from config import controlnet_adapter_model_name,controlnet_base_model_name,kandinsky_model_name -from diffusers import StableDiffusionInpaintPipeline, DPMSolverMultistepScheduler -from video_pipeline import fetch_video_pipeline -from config import video_model_name - - - - - - - - - - - - - - - - -def kandinsky_inpainting_inference(prompt, negative_prompt, image, mask_image,num_inference_steps=800,strength=1.0,guidance_scale = 7.8): - """ - Perform Kandinsky inpainting inference on the given image. - - Args: - prompt (str): The prompt for the inpainting process. - negative_prompt (str): The negative prompt for the inpainting process. - image (PIL.Image.Image): The input image to be inpainted. - mask_image (PIL.Image.Image): The mask image indicating the areas to be inpainted. - - Returns: - PIL.Image.Image: The output inpainted image. - """ - clear_memory() - l.info("Kandinsky Inpainting Inference ->") - pipe = fetch_kandinsky_pipeline(controlnet_adapter_model_name, controlnet_base_model_name,kandinsky_model_name, image) - output_image = pipe(prompt=prompt,negative_prompt=negative_prompt,image=image,mask_image=mask_image,num_inference_steps=num_inference_steps,strength=strength,guidance_scale = guidance_scale,height = 1472, width = 2560).images[0] - return output_image - - - - - - - - - -def image_to_video_pipeline(image, video_model_name, decode_chunk_size, motion_bucket_id, generator=torch.manual_seed(42)): - """ - Converts an image to a video using a specified video model. - - Args: - image (Image): The input image to convert to video. - video_model_name (str): The name of the video model to use. - decode_chunk_size (int): The size of the chunks to decode. - motion_bucket_id (str): The ID of the motion bucket. - generator (torch.Generator, optional): The random number generator. Defaults to torch.manual_seed(42). - - Returns: - list: The frames of the generated video. - """ - clear_memory() - l.info("Stable Video Diffusion Image 2 Video pipeline Inference ->") - pipe = fetch_video_pipeline(video_model_name) - frames = pipe(image=image, decode_chunk_size=decode_chunk_size, motion_bucket_id=motion_bucket_id, generator=generator).frames[0] - return frames - - - - - - - - diff --git a/scripts/pipeline.py b/scripts/pipeline.py deleted file mode 100644 index af0e6bf..0000000 --- a/scripts/pipeline.py +++ /dev/null @@ -1,100 +0,0 @@ -from diffusers import ControlNetModel,StableDiffusionControlNetInpaintPipeline,AutoPipelineForInpainting -import torch - - - - - - - -class PipelineFetcher: - """ - A class that fetches different pipelines for image processing. - - Args: - controlnet_adapter_model_name (str): The name of the controlnet adapter model. - controlnet_base_model_name (str): The name of the controlnet base model. - kandinsky_model_name (str): The name of the Kandinsky model. - image (str): The image to be processed. - - """ - - def __init__(self, controlnet_adapter_model_name, controlnet_base_model_name, kandinsky_model_name, image: str): - self.controlnet_adapter_model_name = controlnet_adapter_model_name - self.controlnet_base_model_name = controlnet_base_model_name - self.kandinsky_model_name = kandinsky_model_name - self.image = image - - def ControlNetInpaintPipeline(self): - """ - Fetches the ControlNet inpainting pipeline. - - Returns: - pipe (StableDiffusionControlNetInpaintPipeline): The ControlNet inpainting pipeline. - - """ - controlnet = ControlNetModel.from_pretrained(self.controlnet_adapter_model_name, torch_dtype=torch.float16) - pipe = StableDiffusionControlNetInpaintPipeline.from_pretrained( - self.controlnet_base_model_name, controlnet=controlnet, torch_dtype=torch.float16 - ) - pipe.to('cuda') - - return pipe - - def KandinskyPipeline(self): - """ - Fetches the Kandinsky pipeline. - - Returns: - pipe (AutoPipelineForInpainting): The Kandinsky pipeline. - - """ - pipe = AutoPipelineForInpainting.from_pretrained(self.kandinsky_model_name, torch_dtype=torch.float16) - pipe = pipe.to('cuda') - pipe.unet = torch.compile(pipe.unet) - - return pipe - - - - - - -def fetch_control_pipeline(controlnet_adapter_model_name, controlnet_base_model_name, kandinsky_model_name, image): - """ - Fetches the control pipeline for image processing. - - Args: - controlnet_adapter_model_name (str): The name of the controlnet adapter model. - controlnet_base_model_name (str): The name of the controlnet base model. - kandinsky_model_name (str): The name of the Kandinsky model. - image: The input image for processing. - - Returns: - pipe: The control pipeline for image processing. - """ - pipe_fetcher = PipelineFetcher(controlnet_adapter_model_name, controlnet_base_model_name, kandinsky_model_name, image) - pipe = pipe_fetcher.ControlNetInpaintPipeline() - return pipe - - -def fetch_kandinsky_pipeline(controlnet_adapter_model_name, controlnet_base_model_name, kandinsky_model_name, image): - """ - Fetches the Kandinsky pipeline. - - Args: - controlnet_adapter_model_name (str): The name of the controlnet adapter model. - controlnet_base_model_name (str): The name of the controlnet base model. - kandinsky_model_name (str): The name of the Kandinsky model. - image: The input image. - - Returns: - pipe: The Kandinsky pipeline. - """ - pipe_fetcher = PipelineFetcher(controlnet_adapter_model_name, controlnet_base_model_name, kandinsky_model_name, image) - pipe = pipe_fetcher.KandinskyPipeline() - pipe = pipe.to('cuda') - - return pipe - - diff --git a/scripts/run.py b/scripts/run.py deleted file mode 100644 index cccc06a..0000000 --- a/scripts/run.py +++ /dev/null @@ -1,39 +0,0 @@ -import argparse -import os -from segment_everything import generate_mask_from_bbox, extend_image, invert_mask -from models import kandinsky_inpainting_inference, load_image -from PIL import Image -from config import segmentation_model, detection_model,target_height, target_width, roi_scale - -def main(args): - """ - Main function that performs the product diffusion process. - - Args: - args (Namespace): Command-line arguments. - - Returns: - None - """ - os.makedirs(args.output_dir, exist_ok=True) - os.makedirs(args.mask_dir, exist_ok=True) - output_image_path = os.path.join(args.output_dir, f'{args.uid}_output.jpg') - image = load_image(args.image_path) - extended_image = extend_image(image, target_width=target_width, target_height=target_height, roi_scale=roi_scale) - mask = generate_mask_from_bbox(extended_image, segmentation_model, detection_model) - mask_image = Image.fromarray(mask) - inverted_mask = invert_mask(mask_image) - #inverted_mask = Image.fromarray(inverted_mask) - output_image = kandinsky_inpainting_inference(args.prompt, args.negative_prompt, extended_image, inverted_mask) - output_image.save(output_image_path) - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description='Perform Outpainting on an image.') - parser.add_argument('--image_path', type=str, required=True, help='Path to the input image.') - parser.add_argument('--prompt', type=str, required=True, help='Prompt for the Kandinsky inpainting.') - parser.add_argument('--negative_prompt', type=str, required=True, help='Negative prompt for the Kandinsky inpainting.') - parser.add_argument('--output_dir', type=str, required=True, help='Directory to save the output image.') - parser.add_argument('--mask_dir', type=str, required=True, help='Directory to save the mask image.') - parser.add_argument('--uid', type=str, required=True, help='Unique identifier for the image and mask.') - args = parser.parse_args() - main(args) \ No newline at end of file diff --git a/scripts/segment_everything.py b/scripts/segment_everything.py deleted file mode 100644 index c2e9532..0000000 --- a/scripts/segment_everything.py +++ /dev/null @@ -1,125 +0,0 @@ -from ultralytics import YOLO -from transformers import SamModel, SamProcessor -import torch -from diffusers.utils import load_image -from PIL import Image, ImageOps -import numpy as np -import torch -from diffusers import StableVideoDiffusionPipeline - - - - - - - - - -device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') - - - - - - - - - -def extend_image(image, target_width, target_height, roi_scale=0.5): - """ - Extends an image to fit within the specified target dimensions while maintaining the aspect ratio. - - Args: - image (PIL.Image.Image): The image to be extended. - target_width (int): The desired width of the extended image. - target_height (int): The desired height of the extended image. - roi_scale (float, optional): The scale factor applied to the resized image. Defaults to 0.5. - - Returns: - PIL.Image.Image: The extended image. - """ - original_image = image - original_width, original_height = original_image.size - scale = min(target_width / original_width, target_height / original_height) - new_width = int(original_width * scale * roi_scale) - new_height = int(original_height * scale * roi_scale) - original_image_resized = original_image.resize((new_width, new_height)) - extended_image = Image.new("RGB", (target_width, target_height), "white") - paste_x = (target_width - new_width) // 2 - paste_y = (target_height - new_height) // 2 - extended_image.paste(original_image_resized, (paste_x, paste_y)) - return extended_image - - - - - -def generate_mask_from_bbox(image: Image, segmentation_model: str ,detection_model) -> Image: - """ - Generates a mask from the bounding box of an image using YOLO and SAM-ViT models. - - Args: - image_path (str): The path to the input image. - - Returns: - numpy.ndarray: The generated mask as a NumPy array. - """ - - yolo = YOLO(detection_model) - processor = SamProcessor.from_pretrained(segmentation_model) - model = SamModel.from_pretrained(segmentation_model).to(device) - results = yolo(image) - bboxes = results[0].boxes.xyxy.tolist() - input_boxes = [[[bboxes[0]]]] - inputs = processor(load_image(image), input_boxes=input_boxes, return_tensors="pt").to("cuda") - with torch.no_grad(): - outputs = model(**inputs) - mask = processor.image_processor.post_process_masks( - outputs.pred_masks.cpu(), - inputs["original_sizes"].cpu(), - inputs["reshaped_input_sizes"].cpu() - )[0][0][0].numpy() - return mask - - - - - - -def invert_mask(mask_image: Image) -> np.ndarray: - """Method to invert mask - Args: - mask_image (np.ndarray): input mask image - Returns: - np.ndarray: inverted mask image - """ - inverted_mask_image = ImageOps.invert(mask_image) - return inverted_mask_image - - - - - - - - -def fetch_video_pipeline(video_model_name): - """ - Fetches the video pipeline for image processing. - - Args: - video_model_name (str): The name of the video model. - - Returns: - pipe (StableVideoDiffusionPipeline): The video pipeline. - - """ - pipe = StableVideoDiffusionPipeline.from_pretrained( - video_model_name, torch_dtype=torch.float16, - ) - pipe = pipe.to('cuda') - pipe.unet= torch.compile(pipe.unet) - - - return pipe - diff --git a/scripts/video_pipeline.py b/scripts/video_pipeline.py deleted file mode 100644 index e69de29..0000000