Spaces:
Runtime error
Runtime error
| import gradio | |
| import av, pathlib, diffusers, torch, transformers, builtins, numpy, re | |
| from animatediff.generate import controlnet_preprocess, img2img_preprocess, wild_card_conversion, region_preprocess, unload_controlnet_models | |
| from animatediff.settings import get_model_config, get_infer_config | |
| from animatediff.utils.pipeline import send_to_device | |
| from animatediff.utils.util import set_tensor_interpolation_method | |
| from animatediff.pipelines import load_text_embeddings | |
| from animatediff.pipelines.lora import load_lcm_lora | |
| import huggingface_hub | |
| import animatediff | |
| width=432 | |
| height=768 | |
| length=1440 | |
| model_config = get_model_config('config/prompts/prompt_travel.json') | |
| is_sdxl = False | |
| infer_config = get_infer_config(True, is_sdxl) | |
| set_tensor_interpolation_method(model_config.tensor_interpolation_slerp) | |
| device = torch.device('cuda') | |
| save_dir = pathlib.Path('output') | |
| controlnet_image_map, controlnet_type_map, controlnet_ref_map, controlnet_no_shrink = controlnet_preprocess(model_config.controlnet_map, width, height, length, save_dir, device, is_sdxl) | |
| img2img_map = img2img_preprocess(model_config.img2img_map, width, height, length, save_dir) | |
| base_model = pathlib.Path('/tmp/base') | |
| diffusers.StableDiffusionPipeline.from_pretrained('stable-diffusion-v1-5/stable-diffusion-v1-5').save_pretrained(base_model) | |
| tokenizer = transformers.CLIPTokenizer.from_pretrained(base_model, subfolder='tokenizer') | |
| text_encoder = transformers.CLIPTextModel.from_pretrained(base_model, subfolder='text_encoder') | |
| vae = diffusers.AutoencoderKL.from_single_file('https://huggingface.co/chaowenguoback/pal/blob/main/vae-ft-mse-840000-ema-pruned.safetensors') | |
| huggingface_hub.hf_hub_download(repo_id='wangfuyun/AnimateLCM', filename='AnimateLCM_sd15_t2v.ckpt', local_dir=pathlib.Path.cwd()) | |
| unet = animatediff.models.unet.UNet2DConditionModel.from_pretrained_2d( | |
| pretrained_model_path=base_model, | |
| motion_module_path=pathlib.Path.cwd().joinpath('AnimateLCM_sd15_t2v.ckpt'), | |
| subfolder='unet', | |
| unet_additional_kwargs=infer_config.unet_additional_kwargs, | |
| feature_extractor = transformers.CLIPImageProcessor.from_pretrained(base_model, subfolder='feature_extractor') | |
| ) | |
| pipeline = diffusers.StableDiffusionPipeline.from_single_file('https://huggingface.co/chaowenguoback/15/blob/main/chilloutMix-Ni.safetensors', config='stable-diffusion-v1-5/stable-diffusion-v1-5', safety_checker=None, use_safetensors=True) | |
| unet.load_state_dict(pipeline.unet.state_dict(), strict=False) | |
| text_encoder.load_state_dict(pipeline.text_encoder.state_dict(), strict=False) | |
| del pipeline | |
| unet.enable_xformers_memory_efficient_attention() | |
| pipeline = animatediff.pipelines.AnimationPipeline( | |
| vae=vae, | |
| text_encoder=text_encoder, | |
| tokenizer=tokenizer, | |
| unet=unet, | |
| scheduler=diffusers.LCMScheduler.from_config(infer_config.noise_scheduler_kwargs), | |
| feature_extractor=feature_extractor, | |
| controlnet_map=None, | |
| ) | |
| lcm_lora = pathlib.Path.cwd().joinpath('data/models/lcm_lora/sd15') | |
| lcm_lora.mkdir(parents=True) | |
| huggingface_hub.hf_hub_download(repo_id='wangfuyun/AnimateLCM', filename='AnimateLCM_sd15_t2v_lora.safetensors', local_dir=lcm_lora) | |
| load_lcm_lora(pipeline, {'start_scale':0.15, 'end_scale':0.75, 'gradient_start':0.2, 'gradient_end':0.75}, is_sdxl=is_sdxl) | |
| pipeline.lora_map = None | |
| pipeline.load_lora_weights('chaowenguoback/15', weight_name='add_detail.safetensors', adapter_name='detail') | |
| pipeline.load_lora_weights('chaowenguoback/15', weight_name='b1r1av5-000007.safetensors', adapter_name='bikini') | |
| pipeline.load_lora_weights('chaowenguoback/15', weight_name='btcstr.safetensors', adapter_name='c-string') | |
| pipeline.load_lora_weights('chaowenguoback/15', weight_name='蓝洁瑛.safetensors', adapter_name='character') | |
| pipeline.set_adapters(['detail', 'bikini', 'c-string', 'character'], [1, 0.4, 0.2, 0.8]) | |
| pipeline.unet = pipeline.unet.half() | |
| pipeline.text_encoder = pipeline.text_encoder.half() | |
| pipeline.text_encoder = pipeline.text_encoder.to(device) | |
| load_text_embeddings(pipeline) | |
| pipeline.text_encoder = pipeline.text_encoder.to('cpu') | |
| pipeline = send_to_device(pipeline, device, freeze=True, force_half=False, compile=False, is_sdxl=is_sdxl) | |
| wild_card_conversion(model_config) | |
| is_init_img_exist = img2img_map != None | |
| region_condi_list, region_list, ip_adapter_config_map, region2index = region_preprocess(model_config, width, height, length, save_dir, is_init_img_exist, is_sdxl) | |
| if controlnet_type_map: | |
| for c in controlnet_type_map: | |
| tmp_r = [region2index[r] for r in controlnet_type_map[c]["control_region_list"]] | |
| controlnet_type_map[c]["control_region_list"] = [r for r in tmp_r if r != -1] | |
| prompt_map = region_condi_list[0]["prompt_map"] | |
| prompt_tags = [re.compile(r"[^\w\-, ]").sub("", tag).strip().replace(" ", "-") for tag in prompt_map[list(prompt_map.keys())[0]].split(",")] | |
| prompt_str = "_".join((prompt_tags[:6]))[:50] | |
| output = pipeline( | |
| n_prompt='nipple, waistband, back view, monochrome, longbody, lowres, bad anatomy, bad hands, fused fingers, missing fingers, too many fingers, cropped, worst quality, low quality, deformed body, bloated, ugly, unrealistic, extra hands and arms', | |
| num_inference_steps=8, | |
| guidance_scale=3, | |
| unet_batch_size=1, | |
| width=width, | |
| height=height, | |
| video_length=length, | |
| return_dict=False, | |
| context_frames=16, | |
| context_stride=1, | |
| context_overlap=16 // 4, | |
| context_schedule='composite', | |
| clip_skip=2, | |
| controlnet_type_map=controlnet_image_map, | |
| controlnet_image_map=controlnet_image_map, | |
| controlnet_ref_map=controlnet_ref_map, | |
| controlnet_no_shrink=controlnet_no_shrink, | |
| controlnet_max_samples_on_vram=model_config.controlnet_map["max_samples_on_vram"] if "max_samples_on_vram" in model_config.controlnet_map else 999, | |
| controlnet_max_models_on_vram=model_config.controlnet_map["max_models_on_vram"] if "max_models_on_vram" in model_config.controlnet_map else 99, | |
| controlnet_is_loop = model_config.controlnet_map["is_loop"] if "is_loop" in model_config.controlnet_map else True, | |
| img2img_map=img2img_map, | |
| ip_adapter_config_map=ip_adapter_config_map, | |
| region_list=region_list, | |
| region_condi_list=region_condi_list, | |
| interpolation_factor=1, | |
| is_single_prompt_mode=model_config.is_single_prompt_mode, | |
| gradual_latent_map=model_config.gradual_latent_hires_fix_map, | |
| callback=None, | |
| callback_steps=None, | |
| ) | |
| unload_controlnet_models(pipe=pipeline) | |
| frames = output.permute(0, 2, 1, 3, 4).squeeze(0) | |
| frames = frames.mul(255).add_(0.5).clamp_(0, 255).permute(0, 2, 3, 1).to("cpu", torch.uint8).numpy() | |
| del pipeline | |
| torch.cuda.empty_cache() | |
| pipeline = diffusers.AudioLDM2Pipeline.from_pretrained('cvssp/audioldm2-music', torch_dtype=torch.float16).to('cuda') | |
| pipeline.scheduler = diffusers.DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config) | |
| music = pipeline(prompt='Light rhythm techno', negative_prompt='low quality, average quality', num_inference_steps=20, audio_length_in_s=180).audios[0] | |
| del pipeline | |
| torch.cuda.empty_cache() | |
| with av.open('video.mp4', mode='w') as writer: | |
| video = writer.add_stream('h264', rate=8) | |
| video.width = width * 4 | |
| video.height = height * 4 | |
| video.pix_fmt = 'yuv420p' | |
| audio = writer.add_stream('aac', rate=16000) | |
| for frame in frames: writer.mux(video.encode(av.VideoFrame.from_ndarray(frame))) | |
| writer.mux(video.encode()) | |
| for _ in builtins.range(0, music.shape[0], audio.frame_size): | |
| frame = av.AudioFrame.from_ndarray(music[_:_ + audio.frame_size][None], format='fltp', layout='mono') | |
| frame.sample_rate = audio.sample_rate | |
| frame.pts = _ | |
| writer.mux(audio.encode(frame)) | |
| writer.mux(audio.encode()) | |
| def greet(name, intensity): | |
| return "Hello, " + name + "!" * int(intensity) | |
| demo = gradio.Interface( | |
| fn=greet, | |
| inputs=["text", "slider"], | |
| outputs=["text"], | |
| api_name="predict" | |
| ) | |
| demo.launch() |