Spaces:

exact-railcar
/

test

Runtime error

App Files Files Community

test / app.py

exact-railcar

Update app.py

029ea00 verified 13 days ago

raw

history blame contribute delete

7.88 kB

	import gradio

	import av, pathlib, diffusers, torch, transformers, builtins, numpy, re
	from animatediff.generate import controlnet_preprocess, img2img_preprocess, wild_card_conversion, region_preprocess, unload_controlnet_models
	from animatediff.settings import get_model_config, get_infer_config
	from animatediff.utils.pipeline import send_to_device
	from animatediff.utils.util import set_tensor_interpolation_method
	from animatediff.pipelines import load_text_embeddings
	from animatediff.pipelines.lora import load_lcm_lora
	import huggingface_hub
	import animatediff

	width=432
	height=768
	length=1440
	model_config = get_model_config('config/prompts/prompt_travel.json')
	is_sdxl = False
	infer_config = get_infer_config(True, is_sdxl)
	set_tensor_interpolation_method(model_config.tensor_interpolation_slerp)
	device = torch.device('cuda')
	save_dir = pathlib.Path('output')
	controlnet_image_map, controlnet_type_map, controlnet_ref_map, controlnet_no_shrink = controlnet_preprocess(model_config.controlnet_map, width, height, length, save_dir, device, is_sdxl)
	img2img_map = img2img_preprocess(model_config.img2img_map, width, height, length, save_dir)

	base_model = pathlib.Path('/tmp/base')
	diffusers.StableDiffusionPipeline.from_pretrained('stable-diffusion-v1-5/stable-diffusion-v1-5').save_pretrained(base_model)

	tokenizer = transformers.CLIPTokenizer.from_pretrained(base_model, subfolder='tokenizer')
	text_encoder = transformers.CLIPTextModel.from_pretrained(base_model, subfolder='text_encoder')
	vae = diffusers.AutoencoderKL.from_single_file('https://huggingface.co/chaowenguoback/pal/blob/main/vae-ft-mse-840000-ema-pruned.safetensors')
	huggingface_hub.hf_hub_download(repo_id='wangfuyun/AnimateLCM', filename='AnimateLCM_sd15_t2v.ckpt', local_dir=pathlib.Path.cwd())
	unet = animatediff.models.unet.UNet2DConditionModel.from_pretrained_2d(
	pretrained_model_path=base_model,
	motion_module_path=pathlib.Path.cwd().joinpath('AnimateLCM_sd15_t2v.ckpt'),
	subfolder='unet',
	unet_additional_kwargs=infer_config.unet_additional_kwargs,
	feature_extractor = transformers.CLIPImageProcessor.from_pretrained(base_model, subfolder='feature_extractor')
	)

	pipeline = diffusers.StableDiffusionPipeline.from_single_file('https://huggingface.co/chaowenguoback/15/blob/main/chilloutMix-Ni.safetensors', config='stable-diffusion-v1-5/stable-diffusion-v1-5', safety_checker=None, use_safetensors=True)
	unet.load_state_dict(pipeline.unet.state_dict(), strict=False)
	text_encoder.load_state_dict(pipeline.text_encoder.state_dict(), strict=False)
	del pipeline

	unet.enable_xformers_memory_efficient_attention()

	pipeline = animatediff.pipelines.AnimationPipeline(
	vae=vae,
	text_encoder=text_encoder,
	tokenizer=tokenizer,
	unet=unet,
	scheduler=diffusers.LCMScheduler.from_config(infer_config.noise_scheduler_kwargs),
	feature_extractor=feature_extractor,
	controlnet_map=None,
	)

	lcm_lora = pathlib.Path.cwd().joinpath('data/models/lcm_lora/sd15')
	lcm_lora.mkdir(parents=True)
	huggingface_hub.hf_hub_download(repo_id='wangfuyun/AnimateLCM', filename='AnimateLCM_sd15_t2v_lora.safetensors', local_dir=lcm_lora)
	load_lcm_lora(pipeline, {'start_scale':0.15, 'end_scale':0.75, 'gradient_start':0.2, 'gradient_end':0.75}, is_sdxl=is_sdxl)
	pipeline.lora_map = None
	pipeline.load_lora_weights('chaowenguoback/15', weight_name='add_detail.safetensors', adapter_name='detail')
	pipeline.load_lora_weights('chaowenguoback/15', weight_name='b1r1av5-000007.safetensors', adapter_name='bikini')
	pipeline.load_lora_weights('chaowenguoback/15', weight_name='btcstr.safetensors', adapter_name='c-string')
	pipeline.load_lora_weights('chaowenguoback/15', weight_name='蓝洁瑛.safetensors', adapter_name='character')
	pipeline.set_adapters(['detail', 'bikini', 'c-string', 'character'], [1, 0.4, 0.2, 0.8])

	pipeline.unet = pipeline.unet.half()
	pipeline.text_encoder = pipeline.text_encoder.half()
	pipeline.text_encoder = pipeline.text_encoder.to(device)
	load_text_embeddings(pipeline)
	pipeline.text_encoder = pipeline.text_encoder.to('cpu')
	pipeline = send_to_device(pipeline, device, freeze=True, force_half=False, compile=False, is_sdxl=is_sdxl)
	wild_card_conversion(model_config)

	is_init_img_exist = img2img_map != None
	region_condi_list, region_list, ip_adapter_config_map, region2index = region_preprocess(model_config, width, height, length, save_dir, is_init_img_exist, is_sdxl)

	if controlnet_type_map:
	for c in controlnet_type_map:
	tmp_r = [region2index[r] for r in controlnet_type_map[c]["control_region_list"]]
	controlnet_type_map[c]["control_region_list"] = [r for r in tmp_r if r != -1]

	prompt_map = region_condi_list[0]["prompt_map"]
	prompt_tags = [re.compile(r"[^\w\-, ]").sub("", tag).strip().replace(" ", "-") for tag in prompt_map[list(prompt_map.keys())[0]].split(",")]
	prompt_str = "_".join((prompt_tags[:6]))[:50]

	output = pipeline(
	n_prompt='nipple, waistband, back view, monochrome, longbody, lowres, bad anatomy, bad hands, fused fingers, missing fingers, too many fingers, cropped, worst quality, low quality, deformed body, bloated, ugly, unrealistic, extra hands and arms',
	num_inference_steps=8,
	guidance_scale=3,
	unet_batch_size=1,
	width=width,
	height=height,
	video_length=length,
	return_dict=False,
	context_frames=16,
	context_stride=1,
	context_overlap=16 // 4,
	context_schedule='composite',
	clip_skip=2,
	controlnet_type_map=controlnet_image_map,
	controlnet_image_map=controlnet_image_map,
	controlnet_ref_map=controlnet_ref_map,
	controlnet_no_shrink=controlnet_no_shrink,
	controlnet_max_samples_on_vram=model_config.controlnet_map["max_samples_on_vram"] if "max_samples_on_vram" in model_config.controlnet_map else 999,
	controlnet_max_models_on_vram=model_config.controlnet_map["max_models_on_vram"] if "max_models_on_vram" in model_config.controlnet_map else 99,
	controlnet_is_loop = model_config.controlnet_map["is_loop"] if "is_loop" in model_config.controlnet_map else True,
	img2img_map=img2img_map,
	ip_adapter_config_map=ip_adapter_config_map,
	region_list=region_list,
	region_condi_list=region_condi_list,
	interpolation_factor=1,
	is_single_prompt_mode=model_config.is_single_prompt_mode,
	gradual_latent_map=model_config.gradual_latent_hires_fix_map,
	callback=None,
	callback_steps=None,
	)

	unload_controlnet_models(pipe=pipeline)
	frames = output.permute(0, 2, 1, 3, 4).squeeze(0)
	frames = frames.mul(255).add_(0.5).clamp_(0, 255).permute(0, 2, 3, 1).to("cpu", torch.uint8).numpy()
	del pipeline
	torch.cuda.empty_cache()
	pipeline = diffusers.AudioLDM2Pipeline.from_pretrained('cvssp/audioldm2-music', torch_dtype=torch.float16).to('cuda')
	pipeline.scheduler = diffusers.DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config)
	music = pipeline(prompt='Light rhythm techno', negative_prompt='low quality, average quality', num_inference_steps=20, audio_length_in_s=180).audios[0]
	del pipeline
	torch.cuda.empty_cache()

	with av.open('video.mp4', mode='w') as writer:
	video = writer.add_stream('h264', rate=8)
	video.width = width * 4
	video.height = height * 4
	video.pix_fmt = 'yuv420p'
	audio = writer.add_stream('aac', rate=16000)
	for frame in frames: writer.mux(video.encode(av.VideoFrame.from_ndarray(frame)))
	writer.mux(video.encode())
	for _ in builtins.range(0, music.shape[0], audio.frame_size):
	frame = av.AudioFrame.from_ndarray(music[_:_ + audio.frame_size][None], format='fltp', layout='mono')
	frame.sample_rate = audio.sample_rate
	frame.pts = _
	writer.mux(audio.encode(frame))
	writer.mux(audio.encode())

	def greet(name, intensity):
	return "Hello, " + name + "!" * int(intensity)

	demo = gradio.Interface(
	fn=greet,
	inputs=["text", "slider"],
	outputs=["text"],
	api_name="predict"
	)

	demo.launch()