Spaces:

fffiloni
/

InstantID-2V

Running

File size: 8,261 Bytes

a7cbda8
 
13f3ef3
f56ed08
6dd1a7f
f56ed08
5b422bf
a7cbda8
6dd1a7f
 
4c2c5c5
 
f8022d8
4c2c5c5
6dd1a7f
 
 
 
 
179183d
6dd1a7f
b6abb42
500f006
 
6dd1a7f
 
 
a7cbda8
 
6dd1a7f
 
a7cbda8
6dd1a7f
157ea0f
6dd1a7f
a7cbda8
6dd1a7f
 
 
179183d
fdac431
f37bf25
6dd1a7f
 
 
f56fa59
a7cbda8
6dd1a7f
a7cbda8
 
 
3f095c9
a7cbda8
 
 
00ec618
298c252
 
a7cbda8
 
 
 
3f095c9
 
 
6dd1a7f
3f095c9
96136a4
3f095c9
 
 
 
 
 
 
 
 
bbae957
 
f6dbf53
7a3b159
f6dbf53
7a3b159
f6dbf53
7a3b159
f6dbf53
7a3b159
f6dbf53
7a3b159
f6dbf53
930c8e5
 
bbae957
 
 
 
 
4f123a1
 
 
 
 
 
 
 
 
 
179183d
bbae957
 
 
 
 
 
 
2c66889
179183d
4f123a1
298c252
2c66889
3f095c9
298c252
3f095c9
 
2c66889
a7cbda8
2c66889
a7cbda8
 
d7cca30
 
 
 
 
 
 
 
 
 
2c66889
d7cca30
e7606fd
d7cca30
 
e7606fd
d7cca30
2c66889
4c2c5c5
 
 
64f565f
4c2c5c5
179183d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9e7d1c3
4f123a1
4c2c5c5
 
333f81c
 
 
2c66889
bbae957
 
 
 
 
 
 
3b9e54d
bbae957
 
 
 
 
2c66889
 
 
 
 
 
179183d
2c66889
a03ca79
2c66889
 
 
 
 
 
 
e511517

import gradio as gr
from gradio_client import Client
import numpy as np
import os 
import random

hf_token = os.environ.get("HF_TKN")

# global variable
MAX_SEED = np.iinfo(np.int32).max
from style_template import styles
STYLE_NAMES = list(styles.keys())
DEFAULT_STYLE_NAME = "(No style)"

def randomize_seed_fn(seed: int, randomize_seed: bool) -> int:
    if randomize_seed:
        seed = random.randint(0, MAX_SEED)
    return seed

def get_instantID(portrait_in, condition_pose, controlnet, prompt, style):
    
    negative_prompt = "(lowres, low quality, worst quality:1.2), (text:1.2), watermark, (frame:1.2), deformed, ugly, deformed eyes, blur, out of focus, blurry, deformed cat, deformed, photo, anthropomorphic cat, monochrome, pet collar, gun, weapon, blue, 3d, drones, drone, buildings in background, green"

    seed = 42
    seed = random.randint(0, MAX_SEED)
    
    client = Client("https://instantx-instantid.hf.space/")
    result = client.predict(
		portrait_in,	# filepath  in 'Upload a photo of your face' Image component
		condition_pose,	# filepath  in 'Upload a reference pose image (Optional)' Image component
		prompt,	# str  in 'Prompt' Textbox component
		negative_prompt,	# str  in 'Negative Prompt' Textbox component
		style,	# Literal['(No style)', 'Spring Festival', 'Watercolor', 'Film Noir', 'Neon', 'Jungle', 'Mars', 'Vibrant Color', 'Snow', 'Line art']  in 'Style template' Dropdown component
		5,	# float (numeric value between 1 and 100) in 'Number of sample steps' Slider component
		0.8,	# float (numeric value between 0 and 1.5) in 'IdentityNet strength (for fidelity)' Slider component
		0.8,	# float (numeric value between 0 and 1.5) in 'Image adapter strength (for detail)' Slider component
		0.4,	# float (numeric value between 0 and 1.5) in 'Pose strength' Slider component
		0.4,	# float (numeric value between 0 and 1.5) in 'Canny strength' Slider component
		0.4,	# float (numeric value between 0 and 1.5) in 'Depth strength' Slider component
		controlnet,	# List[Literal['pose', 'canny', 'depth']]  in 'Controlnet' Checkboxgroup component
		1.5,	# float (numeric value between 0.1 and 20.0) in 'Guidance scale' Slider component
		seed,	# float (numeric value between 0 and 2147483647) in 'Seed' Slider component
		"EulerDiscreteScheduler",	# Literal['DEISMultistepScheduler', 'HeunDiscreteScheduler', 'EulerDiscreteScheduler', 'DPMSolverMultistepScheduler', 'DPMSolverMultistepScheduler-Karras', 'DPMSolverMultistepScheduler-Karras-SDE']  in 'Schedulers' Dropdown component
		True,	# bool  in 'Enable Fast Inference with LCM' Checkbox component
		True,	# bool  in 'Enhance non-face region' Checkbox component
		api_name="/generate_image"
    )
    
    print(result)
    return result[0]

def get_video_i2vgen(image_in, prompt):
    client = Client("https://modelscope-i2vgen-xl.hf.space/")
    result = client.predict(
        image_in,
        prompt,
        fn_index=1
        #api_name="/image_to_video"
    )
    print(result)
    return result

def get_video_svd(image_in):
    from gradio_client import Client

    client = Client("https://multimodalart-stable-video-diffusion.hf.space/")
    result = client.predict(
		image_in,	# filepath  in 'Upload your image' Image component
		0,	# float (numeric value between 0 and 9223372036854775807) in 'Seed' Slider component
		True,	# bool  in 'Randomize seed' Checkbox component
		127,	# float (numeric value between 1 and 255) in 'Motion bucket id' Slider component
		6,	# float (numeric value between 5 and 30) in 'Frames per second' Slider component
		api_name="/video"
    )
    print(result)
    return result[0]["video"]

def load_sample_shot(camera_shot):
    if camera_shot == "close-up":
        conditional_pose = "camera_shots/close_up_shot.jpeg"
    elif camera_shot == "medium close-up":
        conditional_pose = "camera_shots/medium_close_up.jpeg"
    elif camera_shot == "medium shot":
        conditional_pose = "camera_shots/medium_shot.png"
    elif camera_shot == "cowboy shot":
        conditional_pose = "camera_shots/cowboy_shot.jpeg"
    elif camera_shot == "medium full shot":
        conditional_pose = "camera_shots/medium_full_shot.png"
    elif camera_shot == "full shot":
        conditional_pose = "camera_shots/full_shot.jpeg"
    elif camera_shot == "custom":
        conditional_pose = None
    return conditional_pose

def use_custom_cond():
    return "custom"

def get_short_caption(image_in):
    client = Client("https://vikhyatk-moondream1.hf.space/")
    result = client.predict(
		image_in,	# filepath  in 'image' Image component
		"Describe what is happening in one sentence",	# str  in 'Question' Textbox component
		api_name="/answer_question"
    )
    print(result)
    return result

def infer(image_in, camera_shot, conditional_pose, controlnet_selection, prompt, style, chosen_model):
    
    if camera_shot == "custom":
        if conditional_pose != None:
            conditional_pose = conditional_pose
        else :
            raise gr.Error("No custom conditional shot found !")
    
    
    iid_img = get_instantID(image_in, conditional_pose, controlnet_selection, prompt, style)

    #short_cap = get_short_caption(iid_img)
    
    if chosen_model == "i2vgen-xl" :
        video_res = get_video_i2vgen(iid_img, prompt)
    elif chosen_model == "stable-video" :
        video_res = get_video_svd(image_in)
    
    print(video_res)
    
    return video_res


css = """
#col-container{
    margin: 0 auto;
    max-width: 1080px;
}
"""

with gr.Blocks(css=css) as demo:
    with gr.Column(elem_id="col-container"):
        gr.HTML("""
        <h2 style="text-align: center;">
            InstantID-2V
        </h2>
        <p style="text-align: center;">
            Generate alive camera shot from input face
        </p>
        """)
        
        with gr.Row():
            with gr.Column():
                face_in = gr.Image(type="filepath", label="Face to copy", value="monalisa.png")
            with gr.Column():
                with gr.Group():
                    with gr.Row():
                        camera_shot = gr.Dropdown(
                            label = "Camera Shot", 
                            info = "Use standard camera shots vocabulary, or drop your custom shot as conditional pose (1280*720 ratio is recommended)",
                            choices = [
                                "custom", "close-up", "medium close-up", "medium shot", "cowboy shot", "medium full shot", "full shot"
                            ],
                            value = "custom"
                        )
                        style = gr.Dropdown(label="Style template", info="InstantID legacy templates", choices=STYLE_NAMES, value=DEFAULT_STYLE_NAME)
    
                    condition_shot = gr.Image(type="filepath", label="Custom conditional shot (Important) [1280*720 recommended]")
                    controlnet_selection = gr.CheckboxGroup(
                        ["pose", "canny", "depth"], label="Controlnet", value=["pose"],
                        info="Use pose for skeleton inference, canny for edge detection, and depth for depth map estimation. You can try all three to control the generation process"
                    )
                prompt = gr.Textbox(label="Short Prompt (keeping it short is better)")
                chosen_model = gr.Radio(label="Choose a model", choices=["i2vgen-xl", "stable-video"], value="i2vgen-xl", interactive=False, visible=False)
            
        with gr.Column():
            submit_btn = gr.Button("Submit")
            video_out = gr.Video()

    
    camera_shot.change(
        fn = load_sample_shot,
        inputs = camera_shot,
        outputs = condition_shot,
        queue=False
    )
    condition_shot.clear(
        fn = use_custom_cond,
        inputs = None,
        outputs = camera_shot,
        queue=False,
    )
    submit_btn.click(
        fn = infer,
        inputs = [
            face_in,
            camera_shot,
            condition_shot,
            controlnet_selection,
            prompt,
            style,
            chosen_model
        ],
        outputs = [
            video_out
        ]
    )

demo.queue(max_size=3).launch(share=False, show_error=True, show_api=False)