Spaces:

chansung
/

vid2persona

Paused

File size: 7,925 Bytes

import gradio as gr

from vid2persona import init
from vid2persona.pipeline import vlm
from vid2persona.pipeline import llm

init.init_model("HuggingFaceH4/zephyr-7b-beta")
init.auth_gcp()
init.get_env_vars()
prompt_tpl_path = "vid2persona/prompts"

async def extract_traits(video_path):
    traits = await vlm.get_traits(
        init.gcp_project_id, 
        init.gcp_project_location, 
        video_path,
        prompt_tpl_path
    )
    if 'characters' in traits:
        traits = traits['characters'][0]

    return [
        traits, [], 
        gr.Textbox("", interactive=True),
        gr.Button(interactive=True),
        gr.Button(interactive=True),
        gr.Button(interactive=True)
    ]

async def conversation(
    message: str, messages: list, traits: dict,
    model_id: str, max_input_token_length: int, 
    max_new_tokens: int, temperature: float, 
    top_p: float, top_k: float, repetition_penalty: float, 
):
    messages = messages + [[message, ""]]
    yield [messages, message, gr.Button(interactive=False), gr.Button(interactive=False)]

    async for partial_response in llm.chat(
        message, messages, traits,
        prompt_tpl_path, model_id, 
        max_input_token_length, max_new_tokens,
        temperature, top_p, top_k, 
        repetition_penalty, hf_token=init.hf_access_token
    ):
        last_message = messages[-1]
        last_message[1] = last_message[1] + partial_response
        messages[-1] = last_message
        yield [messages, "", gr.Button(interactive=False), gr.Button(interactive=False)]

    yield [messages, "", gr.Button(interactive=True), gr.Button(interactive=True)]

async def regen_conversation(
    messages: list, traits: dict,
    model_id: str, max_input_token_length: int, 
    max_new_tokens: int, temperature: float, 
    top_p: float, top_k: float, repetition_penalty: float, 
):
    if len(messages) > 0:
        message = messages[-1][0]
        messages = messages[:-1]
        messages = messages + [[message, ""]]
        yield [messages, "", gr.Button(interactive=False), gr.Button(interactive=False)]

        async for partial_response in llm.chat(
            message, messages, traits,
            prompt_tpl_path, model_id, 
            max_input_token_length, max_new_tokens,
            temperature, top_p, top_k, 
            repetition_penalty, hf_token=init.hf_access_token
        ):
            last_message = messages[-1]
            last_message[1] = last_message[1] + partial_response
            messages[-1] = last_message
            yield [messages, "", gr.Button(interactive=False), gr.Button(interactive=False)]

        yield [messages, "", gr.Button(interactive=True), gr.Button(interactive=True)]

with gr.Blocks(css="styles.css", theme=gr.themes.Soft()) as demo:
    gr.Markdown("Vid2Persona", elem_classes=["md-center", "h1-font"])
    
    gr.Markdown("This project breathes life into video characters by using AI to describe their personality and then chat with you as them. "
                "[Gemini 1.0 Pro Vision model on Vertex AI](https://cloud.google.com/vertex-ai/generative-ai/docs/multimodal/overview) is used "
                "to grasp traits of video characters, then [HuggingFaceH4/zephyr-7b-beta](https://huggingface.co/HuggingFaceH4/zephyr-7b-beta) model "
                "is used to make conversation with them.",)

    gr.Markdown("This space is modified to be working on Hugging Face [ZeroGPU](https://huggingface.co/zero-gpu-explorers). If you wish to run "
                "the same application on your own machine, please check out the [project repository](https://github.com/deep-diver/Vid2Persona). "
                "You can interact with other LLMs to make conversation besides HuggingFaceH4/zephyr-7b-beta by running them locally, or by "
                "connecting them through remotely hosted within Text Generation Inference framework as [Hugging Face PRO](https://huggingface.co/blog/inference-pro) user.")

    with gr.Column(elem_classes=["group"]):
        with gr.Row():
            video = gr.Video(label="upload short video clip", max_length=180)
            traits = gr.Json(label="extracted traits")
        
        with gr.Row():
            trait_gen = gr.Button("generate  traits")

    with gr.Column(elem_classes=["group"]):
        chatbot = gr.Chatbot([], label="chatbot", elem_id="chatbot", elem_classes=["chatbot-no-label"])
        with gr.Row():
            clear = gr.Button("clear conversation", interactive=False)
            regen = gr.Button("regenerate the last", interactive=False)
            stop = gr.Button("stop", interactive=False) 
        user_input = gr.Textbox(placeholder="ask anything", interactive=False, elem_classes=["textbox-no-label", "textbox-no-top-bottom-borders"])

        with gr.Accordion("parameters' control pane", open=False):
            model_id = gr.Dropdown(choices=init.ALLOWED_LLM_FOR_HF_PRO_ACCOUNTS, value="HuggingFaceH4/zephyr-7b-beta", label="Model ID", visible=False)

            with gr.Row():
                max_input_token_length = gr.Slider(minimum=1024, maximum=4096, value=4096, label="max-input-tokens")
                max_new_tokens = gr.Slider(minimum=128, maximum=2048, value=256, label="max-new-tokens")

            with gr.Row():
                temperature = gr.Slider(minimum=0, maximum=2, step=0.1, value=0.6, label="temperature")
                top_p = gr.Slider(minimum=0, maximum=2, step=0.1, value=0.9, label="top-p")
                top_k = gr.Slider(minimum=0, maximum=2, step=0.1, value=50, label="top-k")
                repetition_penalty = gr.Slider(minimum=0, maximum=2, step=0.1, value=1.2, label="repetition-penalty")
    
    with gr.Row():
        gr.Markdown(
            "[![GitHub Repo](https://img.shields.io/badge/GitHub%20Repo-gray?style=for-the-badge&logo=github&link=https://github.com/deep-diver/Vid2Persona)](https://github.com/deep-diver/Vid2Persona) "
            "[![Chansung](https://img.shields.io/badge/Chansung-blue?style=for-the-badge&logo=twitter&link=https://twitter.com/algo_diver)](https://twitter.com/algo_diver) "
            "[![Sayak](https://img.shields.io/badge/Sayak-blue?style=for-the-badge&logo=twitter&link=https://twitter.com/RisingSayak)](https://twitter.com/RisingSayak )",
            elem_id="bottom-md"
        )

    trait_gen.click(
        extract_traits,
        [video],
        [traits, chatbot, user_input, clear, regen, stop],
        concurrency_limit=5,
    )

    conv = user_input.submit(
        conversation,
        [
            user_input, chatbot, traits,
            model_id, max_input_token_length, 
            max_new_tokens, temperature, 
            top_p, top_k, repetition_penalty,
        ],
        [chatbot, user_input, clear, regen],
        concurrency_limit=5,
    )

    clear.click(
        lambda: [
            gr.Chatbot([]),
            gr.Button(interactive=False),
            gr.Button(interactive=False),
        ],
        None, [chatbot, clear, regen],
        concurrency_limit=5,
    )

    conv_regen = regen.click(
        regen_conversation,
        [
            chatbot, traits,
            model_id, max_input_token_length, 
            max_new_tokens, temperature, 
            top_p, top_k, repetition_penalty, 
        ],
        [chatbot, user_input, clear, regen],
        concurrency_limit=5,
    )

    stop.click(
        lambda: [
            gr.Button(interactive=True),
            gr.Button(interactive=True),
            gr.Button(interactive=True),
        ], None, [clear, regen, stop], 
        cancels=[conv, conv_regen],
        concurrency_limit=5,
    )

    gr.Examples(
        [["assets/sample1.mp4"], ["assets/sample2.mp4"], ["assets/sample3.mp4"], ["assets/sample4.mp4"]],
        video,
        [traits, chatbot, user_input, clear, regen, stop],
        extract_traits,
        cache_examples=True
    )

demo.queue(
    max_size=256
).launch(
    debug=True
)