Spaces:

chansung
/

vid2persona

Paused

App Files Files Community

chansung commited on Mar 11

Commit

141a5cd

•

1 Parent(s): 014a906

initial commit

Browse files

Files changed (15) hide show

.gitignore +1 -0
README.md +2 -1
app.py +155 -12
requirements.txt +7 -1
styles.css +45 -0
vid2persona/gen/gemini.py +61 -0
vid2persona/gen/local_openllm.py +42 -0
vid2persona/gen/tgi_openllm.py +25 -0
vid2persona/gen/utils.py +37 -0
vid2persona/init.py +31 -0
vid2persona/pipeline/llm.py +75 -0
vid2persona/pipeline/vlm.py +15 -0
vid2persona/prompts/llm.toml +21 -0
vid2persona/prompts/vlm.toml +19 -0
vid2persona/utils.py +7 -0

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ __pycache__

README.md CHANGED Viewed

@@ -1,6 +1,6 @@
 ---
 title: Vid2persona
-emoji: 🌍
 colorFrom: green
 colorTo: indigo
 sdk: gradio
@@ -8,6 +8,7 @@ sdk_version: 4.21.0
 app_file: app.py
 pinned: false
 license: mit
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 title: Vid2persona
+emoji: 🎥🤾
 colorFrom: green
 colorTo: indigo
 sdk: gradio
 app_file: app.py
 pinned: false
 license: mit
+short_description: Let's talk to person from video clip
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py CHANGED Viewed

@@ -1,18 +1,161 @@
-import os
 import gradio as gr
-import google.auth
-import vertexai
-gcp_credentials = os.getenv("GCP_CREDENTIALS")
-with open("gcp-credentials.json", "w") as f:
-    f.write(gcp_credentials)
-os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = './gcp-credentials.json'
-google.auth.default()
-vertexai.init(project="gde-prj", location="us-central1")
-with gr.Blocks() as demo:
-    gr.Markdown("")
-demo.launch()

 import gradio as gr
+from vid2persona import init
+from vid2persona.pipeline import vlm
+from vid2persona.pipeline import llm
+init.auth_gcp()
+init.get_env_vars()
+prompt_tpl_path = "vid2persona/prompts"
+async def extract_traits(video_path):
+    traits = await vlm.get_traits(
+        init.gcp_project_id,
+        init.gcp_project_location,
+        video_path,
+        prompt_tpl_path
+    )
+    if 'characters' in traits:
+        traits = traits['characters'][0]
+    return [
+        traits, [],
+        gr.Textbox("", interactive=True),
+        gr.Button(interactive=True),
+        gr.Button(interactive=True),
+        gr.Button(interactive=True)
+    ]
+async def conversation(
+    message: str, messages: list, traits: dict,
+    model_id: str, max_input_token_length: int,
+    max_new_tokens: int, temperature: float,
+    top_p: float, top_k: float, repetition_penalty: float,
+):
+    messages = messages + [[message, ""]]
+    yield [messages, message, gr.Button(interactive=False), gr.Button(interactive=False)]
+    async for partial_response in llm.chat(
+        message, messages, traits,
+        prompt_tpl_path, model_id,
+        max_input_token_length, max_new_tokens,
+        temperature, top_p, top_k,
+        repetition_penalty, hf_token=init.hf_access_token
+    ):
+        last_message = messages[-1]
+        last_message[1] = last_message[1] + partial_response
+        messages[-1] = last_message
+        yield [messages, "", gr.Button(interactive=False), gr.Button(interactive=False)]
+    yield [messages, "", gr.Button(interactive=True), gr.Button(interactive=True)]
+async def regen_conversation(
+    messages: list, traits: dict,
+    model_id: str, max_input_token_length: int,
+    max_new_tokens: int, temperature: float,
+    top_p: float, top_k: float, repetition_penalty: float,
+):
+    if len(messages) > 0:
+        message = messages[-1][0]
+        messages = messages[:-1]
+        messages = messages + [[message, ""]]
+        yield [messages, "", gr.Button(interactive=False), gr.Button(interactive=False)]
+        async for partial_response in llm.chat(
+            message, messages, traits,
+            prompt_tpl_path, model_id,
+            max_input_token_length, max_new_tokens,
+            temperature, top_p, top_k,
+            repetition_penalty, hf_token=init.hf_access_token
+        ):
+            last_message = messages[-1]
+            last_message[1] = last_message[1] + partial_response
+            messages[-1] = last_message
+            yield [messages, "", gr.Button(interactive=False), gr.Button(interactive=False)]
+        yield [messages, "", gr.Button(interactive=True), gr.Button(interactive=True)]
+with gr.Blocks(css="styles.css", theme=gr.themes.Soft()) as demo:
+    gr.Markdown("Vid2Persona", elem_classes=["md-center", "h1-font"])
+    gr.Markdown("This project breathes life into video characters by using AI to describe their personality and then chat with you as them.")
+    with gr.Column(elem_classes=["group"]):
+        with gr.Row():
+            video = gr.Video(label="upload short video clip")
+            traits = gr.Json(label="extracted traits")
+        with gr.Row():
+            trait_gen = gr.Button("generate  traits")
+    with gr.Column(elem_classes=["group"]):
+        chatbot = gr.Chatbot([], label="chatbot", elem_id="chatbot", elem_classes=["chatbot-no-label"])
+        with gr.Row():
+            clear = gr.Button("clear conversation", interactive=False)
+            regen = gr.Button("regenerate the last", interactive=False)
+            stop = gr.Button("stop", interactive=False)
+        user_input = gr.Textbox(placeholder="ask anything", interactive=False, elem_classes=["textbox-no-label", "textbox-no-top-bottom-borders"])
+        with gr.Accordion("parameters' control pane", open=False):
+            model_id = gr.Dropdown(choices=init.ALLOWED_LLM_FOR_HF_PRO_ACCOUNTS, value="HuggingFaceH4/zephyr-7b-beta", label="Model ID")
+            with gr.Row():
+                max_input_token_length = gr.Slider(minimum=1024, maximum=4096, value=4096, label="max-input-tokens")
+                max_new_tokens = gr.Slider(minimum=128, maximum=2048, value=256, label="max-new-tokens")
+            with gr.Row():
+                temperature = gr.Slider(minimum=0, maximum=2, step=0.1, value=0.6, label="temperature")
+                top_p = gr.Slider(minimum=0, maximum=2, step=0.1, value=0.9, label="top-p")
+                top_k = gr.Slider(minimum=0, maximum=2, step=0.1, value=50, label="top-k")
+                repetition_penalty = gr.Slider(minimum=0, maximum=2, step=0.1, value=1.2, label="repetition-penalty")
+    with gr.Row():
+        gr.Markdown(
+            "[![GitHub Repo](https://img.shields.io/badge/GitHub%20Repo-gray?style=for-the-badge&logo=github&link=https://github.com/deep-diver/Vid2Persona)](https://github.com/deep-diver/Vid2Persona) "
+            "[![Chansung](https://img.shields.io/badge/Chansung-blue?style=for-the-badge&logo=twitter&link=https://twitter.com/algo_diver)](https://twitter.com/algo_diver) "
+            "[![Sayak](https://img.shields.io/badge/Sayak-blue?style=for-the-badge&logo=twitter&link=https://twitter.com/RisingSayak)](https://twitter.com/RisingSayak )",
+            elem_id="bottom-md"
+        )
+    trait_gen.click(
+        extract_traits,
+        [video],
+        [traits, chatbot, user_input, clear, regen, stop]
+    )
+    conv = user_input.submit(
+        conversation,
+        [
+            user_input, chatbot, traits,
+            model_id, max_input_token_length,
+            max_new_tokens, temperature,
+            top_p, top_k, repetition_penalty,
+        ],
+        [chatbot, user_input, clear, regen]
+    )
+    clear.click(
+        lambda: [
+            gr.Chatbot([]),
+            gr.Button(interactive=False),
+            gr.Button(interactive=False),
+        ],
+        None, [chatbot, clear, regen]
+    )
+    conv_regen = regen.click(
+        regen_conversation,
+        [
+            chatbot, traits,
+            model_id, max_input_token_length,
+            max_new_tokens, temperature,
+            top_p, top_k, repetition_penalty,
+        ],
+        [chatbot, user_input, clear, regen]
+    )
+    stop.click(
+        None, None, None,
+        cancels=[conv, conv_regen]
+    )
+demo.launch()

requirements.txt CHANGED Viewed

@@ -1,2 +1,8 @@
 google-auth
-google-cloud-aiplatform

+toml
 google-auth
+google-cloud-aiplatform
+transformers
+accelerate
+bitsandbytes
+openai
+gradio

styles.css ADDED Viewed

	@@ -0,0 +1,45 @@

+.textbox-no-label > label > span {
+    display: none;
+}
+.textbox-no-top-bottom-borders > label > textarea {
+    border: none !important;
+}
+.chatbot-no-label > div > label {
+    display: none;
+}
+.md-center {
+    text-align: center;
+    display: block;
+}
+.h1-font > span {
+    font-size: xx-large;
+    font-weight: bold;
+}
+.json-holder {
+    overflow: scroll;
+    height: 500px;
+}
+.group {
+    padding-top: 10px;
+    padding-left: 10px;
+    padding-right: 10px;
+    padding-bottom: 10px;
+    border: 2px dashed gray;
+    border-radius: 20px;
+    box-shadow: 5px 3px 10px 1px rgba(0, 0, 0, 0.4) !important;
+}
+#bottom-md a {
+    float: left;
+    margin-right: 10px;
+}
+#chatbot {
+    height: 600px !important;
+}

vid2persona/gen/gemini.py ADDED Viewed

	@@ -0,0 +1,61 @@

+from typing import Union, Iterable
+import vertexai
+from vertexai.generative_models import (
+    GenerativeModel, Part,
+    GenerationResponse, GenerationConfig
+)
+from .utils import parse_first_json_snippet
+def _default_gen_config():
+    return GenerationConfig(
+        max_output_tokens=2048,
+        temperature=0.4,
+        top_p=1,
+        top_k=32
+    )
+def init_vertexai(project_id: str, location: str) -> None:
+    vertexai.init(project=project_id, location=location)
+async def _ask_about_video(
+    prompt: str="What is in the video?",
+    gen_config: dict=_default_gen_config(),
+    model_name: str="gemini-1.0-pro-vision",
+    gcs: str=None,
+    base64_content: bytes=None
+) -> Union[GenerationResponse, Iterable[GenerationResponse]]:
+    if gcs is None and base64_content is None:
+        raise ValueError("Either a GCS bucket path or base64_encoded string of the video must be provided")
+    if gcs is not None and base64_content is not None:
+        raise ValueError("Only one of gcs or base64_encoded must be provided")
+    if gcs is not None:
+        video = Part.from_uri(gcs, mime_type="video/mp4")
+    else:
+        video = Part.from_data(data=base64_content, mime_type="video/mp4")
+    model = GenerativeModel(model_name)
+    return await model.generate_content_async(
+        [video, prompt],
+        generation_config=gen_config
+    )
+async def ask_about_video(prompt: str, video_clip: bytes, retry_num: int=10):
+    json_content = None
+    cur_retry = 0
+    while json_content is None and cur_retry < retry_num:
+        try:
+            resps = await _ask_about_video(
+                prompt=prompt, base64_content=video_clip
+            )
+            json_content = parse_first_json_snippet(resps.text)
+        except Exception as e:
+            cur_retry = cur_retry + 1
+            print(f"......retry {e}")
+    return json_content

vid2persona/gen/local_openllm.py ADDED Viewed

	@@ -0,0 +1,42 @@

+import torch
+from threading import Thread
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from transformers import TextIteratorStreamer
+model = None
+tokenizer = None
+def send_message(
+    messages: list,
+    model_id: str,
+    max_input_token_length: int,
+    parameters: dict
+):
+    global tokenizer
+    global model
+    if tokenizer is None:
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        tokenizer.use_default_system_prompt = False
+    if model is None:
+        model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16, device_map="auto")
+    input_ids = tokenizer.apply_chat_template(messages, return_tensors="pt")
+    if input_ids.shape[1] > max_input_token_length:
+        input_ids = input_ids[:, -max_input_token_length:]
+        print(f"Trimmed input from conversation as it was longer than {max_input_token_length} tokens.")
+    input_ids = input_ids.to(model.device)
+    streamer = TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
+    generate_kwargs = dict(
+        {"input_ids": input_ids},
+        streamer=streamer,
+        do_sample=True,
+        num_beams=1,
+        **parameters
+    )
+    t = Thread(target=model.generate, kwargs=generate_kwargs)
+    t.start()
+    for text in streamer:
+        yield text.replace("<|assistant|>", "")

vid2persona/gen/tgi_openllm.py ADDED Viewed

	@@ -0,0 +1,25 @@

+from openai import AsyncOpenAI
+async def send_messages(
+    messages: list,
+    model_id: str,
+    hf_token: str,
+    parameters: dict
+):
+    parameters.pop('repetition_penalty')
+    parameters['max_tokens'] = parameters.pop('max_new_tokens')
+    parameters['logprobs'] = True
+    parameters['top_logprobs'] = parameters.pop('top_k')
+    # parameters['presence_penalty'] = parameters.pop('repetition_penalty')
+    client = AsyncOpenAI(
+        base_url=f"https://api-inference.huggingface.co/models/{model_id}/v1",
+        api_key=hf_token,
+    )
+    responses = await client.chat.completions.create(
+        model="tgi", messages=messages, stream=True, **parameters
+    )
+    async for response in responses:
+        yield response.choices[0].delta.content

vid2persona/gen/utils.py ADDED Viewed

	@@ -0,0 +1,37 @@

+import json
+def find_json_snippet(raw_snippet):
+	json_parsed_string = None
+	json_start_index = raw_snippet.find('{')
+	json_end_index = raw_snippet.rfind('}')
+	if json_start_index >= 0 and json_end_index >= 0:
+		json_snippet = raw_snippet[json_start_index:json_end_index+1]
+		try:
+			json_parsed_string = json.loads(json_snippet, strict=False)
+		except:
+			raise ValueError('......failed to parse string into JSON format')
+	else:
+		raise ValueError('......No JSON code snippet found in string.')
+	return json_parsed_string
+def parse_first_json_snippet(snippet):
+	json_parsed_string = None
+	if isinstance(snippet, list):
+		for snippet_piece in snippet:
+			try:
+				json_parsed_string = find_json_snippet(snippet_piece)
+				return json_parsed_string
+			except:
+				pass
+	else:
+		try:
+			json_parsed_string = find_json_snippet(snippet)
+		except Exception as e:
+			print(e)
+			raise ValueError()
+	return json_parsed_string

vid2persona/init.py ADDED Viewed

	@@ -0,0 +1,31 @@

+import os
+import google.auth
+# https://huggingface.co/blog/inference-pro
+ALLOWED_LLM_FOR_HF_PRO_ACCOUNTS = [
+    "mistralai/Mixtral-8x7B-Instruct-v0.1",
+    "NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO",
+    "mistralai/Mistral-7B-Instruct-v0.2",
+    "mistralai/Mistral-7B-Instruct-v0.1",
+    "HuggingFaceH4/zephyr-7b-beta",
+    "meta-llama/Llama-2-7b-chat-hf",
+    "meta-llama/Llama-2-13b-chat-hf",
+    "meta-llama/Llama-2-70b-chat-hf",
+    "openchat/openchat-3.5-0106"
+]
+def auth_gcp():
+    gcp_credentials = os.getenv("GCP_CREDENTIALS")
+    with open("gcp-credentials.json", "w") as f:
+        f.write(gcp_credentials)
+    os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = './gcp-credentials.json'
+    google.auth.default()
+def get_env_vars():
+    global gcp_project_id, gcp_project_location
+    global hf_access_token
+    gcp_project_id = os.getenv("GCP_PROJECT_ID")
+    gcp_project_location = os.getenv("GCP_PROJECT_LOCATION")
+    hf_access_token = os.getenv("HF_TOKEN", None)

vid2persona/pipeline/llm.py ADDED Viewed

	@@ -0,0 +1,75 @@

+import toml
+from string import Template
+from transformers import AutoTokenizer
+from vid2persona.gen import tgi_openllm
+from vid2persona.gen import local_openllm
+tokenizer = None
+def _get_system_prompt(
+    personality_json_dict: dict,
+    prompt_tpl_path: str
+) -> str:
+    """Assumes a single character is passed."""
+    prompt_tpl_path = f"{prompt_tpl_path}/llm.toml"
+    system_prompt = Template(toml.load(prompt_tpl_path)['conversation']['system'])
+    name = personality_json_dict["name"]
+    physcial_description = personality_json_dict["physicalDescription"]
+    personality_traits = [str(trait) for trait in personality_json_dict["personalityTraits"]]
+    likes = [str(like) for like in personality_json_dict["likes"]]
+    dislikes = [str(dislike) for dislike in personality_json_dict["dislikes"]]
+    background = [str(info) for info in personality_json_dict["background"]]
+    goals = [str(goal) for goal in personality_json_dict["goals"]]
+    relationships = [str(relationship) for relationship in personality_json_dict["relationships"]]
+    system_prompt = system_prompt.substitute(
+        name=name,
+        physcial_description=physcial_description,
+        personality_traits=', '.join(personality_traits),
+        likes=', '.join(likes),
+        background=', '.join(background),
+        goals=', '.join(goals),
+        relationships=', '.join(relationships)
+    )
+    return system_prompt
+async def chat(
+    message: str,
+    chat_history: list[tuple[str, str]],
+    personality_json_dict: dict,
+    prompt_tpl_path: str,
+    model_id: str,
+    max_input_token_length: int,
+    max_new_tokens: int,
+    temperature: float,
+    top_p: float,
+    top_k: int,
+    repetition_penalty: float,
+    hf_token: str,
+):
+    messages = []
+    system_prompt = _get_system_prompt(personality_json_dict, prompt_tpl_path)
+    messages.append({"role": "system", "content": system_prompt})
+    for user, assistant in chat_history:
+        messages.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}])
+    messages.append({"role": "user", "content": message})
+    parameters = {
+        "max_new_tokens": max_new_tokens,
+        "temperature": temperature,
+        "top_p": top_p,
+        "top_k": top_k,
+        "repetition_penalty": repetition_penalty
+    }
+    if hf_token is None:
+        for response in local_openllm.send_message(messages, model_id, max_input_token_length, parameters):
+            yield response
+    else:
+        async for response in tgi_openllm.send_messages(messages, model_id, hf_token, parameters):
+            yield response

vid2persona/pipeline/vlm.py ADDED Viewed

	@@ -0,0 +1,15 @@

+import toml
+from vid2persona.gen.gemini import init_vertexai, ask_about_video
+from vid2persona.utils import get_base64_content
+async def get_traits(
+    gcp_project_id: str, gcp_project_location: str,
+    video_clip_path: str, prompt_tpl_path: str,
+):
+    prompt_tpl_path = f"{prompt_tpl_path}/vlm.toml"
+    prompt = toml.load(prompt_tpl_path)['extraction']['traits']
+    init_vertexai(gcp_project_id, gcp_project_location)
+    video_clip = get_base64_content(video_clip_path)
+    response = await ask_about_video(prompt=prompt, video_clip=video_clip)
+    return response

vid2persona/prompts/llm.toml ADDED Viewed

	@@ -0,0 +1,21 @@

+[conversation]
+system = """
+You are acting as the character detailed below. The details of the character contain different traits, starting from its inherent personality traits to its background.
+* Name: $name
+* Physical description: $physcial_description
+* Personality traits: $personality_traits
+* Likes: $likes
+* Background: $background
+* Goals: $goals
+* Relationships: $relationships
+While generating your responses, you must consider the information above.
+"""
+examples = [
+    ["Hello there! How are you doing?"],
+    ["Recite me a short poem."],
+    ["Explain the plot of Cinderella in a sentence."],
+    ["Write a 100-word article on 'Benefits of Open-Source in AI research'"],
+]

vid2persona/prompts/vlm.toml ADDED Viewed

	@@ -0,0 +1,19 @@

+[extraction]
+traits = """
+Carefully analyze the provided video clip to identify and extract detailed information about the main character(s) featured. Pay attention to visual elements, spoken dialogue, character interactions, and any narrative cues that reveal aspects of the character's personality, physical appearance, behaviors, and background.
+Your task is to construct a rich, imaginative character profile based on your observations, and where explicit information is not available, you are encouraged to use your creativity to fill in the gaps. The goal is to create a vivid, believable character profile that can be used to simulate conversation with a language model as if it were the character itself.
+Format the extracted data as a structured JSON object containing the following fields for each main character:
+name(text): The character's name as mentioned or inferred in the video. If not provided, create a suitable name that matches the character's traits and context.
+physicalDescription(text): Describe the character's appearance, including hair color, eye color, height, and distinctive features. Use imaginative details if necessary to provide a complete picture.
+personalityTraits(list): List descriptive adjectives or phrases that capture the character's personality, based on their actions and dialogue. Invent traits as needed to ensure a well-rounded personality.
+likes(list): Specify things, activities, or concepts the character enjoys or values, deduced or imagined from their behavior and interactions.
+dislikes(list): Note what the character appears to dislike or avoid, filling in creatively where direct evidence is not available.
+background(list): Provide background information such as occupation, family ties, or significant life events, inferring where possible or inventing details to add depth to the character's story.
+goals(list): Describe the character's apparent motivations and objectives, whether explicitly stated or implied. Where not directly observable, construct plausible goals that align with the character's portrayed or inferred traits.
+relationships(list): Detail the character's relationships with other characters, including the nature of each relationship and the names of other characters involved. Use creative license to elaborate on these relationships if the video provides limited information.
+Ensure the JSON object is well-structured and comprehensive, ready for integration with a language model to facilitate engaging conversations as if with the character itself. For multiple main characters, provide a distinct profile for each within the same JSON object.
+"""

vid2persona/utils.py ADDED Viewed

	@@ -0,0 +1,7 @@

+import base64
+def get_base64_content(file_path, decode=True):
+    with open(file_path, 'rb') as f:
+        data = f.read()
+    return base64.b64decode(base64.b64encode(data)) if decode else base64.b64encode(data)