zeroscope-img-to-video

Paused

fffiloni commited on Oct 29, 2024

Commit

8d024ac

verified ·

1 Parent(s): ea2e352

use Kosmos-2 for caption instead of Coca

Files changed (1) hide show

app.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import gradio as gr
 from gradio_client import Client, handle_file
 import numpy as np
 import tempfile
@@ -11,22 +12,32 @@ pipe = DiffusionPipeline.from_pretrained("cerspense/zeroscope_v2_576w", torch_dt
 pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
 pipe.enable_model_cpu_offload()
-def create_image_caption(image_init):
-    client = Client("fffiloni/CoCa-clone")
-    result = client.predict(
-    		image=handle_file(image_init),
-    		decoding_method="Nucleus sampling",
-    		rep_penalty=1.2,
-    		top_p=0.5,
-    		min_seq_len=5,
-    		seq_len=20,
-    		api_name="/inference_caption"
     )
-    print(f"cap: {result}")
-    return result
 def export_to_video(frames: np.ndarray, fps: int) -> str:
     frames = np.clip((frames * 255), 0, 255).astype(np.uint8)
@@ -38,7 +49,7 @@ def export_to_video(frames: np.ndarray, fps: int) -> str:
     return out_file.name
 def infer(image_init):
-    prompt = create_image_caption(image_init)
     video_frames = pipe(prompt, num_inference_steps=40, height=320, width=576, num_frames=24).frames[0]
     video_path = export_to_video(video_frames, 12)
     print(video_path)

 import gradio as gr
+import os
 from gradio_client import Client, handle_file
 import numpy as np
 import tempfile
 pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
 pipe.enable_model_cpu_offload()
+hf_token = os.environ.get("HF_TOKEN")
+def get_caption(image_in):
+    kosmos2_client = Client("fffiloni/Kosmos-2-API", hf_token=hf_token)
+    kosmos2_result = kosmos2_client.predict(
+		image_input=handle_file(image_in),
+		text_input="Detailed",
+		api_name="/generate_predictions"
     )
+    print(f"KOSMOS2 RETURNS: {kosmos2_result}")
+    data = kosmos2_result[1]
+    # Extract and combine tokens starting from the second element
+    sentence = ''.join(item['token'] for item in data[1:])
+    # Find the last occurrence of "."
+    #last_period_index = full_sentence.rfind('.')
+    # Truncate the string up to the last period
+    #truncated_caption = full_sentence[:last_period_index + 1]
+    # print(truncated_caption)
+    #print(f"\n—\nIMAGE CAPTION: {truncated_caption}")
+    return sentence
 def export_to_video(frames: np.ndarray, fps: int) -> str:
     frames = np.clip((frames * 255), 0, 255).astype(np.uint8)
     return out_file.name
 def infer(image_init):
+    prompt = get_caption(image_init)
     video_frames = pipe(prompt, num_inference_steps=40, height=320, width=576, num_frames=24).frames[0]
     video_path = export_to_video(video_frames, 12)
     print(video_path)