Spaces:
Sleeping
Sleeping
use Kosmos-2 for caption instead of Coca
Browse files
app.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
import gradio as gr
|
|
|
2 |
from gradio_client import Client, handle_file
|
3 |
import numpy as np
|
4 |
import tempfile
|
@@ -11,22 +12,32 @@ pipe = DiffusionPipeline.from_pretrained("cerspense/zeroscope_v2_576w", torch_dt
|
|
11 |
pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
|
12 |
pipe.enable_model_cpu_offload()
|
13 |
|
|
|
14 |
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
rep_penalty=1.2,
|
22 |
-
top_p=0.5,
|
23 |
-
min_seq_len=5,
|
24 |
-
seq_len=20,
|
25 |
-
api_name="/inference_caption"
|
26 |
)
|
27 |
-
print(f"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
|
29 |
-
return
|
30 |
|
31 |
def export_to_video(frames: np.ndarray, fps: int) -> str:
|
32 |
frames = np.clip((frames * 255), 0, 255).astype(np.uint8)
|
@@ -38,7 +49,7 @@ def export_to_video(frames: np.ndarray, fps: int) -> str:
|
|
38 |
return out_file.name
|
39 |
|
40 |
def infer(image_init):
|
41 |
-
prompt =
|
42 |
video_frames = pipe(prompt, num_inference_steps=40, height=320, width=576, num_frames=24).frames[0]
|
43 |
video_path = export_to_video(video_frames, 12)
|
44 |
print(video_path)
|
|
|
1 |
import gradio as gr
|
2 |
+
import os
|
3 |
from gradio_client import Client, handle_file
|
4 |
import numpy as np
|
5 |
import tempfile
|
|
|
12 |
pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
|
13 |
pipe.enable_model_cpu_offload()
|
14 |
|
15 |
+
hf_token = os.environ.get("HF_TOKEN")
|
16 |
|
17 |
+
def get_caption(image_in):
|
18 |
+
kosmos2_client = Client("fffiloni/Kosmos-2-API", hf_token=hf_token)
|
19 |
+
kosmos2_result = kosmos2_client.predict(
|
20 |
+
image_input=handle_file(image_in),
|
21 |
+
text_input="Detailed",
|
22 |
+
api_name="/generate_predictions"
|
|
|
|
|
|
|
|
|
|
|
23 |
)
|
24 |
+
print(f"KOSMOS2 RETURNS: {kosmos2_result}")
|
25 |
+
|
26 |
+
data = kosmos2_result[1]
|
27 |
+
|
28 |
+
# Extract and combine tokens starting from the second element
|
29 |
+
sentence = ''.join(item['token'] for item in data[1:])
|
30 |
+
|
31 |
+
# Find the last occurrence of "."
|
32 |
+
#last_period_index = full_sentence.rfind('.')
|
33 |
+
|
34 |
+
# Truncate the string up to the last period
|
35 |
+
#truncated_caption = full_sentence[:last_period_index + 1]
|
36 |
+
|
37 |
+
# print(truncated_caption)
|
38 |
+
#print(f"\n—\nIMAGE CAPTION: {truncated_caption}")
|
39 |
|
40 |
+
return sentence
|
41 |
|
42 |
def export_to_video(frames: np.ndarray, fps: int) -> str:
|
43 |
frames = np.clip((frames * 255), 0, 255).astype(np.uint8)
|
|
|
49 |
return out_file.name
|
50 |
|
51 |
def infer(image_init):
|
52 |
+
prompt = get_caption(image_init)
|
53 |
video_frames = pipe(prompt, num_inference_steps=40, height=320, width=576, num_frames=24).frames[0]
|
54 |
video_path = export_to_video(video_frames, 12)
|
55 |
print(video_path)
|