fffiloni commited on
Commit
8d024ac
1 Parent(s): ea2e352

use Kosmos-2 for caption instead of Coca

Browse files
Files changed (1) hide show
  1. app.py +25 -14
app.py CHANGED
@@ -1,4 +1,5 @@
1
  import gradio as gr
 
2
  from gradio_client import Client, handle_file
3
  import numpy as np
4
  import tempfile
@@ -11,22 +12,32 @@ pipe = DiffusionPipeline.from_pretrained("cerspense/zeroscope_v2_576w", torch_dt
11
  pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
12
  pipe.enable_model_cpu_offload()
13
 
 
14
 
15
-
16
- def create_image_caption(image_init):
17
- client = Client("fffiloni/CoCa-clone")
18
- result = client.predict(
19
- image=handle_file(image_init),
20
- decoding_method="Nucleus sampling",
21
- rep_penalty=1.2,
22
- top_p=0.5,
23
- min_seq_len=5,
24
- seq_len=20,
25
- api_name="/inference_caption"
26
  )
27
- print(f"cap: {result}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
 
29
- return result
30
 
31
  def export_to_video(frames: np.ndarray, fps: int) -> str:
32
  frames = np.clip((frames * 255), 0, 255).astype(np.uint8)
@@ -38,7 +49,7 @@ def export_to_video(frames: np.ndarray, fps: int) -> str:
38
  return out_file.name
39
 
40
  def infer(image_init):
41
- prompt = create_image_caption(image_init)
42
  video_frames = pipe(prompt, num_inference_steps=40, height=320, width=576, num_frames=24).frames[0]
43
  video_path = export_to_video(video_frames, 12)
44
  print(video_path)
 
1
  import gradio as gr
2
+ import os
3
  from gradio_client import Client, handle_file
4
  import numpy as np
5
  import tempfile
 
12
  pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
13
  pipe.enable_model_cpu_offload()
14
 
15
+ hf_token = os.environ.get("HF_TOKEN")
16
 
17
+ def get_caption(image_in):
18
+ kosmos2_client = Client("fffiloni/Kosmos-2-API", hf_token=hf_token)
19
+ kosmos2_result = kosmos2_client.predict(
20
+ image_input=handle_file(image_in),
21
+ text_input="Detailed",
22
+ api_name="/generate_predictions"
 
 
 
 
 
23
  )
24
+ print(f"KOSMOS2 RETURNS: {kosmos2_result}")
25
+
26
+ data = kosmos2_result[1]
27
+
28
+ # Extract and combine tokens starting from the second element
29
+ sentence = ''.join(item['token'] for item in data[1:])
30
+
31
+ # Find the last occurrence of "."
32
+ #last_period_index = full_sentence.rfind('.')
33
+
34
+ # Truncate the string up to the last period
35
+ #truncated_caption = full_sentence[:last_period_index + 1]
36
+
37
+ # print(truncated_caption)
38
+ #print(f"\n—\nIMAGE CAPTION: {truncated_caption}")
39
 
40
+ return sentence
41
 
42
  def export_to_video(frames: np.ndarray, fps: int) -> str:
43
  frames = np.clip((frames * 255), 0, 255).astype(np.uint8)
 
49
  return out_file.name
50
 
51
  def infer(image_init):
52
+ prompt = get_caption(image_init)
53
  video_frames = pipe(prompt, num_inference_steps=40, height=320, width=576, num_frames=24).frames[0]
54
  video_path = export_to_video(video_frames, 12)
55
  print(video_path)