LPX55 commited on
Commit
98ee0ed
·
1 Parent(s): 3a48d0a

test: new cap

Browse files
Files changed (1) hide show
  1. app_v3.py +30 -19
app_v3.py CHANGED
@@ -1,6 +1,10 @@
1
  import torch
 
 
 
2
  import spaces
3
  import os
 
4
  from diffusers.utils import load_image
5
  from diffusers.hooks import apply_group_offloading
6
  from diffusers import FluxControlNetModel, FluxControlNetPipeline, AutoencoderKL
@@ -17,12 +21,15 @@ import gradio as gr
17
 
18
  huggingface_token = os.getenv("HUGGINFACE_TOKEN")
19
  MAX_SEED = 1000000
20
- MODEL_PATH = "fancyfeast/llama-joycaption-beta-one-hf-llava"
21
- cap_processor = AutoProcessor.from_pretrained(MODEL_PATH)
22
- cap_model = LlavaForConditionalGeneration.from_pretrained(MODEL_PATH, torch_dtype="bfloat16", device_map=0)
23
- assert isinstance(cap_model, LlavaForConditionalGeneration), f"Expected LlavaForConditionalGeneration, got {type(cap_model)}"
24
- cap_model.eval()
25
- apply_liger_kernel_to_llama(model=cap_model.language_model)
 
 
 
26
 
27
  text_encoder_2_unquant = T5EncoderModel.from_pretrained(
28
  "LPX55/FLUX.1-merged_uncensored",
@@ -114,23 +121,27 @@ def process_image(control_image, user_prompt, system_prompt, scale, steps,
114
 
115
  # If no user prompt provided, generate a caption first
116
  if not final_prompt:
 
 
 
 
117
  # Generate caption
118
- caption_gen = caption(
119
- input_image=control_image,
120
- prompt=system_prompt,
121
- temperature=temperature,
122
- top_p=top_p,
123
- max_new_tokens=max_new_tokens,
124
- log_prompt=log_prompt
125
- )
126
 
127
  # Get the full caption by exhausting the generator
128
- generated_caption = ""
129
- for chunk in caption_gen:
130
- generated_caption += chunk
131
- yield generated_caption, None # Update caption in real-time
132
 
133
- final_prompt = generated_caption
134
  yield f"Using caption: {final_prompt}", None
135
 
136
  # Show the final prompt being used
 
1
  import torch
2
+ import subprocess
3
+ subprocess.run("rm -rf /data-nvme/zerogpu-offload/*", env={}, shell=True)
4
+
5
  import spaces
6
  import os
7
+ import moondream as md
8
  from diffusers.utils import load_image
9
  from diffusers.hooks import apply_group_offloading
10
  from diffusers import FluxControlNetModel, FluxControlNetPipeline, AutoencoderKL
 
21
 
22
  huggingface_token = os.getenv("HUGGINFACE_TOKEN")
23
  MAX_SEED = 1000000
24
+ # MODEL_PATH = "fancyfeast/llama-joycaption-beta-one-hf-llava"
25
+ # cap_processor = AutoProcessor.from_pretrained(MODEL_PATH)
26
+ # cap_model = LlavaForConditionalGeneration.from_pretrained(MODEL_PATH, torch_dtype="bfloat16", device_map=0)
27
+ # assert isinstance(cap_model, LlavaForConditionalGeneration), f"Expected LlavaForConditionalGeneration, got {type(cap_model)}"
28
+ # cap_model.eval()
29
+ # apply_liger_kernel_to_llama(model=cap_model.language_model)
30
+
31
+ md_api_key = os.getenv("MD_KEY")
32
+ model = md.vl(api_key=md_api_key)
33
 
34
  text_encoder_2_unquant = T5EncoderModel.from_pretrained(
35
  "LPX55/FLUX.1-merged_uncensored",
 
121
 
122
  # If no user prompt provided, generate a caption first
123
  if not final_prompt:
124
+ # Generate a detailed caption
125
+ mcaption = model.caption(control_image, length="long")
126
+ detailed_caption = mcaption["caption"]
127
+ print(f"Detailed caption: {detailed_caption}")
128
  # Generate caption
129
+ # caption_gen = caption(
130
+ # input_image=control_image,
131
+ # prompt=system_prompt,
132
+ # temperature=temperature,
133
+ # top_p=top_p,
134
+ # max_new_tokens=max_new_tokens,
135
+ # log_prompt=log_prompt
136
+ # )
137
 
138
  # Get the full caption by exhausting the generator
139
+ # generated_caption = ""
140
+ # for chunk in caption_gen:
141
+ # generated_caption += chunk
142
+ # yield generated_caption, None # Update caption in real-time
143
 
144
+ final_prompt = detailed_caption
145
  yield f"Using caption: {final_prompt}", None
146
 
147
  # Show the final prompt being used