Spaces:

yuhangzang
/

caprl

Running on Zero

App Files Files Community

yuhangzang commited on 9 days ago

Commit

483edf4

1 Parent(s): 2a6eacb

update

Browse files

Files changed (3) hide show

README.md +9 -0
app.py +112 -7
requirements.txt +6 -0

README.md CHANGED Viewed

@@ -12,3 +12,12 @@ short_description: Generate captions for images with CapRL
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
+Citation:
+@article{xing2025caprl,
+  title={CapRL: Stimulating Dense Image Caption Capabilities via Reinforcement Learning},
+  author={Xing, Long and Dong, Xiaoyi and Zang, Yuhang and Cao, Yuhang and Liang, Jianze and Huang, Qidong and Wang, Jiaqi and Wu, Feng and Lin, Dahua},
+  journal={arXiv preprint arXiv:2509.22647},
+  year={2025}
+}

app.py CHANGED Viewed

@@ -1,15 +1,120 @@
 import gradio as gr
 import spaces
 import torch
-zero = torch.Tensor([0]).cuda()
-print(zero.device) # <-- 'cpu' 🤔
 @spaces.GPU
-def greet(n):
-    print(zero.device) # <-- 'cuda:0' 🤗
-    return f"Hello {zero + n} Tensor"
-demo = gr.Interface(fn=greet, inputs=gr.Number(), outputs=gr.Text())
-demo.launch()

 import gradio as gr
 import spaces
 import torch
+from PIL import Image
+from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration
+MODEL_ID = "internlm/CapRL-3B"
+DEFAULT_PROMPT = "Describe the image in detail."
+MAX_NEW_TOKENS = 128
+def get_device() -> str:
+    return "cuda" if torch.cuda.is_available() else "cpu"
+def select_dtype(device: str):
+    if device == "cuda":
+        if torch.cuda.is_bf16_supported():
+            return torch.bfloat16
+        return torch.float16
+    return torch.float32
+def load_model():
+    device = get_device()
+    dtype = select_dtype(device)
+    model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+        MODEL_ID,
+        torch_dtype=dtype,
+        device_map="auto" if device == "cuda" else None,
+        trust_remote_code=True,
+    )
+    if device != "cuda":
+        model.to(device)
+    processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
+    return model, processor
+MODEL, PROCESSOR = load_model()
 @spaces.GPU
+@torch.inference_mode()
+def generate_caption(image: Image.Image):
+    if image is None:
+        return "", 0
+    device = MODEL.device
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "image"},
+                {"type": "text", "text": DEFAULT_PROMPT},
+            ],
+        }
+    ]
+    prompt_text = PROCESSOR.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
+    inputs = PROCESSOR(
+        text=[prompt_text],
+        images=[image],
+        return_tensors="pt",
+    ).to(device)
+    output_ids = MODEL.generate(
+        **inputs,
+        max_new_tokens=MAX_NEW_TOKENS,
+        do_sample=False,
+    )
+    generated_text = PROCESSOR.batch_decode(
+        output_ids, skip_special_tokens=True
+    )[0]
+    processed_outputs = PROCESSOR.post_process_generation(
+        generated_text,
+        messages,
+    )
+    caption = processed_outputs[0].get("generated_text", generated_text).strip()
+    input_ids = inputs.get("input_ids")
+    input_length = input_ids.shape[-1] if input_ids is not None else 0
+    total_length = output_ids.shape[-1]
+    num_generated_tokens = max(total_length - input_length, 0)
+    return caption, int(num_generated_tokens)
+with gr.Blocks(title="CapRL Image Captioning") as demo:
+    gr.Markdown("# CapRL Image Captioning\nUpload an image to generate a caption with CapRL-3B.")
+    with gr.Row():
+        with gr.Column():
+            image_input = gr.Image(type="pil", label="Input Image")
+            generate_button = gr.Button("Generate Caption")
+        with gr.Column():
+            caption_output = gr.Textbox(label="Caption", lines=6)
+            token_output = gr.Number(label="Generated Tokens", precision=0)
+    generate_button.click(
+        fn=generate_caption,
+        inputs=image_input,
+        outputs=[caption_output, token_output],
+    )
+    image_input.upload(
+        fn=generate_caption,
+        inputs=image_input,
+        outputs=[caption_output, token_output],
+    )
+demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+gradio==5.49.1
+spaces
+transformers
+torch
+Pillow
+sentencepiece