Spaces:

internlm
/

caprl_cpu

Running

App Files Files Community

yuhangzang commited on 16 days ago

Commit

583c33f

1 Parent(s): 6cd12c0

update

Browse files

Files changed (6) hide show

README.md +2 -2
app.py +157 -0
examples/example_chinese.png +0 -0
examples/example_receipt.jpg +0 -0
examples/example_table.png +0 -0
requirements.txt +9 -0

README.md CHANGED Viewed

@@ -1,5 +1,5 @@
 ---
-title: Caprl Cpu
 emoji: 🌖
 colorFrom: purple
 colorTo: green
@@ -8,7 +8,7 @@ sdk_version: 5.49.1
 app_file: app.py
 pinned: false
 license: apache-2.0
-short_description: Generate captions for images with CapRL
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: CapRL
 emoji: 🌖
 colorFrom: purple
 colorTo: green
 app_file: app.py
 pinned: false
 license: apache-2.0
+short_description: Generate captions for images with CapRL (CPU-only)
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,157 @@

+import gradio as gr
+import torch
+from PIL import Image
+from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration
+MODEL_ID = "internlm/CapRL-3B"
+DEFAULT_PROMPT = "Describe the image in detail."
+MAX_NEW_TOKENS = 4096
+def load_model():
+    device = "cpu"
+    dtype = torch.float32
+    model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+        MODEL_ID,
+        torch_dtype=dtype,
+        device_map="cpu",
+        trust_remote_code=True,
+        low_cpu_mem_usage=True,
+    )
+    processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
+    return model, processor
+MODEL, PROCESSOR = load_model()
+@torch.inference_mode()
+def generate_caption(image: Image.Image):
+    if image is None:
+        return "", 0
+    try:
+        if not isinstance(image, Image.Image):
+            return "Error: Invalid image format", 0
+        max_size = 4096
+        if image.width > max_size or image.height > max_size:
+            image.thumbnail((max_size, max_size), Image.Resampling.LANCZOS)
+        device = MODEL.device
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image"},
+                    {"type": "text", "text": DEFAULT_PROMPT},
+                ],
+            }
+        ]
+        prompt_text = PROCESSOR.apply_chat_template(
+            messages, tokenize=False, add_generation_prompt=True
+        )
+        inputs = PROCESSOR(
+            text=[prompt_text],
+            images=[image],
+            return_tensors="pt",
+        ).to(device)
+        generated_ids = MODEL.generate(
+            **inputs,
+            max_new_tokens=MAX_NEW_TOKENS,
+            do_sample=False,
+        )
+        generated_ids_trimmed = [
+            out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+        ]
+        output_text = PROCESSOR.batch_decode(
+            generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
+        )
+        caption = output_text[0].strip()
+        input_ids = inputs.get("input_ids")
+        input_length = input_ids.shape[-1] if input_ids is not None else 0
+        total_length = generated_ids.shape[-1]
+        num_generated_tokens = max(total_length - input_length, 0)
+        return caption, int(num_generated_tokens)
+    except RuntimeError as e:
+        return f"Runtime error: {str(e)}", 0
+    except Exception as e:
+        return f"Error generating caption: {str(e)}", 0
+with gr.Blocks(title="CapRL Image Captioning (CPU)") as demo:
+    gr.Markdown("# 🎨 CapRL for Image Captioning (CPU)")
+    gr.Markdown("### CapRL: Stimulating Dense Image Caption Capabilities via Reinforcement Learning")
+    gr.Markdown("✨ Upload an image to generate a detailed caption with CapRL-3B (CPU-only)! ✨")
+    gr.Markdown(
+        """
+📖 <a href=\"https://arxiv.org/abs/2509.22647\">Paper</a> | 🏠 <a href=\"https://github.com/InternLM/CapRL\">Github</a> | 🤗 <a href=\"https://huggingface.co/internlm/CapRL-3B\">CapRL-3B Model</a> | 🤗 <a href=\"https://huggingface.co/yuhangzang/CapRL-InternVL3.5-8B\">CapRL-InternVL3.5-8B Model</a> |
+🤗 <a href=\"https://huggingface.co/datasets/internlm/CapRL-2M\">CapRL-2M Dataset</a>
+🤗 <a href=\"https://huggingface.co/collections/long-xing1/caprl-68d64ac32ded31596c36e189\">CapRL Collection</a> | 📰 <a href=\"https://huggingface.co/papers/2509.22647\">Daily Paper</a> | 💾 <a href=\"https://huggingface.co/mradermacher/CapRL-3B-GGUF\">CapRL-3B-GGUF</a> | 💾 <a href=\"https://huggingface.co/mradermacher/CapRL-3B-i1-GGUF\">CapRL-3B-i1-GGUF</a>
+"""
+    )
+    gr.Markdown(
+        "👉 Prefer faster inference? Try the GPU Space: "
+        "<a href=\"https://huggingface.co/spaces/yuhangzang/caprl\">yuhangzang/caprl</a>"
+    )
+    with gr.Row():
+        with gr.Column():
+            image_input = gr.Image(type="pil", label="Input Image")
+            generate_button = gr.Button("Generate Caption")
+        with gr.Column():
+            caption_output = gr.Textbox(label="Caption", lines=6)
+            token_output = gr.Number(label="Generated Tokens", precision=0)
+    generate_button.click(
+        fn=generate_caption,
+        inputs=image_input,
+        outputs=[caption_output, token_output],
+        show_progress=True,
+    )
+    image_input.upload(
+        fn=generate_caption,
+        inputs=image_input,
+        outputs=[caption_output, token_output],
+        show_progress=True,
+    )
+    gr.Examples(
+        examples=[
+            ["./examples/example_chinese.png"],
+            ["./examples/example_receipt.jpg"],
+            ["./examples/example_table.png"],
+        ],
+        inputs=image_input,
+        outputs=[caption_output, token_output],
+        fn=generate_caption,
+        cache_examples=True,
+        label="📸 Example Images"
+    )
+    gr.Markdown("### Citation")
+    gr.Markdown("If you find this project useful, please kindly cite:")
+    citation_text = """@article{xing2025caprl,
+  title={{CapRL}: Stimulating Dense Image Caption Capabilities via Reinforcement Learning},
+  author={Xing, Long and Dong, Xiaoyi and Zang, Yuhang and Cao, Yuhang and Liang, Jianze and Huang, Qidong and Wang, Jiaqi and Wu, Feng and Lin, Dahua},
+  journal={arXiv preprint arXiv:2509.22647},
+  year={2025}
+}"""
+    gr.Code(value=citation_text, language="markdown", label="BibTeX Citation")
+demo.launch()

examples/example_chinese.png ADDED Viewed

examples/example_receipt.jpg ADDED Viewed

examples/example_table.png ADDED Viewed

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+gradio==5.49.1
+spaces
+transformers
+torch
+accelerate
+torchvision
+Pillow
+sentencepiece