yuhangzang commited on
Commit
583c33f
Β·
1 Parent(s): 6cd12c0
README.md CHANGED
@@ -1,5 +1,5 @@
1
  ---
2
- title: Caprl Cpu
3
  emoji: πŸŒ–
4
  colorFrom: purple
5
  colorTo: green
@@ -8,7 +8,7 @@ sdk_version: 5.49.1
8
  app_file: app.py
9
  pinned: false
10
  license: apache-2.0
11
- short_description: Generate captions for images with CapRL
12
  ---
13
 
14
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: CapRL
3
  emoji: πŸŒ–
4
  colorFrom: purple
5
  colorTo: green
 
8
  app_file: app.py
9
  pinned: false
10
  license: apache-2.0
11
+ short_description: Generate captions for images with CapRL (CPU-only)
12
  ---
13
 
14
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,157 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ from PIL import Image
4
+ from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration
5
+
6
+ MODEL_ID = "internlm/CapRL-3B"
7
+ DEFAULT_PROMPT = "Describe the image in detail."
8
+ MAX_NEW_TOKENS = 4096
9
+
10
+
11
+ def load_model():
12
+ device = "cpu"
13
+ dtype = torch.float32
14
+
15
+ model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
16
+ MODEL_ID,
17
+ torch_dtype=dtype,
18
+ device_map="cpu",
19
+ trust_remote_code=True,
20
+ low_cpu_mem_usage=True,
21
+ )
22
+
23
+ processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
24
+ return model, processor
25
+
26
+
27
+ MODEL, PROCESSOR = load_model()
28
+
29
+
30
+ @torch.inference_mode()
31
+ def generate_caption(image: Image.Image):
32
+ if image is None:
33
+ return "", 0
34
+
35
+ try:
36
+ if not isinstance(image, Image.Image):
37
+ return "Error: Invalid image format", 0
38
+
39
+ max_size = 4096
40
+ if image.width > max_size or image.height > max_size:
41
+ image.thumbnail((max_size, max_size), Image.Resampling.LANCZOS)
42
+
43
+ device = MODEL.device
44
+ messages = [
45
+ {
46
+ "role": "user",
47
+ "content": [
48
+ {"type": "image"},
49
+ {"type": "text", "text": DEFAULT_PROMPT},
50
+ ],
51
+ }
52
+ ]
53
+
54
+ prompt_text = PROCESSOR.apply_chat_template(
55
+ messages, tokenize=False, add_generation_prompt=True
56
+ )
57
+
58
+ inputs = PROCESSOR(
59
+ text=[prompt_text],
60
+ images=[image],
61
+ return_tensors="pt",
62
+ ).to(device)
63
+
64
+ generated_ids = MODEL.generate(
65
+ **inputs,
66
+ max_new_tokens=MAX_NEW_TOKENS,
67
+ do_sample=False,
68
+ )
69
+
70
+ generated_ids_trimmed = [
71
+ out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
72
+ ]
73
+ output_text = PROCESSOR.batch_decode(
74
+ generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
75
+ )
76
+ caption = output_text[0].strip()
77
+
78
+ input_ids = inputs.get("input_ids")
79
+ input_length = input_ids.shape[-1] if input_ids is not None else 0
80
+ total_length = generated_ids.shape[-1]
81
+ num_generated_tokens = max(total_length - input_length, 0)
82
+
83
+ return caption, int(num_generated_tokens)
84
+
85
+ except RuntimeError as e:
86
+ return f"Runtime error: {str(e)}", 0
87
+ except Exception as e:
88
+ return f"Error generating caption: {str(e)}", 0
89
+
90
+
91
+ with gr.Blocks(title="CapRL Image Captioning (CPU)") as demo:
92
+ gr.Markdown("# 🎨 CapRL for Image Captioning (CPU)")
93
+ gr.Markdown("### CapRL: Stimulating Dense Image Caption Capabilities via Reinforcement Learning")
94
+ gr.Markdown("✨ Upload an image to generate a detailed caption with CapRL-3B (CPU-only)! ✨")
95
+ gr.Markdown(
96
+ """
97
+ πŸ“– <a href=\"https://arxiv.org/abs/2509.22647\">Paper</a> | 🏠 <a href=\"https://github.com/InternLM/CapRL\">Github</a> | πŸ€— <a href=\"https://huggingface.co/internlm/CapRL-3B\">CapRL-3B Model</a> | πŸ€— <a href=\"https://huggingface.co/yuhangzang/CapRL-InternVL3.5-8B\">CapRL-InternVL3.5-8B Model</a> |
98
+ πŸ€— <a href=\"https://huggingface.co/datasets/internlm/CapRL-2M\">CapRL-2M Dataset</a>
99
+
100
+ πŸ€— <a href=\"https://huggingface.co/collections/long-xing1/caprl-68d64ac32ded31596c36e189\">CapRL Collection</a> | πŸ“° <a href=\"https://huggingface.co/papers/2509.22647\">Daily Paper</a> | πŸ’Ύ <a href=\"https://huggingface.co/mradermacher/CapRL-3B-GGUF\">CapRL-3B-GGUF</a> | πŸ’Ύ <a href=\"https://huggingface.co/mradermacher/CapRL-3B-i1-GGUF\">CapRL-3B-i1-GGUF</a>
101
+ """
102
+ )
103
+
104
+ gr.Markdown(
105
+ "πŸ‘‰ Prefer faster inference? Try the GPU Space: "
106
+ "<a href=\"https://huggingface.co/spaces/yuhangzang/caprl\">yuhangzang/caprl</a>"
107
+ )
108
+
109
+ with gr.Row():
110
+ with gr.Column():
111
+ image_input = gr.Image(type="pil", label="Input Image")
112
+ generate_button = gr.Button("Generate Caption")
113
+ with gr.Column():
114
+ caption_output = gr.Textbox(label="Caption", lines=6)
115
+ token_output = gr.Number(label="Generated Tokens", precision=0)
116
+
117
+ generate_button.click(
118
+ fn=generate_caption,
119
+ inputs=image_input,
120
+ outputs=[caption_output, token_output],
121
+ show_progress=True,
122
+ )
123
+
124
+ image_input.upload(
125
+ fn=generate_caption,
126
+ inputs=image_input,
127
+ outputs=[caption_output, token_output],
128
+ show_progress=True,
129
+ )
130
+
131
+ gr.Examples(
132
+ examples=[
133
+ ["./examples/example_chinese.png"],
134
+ ["./examples/example_receipt.jpg"],
135
+ ["./examples/example_table.png"],
136
+ ],
137
+ inputs=image_input,
138
+ outputs=[caption_output, token_output],
139
+ fn=generate_caption,
140
+ cache_examples=True,
141
+ label="πŸ“Έ Example Images"
142
+ )
143
+
144
+ gr.Markdown("### Citation")
145
+ gr.Markdown("If you find this project useful, please kindly cite:")
146
+
147
+ citation_text = """@article{xing2025caprl,
148
+ title={{CapRL}: Stimulating Dense Image Caption Capabilities via Reinforcement Learning},
149
+ author={Xing, Long and Dong, Xiaoyi and Zang, Yuhang and Cao, Yuhang and Liang, Jianze and Huang, Qidong and Wang, Jiaqi and Wu, Feng and Lin, Dahua},
150
+ journal={arXiv preprint arXiv:2509.22647},
151
+ year={2025}
152
+ }"""
153
+
154
+ gr.Code(value=citation_text, language="markdown", label="BibTeX Citation")
155
+
156
+
157
+ demo.launch()
examples/example_chinese.png ADDED
examples/example_receipt.jpg ADDED
examples/example_table.png ADDED
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ gradio==5.49.1
2
+ spaces
3
+ transformers
4
+ torch
5
+ accelerate
6
+ torchvision
7
+ Pillow
8
+ sentencepiece
9
+