Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -21,31 +21,37 @@ model = AutoModelForCausalLM.from_pretrained(
|
|
21 |
)
|
22 |
|
23 |
@spaces.GPU
|
24 |
-
def describe_image(image):
|
25 |
-
# Process the image
|
26 |
-
inputs = processor.process(images=[image], text=
|
27 |
|
28 |
# Move inputs to the correct device and make a batch of size 1
|
29 |
inputs = {k: v.to(model.device).unsqueeze(0) for k, v in inputs.items()}
|
30 |
|
31 |
-
# Generate output with maximum 200 new tokens
|
32 |
output = model.generate_from_batch(
|
33 |
inputs,
|
34 |
GenerationConfig(max_new_tokens=200, stop_strings="<|endoftext|>"),
|
35 |
tokenizer=processor.tokenizer
|
36 |
)
|
37 |
|
38 |
-
# Decode and return generated text
|
39 |
generated_tokens = output[0, inputs['input_ids'].size(1):]
|
40 |
generated_text = processor.tokenizer.decode(generated_tokens, skip_special_tokens=True)
|
41 |
|
42 |
return generated_text
|
43 |
|
44 |
-
# Gradio interface
|
45 |
-
gr.
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
)
|
22 |
|
23 |
@spaces.GPU
|
24 |
+
def describe_image(image, prompt):
|
25 |
+
# Process the image with the user-provided text prompt
|
26 |
+
inputs = processor.process(images=[image], text=prompt)
|
27 |
|
28 |
# Move inputs to the correct device and make a batch of size 1
|
29 |
inputs = {k: v.to(model.device).unsqueeze(0) for k, v in inputs.items()}
|
30 |
|
31 |
+
# Generate output with a maximum of 200 new tokens
|
32 |
output = model.generate_from_batch(
|
33 |
inputs,
|
34 |
GenerationConfig(max_new_tokens=200, stop_strings="<|endoftext|>"),
|
35 |
tokenizer=processor.tokenizer
|
36 |
)
|
37 |
|
38 |
+
# Decode and return the generated text
|
39 |
generated_tokens = output[0, inputs['input_ids'].size(1):]
|
40 |
generated_text = processor.tokenizer.decode(generated_tokens, skip_special_tokens=True)
|
41 |
|
42 |
return generated_text
|
43 |
|
44 |
+
# Gradio interface using the latest API
|
45 |
+
with gr.Blocks() as demo:
|
46 |
+
gr.Markdown("# Visual Language Model - Molmo")
|
47 |
+
with gr.Row():
|
48 |
+
image_input = gr.Image(type="pil", label="Upload an image")
|
49 |
+
text_input = gr.Textbox(label="Enter a prompt", placeholder="Describe this image...")
|
50 |
+
output_text = gr.Textbox(label="Generated Description")
|
51 |
+
submit_button = gr.Button("Generate Description")
|
52 |
+
|
53 |
+
# Connect the inputs (image, text prompt) to the function and output
|
54 |
+
submit_button.click(fn=describe_image, inputs=[image_input, text_input], outputs=output_text)
|
55 |
+
|
56 |
+
# Launch the app
|
57 |
+
demo.launch()
|