Spaces:

yasserrmd
/

MolmoVision

Running

yasserrmd commited on Sep 27

Commit

ff95e3f

•

1 Parent(s): 36e07d3

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -21,31 +21,37 @@ model = AutoModelForCausalLM.from_pretrained(
 )
 @spaces.GPU
-def describe_image(image):
-    # Process the image
-    inputs = processor.process(images=[image], text="Describe this image.")
     # Move inputs to the correct device and make a batch of size 1
     inputs = {k: v.to(model.device).unsqueeze(0) for k, v in inputs.items()}
-    # Generate output with maximum 200 new tokens
     output = model.generate_from_batch(
         inputs,
         GenerationConfig(max_new_tokens=200, stop_strings="<|endoftext|>"),
         tokenizer=processor.tokenizer
     )
-    # Decode and return generated text
     generated_tokens = output[0, inputs['input_ids'].size(1):]
     generated_text = processor.tokenizer.decode(generated_tokens, skip_special_tokens=True)
     return generated_text
-# Gradio interface
-gr.Interface(
-    fn=describe_image,
-    inputs=gr.inputs.Image(type="pil"),
-    outputs="text",
-    title="Visual Language Model - Molmo",
-    description="Upload an image, and the model will generate a detailed description of it."
-).launch()

 )
 @spaces.GPU
+def describe_image(image, prompt):
+    # Process the image with the user-provided text prompt
+    inputs = processor.process(images=[image], text=prompt)
     # Move inputs to the correct device and make a batch of size 1
     inputs = {k: v.to(model.device).unsqueeze(0) for k, v in inputs.items()}
+    # Generate output with a maximum of 200 new tokens
     output = model.generate_from_batch(
         inputs,
         GenerationConfig(max_new_tokens=200, stop_strings="<|endoftext|>"),
         tokenizer=processor.tokenizer
     )
+    # Decode and return the generated text
     generated_tokens = output[0, inputs['input_ids'].size(1):]
     generated_text = processor.tokenizer.decode(generated_tokens, skip_special_tokens=True)
     return generated_text
+# Gradio interface using the latest API
+with gr.Blocks() as demo:
+    gr.Markdown("# Visual Language Model - Molmo")
+    with gr.Row():
+        image_input = gr.Image(type="pil", label="Upload an image")
+        text_input = gr.Textbox(label="Enter a prompt", placeholder="Describe this image...")
+    output_text = gr.Textbox(label="Generated Description")
+    submit_button = gr.Button("Generate Description")
+    # Connect the inputs (image, text prompt) to the function and output
+    submit_button.click(fn=describe_image, inputs=[image_input, text_input], outputs=output_text)
+# Launch the app
+demo.launch()