moondream-05

Running

App Files Files Community

andito HF staff commited on Dec 6, 2024

Commit

ecd7421

1 Parent(s): 6d64276

adapt to moondream

Browse files

Files changed (2) hide show

app.py +37 -132
requirements.txt +1 -3

app.py CHANGED Viewed

@@ -1,150 +1,55 @@
 import gradio as gr
-from transformers import AutoProcessor, AutoModelForVision2Seq, TextIteratorStreamer
 from threading import Thread
-import re
-import time
 from PIL import Image
-import torch
 import spaces
 #import subprocess
 #subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
-processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM-Instruct")
-model = AutoModelForVision2Seq.from_pretrained("HuggingFaceTB/SmolVLM-Instruct",
-        torch_dtype=torch.bfloat16,
-        #_attn_implementation="flash_attention_2"
-        ).to("cuda")
-@spaces.GPU
-def model_inference(
-    input_dict, history, decoding_strategy, temperature, max_new_tokens,
-    repetition_penalty, top_p
-):
-    text = input_dict["text"]
-    print(input_dict["files"])
-    if len(input_dict["files"]) > 1:
-      images = [Image.open(image).convert("RGB") for image in input_dict["files"]]
-    elif len(input_dict["files"]) == 1:
-      images = [Image.open(input_dict["files"][0]).convert("RGB")]
     else:
-      images = []
-    if text == "" and not images:
-        gr.Error("Please input a query and optionally image(s).")
-    if text == "" and images:
-        gr.Error("Please input a text query along the image(s).")
-    resulting_messages = [
-                {
-                    "role": "user",
-                    "content": [{"type": "image"} for _ in range(len(images))] + [
-                        {"type": "text", "text": text}
-                    ]
-                }
-            ]
-    prompt = processor.apply_chat_template(resulting_messages, add_generation_prompt=True)
-    inputs = processor(text=prompt, images=[images], return_tensors="pt")
-    inputs = {k: v.to("cuda") for k, v in inputs.items()}
-    generation_args = {
-        "max_new_tokens": max_new_tokens,
-        "repetition_penalty": repetition_penalty,
-    }
-    assert decoding_strategy in [
-        "Greedy",
-        "Top P Sampling",
-    ]
-    if decoding_strategy == "Greedy":
-        generation_args["do_sample"] = False
-    elif decoding_strategy == "Top P Sampling":
-        generation_args["temperature"] = temperature
-        generation_args["do_sample"] = True
-        generation_args["top_p"] = top_p
-    generation_args.update(inputs)
-    # Generate
-    streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens= True)
-    generation_args = dict(inputs, streamer=streamer, max_new_tokens=max_new_tokens)
-    generated_text = ""
-    thread = Thread(target=model.generate, kwargs=generation_args)
-    thread.start()
-    yield "..."
-    buffer = ""
-    for new_text in streamer:
-      buffer += new_text
-      generated_text_without_prompt = buffer#[len(ext_buffer):]
-      time.sleep(0.01)
-      yield buffer
 examples=[
-              [{"text": "What art era do these artpieces belong to?", "files": ["example_images/rococo.jpg", "example_images/rococo_1.jpg"]}, "Greedy", 0.4, 512, 1.2, 0.8],
-              [{"text": "I'm planning a visit to this temple, give me travel tips.", "files": ["example_images/examples_wat_arun.jpg"]},  "Greedy", 0.4, 512, 1.2, 0.8],
-              [{"text":  "What is the due date and the invoice date?", "files": ["example_images/examples_invoice.png"]}, "Greedy", 0.4, 512, 1.2, 0.8],
-              [{"text": "What is this UI about?", "files": ["example_images/s2w_example.png"]},   "Greedy", 0.4, 512, 1.2, 0.8],
-              [{"text":  "Where do the severe droughts happen according to this diagram?", "files": ["example_images/examples_weather_events.png"]},  "Greedy", 0.4, 512, 1.2, 0.8],
       ]
-demo = gr.ChatInterface(fn=model_inference, title="SmolVLM: Small yet Mighty 💫",
-                description="Play with [HuggingFaceTB/SmolVLM-Instruct](https://huggingface.co/HuggingFaceTB/SmolVLM-Instruct) in this demo. To get started, upload an image and text or try one of the examples. This checkpoint works best with single turn conversations, so clear the conversation after a single turn.",
                 examples=examples,
-                textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image"], file_count="multiple"), stop_btn="Stop Generation", multimodal=True,
-                        additional_inputs=[gr.Radio(["Top P Sampling",
-              "Greedy"],
-          value="Greedy",
-          label="Decoding strategy",
-          #interactive=True,
-          info="Higher values is equivalent to sampling more low-probability tokens.",
-      ), gr.Slider(
-          minimum=0.0,
-          maximum=5.0,
-          value=0.4,
-          step=0.1,
-          interactive=True,
-          label="Sampling temperature",
-          info="Higher values will produce more diverse outputs.",
-      ),
-                                            gr.Slider(
-          minimum=8,
-          maximum=1024,
-          value=512,
-          step=1,
-          interactive=True,
-          label="Maximum number of new tokens to generate",
-      ), gr.Slider(
-          minimum=0.01,
-          maximum=5.0,
-          value=1.2,
-          step=0.01,
-          interactive=True,
-          label="Repetition penalty",
-          info="1.0 is equivalent to no penalty",
-      ),
-         gr.Slider(
-          minimum=0.01,
-          maximum=0.99,
-          value=0.8,
-          step=0.01,
-          interactive=True,
-          label="Top P",
-          info="Higher values is equivalent to sampling more low-probability tokens.",
-      )],cache_examples=False
-                )
 demo.launch(debug=True)

 import gradio as gr
 from threading import Thread
 from PIL import Image
 import spaces
+import moondream as md
 #import subprocess
 #subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
+model = md.vl(model="moondream-0_5b-int8.mf")
+def model_inference(input_dict, history):
+    # Extract image from message if present
+    if input_dict.get("files"):
+        image_path = input_dict["files"][0]
+        if isinstance(image_path, dict) and "path" in image_path:
+            image_path = image_path["path"]
+        image = Image.open(image_path)
+        encoded_image = model.encode_image(image)
+        # If there's a question, use query
+        text = input_dict.get("text", "")
+        if text not in ["", "Caption"]:
+            response = model.query(encoded_image, text)["answer"]
+        # Otherwise generate a caption
+        else:
+            response = model.caption(encoded_image)["caption"]
+        return response
     else:
+        return "Please provide an image to analyze."
 examples=[
+              [{"text": "What art era do this artpiece belong to?", "files": ["example_images/rococo.jpg"]}, []],
+              [{"text": "Caption", "files": ["example_images/rococo.jpg"]}, []],
+              [{"text": "I'm planning a visit to this temple, give me travel tips.", "files": ["example_images/examples_wat_arun.jpg"]}, []],
+              [{"text": "Caption", "files": ["example_images/examples_wat_arun.jpg"]}, []],
+              [{"text": "What is the due date and the invoice date?", "files": ["example_images/examples_invoice.png"]}, []],
+              [{"text": "Caption", "files": ["example_images/examples_invoice.png"]}, []],
+              [{"text": "What is this UI about?", "files": ["example_images/s2w_example.png"]}, []],
+              [{"text": "Caption", "files": ["example_images/s2w_example.png"]}, []],
+              [{"text": "Where do the severe droughts happen according to this diagram?", "files": ["example_images/examples_weather_events.png"]}, []],
+              [{"text": "Caption", "files": ["example_images/examples_weather_events.png"]}, []],
       ]
+demo = gr.ChatInterface(fn=model_inference, title="Moondream 0.5B: The World's Smallest Vision-Language Model",
+                description="Play with [Moondream 0.5B](https://huggingface.co/vikhyatk/moondream2) in this demo. To get started, upload an image and text or try one of the examples.",
                 examples=examples,
+                textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image"], file_count="single"), stop_btn="Stop Generation", multimodal=True,
+                additional_inputs=[], cache_examples=False)
 demo.launch(debug=True)

requirements.txt CHANGED Viewed

@@ -1,6 +1,4 @@
-torch
-accelerate
 huggingface_hub
 gradio
-transformers
 spaces

+moondream==0.0.5
 huggingface_hub
 gradio
 spaces