andito HF staff commited on
Commit
ecd7421
·
1 Parent(s): 6d64276

adapt to moondream

Browse files
Files changed (2) hide show
  1. app.py +37 -132
  2. requirements.txt +1 -3
app.py CHANGED
@@ -1,150 +1,55 @@
1
  import gradio as gr
2
- from transformers import AutoProcessor, AutoModelForVision2Seq, TextIteratorStreamer
3
  from threading import Thread
4
- import re
5
- import time
6
  from PIL import Image
7
- import torch
8
  import spaces
 
9
  #import subprocess
10
  #subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
11
 
12
 
13
- processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM-Instruct")
14
- model = AutoModelForVision2Seq.from_pretrained("HuggingFaceTB/SmolVLM-Instruct",
15
- torch_dtype=torch.bfloat16,
16
- #_attn_implementation="flash_attention_2"
17
- ).to("cuda")
18
-
19
- @spaces.GPU
20
- def model_inference(
21
- input_dict, history, decoding_strategy, temperature, max_new_tokens,
22
- repetition_penalty, top_p
23
- ):
24
- text = input_dict["text"]
25
- print(input_dict["files"])
26
- if len(input_dict["files"]) > 1:
27
- images = [Image.open(image).convert("RGB") for image in input_dict["files"]]
28
- elif len(input_dict["files"]) == 1:
29
- images = [Image.open(input_dict["files"][0]).convert("RGB")]
 
 
 
30
  else:
31
- images = []
32
-
33
-
34
- if text == "" and not images:
35
- gr.Error("Please input a query and optionally image(s).")
36
-
37
- if text == "" and images:
38
- gr.Error("Please input a text query along the image(s).")
39
 
40
 
41
 
42
-
43
- resulting_messages = [
44
- {
45
- "role": "user",
46
- "content": [{"type": "image"} for _ in range(len(images))] + [
47
- {"type": "text", "text": text}
48
- ]
49
- }
50
- ]
51
- prompt = processor.apply_chat_template(resulting_messages, add_generation_prompt=True)
52
- inputs = processor(text=prompt, images=[images], return_tensors="pt")
53
- inputs = {k: v.to("cuda") for k, v in inputs.items()}
54
- generation_args = {
55
- "max_new_tokens": max_new_tokens,
56
- "repetition_penalty": repetition_penalty,
57
-
58
- }
59
-
60
- assert decoding_strategy in [
61
- "Greedy",
62
- "Top P Sampling",
63
- ]
64
- if decoding_strategy == "Greedy":
65
- generation_args["do_sample"] = False
66
- elif decoding_strategy == "Top P Sampling":
67
- generation_args["temperature"] = temperature
68
- generation_args["do_sample"] = True
69
- generation_args["top_p"] = top_p
70
-
71
- generation_args.update(inputs)
72
- # Generate
73
- streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens= True)
74
- generation_args = dict(inputs, streamer=streamer, max_new_tokens=max_new_tokens)
75
- generated_text = ""
76
-
77
- thread = Thread(target=model.generate, kwargs=generation_args)
78
- thread.start()
79
-
80
- yield "..."
81
- buffer = ""
82
-
83
-
84
- for new_text in streamer:
85
-
86
- buffer += new_text
87
- generated_text_without_prompt = buffer#[len(ext_buffer):]
88
- time.sleep(0.01)
89
- yield buffer
90
-
91
-
92
  examples=[
93
- [{"text": "What art era do these artpieces belong to?", "files": ["example_images/rococo.jpg", "example_images/rococo_1.jpg"]}, "Greedy", 0.4, 512, 1.2, 0.8],
94
- [{"text": "I'm planning a visit to this temple, give me travel tips.", "files": ["example_images/examples_wat_arun.jpg"]}, "Greedy", 0.4, 512, 1.2, 0.8],
95
- [{"text": "What is the due date and the invoice date?", "files": ["example_images/examples_invoice.png"]}, "Greedy", 0.4, 512, 1.2, 0.8],
96
- [{"text": "What is this UI about?", "files": ["example_images/s2w_example.png"]}, "Greedy", 0.4, 512, 1.2, 0.8],
97
- [{"text": "Where do the severe droughts happen according to this diagram?", "files": ["example_images/examples_weather_events.png"]}, "Greedy", 0.4, 512, 1.2, 0.8],
 
 
 
 
 
98
  ]
99
- demo = gr.ChatInterface(fn=model_inference, title="SmolVLM: Small yet Mighty 💫",
100
- description="Play with [HuggingFaceTB/SmolVLM-Instruct](https://huggingface.co/HuggingFaceTB/SmolVLM-Instruct) in this demo. To get started, upload an image and text or try one of the examples. This checkpoint works best with single turn conversations, so clear the conversation after a single turn.",
 
101
  examples=examples,
102
- textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image"], file_count="multiple"), stop_btn="Stop Generation", multimodal=True,
103
- additional_inputs=[gr.Radio(["Top P Sampling",
104
- "Greedy"],
105
- value="Greedy",
106
- label="Decoding strategy",
107
- #interactive=True,
108
- info="Higher values is equivalent to sampling more low-probability tokens.",
109
-
110
- ), gr.Slider(
111
- minimum=0.0,
112
- maximum=5.0,
113
- value=0.4,
114
- step=0.1,
115
- interactive=True,
116
- label="Sampling temperature",
117
- info="Higher values will produce more diverse outputs.",
118
- ),
119
- gr.Slider(
120
- minimum=8,
121
- maximum=1024,
122
- value=512,
123
- step=1,
124
- interactive=True,
125
- label="Maximum number of new tokens to generate",
126
- ), gr.Slider(
127
- minimum=0.01,
128
- maximum=5.0,
129
- value=1.2,
130
- step=0.01,
131
- interactive=True,
132
- label="Repetition penalty",
133
- info="1.0 is equivalent to no penalty",
134
- ),
135
- gr.Slider(
136
- minimum=0.01,
137
- maximum=0.99,
138
- value=0.8,
139
- step=0.01,
140
- interactive=True,
141
- label="Top P",
142
- info="Higher values is equivalent to sampling more low-probability tokens.",
143
- )],cache_examples=False
144
- )
145
-
146
-
147
-
148
 
149
  demo.launch(debug=True)
150
 
 
1
  import gradio as gr
 
2
  from threading import Thread
 
 
3
  from PIL import Image
 
4
  import spaces
5
+ import moondream as md
6
  #import subprocess
7
  #subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
8
 
9
 
10
+ model = md.vl(model="moondream-0_5b-int8.mf")
11
+
12
+ def model_inference(input_dict, history):
13
+ # Extract image from message if present
14
+ if input_dict.get("files"):
15
+ image_path = input_dict["files"][0]
16
+ if isinstance(image_path, dict) and "path" in image_path:
17
+ image_path = image_path["path"]
18
+ image = Image.open(image_path)
19
+ encoded_image = model.encode_image(image)
20
+
21
+ # If there's a question, use query
22
+ text = input_dict.get("text", "")
23
+ if text not in ["", "Caption"]:
24
+ response = model.query(encoded_image, text)["answer"]
25
+ # Otherwise generate a caption
26
+ else:
27
+ response = model.caption(encoded_image)["caption"]
28
+
29
+ return response
30
  else:
31
+ return "Please provide an image to analyze."
 
 
 
 
 
 
 
32
 
33
 
34
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  examples=[
36
+ [{"text": "What art era do this artpiece belong to?", "files": ["example_images/rococo.jpg"]}, []],
37
+ [{"text": "Caption", "files": ["example_images/rococo.jpg"]}, []],
38
+ [{"text": "I'm planning a visit to this temple, give me travel tips.", "files": ["example_images/examples_wat_arun.jpg"]}, []],
39
+ [{"text": "Caption", "files": ["example_images/examples_wat_arun.jpg"]}, []],
40
+ [{"text": "What is the due date and the invoice date?", "files": ["example_images/examples_invoice.png"]}, []],
41
+ [{"text": "Caption", "files": ["example_images/examples_invoice.png"]}, []],
42
+ [{"text": "What is this UI about?", "files": ["example_images/s2w_example.png"]}, []],
43
+ [{"text": "Caption", "files": ["example_images/s2w_example.png"]}, []],
44
+ [{"text": "Where do the severe droughts happen according to this diagram?", "files": ["example_images/examples_weather_events.png"]}, []],
45
+ [{"text": "Caption", "files": ["example_images/examples_weather_events.png"]}, []],
46
  ]
47
+
48
+ demo = gr.ChatInterface(fn=model_inference, title="Moondream 0.5B: The World's Smallest Vision-Language Model",
49
+ description="Play with [Moondream 0.5B](https://huggingface.co/vikhyatk/moondream2) in this demo. To get started, upload an image and text or try one of the examples.",
50
  examples=examples,
51
+ textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image"], file_count="single"), stop_btn="Stop Generation", multimodal=True,
52
+ additional_inputs=[], cache_examples=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
 
54
  demo.launch(debug=True)
55
 
requirements.txt CHANGED
@@ -1,6 +1,4 @@
1
- torch
2
- accelerate
3
  huggingface_hub
4
  gradio
5
- transformers
6
  spaces
 
1
+ moondream==0.0.5
 
2
  huggingface_hub
3
  gradio
 
4
  spaces