Spaces:
Sleeping
Sleeping
update demo UX
Browse files
app.py
CHANGED
@@ -1202,29 +1202,48 @@ def answer_question(image, question):
|
|
1202 |
yield re.sub("<$", "", re.sub("END$", "", buffer))
|
1203 |
|
1204 |
|
1205 |
-
gr.
|
1206 |
-
|
1207 |
-
|
1208 |
-
moondream1 is a tiny (1.6B parameter) vision language model trained by
|
1209 |
-
|
1210 |
-
|
1211 |
-
|
1212 |
-
|
1213 |
-
|
1214 |
-
|
1215 |
-
|
1216 |
-
|
1217 |
-
|
1218 |
-
|
1219 |
-
|
1220 |
-
|
1221 |
-
|
1222 |
-
|
1223 |
-
|
1224 |
-
|
1225 |
-
|
1226 |
-
|
1227 |
-
|
1228 |
-
|
1229 |
-
|
1230 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1202 |
yield re.sub("<$", "", re.sub("END$", "", buffer))
|
1203 |
|
1204 |
|
1205 |
+
with gr.Blocks() as demo:
|
1206 |
+
gr.HTML("<h1 class='gradio-heading'><center>๐ moondream</center></h1>")
|
1207 |
+
gr.HTML(
|
1208 |
+
"<p class='gradio-sub-heading'><center>moondream1 is a tiny (1.6B parameter) vision language model trained by <a href='https://x.com/vikhyatk'>@vikhyatk</a> that performs on par with models twice its size. It is trained on the LLaVa training dataset, and initialized with SigLIP as the vision tower and Phi-1.5 as the text encoder. Check out the <a href='https://huggingface.co/vikhyatk/moondream1'>HuggingFace model card</a> for more details.</center></p>"
|
1209 |
+
)
|
1210 |
+
with gr.Group():
|
1211 |
+
with gr.Row():
|
1212 |
+
prompt = gr.Textbox(
|
1213 |
+
label="Question", placeholder="e.g. What is this?", scale=4
|
1214 |
+
)
|
1215 |
+
submit = gr.Button(
|
1216 |
+
"Submit",
|
1217 |
+
scale=1,
|
1218 |
+
)
|
1219 |
+
with gr.Row():
|
1220 |
+
img = gr.Image(type="pil", label="Upload or Drag an Image")
|
1221 |
+
output = gr.TextArea(label="Answer")
|
1222 |
+
|
1223 |
+
# handling events
|
1224 |
+
submit.click(answer_question, [img, prompt], output)
|
1225 |
+
prompt.submit(answer_question, [img, prompt], output)
|
1226 |
+
|
1227 |
+
demo.queue().launch(debug=True)
|
1228 |
+
|
1229 |
+
# gr.Interface(
|
1230 |
+
# title="๐ moondream1",
|
1231 |
+
# description="""
|
1232 |
+
# moondream1 is a tiny (1.6B parameter) vision language model trained by <a href="https://x.com/vikhyatk">@vikhyatk</a> that performs on par with models twice its size. It is trained on the LLaVa training dataset, and initialized with SigLIP as the vision tower and Phi-1.5 as the text encoder. Check out the <a href="https://huggingface.co/vikhyatk/moondream1">HuggingFace model card</a> for more details.
|
1233 |
+
# """,
|
1234 |
+
# fn=answer_question,
|
1235 |
+
# inputs=[gr.Image(type="pil"), gr.Textbox(lines=2, label="Question")],
|
1236 |
+
# examples=[
|
1237 |
+
# [Image.open("assets/demo-1.jpg"), "Who is the author of this book?"],
|
1238 |
+
# [Image.open("assets/demo-2.jpg"), "What type of food is the girl eating?"],
|
1239 |
+
# [
|
1240 |
+
# Image.open("assets/demo-3.jpg"),
|
1241 |
+
# "What kind of public transportation is in the image?",
|
1242 |
+
# ],
|
1243 |
+
# [Image.open("assets/demo-4.jpg"), "What is the girl looking at?"],
|
1244 |
+
# [Image.open("assets/demo-5.jpg"), "What kind of dog is in the picture?"],
|
1245 |
+
# ],
|
1246 |
+
# outputs=gr.TextArea(label="Answer"),
|
1247 |
+
# allow_flagging="never",
|
1248 |
+
# cache_examples=False,
|
1249 |
+
# ).launch()
|