vikhyatk commited on
Commit
77e99d6
โ€ข
1 Parent(s): 0f11c75

update demo UX

Browse files
Files changed (1) hide show
  1. app.py +45 -26
app.py CHANGED
@@ -1202,29 +1202,48 @@ def answer_question(image, question):
1202
  yield re.sub("<$", "", re.sub("END$", "", buffer))
1203
 
1204
 
1205
- gr.Interface(
1206
- title="๐ŸŒ” moondream1",
1207
- description="""
1208
- moondream1 is a tiny (1.6B parameter) vision language model trained by
1209
- <a href="https://x.com/vikhyatk">@vikhyatk</a> that performs on par with
1210
- models twice its size. It is trained on the LLaVa training dataset, and
1211
- initialized with SigLIP as the vision tower and Phi-1.5 as the text encoder.
1212
- Check out the <a href="https://huggingface.co/vikhyatk/moondream1">HuggingFace
1213
- model card</a> for more details.
1214
- """,
1215
- fn=answer_question,
1216
- inputs=[gr.Image(type="pil"), gr.Textbox(lines=2, label="Question")],
1217
- examples=[
1218
- [Image.open("assets/demo-1.jpg"), "Who is the author of this book?"],
1219
- [Image.open("assets/demo-2.jpg"), "What type of food is the girl eating?"],
1220
- [
1221
- Image.open("assets/demo-3.jpg"),
1222
- "What kind of public transportation is in the image?",
1223
- ],
1224
- [Image.open("assets/demo-4.jpg"), "What is the girl looking at?"],
1225
- [Image.open("assets/demo-5.jpg"), "What kind of dog is in the picture?"],
1226
- ],
1227
- outputs=gr.TextArea(label="Answer"),
1228
- allow_flagging="never",
1229
- cache_examples=False,
1230
- ).launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1202
  yield re.sub("<$", "", re.sub("END$", "", buffer))
1203
 
1204
 
1205
+ with gr.Blocks() as demo:
1206
+ gr.HTML("<h1 class='gradio-heading'><center>๐ŸŒ” moondream</center></h1>")
1207
+ gr.HTML(
1208
+ "<p class='gradio-sub-heading'><center>moondream1 is a tiny (1.6B parameter) vision language model trained by <a href='https://x.com/vikhyatk'>@vikhyatk</a> that performs on par with models twice its size. It is trained on the LLaVa training dataset, and initialized with SigLIP as the vision tower and Phi-1.5 as the text encoder. Check out the <a href='https://huggingface.co/vikhyatk/moondream1'>HuggingFace model card</a> for more details.</center></p>"
1209
+ )
1210
+ with gr.Group():
1211
+ with gr.Row():
1212
+ prompt = gr.Textbox(
1213
+ label="Question", placeholder="e.g. What is this?", scale=4
1214
+ )
1215
+ submit = gr.Button(
1216
+ "Submit",
1217
+ scale=1,
1218
+ )
1219
+ with gr.Row():
1220
+ img = gr.Image(type="pil", label="Upload or Drag an Image")
1221
+ output = gr.TextArea(label="Answer")
1222
+
1223
+ # handling events
1224
+ submit.click(answer_question, [img, prompt], output)
1225
+ prompt.submit(answer_question, [img, prompt], output)
1226
+
1227
+ demo.queue().launch(debug=True)
1228
+
1229
+ # gr.Interface(
1230
+ # title="๐ŸŒ” moondream1",
1231
+ # description="""
1232
+ # moondream1 is a tiny (1.6B parameter) vision language model trained by <a href="https://x.com/vikhyatk">@vikhyatk</a> that performs on par with models twice its size. It is trained on the LLaVa training dataset, and initialized with SigLIP as the vision tower and Phi-1.5 as the text encoder. Check out the <a href="https://huggingface.co/vikhyatk/moondream1">HuggingFace model card</a> for more details.
1233
+ # """,
1234
+ # fn=answer_question,
1235
+ # inputs=[gr.Image(type="pil"), gr.Textbox(lines=2, label="Question")],
1236
+ # examples=[
1237
+ # [Image.open("assets/demo-1.jpg"), "Who is the author of this book?"],
1238
+ # [Image.open("assets/demo-2.jpg"), "What type of food is the girl eating?"],
1239
+ # [
1240
+ # Image.open("assets/demo-3.jpg"),
1241
+ # "What kind of public transportation is in the image?",
1242
+ # ],
1243
+ # [Image.open("assets/demo-4.jpg"), "What is the girl looking at?"],
1244
+ # [Image.open("assets/demo-5.jpg"), "What kind of dog is in the picture?"],
1245
+ # ],
1246
+ # outputs=gr.TextArea(label="Answer"),
1247
+ # allow_flagging="never",
1248
+ # cache_examples=False,
1249
+ # ).launch()