Files changed (1) hide show
  1. app.py +24 -5
app.py CHANGED
@@ -1,6 +1,5 @@
1
  from __future__ import annotations
2
 
3
- import spaces
4
  import torch
5
  from PIL import Image
6
  from einops import rearrange
@@ -1164,7 +1163,7 @@ from transformers import TextIteratorStreamer
1164
  import hashlib
1165
  import os
1166
 
1167
- model_path = snapshot_download("vikhyatk/moondream1", revision="3b9dfe7f7fc461b17aa5f16aadefe60cfc2150c9")
1168
 
1169
  vision_encoder = VisionEncoder(model_path).to(DEVICE, dtype=DTYPE)
1170
  text_model = TextModel(model_path).to(DEVICE, dtype=DTYPE)
@@ -1186,7 +1185,6 @@ def cached_vision_encoder(image):
1186
  return image_vec.to(DEVICE, dtype=DTYPE)
1187
 
1188
 
1189
- @spaces.GPU(duration=10)
1190
  def answer_question(image, question):
1191
  yield "Encoding image..."
1192
 
@@ -1205,9 +1203,9 @@ def answer_question(image, question):
1205
 
1206
 
1207
  with gr.Blocks() as demo:
1208
- gr.HTML("<h1 class='gradio-heading'><center>πŸŒ” moondream1</center></h1>")
1209
  gr.HTML(
1210
- "<center><p class='gradio-sub-heading'>moondream1 is an older version of the moondream model. Check out the <a href='https://huggingface.co/spaces/vikhyatk/moondream2'>moondream2</a> space for an improved version.</p></center>"
1211
  )
1212
  with gr.Group():
1213
  with gr.Row():
@@ -1228,3 +1226,24 @@ with gr.Blocks() as demo:
1228
 
1229
  demo.queue().launch(debug=True)
1230
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  from __future__ import annotations
2
 
 
3
  import torch
4
  from PIL import Image
5
  from einops import rearrange
 
1163
  import hashlib
1164
  import os
1165
 
1166
+ model_path = snapshot_download("vikhyatk/moondream1")
1167
 
1168
  vision_encoder = VisionEncoder(model_path).to(DEVICE, dtype=DTYPE)
1169
  text_model = TextModel(model_path).to(DEVICE, dtype=DTYPE)
 
1185
  return image_vec.to(DEVICE, dtype=DTYPE)
1186
 
1187
 
 
1188
  def answer_question(image, question):
1189
  yield "Encoding image..."
1190
 
 
1203
 
1204
 
1205
  with gr.Blocks() as demo:
1206
+ gr.HTML("<h1 class='gradio-heading'><center>πŸŒ” moondream</center></h1>")
1207
  gr.HTML(
1208
+ "<center><p class='gradio-sub-heading'>moondream1 is a tiny (1.6B parameter) vision language model trained by <a href='https://x.com/vikhyatk'>@vikhyatk</a> that performs on par with models twice its size. It is trained on the LLaVa training dataset, and initialized with SigLIP as the vision tower and Phi-1.5 as the text encoder. Check out the <a href='https://huggingface.co/vikhyatk/moondream1'>HuggingFace model card</a> for more details.</p></center>"
1209
  )
1210
  with gr.Group():
1211
  with gr.Row():
 
1226
 
1227
  demo.queue().launch(debug=True)
1228
 
1229
+ # gr.Interface(
1230
+ # title="πŸŒ” moondream1",
1231
+ # description="""
1232
+ # moondream1 is a tiny (1.6B parameter) vision language model trained by <a href="https://x.com/vikhyatk">@vikhyatk</a> that performs on par with models twice its size. It is trained on the LLaVa training dataset, and initialized with SigLIP as the vision tower and Phi-1.5 as the text encoder. Check out the <a href="https://huggingface.co/vikhyatk/moondream1">HuggingFace model card</a> for more details.
1233
+ # """,
1234
+ # fn=answer_question,
1235
+ # inputs=[gr.Image(type="pil"), gr.Textbox(lines=2, label="Question")],
1236
+ # examples=[
1237
+ # [Image.open("assets/demo-1.jpg"), "Who is the author of this book?"],
1238
+ # [Image.open("assets/demo-2.jpg"), "What type of food is the girl eating?"],
1239
+ # [
1240
+ # Image.open("assets/demo-3.jpg"),
1241
+ # "What kind of public transportation is in the image?",
1242
+ # ],
1243
+ # [Image.open("assets/demo-4.jpg"), "What is the girl looking at?"],
1244
+ # [Image.open("assets/demo-5.jpg"), "What kind of dog is in the picture?"],
1245
+ # ],
1246
+ # outputs=gr.TextArea(label="Answer"),
1247
+ # allow_flagging="never",
1248
+ # cache_examples=False,
1249
+ # ).launch()