Sam commited on
Commit
fb30cb6
β€’
1 Parent(s): 3b630ea
Files changed (3) hide show
  1. README.md +7 -0
  2. app.py +134 -0
  3. requirements.txt +6 -0
README.md CHANGED
@@ -1,4 +1,11 @@
1
  ---
2
  license: mit
3
  title: Moondream 2 Multi Interrogation
 
 
 
 
 
 
 
4
  ---
 
1
  ---
2
  license: mit
3
  title: Moondream 2 Multi Interrogation
4
+ emoji: πŸŒ€
5
+ colorFrom: yellow
6
+ colorTo: purple
7
+ sdk: gradio
8
+ sdk_version: "4.31.3"
9
+ app_file: app.py
10
+ pinned: true
11
  ---
app.py ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from transformers import AutoTokenizer, AutoModelForCausalLM
3
+ import json
4
+ import torch
5
+ import requests
6
+ import time
7
+ import random
8
+ from PIL import Image
9
+ from typing import Union
10
+
11
+ device = "cuda" if torch.cuda.is_available() else "cpu"
12
+ print(f"Using {device}" if device != "cpu" else "Using CPU")
13
+
14
+ def _load_model():
15
+ tokenizer = AutoTokenizer.from_pretrained("vikhyatk/moondream2", trust_remote_code=True, revision="2024-05-08")
16
+ model = AutoModelForCausalLM.from_pretrained("vikhyatk/moondream2", device_map=device, trust_remote_code=True, revision="2024-05-08")
17
+ return (model, tokenizer)
18
+
19
+ class MoonDream():
20
+ def __init__(self, model=None, tokenizer=None):
21
+ self.model, self.tokenizer = (model, tokenizer)
22
+ if not model or not tokenizer:
23
+ self.model, self.tokenizer = _load_model()
24
+ self.device = device
25
+ self.model.to(self.device)
26
+ def __call__(self, question, imgs):
27
+ imn = 0
28
+ for img in imgs:
29
+ img = self.model.encode_image(img)
30
+ res = self.model.answer_question(question=question, image_embeds=img, tokenizer=self.tokenizer)
31
+ yield res
32
+ return
33
+
34
+ def _respond_one(question, img):
35
+ txt = ""
36
+ yield (txt := txt + MoonDream()(question, [img]))
37
+ return txt
38
+
39
+ def respond_batch(question, **imgs):
40
+ md = MoonDream()
41
+ for img in imgs.values():
42
+ res = md(question, img)
43
+ for r in res:
44
+ yield r
45
+ yield "\n\n\n\n\n\n"
46
+ return
47
+
48
+ red = Image.new("RGB", (192,192), (255,0,0))
49
+ green = Image.new("RGB", (192,192), (0,255,0))
50
+ blue = Image.new("RGB", (192,192), (0,0,255))
51
+ res = respond_batch("What color is this? Elaborate upon what emotion registers most strongly with you upon viewing. ", imgs=[red, green, blue])
52
+ for r in res:
53
+ print(r)
54
+ if "\n\n\n\n\n\n" in r:
55
+ break
56
+
57
+ def dual_images(img1: Image):
58
+ # Ran once for each img to it's respective output. Output should be detailed str of description/feature extraction/interrogation.
59
+ md = MoonDream()
60
+ res = md("Describe the image in plain english ", [img1])
61
+ txt = ""
62
+ for r in res:
63
+ yield (txt := txt + r)
64
+ return
65
+
66
+ import os
67
+
68
+ with open("together_key.txt", "r") as f:
69
+ os.environ["TOGETHER_KEY"] = f.read().strip()
70
+ print("Set together key")
71
+
72
+ def merge_descriptions_to_prompt(mi, d1, d2):
73
+ from together import Together
74
+ tog = Together(api_key=os.getenv("TOGETHER_KEY"))
75
+ res = tog.completions.create(prompt=f"""Describe what would result if the following two descriptions were describing one thing.
76
+ ### Description 1:
77
+ ```text
78
+ {d1}
79
+ ```
80
+ ### Description 2:
81
+ ```text
82
+ {d2}
83
+ ```
84
+ Merge-Specific Instructions:
85
+ ```text
86
+ {mi}
87
+ ```
88
+ Ensure you end your output with ```\\n
89
+ ---
90
+ Complete Description:
91
+ ```text""", model="meta-llama/Meta-Llama-3-70B", stop=["```"], max_tokens=1024)
92
+ return res.choices[0].text.split("```")[0]
93
+
94
+ def xform_image_description(img, inst):
95
+ from together import Together
96
+ desc = dual_images(img)
97
+ tog = Together(api_key=os.getenv("TOGETHER_KEY"))
98
+ prompt=f"""Describe the image in aggressively verbose detail. I must know every freckle upon a man's brow and each blade of the grass intimately.\nDescription: ```text\n{desc}\n```\nInstructions:\n```text\n{inst}\n```\n\n\n---\nDetailed Description:\n```text"""
99
+ res = tog.completions.create(prompt=prompt, model="meta-llama/Meta-Llama-3-70B", stop=["```"], max_tokens=1024)
100
+ return res.choices[0].text[len(prompt):].split("```")[0]
101
+
102
+ with gr.Blocks() as demo:
103
+ with gr.Row(visible=True):
104
+ with gr.Column():
105
+ with gr.Row():
106
+ img = gr.Image(label="images", type='pil')
107
+ with gr.Row():
108
+ btn = gr.Button("submit")
109
+ with gr.Row():
110
+ otpt = gr.Textbox(label="output", lines=3, interactive=True)
111
+ with gr.Row():
112
+ with gr.Column():
113
+ im1 = gr.Image(label="image 1", type='pil')
114
+ with gr.Column():
115
+ im2 = gr.Image(label="image 2", type='pil')
116
+ with gr.Row():
117
+ btn2 = gr.Button("submit batch")
118
+ with gr.Row():
119
+ with gr.Column():
120
+ otp2 = gr.Textbox(label="individual batch output (left)", interactive=True)
121
+ with gr.Column():
122
+ otp3 = gr.Textbox(label="individual batch output (right)", interactive=True)
123
+ with gr.Row():
124
+ minst = gr.Textbox(label="Merge Instructions")
125
+ with gr.Row():
126
+ btn_scd = gr.Button("Merge Descriptions to Single Combined Description")
127
+ with gr.Row():
128
+ otp4 = gr.Textbox(label="batch output ( combined )", interactive=True, lines=4)
129
+ btn2.click(dual_images, inputs=[im1], outputs=[otp2])
130
+ btn2.click(dual_images, inputs=[im2], outputs=[otp3])
131
+ btn.click(dual_images, inputs=[img], outputs=[otpt])
132
+ btn_scd.click(merge_descriptions_to_prompt, inputs=[minst, otp2, otp3], outputs=[otp4])
133
+
134
+ demo.launch(debug=True, share=True)
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ gradio==4.31.3
2
+ transformers==4.40.2
3
+ accelerate==0.30.1
4
+ einops==0.8.0
5
+ pillow==10.3.0
6
+ together==1.1.5