raghavd99 commited on
Commit
2a042a6
β€’
1 Parent(s): 3cfe41d
app.py ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import lancedb
2
+ import lancedb.embeddings.imagebind
3
+ from lancedb.embeddings import get_registry
4
+ from lancedb.pydantic import LanceModel, Vector
5
+ import pandas as pd
6
+
7
+ model = get_registry().get("imagebind").create()
8
+
9
+ class TextModel(LanceModel):
10
+ text: str
11
+ image_uri: str = model.SourceField()
12
+ audio_path: str
13
+ vector: Vector(model.ndims()) = model.VectorField()
14
+
15
+ text_list=["A dragon", "A car", "A bird"]
16
+ image_paths=["./test_inputs/dragon.jpg",
17
+ "./test_inputs/car.jpg",
18
+ "./test_inputs/bird_image.jpg"]
19
+ audio_paths=["./test_inputs/dragon-growl-37570.wav", "./test_inputs/car_audio.wav",
20
+ "./test_inputs/bird_audio.wav"]
21
+
22
+ # Load data
23
+ inputs = [
24
+ {
25
+ "text": a,
26
+ "audio_path":b,
27
+ "image_uri":c
28
+ } for a,
29
+ b,
30
+ c in zip(text_list,
31
+ audio_paths,
32
+ image_paths)
33
+ ]
34
+
35
+
36
+ db = lancedb.connect("~/lancedb")
37
+ table = db.create_table("img_bind",schema=TextModel)
38
+ table.add(inputs)
39
+
40
+ import gradio as gr
41
+
42
+ def process_image(inp_img) -> str:
43
+
44
+ actual = (
45
+ table.search(inp_img, vector_column_name="vector")
46
+ .limit(1)
47
+ .to_pydantic(TextModel)[0]
48
+ )
49
+
50
+ return actual.text, actual.audio_path
51
+
52
+ def process_text(inp_text) -> str:
53
+
54
+ actual = (
55
+ table.search(inp_text, vector_column_name="vector")
56
+ .limit(1)
57
+ .to_pydantic(TextModel)[0]
58
+ )
59
+
60
+ return actual.image_uri, actual.audio_path
61
+
62
+ def process_audio(inp_audio) -> str:
63
+
64
+ actual = (
65
+ table.search(inp_audio, vector_column_name="vector")
66
+ .limit(1)
67
+ .to_pydantic(TextModel)[0]
68
+ )
69
+
70
+ return actual.image_uri, actual.text
71
+
72
+ css = """
73
+ output-audio, output-text {
74
+ display: none;
75
+ }
76
+ img {
77
+ width: 300px;
78
+ height: 450px;
79
+ object-fit: cover;
80
+
81
+ """
82
+ with gr.Blocks(css=css) as app:
83
+ # Using Markdown for custom CSS (optional)
84
+ with gr.Tab("Image to Text and Audio"):
85
+ with gr.Row():
86
+ with gr.Column():
87
+ inp1 = gr.Image(value='./test_inputs/dragon.jpg',type='filepath',elem_id='img')
88
+ output_audio1 = gr.Audio(label="Output Audio", elem_id="output-audio")
89
+ output_text1 = gr.Textbox(label="Output Text", elem_id="output-text")
90
+ btn_img1 = gr.Button("Retrieve")
91
+
92
+ # output_audio1 = gr.Audio(label="Output Audio 1", elem_id="output-audio1")
93
+ with gr.Column():
94
+ inp2 = gr.Image(value='./test_inputs/car.jpg',type='filepath',elem_id='img')
95
+ output_audio2 = gr.Audio(label="Output Audio", elem_id="output-audio")
96
+ output_text2 = gr.Textbox(label="Output Text", elem_id="output-text")
97
+ btn_img2 = gr.Button("Retrieve")
98
+
99
+ with gr.Column():
100
+ inp3 = gr.Image(value='./test_inputs/bird_image.jpg',type='filepath',elem_id='img')
101
+ output_audio3 = gr.Audio(label="Output Audio", elem_id="output-audio")
102
+ output_text3 = gr.Textbox(label="Output Text", elem_id="output-text")
103
+ btn_img3 = gr.Button("Retrieve")
104
+
105
+ with gr.Tab("Text to Image and Audio"):
106
+ with gr.Row():
107
+ with gr.Column():
108
+ input_txt1 = gr.Textbox(label="Enter a prompt:", elem_id="output-text")
109
+ output_audio4 = gr.Audio(label="Output Audio", elem_id="output-audio")
110
+ output_img1 = gr.Image(type='filepath',elem_id='img')
111
+
112
+ # with gr.Column():
113
+ # input_txt2 = gr.Textbox(label="Enter a prompt:", elem_id="output-text")
114
+ # output_audio5 = gr.Audio(label="Output Audio", elem_id="output-audio")
115
+ # output_img2 = gr.Image(type='filepath',elem_id='img')
116
+
117
+
118
+ # with gr.Column():
119
+ # input_txt3 = gr.Textbox(label="Enter a prompt:", elem_id="output-text")
120
+ # output_audio6 = gr.Audio(label="Output Audio", elem_id="output-audio")
121
+ # output_img3 = gr.Image(type='filepath',elem_id='img')
122
+
123
+ with gr.Tab("Audio to Image and Text"):
124
+ with gr.Row():
125
+ with gr.Column():
126
+ inp_audio1 = gr.Audio(value='./test_inputs/dragon-growl-37570.wav',type='filepath',elem_id='output-audio')
127
+ output_img7 = gr.Image(type='filepath',elem_id='img')
128
+ output_text7 = gr.Textbox(label="Output Text", elem_id="output-text")
129
+ btn_audio1 = gr.Button("Retrieve")
130
+
131
+ with gr.Column():
132
+ inp_audio2 = gr.Audio(value='./test_inputs/car_audio.wav',type='filepath',elem_id='output-audio')
133
+ output_img8 = gr.Image(type='filepath',elem_id='img')
134
+ output_text8 = gr.Textbox(label="Output Text", elem_id="output-text")
135
+ btn_audio2 = gr.Button("Retrieve")
136
+
137
+ with gr.Column():
138
+ inp_audio3 = gr.Audio(value='./test_inputs/bird_audio.wav',type='filepath',elem_id='output-audio')
139
+ output_img9 = gr.Image(type='filepath',elem_id='img')
140
+ output_text9 = gr.Textbox(label="Output Text", elem_id="output-text")
141
+ btn_audio3 = gr.Button("Retrieve")
142
+
143
+ # Click actions for buttons/Textboxes
144
+ btn_img1.click(process_image, inputs=[inp1],outputs=[output_text1,output_audio1])
145
+ btn_img2.click(process_image, inputs=[inp2],outputs=[output_text2,output_audio2])
146
+ btn_img3.click(process_image, inputs=[inp3],outputs=[output_text3,output_audio3])
147
+
148
+ input_txt1.submit(process_text, inputs=[input_txt1],outputs=[output_img1,output_audio4])
149
+
150
+ btn_audio1.click(process_audio, inputs=[inp_audio1],outputs=[output_img7,output_text7])
151
+ btn_audio2.click(process_audio, inputs=[inp_audio2],outputs=[output_img8,output_text8])
152
+ btn_audio3.click(process_audio, inputs=[inp_audio3],outputs=[output_img9,output_text9])
153
+
154
+ if __name__ == "__main__":
155
+ app.launch(share=True)
156
+
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ lancedb
2
+ gradio
3
+ pandas
4
+ imagebind@git+https://github.com/raghavdixit99/ImageBind.git
bird_audio.wav β†’ test_inputs/bird_audio.wav RENAMED
File without changes
bird_image.jpg β†’ test_inputs/bird_image.jpg RENAMED
File without changes
car.jpg β†’ test_inputs/car.jpg RENAMED
File without changes
car_audio.wav β†’ test_inputs/car_audio.wav RENAMED
File without changes
dog_audio.wav β†’ test_inputs/dog_audio.wav RENAMED
File without changes
dog_image.jpg β†’ test_inputs/dog_image.jpg RENAMED
File without changes
dragon-growl-37570.wav β†’ test_inputs/dragon-growl-37570.wav RENAMED
File without changes
dragon.jpg β†’ test_inputs/dragon.jpg RENAMED
File without changes