khulnasoft commited on
Commit
395e92a
1 Parent(s): 3dcc707

Create demos/demo_parse_task.json

Browse files
Files changed (1) hide show
  1. demos/demo_parse_task.json +365 -0
demos/demo_parse_task.json ADDED
@@ -0,0 +1,365 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "role": "user",
4
+ "content": "Give you some pictures e1.jpg, e2.png, e3.jpg, help me count the number of sheep?"
5
+ },
6
+ {
7
+ "role": "assistant",
8
+ "content": [
9
+ {
10
+ "task": "image-to-text",
11
+ "id": 0,
12
+ "dep": [-1],
13
+ "args": {
14
+ "image": "e1.jpg"
15
+ }
16
+ },
17
+ {
18
+ "task": "object-detection",
19
+ "id": 1,
20
+ "dep": [-1],
21
+ "args": {
22
+ "image": "e1.jpg"
23
+ }
24
+ },
25
+ {
26
+ "task": "visual-question-answering",
27
+ "id": 2,
28
+ "dep": [1],
29
+ "args": {
30
+ "image": "<GENERATED>-1",
31
+ "text": "How many sheep in the picture"
32
+ }
33
+ },
34
+ {
35
+ "task": "image-to-text",
36
+ "id": 3,
37
+ "dep": [-1],
38
+ "args": {
39
+ "image": "e2.png"
40
+ }
41
+ },
42
+ {
43
+ "task": "object-detection",
44
+ "id": 4,
45
+ "dep": [-1],
46
+ "args": {
47
+ "image": "e2.png"
48
+ }
49
+ },
50
+ {
51
+ "task": "visual-question-answering",
52
+ "id": 5,
53
+ "dep": [4],
54
+ "args": {
55
+ "image": "<GENERATED>-4",
56
+ "text": "How many sheep in the picture"
57
+ }
58
+ },
59
+ {
60
+ "task": "image-to-text",
61
+ "id": 6,
62
+ "dep": [-1],
63
+ "args": {
64
+ "image": "e3.jpg"
65
+ }
66
+ },
67
+ {
68
+ "task": "object-detection",
69
+ "id": 7,
70
+ "dep": [-1],
71
+ "args": {
72
+ "image": "e3.jpg"
73
+ }
74
+ },
75
+ {
76
+ "task": "visual-question-answering",
77
+ "id": 8,
78
+ "dep": [7],
79
+ "args": {
80
+ "image": "<GENERATED>-7",
81
+ "text": "How many sheep in the picture"
82
+ }
83
+ }
84
+ ]
85
+ },
86
+
87
+
88
+ {
89
+ "role": "user",
90
+ "content": "Look at /e.jpg, can you tell me how many objects in the picture? Give me a picture and video similar to this one."
91
+ },
92
+ {
93
+ "role": "assistant",
94
+ "content": [
95
+ {
96
+ "task": "image-to-text",
97
+ "id": 0,
98
+ "dep": [-1],
99
+ "args": {
100
+ "image": "/e.jpg"
101
+ }
102
+ },
103
+ {
104
+ "task": "object-detection",
105
+ "id": 1,
106
+ "dep": [-1],
107
+ "args": {
108
+ "image": "/e.jpg"
109
+ }
110
+ },
111
+ {
112
+ "task": "visual-question-answering",
113
+ "id": 2,
114
+ "dep": [1],
115
+ "args": {
116
+ "image": "<GENERATED>-1",
117
+ "text": "how many objects in the picture?"
118
+ }
119
+ },
120
+ {
121
+ "task": "text-to-image",
122
+ "id": 3,
123
+ "dep": [0],
124
+ "args": {
125
+ "text": "<GENERATED-0>"
126
+ }
127
+ },
128
+ {
129
+ "task": "image-to-image",
130
+ "id": 4,
131
+ "dep": [-1],
132
+ "args": {
133
+ "image": "/e.jpg"
134
+ }
135
+ },
136
+ {
137
+ "task": "text-to-video",
138
+ "id": 5,
139
+ "dep": [0],
140
+ "args": {
141
+ "text": "<GENERATED-0>"
142
+ }
143
+ }
144
+ ]
145
+ },
146
+
147
+ {
148
+ "role": "user",
149
+ "content": "given a document /images/e.jpeg, answer me what is the student amount? And describe the image with your voice"
150
+ },
151
+ {
152
+ "role": "assistant",
153
+ "content": [
154
+ {
155
+ "task": "document-question-answering",
156
+ "id": 0,
157
+ "dep": [-1],
158
+ "args": {
159
+ "image": "/images/e.jpeg",
160
+ "text": "what is the student amount?"
161
+ }
162
+ },
163
+ {
164
+ "task": "visual-question-answering",
165
+ "id": 1,
166
+ "dep": [-1],
167
+ "args": {
168
+ "image": "/images/e.jpeg",
169
+ "text": "what is the student amount?"
170
+ }
171
+ },
172
+ {
173
+ "task": "image-to-text",
174
+ "id": 2,
175
+ "dep": [-1],
176
+ "args": {
177
+ "image": "/images/e.jpeg"
178
+ }
179
+ },
180
+ {
181
+ "task": "text-to-speech",
182
+ "id": 3,
183
+ "dep": [2],
184
+ "args": {
185
+ "text": "<GENERATED>-2"
186
+ }
187
+ }
188
+ ]
189
+ },
190
+
191
+
192
+ {
193
+ "role": "user",
194
+ "content": "Given an image /example.jpg, first generate a pose image, then based on the pose generate a new image where a girl is reading a book"
195
+ },
196
+ {
197
+ "role": "assistant",
198
+ "content": [
199
+ {
200
+ "task": "openpose-control",
201
+ "id": 0,
202
+ "dep": [-1],
203
+ "args": {
204
+ "image": "/example.jpg"
205
+ }
206
+ },
207
+ {
208
+ "task": "openpose-text-to-image",
209
+ "id": 1,
210
+ "dep": [0],
211
+ "args": {
212
+ "text": "a girl is reading a book",
213
+ "image": "<GENERATED>-0"
214
+ }
215
+ }
216
+ ]
217
+ },
218
+
219
+
220
+ {
221
+ "role": "user",
222
+ "content": "please show me a video and an image of (based on the text) 'a boy is running' and dub it"
223
+ },
224
+ {
225
+ "role": "assistant",
226
+ "content": [
227
+ {
228
+ "task": "text-to-video",
229
+ "id": 0,
230
+ "dep": [-1],
231
+ "args": {
232
+ "text": "a boy is running"
233
+ }
234
+ },
235
+ {
236
+ "task": "text-to-speech",
237
+ "id": 1,
238
+ "dep": [-1],
239
+ "args": {
240
+ "text": "a boy is running"
241
+ }
242
+ },
243
+ {
244
+ "task": "text-to-image",
245
+ "id": 2,
246
+ "dep": [-1],
247
+ "args": {
248
+ "text": "a boy is running"
249
+ }
250
+ }
251
+ ]
252
+ },
253
+
254
+ {
255
+ "role": "user",
256
+ "content": "please show me a joke and an image of cat"
257
+ },
258
+ {
259
+ "role": "assistant",
260
+ "content": [
261
+ {
262
+ "task": "conversational",
263
+ "id": 0,
264
+ "dep": [-1],
265
+ "args": {
266
+ "text": "please show me a joke of cat"
267
+ }
268
+ },
269
+ {
270
+ "task": "text-to-image",
271
+ "id": 1,
272
+ "dep": [-1],
273
+ "args": {
274
+ "text": "a photo of cat"
275
+ }
276
+ }
277
+ ]
278
+ },
279
+
280
+
281
+ {
282
+ "role": "user",
283
+ "content": "give me a picture about a cut dog, then describe the image to me and tell a story about it"
284
+ },
285
+ {
286
+ "role": "assistant",
287
+ "content": [
288
+ {
289
+ "task": "text-to-image",
290
+ "id": 0,
291
+ "dep": [-1],
292
+ "args": {
293
+ "text": "a picture of a cute dog"
294
+ }
295
+ },
296
+ {
297
+ "task": "image-to-text",
298
+ "id": 1,
299
+ "dep": [0],
300
+ "args": {
301
+ "image": "<GENERATED>-0"
302
+ }
303
+ },
304
+ {
305
+ "task": "text-generation",
306
+ "id": 2,
307
+ "dep": [1],
308
+ "args": {
309
+ "text": "<GENERATED>-1"
310
+ }
311
+ },
312
+ {
313
+ "task": "text-to-speech",
314
+ "id": 3,
315
+ "dep": [2],
316
+ "args": {
317
+ "text": "<GENERATED>-2"
318
+ }
319
+ }
320
+ ]
321
+ },
322
+
323
+
324
+ {
325
+ "role": "user",
326
+ "content": "give you a picture /example.jpg, what's in it and tell me a joke about it"
327
+ },
328
+ {
329
+ "role": "assistant",
330
+ "content": [
331
+ {
332
+ "task": "image-to-text",
333
+ "id": 0,
334
+ "dep": [-1],
335
+ "args": {
336
+ "image": "/example.jpg"
337
+ }
338
+ },
339
+ {
340
+ "task": "object-detection",
341
+ "id": 1,
342
+ "dep": [-1],
343
+ "args": {
344
+ "image": "/example.jpg"
345
+ }
346
+ },
347
+ {
348
+ "task": "conversational",
349
+ "id": 2,
350
+ "dep": [0],
351
+ "args": {
352
+ "text": "<GENERATED>-0"
353
+ }
354
+ },
355
+ {
356
+ "task": "text-to-speech",
357
+ "id": 3,
358
+ "dep": [2],
359
+ "args": {
360
+ "text": "<GENERATED>-1"
361
+ }
362
+ }
363
+ ]
364
+ }
365
+ ]