tricktreat commited on
Commit
c5ea4b9
1 Parent(s): 56994ad

update gpt4

Browse files
Files changed (5) hide show
  1. app.py +1 -1
  2. awesome_chat.py +10 -5
  3. config.gradio.yaml +28 -3
  4. demos/demo_parse_task.json +308 -16
  5. get_token_ids.py +1 -1
app.py CHANGED
@@ -60,7 +60,7 @@ class Client:
60
 
61
  def add_text(self, messages, message):
62
  if not self.OPENAI_KEY or not self.OPENAI_KEY.startswith("sk-") or not self.HUGGINGFACE_TOKEN or not self.HUGGINGFACE_TOKEN.startswith("hf_"):
63
- return messages, "Please set your OpenAI API key and Hugging Face token first!!!"
64
  self.add_message(message, "user")
65
  messages = messages + [(message, None)]
66
  urls, image_urls, audio_urls, video_urls = self.extract_medias(message)
60
 
61
  def add_text(self, messages, message):
62
  if not self.OPENAI_KEY or not self.OPENAI_KEY.startswith("sk-") or not self.HUGGINGFACE_TOKEN or not self.HUGGINGFACE_TOKEN.startswith("hf_"):
63
+ return messages, "Please set your OpenAI API key and Hugging Face token first!"
64
  self.add_message(message, "user")
65
  messages = messages + [(message, None)]
66
  urls, image_urls, audio_urls, video_urls = self.extract_medias(message)
awesome_chat.py CHANGED
@@ -49,13 +49,15 @@ if LOG_HF_TOKEN:
49
  )
50
 
51
  logger = logging.getLogger(__name__)
52
- logger.setLevel(logging.CRITICAL)
 
 
53
 
54
  handler = logging.StreamHandler()
55
  formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
56
  handler.setFormatter(formatter)
57
- if not config["debug"]:
58
- handler.setLevel(logging.INFO)
59
  logger.addHandler(handler)
60
 
61
  log_file = config["log_file"]
@@ -175,7 +177,7 @@ def replace_slot(text, entries):
175
  for key, value in entries.items():
176
  if not isinstance(value, str):
177
  value = str(value)
178
- text = text.replace("{{" + key +"}}", value.replace('"', "'").replace('\n', ""))
179
  return text
180
 
181
  def find_json(s):
@@ -275,6 +277,9 @@ def chitchat(messages, openaikey=None):
275
  def parse_task(context, input, openaikey=None):
276
  demos_or_presteps = parse_task_demos_or_presteps
277
  messages = json.loads(demos_or_presteps)
 
 
 
278
  messages.insert(0, {"role": "system", "content": parse_task_tprompt})
279
 
280
  # cut chat logs
@@ -337,7 +342,7 @@ def response_results(input, results, openaikey=None):
337
  "input": input,
338
  "processes": results
339
  })
340
- messages = json.loads(demos_or_presteps)
341
  messages.insert(0, {"role": "system", "content": response_results_tprompt})
342
  messages.append({"role": "user", "content": prompt})
343
  logger.debug(messages)
49
  )
50
 
51
  logger = logging.getLogger(__name__)
52
+ logger.setLevel(logging.INFO)
53
+ logger.handlers = []
54
+ logger.propagate = False
55
 
56
  handler = logging.StreamHandler()
57
  formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
58
  handler.setFormatter(formatter)
59
+ if config["debug"]:
60
+ handler.setLevel(logging.DEBUG)
61
  logger.addHandler(handler)
62
 
63
  log_file = config["log_file"]
177
  for key, value in entries.items():
178
  if not isinstance(value, str):
179
  value = str(value)
180
+ text = text.replace("{{" + key +"}}", value.replace('"', "'").replace('\n', "").replace('\\', '\\\\'))
181
  return text
182
 
183
  def find_json(s):
277
  def parse_task(context, input, openaikey=None):
278
  demos_or_presteps = parse_task_demos_or_presteps
279
  messages = json.loads(demos_or_presteps)
280
+ for message in messages:
281
+ if not isinstance(message["content"], str):
282
+ message["content"] = json.dumps(message["content"], ensure_ascii=False)
283
  messages.insert(0, {"role": "system", "content": parse_task_tprompt})
284
 
285
  # cut chat logs
342
  "input": input,
343
  "processes": results
344
  })
345
+ messages = json.loads(demos_or_presteps, strict=False)
346
  messages.insert(0, {"role": "system", "content": response_results_tprompt})
347
  messages.append({"role": "user", "content": prompt})
348
  logger.debug(messages)
config.gradio.yaml CHANGED
@@ -16,8 +16,32 @@ logit_bias:
16
  parse_task: 0.5
17
  choose_model: 5
18
  tprompt:
19
- parse_task: >-
20
- #1 Task Planning Stage: The AI assistant can parse user input to several tasks: [{"task": task, "id": task_id, "dep": dependency_task_id, "args": {"text": text or <GENERATED>-dep_id, "image": image_url or <GENERATED>-dep_id, "audio": audio_url or <GENERATED>-dep_id}}]. The special tag "<GENERATED>-dep_id" refer to the one genereted text/image/audio in the dependency task (Please consider whether the dependency task generates resources of this type.) and "dep_id" must be in "dep" list. The "dep" field denotes the ids of the previous prerequisite tasks which generate a new resource that the current task relies on. The "args" field must in ["text", "image", "audio"], nothing else. The task MUST be selected from the following options: "token-classification", "text2text-generation", "summarization", "translation", "question-answering", "conversational", "text-generation", "sentence-similarity", "tabular-classification", "object-detection", "image-classification", "image-to-image", "image-to-text", "text-to-image", "text-to-video", "visual-question-answering", "document-question-answering", "image-segmentation", "depth-estimation", "text-to-speech", "automatic-speech-recognition", "audio-to-audio", "audio-classification", "canny-control", "hed-control", "mlsd-control", "normal-control", "openpose-control", "canny-text-to-image", "depth-text-to-image", "hed-text-to-image", "mlsd-text-to-image", "normal-text-to-image", "openpose-text-to-image", "seg-text-to-image". There may be multiple tasks of the same type. Think step by step about all the tasks needed to resolve the user's request. Parse out as few tasks as possible while ensuring that the user request can be resolved. Pay attention to the dependencies and order among tasks. If the user input can't be parsed, you need to reply empty JSON [].
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  choose_model: >-
22
  #2 Model Selection Stage: Given the user request and the parsed tasks, the AI assistant helps the user to select a suitable model from a list of models to process the user request. The assistant should focus more on the description of the model and find the model that has the most potential to solve requests and tasks. Also, prefer models with local inference endpoints for speed and stability.
23
  response_results: >-
@@ -27,7 +51,8 @@ demos_or_presteps:
27
  choose_model: demos/demo_choose_model.json
28
  response_results: demos/demo_response_results.json
29
  prompt:
30
- parse_task: The chat log [ {{context}} ] may contain the resources I mentioned. Now I input { {{input}} }. Pay attention to the input and output types of tasks and the dependencies between tasks.
 
31
  choose_model: >-
32
  Please choose the most suitable model from {{metas}} for the task {{task}}. The output must be in a strict JSON format: {"id": "id", "reason": "your detail reasons for the choice"}.
33
  response_results: >-
16
  parse_task: 0.5
17
  choose_model: 5
18
  tprompt:
19
+ parse_task: |-
20
+ #1 Task Planning Stage:
21
+ # Objective:
22
+ Parse the user input into a set of sub tasks.
23
+ # Task Structure:
24
+ {
25
+ "task": task,_name,
26
+ "id": task_id,
27
+ "dep": [dependency_task_id,s],
28
+ "args": {
29
+ "text": text orOR &lt;GENERATED&gt;-dep_id,
30
+ "image": image_url orOR &lt;GENERATED&gt;-dep_id,
31
+ "audio": audio_url orOR &lt;GENERATED&gt;-dep_id}}]. The special tag "
32
+ }
33
+ }
34
+ # Key Points:
35
+ Key Points:
36
+ 1. GENERATED-dep_id Tag: This refers to a resource (text, image, audio) generated by a dependent task. Ensure the dependency task can produce that type of resource.
37
+ 2. dep Field: Lists the IDs of prerequisite tasks. These tasks generate resources required by the current task.
38
+ 3. args Field: Contains parameters for the task. Only "text", "image", and "audio" are accepted.
39
+ 4. Task Options: The task must be one of the following:
40
+ "token-classification", "text2text-generation", "summarization", "translation", "question-answering", "conversational", "text-generation", "sentence-similarity", "tabular-classification", "object-detection", "image-classification", "image-to-image", "image-to-text", "text-to-image", "text-to-video", "visual-question-answering", "document-question-answering", "image-segmentation", "depth-estimation", "text-to-speech", "automatic-speech-recognition", "audio-to-audio", "audio-classification", "canny-control", "hed-control", "mlsd-control", "normal-control", "openpose-control", "canny-text-to-image", "depth-text-to-image", "hed-text-to-image", "mlsd-text-to-image", "normal-text-to-image", "openpose-text-to-image", "seg-text-to-image". Note: You can have multiple tasks of the same type.
41
+ 5. Efficiency: Aim to parse the fewest tasks needed to fulfill the user's request.
42
+ 6. Ordering: Ensure that tasks are in the correct sequence based on their dependencies.
43
+ # Output:
44
+ If the user input can be parsed into tasks, return the tasks as JSON. If not, return an empty JSON array [].
45
  choose_model: >-
46
  #2 Model Selection Stage: Given the user request and the parsed tasks, the AI assistant helps the user to select a suitable model from a list of models to process the user request. The assistant should focus more on the description of the model and find the model that has the most potential to solve requests and tasks. Also, prefer models with local inference endpoints for speed and stability.
47
  response_results: >-
51
  choose_model: demos/demo_choose_model.json
52
  response_results: demos/demo_response_results.json
53
  prompt:
54
+ parse_task: >+
55
+ The chat log [ {{context}} ] may contain the resources I mentioned. Now I input { {{input}} }. Please parse the input into tasks, output in a strict JSON object:
56
  choose_model: >-
57
  Please choose the most suitable model from {{metas}} for the task {{task}}. The output must be in a strict JSON format: {"id": "id", "reason": "your detail reasons for the choice"}.
58
  response_results: >-
demos/demo_parse_task.json CHANGED
@@ -1,3 +1,4 @@
 
1
  [
2
  {
3
  "role": "user",
@@ -5,70 +6,361 @@
5
  },
6
  {
7
  "role": "assistant",
8
- "content": "[{\"task\": \"image-to-text\", \"id\": 0, \"dep\": [-1], \"args\": {\"image\": \"e1.jpg\" }}, {\"task\": \"object-detection\", \"id\": 1, \"dep\": [-1], \"args\": {\"image\": \"e1.jpg\" }}, {\"task\": \"visual-question-answering\", \"id\": 2, \"dep\": [1], \"args\": {\"image\": \"<GENERATED>-1\", \"text\": \"How many sheep in the picture\"}} }}, {\"task\": \"image-to-text\", \"id\": 3, \"dep\": [-1], \"args\": {\"image\": \"e2.png\" }}, {\"task\": \"object-detection\", \"id\": 4, \"dep\": [-1], \"args\": {\"image\": \"e2.png\" }}, {\"task\": \"visual-question-answering\", \"id\": 5, \"dep\": [4], \"args\": {\"image\": \"<GENERATED>-4\", \"text\": \"How many sheep in the picture\"}} }}, {\"task\": \"image-to-text\", \"id\": 6, \"dep\": [-1], \"args\": {\"image\": \"e3.jpg\" }}, {\"task\": \"object-detection\", \"id\": 7, \"dep\": [-1], \"args\": {\"image\": \"e3.jpg\" }}, {\"task\": \"visual-question-answering\", \"id\": 8, \"dep\": [7], \"args\": {\"image\": \"<GENERATED>-7\", \"text\": \"How many sheep in the picture\"}}]"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  },
10
 
 
11
  {
12
- "role":"user",
13
- "content":"Look at /e.jpg, can you tell me how many objects in the picture? Give me a picture and video similar to this one."
14
  },
15
  {
16
- "role":"assistant",
17
- "content":"[{\"task\": \"image-to-text\", \"id\": 0, \"dep\": [-1], \"args\": {\"image\": \"/e.jpg\" }}, {\"task\": \"object-detection\", \"id\": 1, \"dep\": [-1], \"args\": {\"image\": \"/e.jpg\" }}, {\"task\": \"visual-question-answering\", \"id\": 2, \"dep\": [1], \"args\": {\"image\": \"<GENERATED>-1\", \"text\": \"how many objects in the picture?\" }}, {\"task\": \"text-to-image\", \"id\": 3, \"dep\": [0], \"args\": {\"text\": \"<GENERATED-0>\" }}, {\"task\": \"image-to-image\", \"id\": 4, \"dep\": [-1], \"args\": {\"image\": \"/e.jpg\" }}, {\"task\": \"text-to-video\", \"id\": 5, \"dep\": [0], \"args\": {\"text\": \"<GENERATED-0>\" }}]"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  },
19
 
20
  {
21
- "role":"user",
22
- "content":"given a document /images/e.jpeg, answer me what is the student amount? And describe the image with your voice"
23
  },
24
  {
25
- "role":"assistant",
26
- "content":"{\"task\": \"document-question-answering\", \"id\": 0, \"dep\": [-1], \"args\": {\"image\": \"/images/e.jpeg\", \"text\": \"what is the student amount?\" }}, {\"task\": \"visual-question-answering\", \"id\": 1, \"dep\": [-1], \"args\": {\"image\": \"/images/e.jpeg\", \"text\": \"what is the student amount?\" }}, {\"task\": \"image-to-text\", \"id\": 2, \"dep\": [-1], \"args\": {\"image\": \"/images/e.jpg\" }}, {\"task\": \"text-to-speech\", \"id\": 3, \"dep\": [2], \"args\": {\"text\": \"<GENERATED>-2\" }}]"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  },
28
 
 
29
  {
30
  "role": "user",
31
- "content": "Given an image /example.jpg, first generate a hed image, then based on the hed image generate a new image where a girl is reading a book"
32
  },
33
  {
34
  "role": "assistant",
35
- "content": "[{\"task\": \"openpose-control\", \"id\": 0, \"dep\": [-1], \"args\": {\"image\": \"/example.jpg\" }}, {\"task\": \"openpose-text-to-image\", \"id\": 1, \"dep\": [0], \"args\": {\"text\": \"a girl is reading a book\", \"image\": \"<GENERATED>-0\" }}]"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  },
37
 
 
38
  {
39
  "role": "user",
40
  "content": "please show me a video and an image of (based on the text) 'a boy is running' and dub it"
41
  },
42
  {
43
  "role": "assistant",
44
- "content": "[{\"task\": \"text-to-video\", \"id\": 0, \"dep\": [-1], \"args\": {\"text\": \"a boy is running\" }}, {\"task\": \"text-to-speech\", \"id\": 1, \"dep\": [-1], \"args\": {\"text\": \"a boy is running\" }}, {\"task\": \"text-to-image\", \"id\": 2, \"dep\": [-1], \"args\": {\"text\": \"a boy is running\" }}]"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
  },
46
 
47
-
48
  {
49
  "role": "user",
50
  "content": "please show me a joke and an image of cat"
51
  },
52
  {
53
  "role": "assistant",
54
- "content": "[{\"task\": \"conversational\", \"id\": 0, \"dep\": [-1], \"args\": {\"text\": \"please show me a joke of cat\" }}, {\"task\": \"text-to-image\", \"id\": 1, \"dep\": [-1], \"args\": {\"text\": \"a photo of cat\" }}]"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
  },
56
 
 
57
  {
58
  "role": "user",
59
  "content": "give me a picture about a cut dog, then describe the image to me and tell a story about it"
60
  },
61
  {
62
  "role": "assistant",
63
- "content": "[{\"task\": \"text-to-image\", \"id\": 0, \"dep\": [-1], \"args\": {\"text\": \"a picture of a cut dog\" }}, {\"task\": \"image-to-text\", \"id\": 1, \"dep\": [0], \"args\": {\"image\": \"<GENERATED>-0\" }}, {\"task\": \"text-generation\", \"id\": 2, \"dep\": [1], \"args\": {\"text\": \"<GENERATED>-1\" }}, {\"task\": \"text-to-speech\", \"id\": 3, \"dep\": [2], \"args\": {\"text\": \"<GENERATED>-2\" }}]"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
  },
65
 
 
66
  {
67
  "role": "user",
68
  "content": "give you a picture /example.jpg, what's in it and tell me a joke about it"
69
  },
70
  {
71
  "role": "assistant",
72
- "content": "[{\"task\": \"image-to-text\", \"id\": 0, \"dep\": [-1], \"args\": {\"image\": \"/example.jpg\" }}, {\"task\": \"object-detection\", \"id\": 1, \"dep\": [-1], \"args\": {\"image\": \"/example.jpg\" }}, {\"task\": \"conversational\", \"id\": 2, \"dep\": [0], \"args\": {\"text\": \"<GENERATED>-0\" }}, {\"task\": \"text-to-speech\", \"id\": 3, \"dep\": [2], \"args\": {\"text\": \"<GENERATED>-1\" }}]"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
  }
74
  ]
1
+
2
  [
3
  {
4
  "role": "user",
6
  },
7
  {
8
  "role": "assistant",
9
+ "content": [
10
+ {
11
+ "task": "image-to-text",
12
+ "id": 0,
13
+ "dep": [-1],
14
+ "args": {
15
+ "image": "e1.jpg"
16
+ }
17
+ },
18
+ {
19
+ "task": "object-detection",
20
+ "id": 1,
21
+ "dep": [-1],
22
+ "args": {
23
+ "image": "e1.jpg"
24
+ }
25
+ },
26
+ {
27
+ "task": "visual-question-answering",
28
+ "id": 2,
29
+ "dep": [1],
30
+ "args": {
31
+ "image": "<GENERATED>-1",
32
+ "text": "How many sheep in the picture"
33
+ }
34
+ },
35
+ {
36
+ "task": "image-to-text",
37
+ "id": 3,
38
+ "dep": [-1],
39
+ "args": {
40
+ "image": "e2.png"
41
+ }
42
+ },
43
+ {
44
+ "task": "object-detection",
45
+ "id": 4,
46
+ "dep": [-1],
47
+ "args": {
48
+ "image": "e2.png"
49
+ }
50
+ },
51
+ {
52
+ "task": "visual-question-answering",
53
+ "id": 5,
54
+ "dep": [4],
55
+ "args": {
56
+ "image": "<GENERATED>-4",
57
+ "text": "How many sheep in the picture"
58
+ }
59
+ },
60
+ {
61
+ "task": "image-to-text",
62
+ "id": 6,
63
+ "dep": [-1],
64
+ "args": {
65
+ "image": "e3.jpg"
66
+ }
67
+ },
68
+ {
69
+ "task": "object-detection",
70
+ "id": 7,
71
+ "dep": [-1],
72
+ "args": {
73
+ "image": "e3.jpg"
74
+ }
75
+ },
76
+ {
77
+ "task": "visual-question-answering",
78
+ "id": 8,
79
+ "dep": [7],
80
+ "args": {
81
+ "image": "<GENERATED>-7",
82
+ "text": "How many sheep in the picture"
83
+ }
84
+ }
85
+ ]
86
  },
87
 
88
+
89
  {
90
+ "role": "user",
91
+ "content": "Look at /e.jpg, can you tell me how many objects in the picture? Give me a picture and video similar to this one."
92
  },
93
  {
94
+ "role": "assistant",
95
+ "content": [
96
+ {
97
+ "task": "image-to-text",
98
+ "id": 0,
99
+ "dep": [-1],
100
+ "args": {
101
+ "image": "/e.jpg"
102
+ }
103
+ },
104
+ {
105
+ "task": "object-detection",
106
+ "id": 1,
107
+ "dep": [-1],
108
+ "args": {
109
+ "image": "/e.jpg"
110
+ }
111
+ },
112
+ {
113
+ "task": "visual-question-answering",
114
+ "id": 2,
115
+ "dep": [1],
116
+ "args": {
117
+ "image": "<GENERATED>-1",
118
+ "text": "how many objects in the picture?"
119
+ }
120
+ },
121
+ {
122
+ "task": "text-to-image",
123
+ "id": 3,
124
+ "dep": [0],
125
+ "args": {
126
+ "text": "<GENERATED-0>"
127
+ }
128
+ },
129
+ {
130
+ "task": "image-to-image",
131
+ "id": 4,
132
+ "dep": [-1],
133
+ "args": {
134
+ "image": "/e.jpg"
135
+ }
136
+ },
137
+ {
138
+ "task": "text-to-video",
139
+ "id": 5,
140
+ "dep": [0],
141
+ "args": {
142
+ "text": "<GENERATED-0>"
143
+ }
144
+ }
145
+ ]
146
  },
147
 
148
  {
149
+ "role": "user",
150
+ "content": "given a document /images/e.jpeg, answer me what is the student amount? And describe the image with your voice"
151
  },
152
  {
153
+ "role": "assistant",
154
+ "content": [
155
+ {
156
+ "task": "document-question-answering",
157
+ "id": 0,
158
+ "dep": [-1],
159
+ "args": {
160
+ "image": "/images/e.jpeg",
161
+ "text": "what is the student amount?"
162
+ }
163
+ },
164
+ {
165
+ "task": "visual-question-answering",
166
+ "id": 1,
167
+ "dep": [-1],
168
+ "args": {
169
+ "image": "/images/e.jpeg",
170
+ "text": "what is the student amount?"
171
+ }
172
+ },
173
+ {
174
+ "task": "image-to-text",
175
+ "id": 2,
176
+ "dep": [-1],
177
+ "args": {
178
+ "image": "/images/e.jpeg"
179
+ }
180
+ },
181
+ {
182
+ "task": "text-to-speech",
183
+ "id": 3,
184
+ "dep": [2],
185
+ "args": {
186
+ "text": "<GENERATED>-2"
187
+ }
188
+ }
189
+ ]
190
  },
191
 
192
+
193
  {
194
  "role": "user",
195
+ "content": "Given an image /example.jpg, first generate a pose image, then based on the pose generate a new image where a girl is reading a book"
196
  },
197
  {
198
  "role": "assistant",
199
+ "content": [
200
+ {
201
+ "task": "openpose-control",
202
+ "id": 0,
203
+ "dep": [-1],
204
+ "args": {
205
+ "image": "/example.jpg"
206
+ }
207
+ },
208
+ {
209
+ "task": "openpose-text-to-image",
210
+ "id": 1,
211
+ "dep": [0],
212
+ "args": {
213
+ "text": "a girl is reading a book",
214
+ "image": "<GENERATED>-0"
215
+ }
216
+ }
217
+ ]
218
  },
219
 
220
+
221
  {
222
  "role": "user",
223
  "content": "please show me a video and an image of (based on the text) 'a boy is running' and dub it"
224
  },
225
  {
226
  "role": "assistant",
227
+ "content": [
228
+ {
229
+ "task": "text-to-video",
230
+ "id": 0,
231
+ "dep": [-1],
232
+ "args": {
233
+ "text": "a boy is running"
234
+ }
235
+ },
236
+ {
237
+ "task": "text-to-speech",
238
+ "id": 1,
239
+ "dep": [-1],
240
+ "args": {
241
+ "text": "a boy is running"
242
+ }
243
+ },
244
+ {
245
+ "task": "text-to-image",
246
+ "id": 2,
247
+ "dep": [-1],
248
+ "args": {
249
+ "text": "a boy is running"
250
+ }
251
+ }
252
+ ]
253
  },
254
 
 
255
  {
256
  "role": "user",
257
  "content": "please show me a joke and an image of cat"
258
  },
259
  {
260
  "role": "assistant",
261
+ "content": [
262
+ {
263
+ "task": "conversational",
264
+ "id": 0,
265
+ "dep": [-1],
266
+ "args": {
267
+ "text": "please show me a joke of cat"
268
+ }
269
+ },
270
+ {
271
+ "task": "text-to-image",
272
+ "id": 1,
273
+ "dep": [-1],
274
+ "args": {
275
+ "text": "a photo of cat"
276
+ }
277
+ }
278
+ ]
279
  },
280
 
281
+
282
  {
283
  "role": "user",
284
  "content": "give me a picture about a cut dog, then describe the image to me and tell a story about it"
285
  },
286
  {
287
  "role": "assistant",
288
+ "content": [
289
+ {
290
+ "task": "text-to-image",
291
+ "id": 0,
292
+ "dep": [-1],
293
+ "args": {
294
+ "text": "a picture of a cute dog"
295
+ }
296
+ },
297
+ {
298
+ "task": "image-to-text",
299
+ "id": 1,
300
+ "dep": [0],
301
+ "args": {
302
+ "image": "<GENERATED>-0"
303
+ }
304
+ },
305
+ {
306
+ "task": "text-generation",
307
+ "id": 2,
308
+ "dep": [1],
309
+ "args": {
310
+ "text": "<GENERATED>-1"
311
+ }
312
+ },
313
+ {
314
+ "task": "text-to-speech",
315
+ "id": 3,
316
+ "dep": [2],
317
+ "args": {
318
+ "text": "<GENERATED>-2"
319
+ }
320
+ }
321
+ ]
322
  },
323
 
324
+
325
  {
326
  "role": "user",
327
  "content": "give you a picture /example.jpg, what's in it and tell me a joke about it"
328
  },
329
  {
330
  "role": "assistant",
331
+ "content": [
332
+ {
333
+ "task": "image-to-text",
334
+ "id": 0,
335
+ "dep": [-1],
336
+ "args": {
337
+ "image": "/example.jpg"
338
+ }
339
+ },
340
+ {
341
+ "task": "object-detection",
342
+ "id": 1,
343
+ "dep": [-1],
344
+ "args": {
345
+ "image": "/example.jpg"
346
+ }
347
+ },
348
+ {
349
+ "task": "conversational",
350
+ "id": 2,
351
+ "dep": [0],
352
+ "args": {
353
+ "text": "<GENERATED>-0"
354
+ }
355
+ },
356
+ {
357
+ "task": "text-to-speech",
358
+ "id": 3,
359
+ "dep": [2],
360
+ "args": {
361
+ "text": "<GENERATED>-1"
362
+ }
363
+ }
364
+ ]
365
  }
366
  ]
get_token_ids.py CHANGED
@@ -17,7 +17,7 @@ encodings = {
17
  }
18
 
19
  max_length = {
20
- "gpt-4": 4096,
21
  "gpt-3.5-turbo": 4096,
22
  "gpt-3.5-turbo-0301": 4096,
23
  "text-davinci-003": 4096,
17
  }
18
 
19
  max_length = {
20
+ "gpt-4": 8192,
21
  "gpt-3.5-turbo": 4096,
22
  "gpt-3.5-turbo-0301": 4096,
23
  "text-davinci-003": 4096,