Spaces:

microsoft
/

HuggingGPT

Runtime error

App Files Files Community

tricktreat commited on Oct 10, 2023

Commit

c5ea4b9

1 Parent(s): 56994ad

update gpt4

Browse files

Files changed (5) hide show

app.py +1 -1
awesome_chat.py +10 -5
config.gradio.yaml +28 -3
demos/demo_parse_task.json +308 -16
get_token_ids.py +1 -1

app.py CHANGED Viewed

@@ -60,7 +60,7 @@ class Client:
     def add_text(self, messages, message):
         if not self.OPENAI_KEY or not self.OPENAI_KEY.startswith("sk-") or not self.HUGGINGFACE_TOKEN or not self.HUGGINGFACE_TOKEN.startswith("hf_"):
-            return messages, "Please set your OpenAI API key and Hugging Face token first!!!"
         self.add_message(message, "user")
         messages = messages + [(message, None)]
         urls, image_urls, audio_urls, video_urls = self.extract_medias(message)

     def add_text(self, messages, message):
         if not self.OPENAI_KEY or not self.OPENAI_KEY.startswith("sk-") or not self.HUGGINGFACE_TOKEN or not self.HUGGINGFACE_TOKEN.startswith("hf_"):
+            return messages, "Please set your OpenAI API key and Hugging Face token first!"
         self.add_message(message, "user")
         messages = messages + [(message, None)]
         urls, image_urls, audio_urls, video_urls = self.extract_medias(message)

awesome_chat.py CHANGED Viewed

@@ -49,13 +49,15 @@ if LOG_HF_TOKEN:
     )
 logger = logging.getLogger(__name__)
-logger.setLevel(logging.CRITICAL)
 handler = logging.StreamHandler()
 formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
 handler.setFormatter(formatter)
-if not config["debug"]:
-    handler.setLevel(logging.INFO)
 logger.addHandler(handler)
 log_file = config["log_file"]
@@ -175,7 +177,7 @@ def replace_slot(text, entries):
     for key, value in entries.items():
         if not isinstance(value, str):
             value = str(value)
-        text = text.replace("{{" + key +"}}", value.replace('"', "'").replace('\n', ""))
     return text
 def find_json(s):
@@ -275,6 +277,9 @@ def chitchat(messages, openaikey=None):
 def parse_task(context, input, openaikey=None):
     demos_or_presteps = parse_task_demos_or_presteps
     messages = json.loads(demos_or_presteps)
     messages.insert(0, {"role": "system", "content": parse_task_tprompt})
     # cut chat logs
@@ -337,7 +342,7 @@ def response_results(input, results, openaikey=None):
         "input": input,
         "processes": results
     })
-    messages = json.loads(demos_or_presteps)
     messages.insert(0, {"role": "system", "content": response_results_tprompt})
     messages.append({"role": "user", "content": prompt})
     logger.debug(messages)

     )
 logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+logger.handlers = []
+logger.propagate = False
 handler = logging.StreamHandler()
 formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
 handler.setFormatter(formatter)
+if config["debug"]:
+    handler.setLevel(logging.DEBUG)
 logger.addHandler(handler)
 log_file = config["log_file"]
     for key, value in entries.items():
         if not isinstance(value, str):
             value = str(value)
+        text = text.replace("{{" + key +"}}", value.replace('"', "'").replace('\n', "").replace('\\', '\\\\'))
     return text
 def find_json(s):
 def parse_task(context, input, openaikey=None):
     demos_or_presteps = parse_task_demos_or_presteps
     messages = json.loads(demos_or_presteps)
+    for message in messages:
+        if not isinstance(message["content"], str):
+            message["content"] = json.dumps(message["content"], ensure_ascii=False)
     messages.insert(0, {"role": "system", "content": parse_task_tprompt})
     # cut chat logs
         "input": input,
         "processes": results
     })
+    messages = json.loads(demos_or_presteps, strict=False)
     messages.insert(0, {"role": "system", "content": response_results_tprompt})
     messages.append({"role": "user", "content": prompt})
     logger.debug(messages)

config.gradio.yaml CHANGED Viewed

@@ -16,8 +16,32 @@ logit_bias:
   parse_task: 0.5
   choose_model: 5
 tprompt:
-  parse_task: >-
-    #1 Task Planning Stage: The AI assistant can parse user input to several tasks: [{"task": task, "id": task_id, "dep": dependency_task_id, "args": {"text": text or <GENERATED>-dep_id, "image": image_url or <GENERATED>-dep_id, "audio": audio_url or <GENERATED>-dep_id}}]. The special tag "<GENERATED>-dep_id" refer to the one genereted text/image/audio in the dependency task (Please consider whether the dependency task generates resources of this type.) and "dep_id" must be in "dep" list. The "dep" field denotes the ids of the previous prerequisite tasks which generate a new resource that the current task relies on. The "args" field must in ["text", "image", "audio"], nothing else. The task MUST be selected from the following options: "token-classification", "text2text-generation", "summarization", "translation", "question-answering", "conversational", "text-generation", "sentence-similarity", "tabular-classification", "object-detection", "image-classification", "image-to-image", "image-to-text", "text-to-image", "text-to-video", "visual-question-answering", "document-question-answering", "image-segmentation", "depth-estimation", "text-to-speech", "automatic-speech-recognition", "audio-to-audio", "audio-classification", "canny-control", "hed-control", "mlsd-control", "normal-control", "openpose-control", "canny-text-to-image", "depth-text-to-image", "hed-text-to-image", "mlsd-text-to-image", "normal-text-to-image", "openpose-text-to-image", "seg-text-to-image". There may be multiple tasks of the same type. Think step by step about all the tasks needed to resolve the user's request. Parse out as few tasks as possible while ensuring that the user request can be resolved. Pay attention to the dependencies and order among tasks. If the user input can't be parsed, you need to reply empty JSON [].
   choose_model: >-
     #2 Model Selection Stage: Given the user request and the parsed tasks, the AI assistant helps the user to select a suitable model from a list of models to process the user request. The assistant should focus more on the description of the model and find the model that has the most potential to solve requests and tasks. Also, prefer models with local inference endpoints for speed and stability.
   response_results: >-
@@ -27,7 +51,8 @@ demos_or_presteps:
   choose_model: demos/demo_choose_model.json
   response_results: demos/demo_response_results.json
 prompt:
-  parse_task: The chat log [ {{context}} ] may contain the resources I mentioned. Now I input { {{input}} }. Pay attention to the input and output types of tasks and the dependencies between tasks.
   choose_model: >-
     Please choose the most suitable model from {{metas}} for the task {{task}}. The output must be in a strict JSON format: {"id": "id", "reason": "your detail reasons for the choice"}.
   response_results: >-

   parse_task: 0.5
   choose_model: 5
 tprompt:
+  parse_task: |-
+    #1 Task Planning Stage:
+    # Objective:
+    Parse the user input into a set of sub tasks.
+    # Task Structure:
+    {
+      "task": task,_name,
+      "id": task_id,
+      "dep": [dependency_task_id,s],
+      "args": {
+        "text": text orOR &lt;GENERATED&gt;-dep_id,
+        "image": image_url orOR &lt;GENERATED&gt;-dep_id,
+        "audio": audio_url orOR &lt;GENERATED&gt;-dep_id}}]. The special tag "
+        }
+    }
+    # Key Points:
+    Key Points:
+    1. GENERATED-dep_id Tag: This refers to a resource (text, image, audio) generated by a dependent task. Ensure the dependency task can produce that type of resource.
+    2. dep Field: Lists the IDs of prerequisite tasks. These tasks generate resources required by the current task.
+    3. args Field: Contains parameters for the task. Only "text", "image", and "audio" are accepted.
+    4. Task Options: The task must be one of the following:
+    "token-classification", "text2text-generation", "summarization", "translation", "question-answering", "conversational", "text-generation", "sentence-similarity", "tabular-classification", "object-detection", "image-classification", "image-to-image", "image-to-text", "text-to-image", "text-to-video", "visual-question-answering", "document-question-answering", "image-segmentation", "depth-estimation", "text-to-speech", "automatic-speech-recognition", "audio-to-audio", "audio-classification", "canny-control", "hed-control", "mlsd-control", "normal-control", "openpose-control", "canny-text-to-image", "depth-text-to-image", "hed-text-to-image", "mlsd-text-to-image", "normal-text-to-image", "openpose-text-to-image", "seg-text-to-image". Note: You can have multiple tasks of the same type.
+    5. Efficiency: Aim to parse the fewest tasks needed to fulfill the user's request.
+    6. Ordering: Ensure that tasks are in the correct sequence based on their dependencies.
+    # Output:
+    If the user input can be parsed into tasks, return the tasks as JSON. If not, return an empty JSON array [].
   choose_model: >-
     #2 Model Selection Stage: Given the user request and the parsed tasks, the AI assistant helps the user to select a suitable model from a list of models to process the user request. The assistant should focus more on the description of the model and find the model that has the most potential to solve requests and tasks. Also, prefer models with local inference endpoints for speed and stability.
   response_results: >-
   choose_model: demos/demo_choose_model.json
   response_results: demos/demo_response_results.json
 prompt:
+  parse_task: >+
+    The chat log [ {{context}} ] may contain the resources I mentioned. Now I input { {{input}} }. Please parse the input into tasks, output in a strict JSON object:
   choose_model: >-
     Please choose the most suitable model from {{metas}} for the task {{task}}. The output must be in a strict JSON format: {"id": "id", "reason": "your detail reasons for the choice"}.
   response_results: >-

demos/demo_parse_task.json CHANGED Viewed

@@ -1,3 +1,4 @@
 [
     {
         "role": "user",
@@ -5,70 +6,361 @@
     },
     {
         "role": "assistant",
-        "content": "[{\"task\": \"image-to-text\", \"id\": 0, \"dep\": [-1], \"args\": {\"image\": \"e1.jpg\" }}, {\"task\": \"object-detection\", \"id\": 1, \"dep\": [-1], \"args\": {\"image\": \"e1.jpg\" }}, {\"task\": \"visual-question-answering\", \"id\": 2, \"dep\": [1], \"args\": {\"image\": \"<GENERATED>-1\", \"text\": \"How many sheep in the picture\"}} }}, {\"task\": \"image-to-text\", \"id\": 3, \"dep\": [-1], \"args\": {\"image\": \"e2.png\" }}, {\"task\": \"object-detection\", \"id\": 4, \"dep\": [-1], \"args\": {\"image\": \"e2.png\" }}, {\"task\": \"visual-question-answering\", \"id\": 5, \"dep\": [4], \"args\": {\"image\": \"<GENERATED>-4\", \"text\": \"How many sheep in the picture\"}} }}, {\"task\": \"image-to-text\", \"id\": 6, \"dep\": [-1], \"args\": {\"image\": \"e3.jpg\" }},  {\"task\": \"object-detection\", \"id\": 7, \"dep\": [-1], \"args\": {\"image\": \"e3.jpg\" }}, {\"task\": \"visual-question-answering\", \"id\": 8, \"dep\": [7], \"args\": {\"image\": \"<GENERATED>-7\", \"text\": \"How many sheep in the picture\"}}]"
     },
     {
-        "role":"user",
-        "content":"Look at /e.jpg, can you tell me how many objects in the picture? Give me a picture and video similar to this one."
     },
     {
-        "role":"assistant",
-        "content":"[{\"task\": \"image-to-text\", \"id\": 0, \"dep\": [-1], \"args\": {\"image\": \"/e.jpg\" }}, {\"task\": \"object-detection\", \"id\": 1, \"dep\": [-1], \"args\": {\"image\": \"/e.jpg\" }}, {\"task\": \"visual-question-answering\", \"id\": 2, \"dep\": [1], \"args\": {\"image\": \"<GENERATED>-1\", \"text\": \"how many objects in the picture?\" }}, {\"task\": \"text-to-image\", \"id\": 3, \"dep\": [0], \"args\": {\"text\": \"<GENERATED-0>\" }}, {\"task\": \"image-to-image\", \"id\": 4, \"dep\": [-1], \"args\": {\"image\": \"/e.jpg\" }}, {\"task\": \"text-to-video\", \"id\": 5, \"dep\": [0], \"args\": {\"text\": \"<GENERATED-0>\" }}]"
     },
     {
-        "role":"user",
-        "content":"given a document /images/e.jpeg, answer me what is the student amount? And describe the image with your voice"
     },
     {
-        "role":"assistant",
-        "content":"{\"task\": \"document-question-answering\", \"id\": 0, \"dep\": [-1], \"args\": {\"image\": \"/images/e.jpeg\", \"text\": \"what is the student amount?\" }}, {\"task\": \"visual-question-answering\", \"id\": 1, \"dep\": [-1], \"args\": {\"image\": \"/images/e.jpeg\", \"text\": \"what is the student amount?\" }}, {\"task\": \"image-to-text\", \"id\": 2, \"dep\": [-1], \"args\": {\"image\": \"/images/e.jpg\" }}, {\"task\": \"text-to-speech\", \"id\": 3, \"dep\": [2], \"args\": {\"text\": \"<GENERATED>-2\" }}]"
     },
     {
         "role": "user",
-        "content": "Given an image /example.jpg, first generate a hed image, then based on the hed image generate a new image where a girl is reading a book"
     },
     {
         "role": "assistant",
-        "content": "[{\"task\": \"openpose-control\", \"id\": 0, \"dep\": [-1], \"args\": {\"image\": \"/example.jpg\" }},  {\"task\": \"openpose-text-to-image\", \"id\": 1, \"dep\": [0], \"args\": {\"text\": \"a girl is reading a book\", \"image\": \"<GENERATED>-0\" }}]"
     },
     {
         "role": "user",
         "content": "please show me a video and an image of (based on the text) 'a boy is running' and dub it"
     },
     {
         "role": "assistant",
-        "content": "[{\"task\": \"text-to-video\", \"id\": 0, \"dep\": [-1], \"args\": {\"text\": \"a boy is running\" }}, {\"task\": \"text-to-speech\", \"id\": 1, \"dep\": [-1], \"args\": {\"text\": \"a boy is running\" }}, {\"task\": \"text-to-image\", \"id\": 2, \"dep\": [-1], \"args\": {\"text\": \"a boy is running\" }}]"
     },
     {
         "role": "user",
         "content": "please show me a joke and an image of cat"
     },
     {
         "role": "assistant",
-        "content": "[{\"task\": \"conversational\", \"id\": 0, \"dep\": [-1], \"args\": {\"text\": \"please show me a joke of cat\" }}, {\"task\": \"text-to-image\", \"id\": 1, \"dep\": [-1], \"args\": {\"text\": \"a photo of cat\" }}]"
     },
     {
         "role": "user",
         "content": "give me a picture about a cut dog, then describe the image to me and tell a story about it"
     },
     {
         "role": "assistant",
-        "content": "[{\"task\": \"text-to-image\", \"id\": 0, \"dep\": [-1], \"args\": {\"text\": \"a picture of a cut dog\" }}, {\"task\": \"image-to-text\", \"id\": 1, \"dep\": [0], \"args\": {\"image\": \"<GENERATED>-0\" }}, {\"task\": \"text-generation\", \"id\": 2, \"dep\": [1], \"args\": {\"text\": \"<GENERATED>-1\" }}, {\"task\": \"text-to-speech\", \"id\": 3, \"dep\": [2], \"args\": {\"text\": \"<GENERATED>-2\" }}]"
     },
     {
         "role": "user",
         "content": "give you a picture /example.jpg, what's in it and tell me a joke about it"
     },
     {
         "role": "assistant",
-        "content": "[{\"task\": \"image-to-text\", \"id\": 0, \"dep\": [-1], \"args\": {\"image\": \"/example.jpg\" }}, {\"task\": \"object-detection\", \"id\": 1, \"dep\": [-1], \"args\": {\"image\": \"/example.jpg\" }}, {\"task\": \"conversational\", \"id\": 2, \"dep\": [0], \"args\": {\"text\": \"<GENERATED>-0\" }}, {\"task\": \"text-to-speech\", \"id\": 3, \"dep\": [2], \"args\": {\"text\": \"<GENERATED>-1\" }}]"
     }
 ]

 [
     {
         "role": "user",
     },
     {
         "role": "assistant",
+        "content": [
+            {
+                "task": "image-to-text",
+                "id": 0,
+                "dep": [-1],
+                "args": {
+                    "image": "e1.jpg"
+                }
+            },
+            {
+                "task": "object-detection",
+                "id": 1,
+                "dep": [-1],
+                "args": {
+                    "image": "e1.jpg"
+                }
+            },
+            {
+                "task": "visual-question-answering",
+                "id": 2,
+                "dep": [1],
+                "args": {
+                    "image": "<GENERATED>-1",
+                    "text": "How many sheep in the picture"
+                }
+            },
+            {
+                "task": "image-to-text",
+                "id": 3,
+                "dep": [-1],
+                "args": {
+                    "image": "e2.png"
+                }
+            },
+            {
+                "task": "object-detection",
+                "id": 4,
+                "dep": [-1],
+                "args": {
+                    "image": "e2.png"
+                }
+            },
+            {
+                "task": "visual-question-answering",
+                "id": 5,
+                "dep": [4],
+                "args": {
+                    "image": "<GENERATED>-4",
+                    "text": "How many sheep in the picture"
+                }
+            },
+            {
+                "task": "image-to-text",
+                "id": 6,
+                "dep": [-1],
+                "args": {
+                    "image": "e3.jpg"
+                }
+            },
+            {
+                "task": "object-detection",
+                "id": 7,
+                "dep": [-1],
+                "args": {
+                    "image": "e3.jpg"
+                }
+            },
+            {
+                "task": "visual-question-answering",
+                "id": 8,
+                "dep": [7],
+                "args": {
+                    "image": "<GENERATED>-7",
+                    "text": "How many sheep in the picture"
+                }
+            }
+        ]
     },
     {
+        "role": "user",
+        "content": "Look at /e.jpg, can you tell me how many objects in the picture? Give me a picture and video similar to this one."
     },
     {
+        "role": "assistant",
+        "content": [
+            {
+                "task": "image-to-text",
+                "id": 0,
+                "dep": [-1],
+                "args": {
+                    "image": "/e.jpg"
+                }
+            },
+            {
+                "task": "object-detection",
+                "id": 1,
+                "dep": [-1],
+                "args": {
+                    "image": "/e.jpg"
+                }
+            },
+            {
+                "task": "visual-question-answering",
+                "id": 2,
+                "dep": [1],
+                "args": {
+                    "image": "<GENERATED>-1",
+                    "text": "how many objects in the picture?"
+                }
+            },
+            {
+                "task": "text-to-image",
+                "id": 3,
+                "dep": [0],
+                "args": {
+                    "text": "<GENERATED-0>"
+                }
+            },
+            {
+                "task": "image-to-image",
+                "id": 4,
+                "dep": [-1],
+                "args": {
+                    "image": "/e.jpg"
+                }
+            },
+            {
+                "task": "text-to-video",
+                "id": 5,
+                "dep": [0],
+                "args": {
+                    "text": "<GENERATED-0>"
+                }
+            }
+        ]
     },
     {
+        "role": "user",
+        "content": "given a document /images/e.jpeg, answer me what is the student amount? And describe the image with your voice"
     },
     {
+        "role": "assistant",
+        "content": [
+            {
+                "task": "document-question-answering",
+                "id": 0,
+                "dep": [-1],
+                "args": {
+                    "image": "/images/e.jpeg",
+                    "text": "what is the student amount?"
+                }
+            },
+            {
+                "task": "visual-question-answering",
+                "id": 1,
+                "dep": [-1],
+                "args": {
+                    "image": "/images/e.jpeg",
+                    "text": "what is the student amount?"
+                }
+            },
+            {
+                "task": "image-to-text",
+                "id": 2,
+                "dep": [-1],
+                "args": {
+                    "image": "/images/e.jpeg"
+                }
+            },
+            {
+                "task": "text-to-speech",
+                "id": 3,
+                "dep": [2],
+                "args": {
+                    "text": "<GENERATED>-2"
+                }
+            }
+        ]
     },
     {
         "role": "user",
+        "content": "Given an image /example.jpg, first generate a pose image, then based on the pose generate a new image where a girl is reading a book"
     },
     {
         "role": "assistant",
+        "content": [
+            {
+                "task": "openpose-control",
+                "id": 0,
+                "dep": [-1],
+                "args": {
+                    "image": "/example.jpg"
+                }
+            },
+            {
+                "task": "openpose-text-to-image",
+                "id": 1,
+                "dep": [0],
+                "args": {
+                    "text": "a girl is reading a book",
+                    "image": "<GENERATED>-0"
+                }
+            }
+        ]
     },
     {
         "role": "user",
         "content": "please show me a video and an image of (based on the text) 'a boy is running' and dub it"
     },
     {
         "role": "assistant",
+        "content": [
+            {
+                "task": "text-to-video",
+                "id": 0,
+                "dep": [-1],
+                "args": {
+                    "text": "a boy is running"
+                }
+            },
+            {
+                "task": "text-to-speech",
+                "id": 1,
+                "dep": [-1],
+                "args": {
+                    "text": "a boy is running"
+                }
+            },
+            {
+                "task": "text-to-image",
+                "id": 2,
+                "dep": [-1],
+                "args": {
+                    "text": "a boy is running"
+                }
+            }
+        ]
     },
     {
         "role": "user",
         "content": "please show me a joke and an image of cat"
     },
     {
         "role": "assistant",
+        "content": [
+            {
+                "task": "conversational",
+                "id": 0,
+                "dep": [-1],
+                "args": {
+                    "text": "please show me a joke of cat"
+                }
+            },
+            {
+                "task": "text-to-image",
+                "id": 1,
+                "dep": [-1],
+                "args": {
+                    "text": "a photo of cat"
+                }
+            }
+        ]
     },
     {
         "role": "user",
         "content": "give me a picture about a cut dog, then describe the image to me and tell a story about it"
     },
     {
         "role": "assistant",
+        "content": [
+            {
+                "task": "text-to-image",
+                "id": 0,
+                "dep": [-1],
+                "args": {
+                    "text": "a picture of a cute dog"
+                }
+            },
+            {
+                "task": "image-to-text",
+                "id": 1,
+                "dep": [0],
+                "args": {
+                    "image": "<GENERATED>-0"
+                }
+            },
+            {
+                "task": "text-generation",
+                "id": 2,
+                "dep": [1],
+                "args": {
+                    "text": "<GENERATED>-1"
+                }
+            },
+            {
+                "task": "text-to-speech",
+                "id": 3,
+                "dep": [2],
+                "args": {
+                    "text": "<GENERATED>-2"
+                }
+            }
+        ]
     },
     {
         "role": "user",
         "content": "give you a picture /example.jpg, what's in it and tell me a joke about it"
     },
     {
         "role": "assistant",
+        "content": [
+            {
+                "task": "image-to-text",
+                "id": 0,
+                "dep": [-1],
+                "args": {
+                    "image": "/example.jpg"
+                }
+            },
+            {
+                "task": "object-detection",
+                "id": 1,
+                "dep": [-1],
+                "args": {
+                    "image": "/example.jpg"
+                }
+            },
+            {
+                "task": "conversational",
+                "id": 2,
+                "dep": [0],
+                "args": {
+                    "text": "<GENERATED>-0"
+                }
+            },
+            {
+                "task": "text-to-speech",
+                "id": 3,
+                "dep": [2],
+                "args": {
+                    "text": "<GENERATED>-1"
+                }
+            }
+        ]
     }
 ]

get_token_ids.py CHANGED Viewed

@@ -17,7 +17,7 @@ encodings = {
 }
 max_length = {
-    "gpt-4": 4096,
     "gpt-3.5-turbo": 4096,
     "gpt-3.5-turbo-0301": 4096,
     "text-davinci-003": 4096,

 }
 max_length = {
+    "gpt-4": 8192,
     "gpt-3.5-turbo": 4096,
     "gpt-3.5-turbo-0301": 4096,
     "text-davinci-003": 4096,