zwgao's picture
add file
3fdcc70
raw
history blame
No virus
21.4 kB
import os
import sys
sys.path.append(os.getcwd())
from cllm.agents.base import Action
BUILTIN_SEG_BY_POINTS = "Segment the given image based on the prompt points."
BUILTIN_SEG_BY_MASK = "Segment the given image based on the prompt mask."
# BUILTIN_REMOVE_BY_MASK = "Remove the object based on the given mask."
BUILTIN_IMAGE_TO_EDGE = "Generate the edge from the given image."
BUILTIN_GENERATE_SIMILAR_IMAGE = "Generate a new image similar to the input image"
# BUILTIN_GENERATE_SIMILAR_IMAGE2 = "Generate a similar image from the given image 2"
# BUILTIN_GENERATE_SIMILAR_IMAGE3 = "Image to image. 3"
BUILTIN_GENERATE_SIMILAR_IMAGE4 = "Generate a new image similar to image 4"
BUILTIN_GENERATE_IMAGE_HED = "Generate a new image based on HED result from input image"
BUILTIN_GENERATE_IMAGE_DEPTH = (
"Generate a new image based on depth map from input image"
)
BUILTIN_GENERATE_IMAGE_OCR = "Please extract the text from the image"
BUILTIN_TEXT_EDGE_TO_IMAGE = "Generate an image based on the given edge map."
BUILTIN_GENERATE_IMAGE = "Generate a new image that shows a woman is skiing"
BUILTIN_IMAGE_TO_VIDEO = "Generate a video from the image"
BUILTIN_COUNT_OBJECTS = "Provide me with the count of bears in the input image"
BUILTIN_VIDEO_TO_WEBPAGE = "Generate a web page for input video"
BUILTIN_TEXT_TO_MUSIC = "Please generate a piece of music based on given prompt. Here is the prompt: An 80s driving pop song with heavy drums and synth pads in the background"
BUILTIN_IMAGE_ERASING1 = "Erase the wine glass from the photo"
BUILTIN_IMAGE_ERASING2 = "Erase the cats in the photo"
BUILTIN_IMAGE_CROPPING = "Crop the cats from the photo"
BUILTIN_IMAGE_SEG = "give me the mask of elephant."
BUILTIN_IMAGE_HIGHLIGHT = "highlight the elephant."
BUILTIN_TEXT_SPEECH = "translate text into speech"
BUILTIN_DUBBING = "dub this video with the given audio"
BUILTIN_COUNT_OBJECTS2 = "Count the horse in the image."
BUILTIN_IMAGE_TO_VIDEO2 = "Generate an image that shows a serene and beautiful landscape with a calm lake reflecting the blue sky and white clouds. Then generate a video to introduce this image."
BUILTIN_IMAGE_TO_VIDEO3 = "Create a visual and auditory representation of a peaceful and scenic landscape. The image should depict a serene and beautiful landscape with a calm lake reflecting the blue sky. The music should match the image. Finally, combine the image and the music into a video that showcases the beauty of nature."
BUILTIN_VIDEO_CLS = "Recognize the action in the video"
BUILTIN_VIDEO_CLS = "Recognize the action in the video"
BUILTIN_AUDIO_CLS = "Recognize the event in this audio"
BUILTIN_IMAGE2MUSIC = "Generate a piece of music for this image"
BUILTIN_VIDEO2MUSIC = (
"Generate a piece of music for this video and dub the video with generated music"
)
BUILTIN_PLANS = {
# BUILTIN_REMOVE_BY_MASK: [
# [
# Action(
# tool_name="image_inpainting",
# inputs={"image": "image", "mask": "image.mask"},
# outputs=["<GENERATED>-0"],
# )
# ]
# ],
BUILTIN_IMAGE_TO_EDGE: [
[
Action(
tool_name="image_to_edge",
inputs={"image": "image"},
outputs=["<GENERATED>-0"],
)
]
],
BUILTIN_TEXT_EDGE_TO_IMAGE: [
[
Action(
tool_name="image_captioning",
inputs={"image": "image"},
outputs=["<TOOL-GENERATED>-prompt"],
),
Action(
tool_name="edge_text_to_image",
inputs={
"edge": "image.edge",
"text": "<TOOL-GENERATED>-prompt",
},
outputs=["<GENERATED>-0"],
),
]
],
BUILTIN_GENERATE_SIMILAR_IMAGE: [
[
Action(
tool_name="image_to_edge",
inputs={"image": "image"},
outputs=["<TOOL-GENERATED>-edge"],
),
Action(
tool_name="image_captioning",
inputs={"image": "image"},
outputs=["<TOOL-GENERATED>-prompt"],
),
Action(
tool_name="edge_text_to_image",
inputs={
"edge": "<TOOL-GENERATED>-edge",
"text": "<TOOL-GENERATED>-prompt",
},
outputs=["<GENERATED>-0"],
),
]
],
# BUILTIN_GENERATE_SIMILAR_IMAGE2: [
# [
# Action(
# tool_name="image_captioning",
# inputs={"image": "image"},
# outputs=["<TOOL-GENERATED>-prompt"],
# ),
# Action(
# tool_name="text_to_image",
# inputs={"text": "<TOOL-GENERATED>-prompt"},
# outputs=["<GENERATED>-0"],
# ),
# ]
# ],
# BUILTIN_GENERATE_SIMILAR_IMAGE3: [
# [
# Action(
# tool_name="image_to_image",
# inputs={"image": "image"},
# outputs=["<GENERATED>-0"],
# ),
# ]
# ],
BUILTIN_GENERATE_IMAGE_HED: [
[
Action(
tool_name="image_to_hed",
inputs={"image": "image"},
outputs=["<TOOL-GENERATED>-image_to_hed-hed-0"],
),
Action(
tool_name="hed_text_to_image",
inputs={
"text": "beautiful mountains and sunset",
"hed": "<TOOL-GENERATED>-image_to_hed-hed-0",
},
outputs=["<GENERATED>-0"],
),
]
],
BUILTIN_GENERATE_IMAGE_DEPTH: [
[
Action(
tool_name="image_captioning",
inputs={
"image": "image",
},
outputs=["<TOOL-GENERATED>-image_captioning-text-0"],
),
Action(
tool_name="image_to_depth",
inputs={"image": "image"},
outputs=["<TOOL-GENERATED>-image_to_depth-depth-0"],
),
Action(
tool_name="depth_text_to_image",
inputs={
"text": "<TOOL-GENERATED>-image_captioning-text-0",
"depth": "<TOOL-GENERATED>-image_to_depth-depth-0",
},
outputs=["<GENERATED>-0"],
),
]
],
BUILTIN_GENERATE_IMAGE_OCR: [
[
Action(
tool_name="optical_character_recognition",
inputs={"image": "image"},
outputs=["<GENERATED>-0"],
)
]
],
BUILTIN_COUNT_OBJECTS: [
[
Action(
tool_name="object_detection",
inputs={"image": "image"},
outputs=["<TOOL-GENERATED>-object_detection-bbox-0"],
),
Action(
tool_name="select_bbox",
inputs={
"bbox_list": "<TOOL-GENERATED>-object_detection-bbox-0",
"condition": "bear",
},
outputs=["<TOOL-GENERATED>-select_bbox-bbox-0"],
),
Action(
tool_name="count_objects",
inputs={"bbox_list": "<TOOL-GENERATED>-select_bbox-bbox-0"},
outputs=["<GENERATED>-0"],
),
],
[
Action(
tool_name="image_question_answering",
inputs={
"text": "Provide me with the count of bears in the input image",
"image": "image",
},
outputs=["<GENERATED>-1"],
)
],
],
BUILTIN_VIDEO_TO_WEBPAGE: [
[
Action(
tool_name="video_captioning",
inputs={"video": "video"},
outputs=["<TOOL-GENERATED>-text-0"],
),
Action(
tool_name="text_to_music",
inputs={"text": "<TOOL-GENERATED>-text-0"},
outputs=["<TOOL-GENERATED>-text_to_music-audio-0"],
),
Action(
tool_name="dub_video",
inputs={
"video": "video",
"audio": "<TOOL-GENERATED>-text_to_music-audio-0",
},
outputs=["<TOOL-GENERATED>-dub_video-video-0"],
),
Action(
tool_name="title_generation",
inputs={"text": "<TOOL-GENERATED>-text-0"},
outputs=["<TOOL-GENERATED>-text-1"],
),
Action(
tool_name="text_to_tags",
inputs={"text": "<TOOL-GENERATED>-text-0"},
outputs=["<TOOL-GENERATED>-tags-0"],
),
Action(
tool_name="video_to_webpage",
inputs={
"video": "<TOOL-GENERATED>-dub_video-video-0",
"title": "<TOOL-GENERATED>-text-1",
"tags": "<TOOL-GENERATED>-tags-0",
"description": "<TOOL-GENERATED>-text-0",
},
outputs=["<GENERATED>-0"],
),
]
],
BUILTIN_TEXT_TO_MUSIC: [
[
Action(
tool_name="text_to_music",
inputs={
"text": "An 80s driving pop song with heavy drums and synth pads in the background"
},
outputs=["<GENERATED>-audio-0"],
)
]
],
BUILTIN_IMAGE_ERASING1: [
[
Action(
tool_name="image_instance_segmentation",
inputs={"image": "image"},
outputs=["<TOOL-GENERATED>-image_instance_segmentation-mask-0"],
),
Action(
tool_name="select_mask",
inputs={
"mask_list": "<TOOL-GENERATED>-image_instance_segmentation-mask-0",
"condition": "wine glass",
},
outputs=["<TOOL-GENERATED>-select_mask-mask-1"],
),
Action(
tool_name="image_inpainting",
inputs={
"image": "image",
"mask": "<TOOL-GENERATED>-select_mask-mask-0",
},
outputs=["<GENERATED>-0"],
),
]
],
BUILTIN_IMAGE_ERASING2: [
[
Action(
tool_name="image_instance_segmentation",
inputs={"image": "image"},
outputs=["<TOOL-GENERATED>-image_instance_segmentation-mask-0"],
),
Action(
tool_name="select_mask",
inputs={
"mask_list": "<TOOL-GENERATED>-image_instance_segmentation-mask-0",
"condition": "cat",
},
outputs=["<TOOL-GENERATED>-select_mask-mask-0"],
),
Action(
tool_name="image_inpainting",
inputs={
"image": "image",
"mask": "<TOOL-GENERATED>-select_mask-mask-0",
},
outputs=["<GENERATED>-0"],
),
]
],
BUILTIN_IMAGE_CROPPING: [
[
Action(
tool_name="object_detection",
inputs={"image": "image"},
outputs=["<TOOL-GENERATED>-object_detection-bbox-0"],
),
Action(
tool_name="select_bbox",
inputs={
"bbox_list": "<TOOL-GENERATED>-object_detection-bbox-0",
"condition": "cat",
},
outputs=["<TOOL-GENERATED>-select_bbox-bbox-0"],
),
Action(
tool_name="image_cropping",
inputs={
"image": "image",
"object": "<TOOL-GENERATED>-select_bbox-bbox-0",
},
outputs=["<GENERATED>-0"],
),
]
],
BUILTIN_IMAGE_SEG: [
[
Action(
tool_name="image_instance_segmentation",
inputs={"image": "image"},
outputs=["<TOOL-GENERATED>-image_instance_segmentation-mask-0"],
),
Action(
tool_name="select_mask",
inputs={
"mask_list": "<TOOL-GENERATED>-image_instance_segmentation-mask-0",
"condition": "elephant",
},
outputs=["<GENERATED>-0"],
),
]
],
BUILTIN_IMAGE_HIGHLIGHT: [
[
Action(
tool_name="object_detection",
inputs={"image": "image"},
outputs=["<TOOL-GENERATED>-object_detection-bbox-0"],
),
Action(
tool_name="select_bbox",
inputs={
"bbox_list": "<TOOL-GENERATED>-object_detection-bbox-0",
"condition": "elephant",
},
outputs=["<TOOL-GENERATED>-select_bbox-bbox-0"],
),
Action(
tool_name="highlight_object_on_image",
inputs={
"image": "image",
"bbox": "<TOOL-GENERATED>-select_bbox-bbox-0",
},
outputs=["<GENERATED>-0"],
),
]
],
BUILTIN_TEXT_SPEECH: [
[
Action(
tool_name="text_to_speech",
inputs={
"text": "Hope is the thing with feathers That perches in the soul, And sings the tune without the words, And never stops at all"
},
outputs=["<GENERATED>-0"],
)
]
],
BUILTIN_DUBBING: [
[
Action(
tool_name="dub_video",
inputs={"video": "video", "audio": "audio"},
outputs=["<GENERATED>-0"],
)
]
],
BUILTIN_GENERATE_SIMILAR_IMAGE4: [
[
Action(
tool_name="segment_anything",
inputs={"image": "image"},
outputs=["<TOOL-GENERATED>-seg"],
),
Action(
tool_name="image_captioning",
inputs={"image": "image"},
outputs=["<TOOL-GENERATED>-prompt"],
),
Action(
tool_name="segmentation_text_to_image",
inputs={
"segmentation": "<TOOL-GENERATED>-seg",
"text": "<TOOL-GENERATED>-prompt",
},
outputs=["<GENERATED>-0"],
),
]
],
BUILTIN_GENERATE_IMAGE: [
[
Action(
tool_name="text_to_image",
inputs={"text": "a woman is skiing"},
outputs=["<GENERATED>-0"],
)
]
],
BUILTIN_IMAGE_TO_VIDEO: [
[
Action(
tool_name="image_to_video",
inputs={"image": "image"},
outputs=["<GENERATED>-0"],
)
]
],
BUILTIN_COUNT_OBJECTS2: [
[
Action(
tool_name="object_detection",
inputs={"image": "image"},
outputs=["<TOOL-GENERATED>-object_detection-bbox-0"],
),
Action(
tool_name="select_bbox",
inputs={
"bbox_list": "<TOOL-GENERATED>-object_detection-bbox-0",
"condition": "horse",
},
outputs=["<TOOL-GENERATED>-select_bbox-bbox-0"],
),
Action(
tool_name="count_objects",
inputs={"bbox_list": "<TOOL-GENERATED>-select_bbox-bbox-0"},
outputs=["<GENERATED>-0"],
),
],
[
Action(
tool_name="image_question_answering",
inputs={
"text": "Provide me with the count of horses in the input image",
"image": "image",
},
outputs=["<GENERATED>-1"],
)
],
],
BUILTIN_IMAGE_TO_VIDEO2: [
[
Action(
tool_name="text_to_image",
inputs={
"text": "A serene and beautiful landscape with a calm lake reflecting the blue sky and white clouds."
},
outputs=["<GENERATED>-0"],
),
],
[
Action(
tool_name="image_captioning",
inputs={"image": "<GENERATED>-0"},
outputs=["<TOOL-GENERATED>-text-0"],
),
Action(
tool_name="text_to_speech",
inputs={"text": "<TOOL-GENERATED>-text-0"},
outputs=["<TOOL-GENERATED>-text_to_speech-audio-0"],
),
Action(
tool_name="image_audio_to_video",
inputs={
"image": "<GENERATED>-0",
"audio": "<TOOL-GENERATED>-text_to_speech-audio-0",
},
outputs=["<GENERATED>-1"],
),
],
],
BUILTIN_IMAGE_TO_VIDEO3: [
[
Action(
tool_name="text_to_image",
inputs={
"text": "A serene and beautiful landscape with a calm lake reflecting the blue sky."
},
outputs=["<GENERATED>-0"],
),
],
[
Action(
tool_name="image_captioning",
inputs={"image": "<GENERATED>-0"},
outputs=["<TOOL-GENERATED>-text-0"],
),
Action(
tool_name="text_to_music",
inputs={"text": "<TOOL-GENERATED>-text-0"},
outputs=["<GENERATED>-1"],
),
],
[
Action(
tool_name="image_to_video",
inputs={
"image": "<GENERATED>-0",
},
outputs=["<TOOL-GENERATED>-image_to_video-video-0"],
),
Action(
tool_name="dub_video",
inputs={
"video": "<TOOL-GENERATED>-image_to_video-video-0",
"audio": "<GENERATED>-1",
},
outputs=["<GENERATED>-2"],
),
],
],
BUILTIN_VIDEO_CLS: [
[
Action(
tool_name="video_classification",
inputs={"video": "video"},
outputs=["<GENERATED>-0"],
)
]
],
BUILTIN_AUDIO_CLS: [
[
Action(
tool_name="audio_classification",
inputs={"audio": "audio"},
outputs=["<GENERATED>-0"],
)
]
],
BUILTIN_IMAGE2MUSIC: [
[
Action(
tool_name="image_captioning",
inputs={"image": "image"},
outputs=["<TOOL-GENERATED>-text-0"],
),
Action(
tool_name="text_to_music",
inputs={"text": "<TOOL-GENERATED>-text-0"},
outputs=["<GENERATED>-0"],
),
]
],
BUILTIN_VIDEO2MUSIC: [
[
Action(
tool_name="video_captioning",
inputs={"video": "video"},
outputs=["<TOOL-GENERATED>-text-0"],
),
Action(
tool_name="text_to_music",
inputs={"text": "<TOOL-GENERATED>-text-0"},
outputs=["<GENERATED>-0"],
),
],
[
Action(
tool_name="dub_video",
inputs={
"video": "video",
"audio": "<GENERATED>-0",
},
outputs=["<GENERATED>-1"],
),
],
],
BUILTIN_SEG_BY_POINTS: [
[
Action(
tool_name="image_segmentation_by_points",
inputs={"image": "image", "prompt_points": "prompt_points"},
outputs=["<GENERATED>-0"],
)
]
],
# BUILTIN_SEG_BY_MASK: [
# [
# Action(
# tool_name='image_segmentation_by_mask',
# inputs={'image': 'image', 'prompt_mask': 'prompt_mask'},
# outputs=['<GENERATED>-0'],
# )
# ]
# ],
}
def load_builtin_plans(path):
import json
plans = json.load(open(path, "r"))
processed_plan = {}
for query, actions in plans.items():
actions2 = []
for ac in actions[0]:
actions2.append(
Action(
tool_name=ac["tool_name"],
inputs=ac["inputs"],
outputs=ac["outputs"],
),
)
processed_plan[query] = [actions2]
return processed_plan