# Define tools such as: Web search tool, image processing tool, language translation tool, video processing tool from llama_index.tools.brave_search import BraveSearchToolSpec from llama_index.core.tools import FunctionTool import os from llama_index.multi_modal_llms.ollama import OllamaMultiModal from llama_index.core.schema import ImageNode import cv2 from PIL import Image from dotenv import load_dotenv load_dotenv() brave_api_key = os.getenv('BRAVE_API_KEY', 'No key') mm_model = OllamaMultiModal( model='llava', temperature=0.7, ) search_tool_spec = BraveSearchToolSpec(api_key=brave_api_key) search_tool = search_tool_spec.to_tool_list()[0] # creating an image handling tool def image_handling_tool(input_data:str) -> str: """this tool takes an image file processes it based on the user or system prompt and generates a response in a string format""" image = ImageNode(image_url = input_data) result = mm_model.complete( prompt='Use the context prompt generated by the agent\'s reasoning to answer the question asked on the image', image_documents=[image] ) return str(result) # creating a video handling tool def video_handling_tool(input:str): """this tool takes a video url link, processes it based on the agent's prompt and or context and generates a response in astring format""" # Load video cap = cv2.VideoCapture(input) # Read a frame at 5-second mark cap.set(cv2.CAP_PROP_POS_MSEC, 5000) success, frame = cap.read() if success: image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)) prompt = "What is happening in this frame of the video?" response = mm_model.complete(prompt=prompt, image=image) return str(response) image_tool = FunctionTool.from_defaults(fn=image_handling_tool) video_tool = FunctionTool.from_defaults(fn=video_handling_tool)