""" General agents class """ from .common import * from .gpt4v import * from .ollama import * from .claude import * from .gemini import * from .qwen import * from .phi import * from .llama import * from .minicpm import * from .intern import * from abc import abstractmethod from typing import Union, Dict from bson import ObjectId from .event import * from .keychain import KeyChain import time import pickle class Agent(object): def __init__(self, api_key:Union[str, KeyChain], task:TaskSpec, vision_model:str="gpt-4-vision-preview", followup_func=None, session_token=None): """ Args: api_key: openAI/Claude api key task: Task specification for this agent vision_model: string identifier to the vision model used. """ self.followup_func = followup_func self.api_key = api_key # if this is a string, then self.vision_model = vision_model self.task = task ''' # # TODO: Add your own model here # elif vision_model == "{model_id of your model}": # logger.info(f"creating {Name of your model}-based agent of type: {vision_model}") # self.visual_interface = YourModel(task=task, model=vision_model) ''' if vision_model in ('gpt-4-vision-preview', 'gpt-4', 'gpt-4-turbo', 'gpt-4o-mini', "gpt-4o", "o1-preview", "o1-mini", 'o3-mini', 'o1'): # using the open ai key. logger.info(f"creating GPT-based agent of type: {vision_model}") if isinstance(api_key, KeyChain): api_key = api_key["openai"] self.visual_interface = GPTModel(api_key, task, model=vision_model) elif vision_model in ("claude-3-5-sonnet-latest", "claude-3-haiku-latest", "claude-3-5-haiku-latest", "claude-3-opus-latest", 'claude-3-7-sonnet-latest'): # using the claude key. logger.info(f"creating Claude-based agent of type: {vision_model}") if isinstance(api_key, KeyChain): api_key = api_key["claude"] self.visual_interface = ClaudeModel(api_key, task) elif vision_model in ('gemini-pro', 'gemini-pro-vision', 'gemini-2.0-flash', 'gemini-1.5-flash', 'gemini-1.5-pro'): # using the gemini key. if isinstance(api_key, KeyChain): api_key = api_key["gemini"] logger.info(f"creating Gemini-based agent of type: {vision_model}") self.visual_interface = GeminiModel(api_key=api_key, task=task, model=vision_model) elif vision_model in ('qwen', 'qwenllama'): logger.info(f"creating Qwen-based agent of type: Qwen/Qwen2-VL-7B-Instruct.") self.visual_interface = QwenModel(task=task) elif vision_model in ('phi', 'phillama'): logger.info(f"creating Phi-based agent of type: microsoft/Phi-3.5-vision-instruct.") self.visual_interface = PhiModel(task=task, model='microsoft/Phi-3.5-vision-instruct') elif vision_model == 'llama': logger.info(f"creating LLaMA-based agent of type: meta-llama/Meta-Llama-3.1-8B-Instruct.") self.visual_interface = LlamaModel(task=task, model='meta-llama/Meta-Llama-3.1-8B-Instruct') elif vision_model in ('minicpm', 'minicpmllama'): logger.info(f"creating MiniCPM-based agent of type: openbmb/MiniCPM-V-2_6-int4.") self.visual_interface = MiniCPMModel(task=task, model='openbmb/MiniCPM-V-2_6-int4') elif vision_model in ('intern', 'internllama'): logger.info(f"creating Intern-based agent of type: OpenGVLab/InternVL2-8B.") self.visual_interface = InternModel(task=task, model='OpenGVLab/InternVL2-8B') else: raise ValueError(f'{vision_model} not matched with any avalable choices.') if session_token is None: self.session_token = str(ObjectId()) self.event_buffer = EventCollection() else: raise NotImplementedError("Need to implement loading function for session_token") def save(self, to): with open(to, "wb") as f: pickle.dump(self, f) return self @staticmethod def load(fp): with open(fp, "rb") as f: agent = pickle.load(f) return agent def clear_event_buffer(self): # begins a new session, fresh session id and event_buffer objects. self.session_token = str(ObjectId()) self.event_buffer = EventCollection() def think(self, question:Question) -> ParsedAnswer: """ Adds a THINKING event to the event buffer. Args: question: The question/task instance we seek to solve. """ # make an initial guess if this is going to be the first try if len(self.event_buffer.filter_to('ACT')) == 0: p_ans, ans, meta, p = self.visual_interface.run_once(question) else: print('Into think') p_ans, ans, meta, p = self.visual_interface.rough_guess(question) ev = ThinkEvent(session_token=self.session_token, qa_sequence=[(question, p_ans)]) self.event_buffer.add_event(ev) # update events_collection return p_ans, ans, meta, p @abstractmethod def act(self, p_ans:ParsedAnswer): """ NEEDS to add an ACTION event to the event buffer. Executes the action within the environment, resulting in some state change. This code is specific to the environment/task that it operates under. """ ... @abstractmethod def observe(self, state:dict): """ Observations NEEDS to add an OBSERVE event to the event buffer. States are specific to the environment/task that it operates under. """ ... def reflect(self) -> Union[None, Question]: """ Reflections Adds a REFLECT event to the event buffer. """ # have we finished the task? # evaluator fucntion (self.task.completed) gets the agent itself. evaluation_question, evaluation_answer = self.task.completed(self) ev = EvaluateEvent(completion_question=evaluation_question, completion_eval=evaluation_answer) # logger.info(f"evaluator says: {evaluation_answer.success()} -- {evaluation_answer}") self.event_buffer.add_event(ev) if evaluation_answer.success(): return None # followup func should take in the agent itself, # with access to all the events and internal states # that it contains, and ask good followup questions # to itself. followup = self.followup_func(self) ev = FeedbackEvent(feedback=followup) self.event_buffer.add_event(ev) # otherwise make the followup. return followup def interject(self, interjection:InteractEvent): """ User interjects. Adds a INTERACT event to the event buffer Main responsibility of method is storage of user interactions. Composed of: 1) User actions 2) State transitions 3) Reasoning, and/or comments for why the agents has failed. """ self.event_buffer.add_event(interjection) return self def run(self): """ An interface to run the T/A/O/R/I loops T = think A = act O = observe R = reflect I = interaction/interjection A usual flow over the different steps might look something like: TAORTAORTAORTAORI, with an interjection at the end from the user as a way to teach the agent how to do the right thing, as well as explanations for why. """ raise NotImplementedError