yiranranranra's picture
Upload folder using huggingface_hub
a12c07f verified
"""
General agents class
"""
from .common import *
from .gpt4v import *
from .ollama import *
from .claude import *
from .gemini import *
from .qwen import *
from .phi import *
from .llama import *
from .minicpm import *
from .intern import *
from abc import abstractmethod
from typing import Union, Dict
from bson import ObjectId
from .event import *
from .keychain import KeyChain
import time
import pickle
class Agent(object):
def __init__(self, api_key:Union[str, KeyChain], task:TaskSpec,
vision_model:str="gpt-4-vision-preview",
followup_func=None,
session_token=None):
"""
Args:
api_key: openAI/Claude api key
task: Task specification for this agent
vision_model: string identifier to the vision model used.
"""
self.followup_func = followup_func
self.api_key = api_key # if this is a string, then
self.vision_model = vision_model
self.task = task
'''
# # TODO: Add your own model here
# elif vision_model == "{model_id of your model}":
# logger.info(f"creating {Name of your model}-based agent of type: {vision_model}")
# self.visual_interface = YourModel(task=task, model=vision_model)
'''
if vision_model in ('gpt-4-vision-preview', 'gpt-4', 'gpt-4-turbo', 'gpt-4o-mini', "gpt-4o", "o1-preview", "o1-mini", 'o3-mini', 'o1'):
# using the open ai key.
logger.info(f"creating GPT-based agent of type: {vision_model}")
if isinstance(api_key, KeyChain):
api_key = api_key["openai"]
self.visual_interface = GPTModel(api_key, task, model=vision_model)
elif vision_model in ("claude-3-5-sonnet-latest", "claude-3-haiku-latest", "claude-3-5-haiku-latest", "claude-3-opus-latest", 'claude-3-7-sonnet-latest'):
# using the claude key.
logger.info(f"creating Claude-based agent of type: {vision_model}")
if isinstance(api_key, KeyChain):
api_key = api_key["claude"]
self.visual_interface = ClaudeModel(api_key, task)
elif vision_model in ('gemini-pro', 'gemini-pro-vision', 'gemini-2.0-flash', 'gemini-1.5-flash', 'gemini-1.5-pro'):
# using the gemini key.
if isinstance(api_key, KeyChain):
api_key = api_key["gemini"]
logger.info(f"creating Gemini-based agent of type: {vision_model}")
self.visual_interface = GeminiModel(api_key=api_key, task=task, model=vision_model)
elif vision_model in ('qwen', 'qwenllama'):
logger.info(f"creating Qwen-based agent of type: Qwen/Qwen2-VL-7B-Instruct.")
self.visual_interface = QwenModel(task=task)
elif vision_model in ('phi', 'phillama'):
logger.info(f"creating Phi-based agent of type: microsoft/Phi-3.5-vision-instruct.")
self.visual_interface = PhiModel(task=task, model='microsoft/Phi-3.5-vision-instruct')
elif vision_model == 'llama':
logger.info(f"creating LLaMA-based agent of type: meta-llama/Meta-Llama-3.1-8B-Instruct.")
self.visual_interface = LlamaModel(task=task, model='meta-llama/Meta-Llama-3.1-8B-Instruct')
elif vision_model in ('minicpm', 'minicpmllama'):
logger.info(f"creating MiniCPM-based agent of type: openbmb/MiniCPM-V-2_6-int4.")
self.visual_interface = MiniCPMModel(task=task, model='openbmb/MiniCPM-V-2_6-int4')
elif vision_model in ('intern', 'internllama'):
logger.info(f"creating Intern-based agent of type: OpenGVLab/InternVL2-8B.")
self.visual_interface = InternModel(task=task, model='OpenGVLab/InternVL2-8B')
else:
raise ValueError(f'{vision_model} not matched with any avalable choices.')
if session_token is None:
self.session_token = str(ObjectId())
self.event_buffer = EventCollection()
else:
raise NotImplementedError("Need to implement loading function for session_token")
def save(self, to):
with open(to, "wb") as f:
pickle.dump(self, f)
return self
@staticmethod
def load(fp):
with open(fp, "rb") as f:
agent = pickle.load(f)
return agent
def clear_event_buffer(self):
# begins a new session, fresh session id and event_buffer objects.
self.session_token = str(ObjectId())
self.event_buffer = EventCollection()
def think(self, question:Question) -> ParsedAnswer:
"""
Adds a THINKING event to the event buffer.
Args:
question: The question/task instance we seek to solve.
"""
# make an initial guess if this is going to be the first try
if len(self.event_buffer.filter_to('ACT')) == 0:
p_ans, ans, meta, p = self.visual_interface.run_once(question)
else:
print('Into think')
p_ans, ans, meta, p = self.visual_interface.rough_guess(question)
ev = ThinkEvent(session_token=self.session_token,
qa_sequence=[(question, p_ans)])
self.event_buffer.add_event(ev)
# update events_collection
return p_ans, ans, meta, p
@abstractmethod
def act(self, p_ans:ParsedAnswer):
"""
NEEDS to add an ACTION event to the event buffer.
Executes the action within the environment, resulting
in some state change.
This code is specific to the environment/task that it operates under.
"""
...
@abstractmethod
def observe(self, state:dict):
""" Observations
NEEDS to add an OBSERVE event to the event buffer.
States are specific to the environment/task that it operates under.
"""
...
def reflect(self) -> Union[None, Question]:
""" Reflections
Adds a REFLECT event to the event buffer.
"""
# have we finished the task?
# evaluator fucntion (self.task.completed) gets the agent itself.
evaluation_question, evaluation_answer = self.task.completed(self)
ev = EvaluateEvent(completion_question=evaluation_question,
completion_eval=evaluation_answer)
# logger.info(f"evaluator says: {evaluation_answer.success()} -- {evaluation_answer}")
self.event_buffer.add_event(ev)
if evaluation_answer.success():
return None
# followup func should take in the agent itself,
# with access to all the events and internal states
# that it contains, and ask good followup questions
# to itself.
followup = self.followup_func(self)
ev = FeedbackEvent(feedback=followup)
self.event_buffer.add_event(ev)
# otherwise make the followup.
return followup
def interject(self, interjection:InteractEvent):
""" User interjects.
Adds a INTERACT event to the event buffer
Main responsibility of method is storage of
user interactions.
Composed of:
1) User actions
2) State transitions
3) Reasoning, and/or comments for why the agents
has failed.
"""
self.event_buffer.add_event(interjection)
return self
def run(self):
""" An interface to run the T/A/O/R/I loops
T = think
A = act
O = observe
R = reflect
I = interaction/interjection
A usual flow over the different steps might look something
like: TAORTAORTAORTAORI, with an interjection at the end
from the user as a way to teach the agent how to do the right
thing, as well as explanations for why.
"""
raise NotImplementedError