Final_Assignment_Template

Sleeping

App Files Files Community

Final_Assignment_Template / mini_agents.py

huytofu92

mini change

90254a0 25 days ago

raw

history blame contribute delete

8.13 kB

	from smolagents import CodeAgent, InferenceClientModel
	from smolagents.default_tools import PythonInterpreterTool, DuckDuckGoSearchTool
	from tools import sort_list, operate_two_numbers, convert_number, load_dataframe_from_csv, load_dataframe_from_excel
	from tools import tavily_search_tool, read_python_file_from_path
	from tools import to_dataframe, to_json, get_dataframe_data, get_dataframe_column, get_dataframe_row, get_dataframe_groupby
	from vlm_tools import image_processing, object_detection_tool, ocr_scan_tool, extract_images_from_video, get_image_from_file_path, get_video_from_file_path
	from audio_tools import transcribe_audio_tool, get_audio_from_file_path, noise_reduction, audio_segmentation, speaker_diarization
	from community_tools import community_tools, get_youtube_transcript_from_url, search_tools
	from browser import browser_manager
	import os
	import logging
	import yaml
	from typing import List, Optional
	from smolagents.tools import Tool

	logging.basicConfig(level=logging.DEBUG)

	MODEL_CHOICES = {
	"audio": ["Qwen/Qwen2.5-Coder-32B-Instruct"],
	"vlm": ["Qwen/Qwen2.5-Coder-32B-Instruct"],
	"math": ["Qwen/Qwen2.5-Coder-7B-Instruct"],
	"context_search": ["Qwen/Qwen2.5-Coder-32B-Instruct"],
	"master": ["Qwen/Qwen2.5-Coder-32B-Instruct"]
	}

	with open("prompts/prompts.yaml", 'r') as stream:
	prompt_templates = yaml.safe_load(stream)
	with open("prompts/audio_prompts.yaml", 'r') as stream:
	audio_prompt_templates = yaml.safe_load(stream)
	with open("prompts/vlm_prompts.yaml", 'r') as stream:
	vlm_prompt_templates = yaml.safe_load(stream)
	with open("prompts/context_search_prompts.yaml", 'r') as stream:
	context_search_prompt_templates = yaml.safe_load(stream)

	PROMPT_TEMPLATE = {
	"master_agent": prompt_templates,
	"audio_agent": audio_prompt_templates,
	"vlm_agent": vlm_prompt_templates,
	"context_search_agent": context_search_prompt_templates
	}

	# Consolidated authorized imports for all agents
	AUTHORIZED_IMPORTS = [
	# Audio processing
	"wave", "speech_recognition", "pytube", "pytube3", "youtube_dl", "pydub", "pyAudioAnalysis",
	# Image/Video processing
	"cv2", "cv2.dnn", "cv2.imread", "pytesseract", "onnxruntime", "PIL", "PIL.Image", "bs4", "tesseract",
	# Data processing
	"numpy", "pandas", "sklearn", "scipy", "math", "hmmlearn",
	# File handling
	"base64", "io", "json", "os", "pickle", "openpyxl", "pyxlsb"
	# Visualization
	"pyplot", "matplotlib", "matplotlib.pyplot",
	# Utilities
	"logging", "yaml", "datetime", "typing", "markdownify", "requests", "chess"
	]

	audio_model = InferenceClientModel(
	model_id=MODEL_CHOICES["audio"][0],
	token=os.getenv("HUGGINGFACE_API_KEY"),
	max_tokens=18000
	)

	audio_agent = CodeAgent(
	model=audio_model,
	tools=[transcribe_audio_tool, get_audio_from_file_path, noise_reduction, audio_segmentation, speaker_diarization],
	max_steps=4,
	additional_authorized_imports=AUTHORIZED_IMPORTS,
	planning_interval=4,
	name="audio_agent",
	prompt_templates=PROMPT_TEMPLATE["audio_agent"],
	description="This agent is responsible for processing audio, loading mp3 audio and converting it to base64, reducing noise, segmenting audio and transcribing audio (in base64 format). It cannot process videos."
	)

	vlm_model = InferenceClientModel(
	model_id=MODEL_CHOICES["vlm"][0],
	token=os.getenv("HUGGINGFACE_API_KEY"),
	max_tokens=18000
	)

	vlm_agent = CodeAgent(
	model=vlm_model,
	tools=[image_processing, object_detection_tool, ocr_scan_tool, extract_images_from_video, get_image_from_file_path, get_video_from_file_path],
	max_steps=4,
	additional_authorized_imports=AUTHORIZED_IMPORTS,
	planning_interval=4,
	name="vlm_agent",
	prompt_templates=PROMPT_TEMPLATE["vlm_agent"],
	description="This agent is responsible for downloading images or videos, processing images or videos, detecting objects in them and extracting text from them. It cannot process audios."
	)

	math_model = InferenceClientModel(
	model_id=MODEL_CHOICES["math"][0],
	token=os.getenv("HUGGINGFACE_API_KEY"),
	max_tokens=6000
	)

	math_agent = CodeAgent(
	model=math_model,
	tools=[operate_two_numbers, convert_number, load_dataframe_from_csv, load_dataframe_from_excel, to_dataframe, to_json, get_dataframe_data, get_dataframe_column, get_dataframe_row, get_dataframe_groupby],
	max_steps=4,
	planning_interval=4,
	additional_authorized_imports=AUTHORIZED_IMPORTS,
	name="math_agent",
	description="This agent is responsible for performing arithmetic operations on two numbers. It can also perform dataframe operations such as converting data to a dataframe, performing calculations on such dataframe and converting the dataframe back to a json or a csv file"
	)

	context_search_model = InferenceClientModel(
	model_id=MODEL_CHOICES["context_search"][0],
	token=os.getenv("HUGGINGFACE_API_KEY"),
	max_tokens=24000
	)

	context_search_agent = CodeAgent(
	model=context_search_model,
	tools=[*search_tools],
	max_steps=4,
	additional_authorized_imports=AUTHORIZED_IMPORTS,
	planning_interval=4,
	name="context_search_agent",
	prompt_templates=PROMPT_TEMPLATE["context_search_agent"],
	description="This agent is responsible for searching the web for context using wikipedia for general information and arxiv for scientific information."
	)

	master_model = InferenceClientModel(
	model_id=MODEL_CHOICES["master"][0],
	token=os.getenv("HUGGINGFACE_API_KEY"),
	max_tokens=24000
	)

	class MasterAgentWrapper:
	"""Wrapper class to manage master agent with thread-safe browser tools"""
	def __init__(self):
	self.base_tools = [
	sort_list,
	get_youtube_transcript_from_url,
	read_python_file_from_path,
	PythonInterpreterTool(),
	DuckDuckGoSearchTool(),
	tavily_search_tool,
	*community_tools,
	]

	self.master_agent = CodeAgent(
	model=master_model,
	managed_agents=[audio_agent, vlm_agent, math_agent],
	tools=self.base_tools, # Initialize without browser tools
	add_base_tools=False,
	max_steps=20, #One final plan step, 16 intermediate steps
	additional_authorized_imports=AUTHORIZED_IMPORTS,
	verbosity_level=logging.INFO,
	planning_interval=5,
	prompt_templates=PROMPT_TEMPLATE["master_agent"],
	name="master_agent",
	description="This agent is responsible for managing audio, vlm, context_search and math agents."
	)

	def _run_with_browser_tools(self, question: str, browser_tools: List[Tool]) -> str:
	"""Run agent with browser tools"""
	# Temporarily add browser tools
	original_tools = self.master_agent.tools.copy() # Copy the dictionary
	all_tools = original_tools.copy()
	# Add browser tools to the dictionary
	for tool in browser_tools:
	all_tools[tool.name] = tool

	self.master_agent.tools = all_tools

	try:
	# Run the agent directly since we're in a sync context
	result = self.master_agent.run(question)
	return result
	finally:
	# Restore original tools
	self.master_agent.tools = original_tools

	def run(self, question: str) -> str:
	"""Run the agent with thread-safe browser tools"""
	try:
	# Get browser tools in the correct context
	with browser_manager.get_browser_tools() as browser_tools:
	# Run with browser tools
	return self._run_with_browser_tools(question, browser_tools)
	# return self.master_agent.run(question) # Try without browser tools

	except Exception as e:
	logging.error(f"Error in master agent run: {e}")
	raise

	# Create the wrapped master agent
	master_agent = MasterAgentWrapper()

	# For backward compatibility
	def run_master_agent(question: str) -> str:
	return master_agent.run(question)

	#TESTING 5