File size: 8,129 Bytes
01186d8
b114e5e
ad0b84b
69fc68f
0db692a
915154c
2fac6a6
b114e5e
d812eb3
dde4764
01186d8
ded1403
59377f3
 
01186d8
 
dde4764
 
3a27209
 
b114e5e
 
3a27209
0db692a
dde4764
6d2e403
ded1403
6d2e403
 
 
 
b114e5e
 
ded1403
 
 
6d2e403
 
b114e5e
ded1403
 
6aa97d2
 
 
 
 
1cb8f4e
6aa97d2
1cb8f4e
6aa97d2
f864d65
6aa97d2
1cb8f4e
6aa97d2
1cb8f4e
6aa97d2
 
dde4764
92a4c5d
5ec6e01
204b035
dde4764
 
 
 
2fac6a6
ca74fff
6aa97d2
ca74fff
8d5a7ee
b114e5e
da4e402
dde4764
 
 
92a4c5d
5ec6e01
204b035
dde4764
 
 
 
915154c
ca74fff
6aa97d2
ca74fff
8d5a7ee
b114e5e
915154c
dde4764
 
b114e5e
 
5ec6e01
 
0db692a
dde4764
b114e5e
 
 
3a27209
333a8cc
6aa97d2
b114e5e
 
0db692a
dde4764
b114e5e
 
5ec6e01
b114e5e
0db692a
dde4764
b114e5e
 
3397cf1
3a27209
6aa97d2
b114e5e
 
 
3397cf1
0db692a
dde4764
92a4c5d
 
5ec6e01
 
92a4c5d
 
1a04a88
 
 
 
 
 
 
3db6b5f
b114e5e
3397cf1
b114e5e
1a04a88
 
 
 
b114e5e
1a04a88
3db6b5f
b114e5e
1a04a88
 
90254a0
1a04a88
 
b114e5e
1a04a88
 
58c2c51
beb3f4b
59377f3
ff1ad14
 
 
 
 
 
 
59377f3
 
beb3f4b
 
59377f3
 
 
 
 
1a04a88
 
 
 
d812eb3
 
 
 
1a04a88
 
 
 
 
 
 
 
 
 
 
525242c
d9f4863
525242c
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
from smolagents import CodeAgent, InferenceClientModel
from smolagents.default_tools import PythonInterpreterTool, DuckDuckGoSearchTool
from tools import sort_list, operate_two_numbers, convert_number, load_dataframe_from_csv, load_dataframe_from_excel
from tools import tavily_search_tool, read_python_file_from_path
from tools import to_dataframe, to_json, get_dataframe_data, get_dataframe_column, get_dataframe_row, get_dataframe_groupby
from vlm_tools import image_processing, object_detection_tool, ocr_scan_tool, extract_images_from_video, get_image_from_file_path, get_video_from_file_path
from audio_tools import transcribe_audio_tool, get_audio_from_file_path, noise_reduction, audio_segmentation, speaker_diarization
from community_tools import community_tools, get_youtube_transcript_from_url, search_tools
from browser import browser_manager
import os
import logging
import yaml
from typing import List, Optional
from smolagents.tools import Tool

logging.basicConfig(level=logging.DEBUG)

MODEL_CHOICES = {
    "audio": ["Qwen/Qwen2.5-Coder-32B-Instruct"], 
    "vlm": ["Qwen/Qwen2.5-Coder-32B-Instruct"], 
    "math": ["Qwen/Qwen2.5-Coder-7B-Instruct"],
    "context_search": ["Qwen/Qwen2.5-Coder-32B-Instruct"],
    "master": ["Qwen/Qwen2.5-Coder-32B-Instruct"]
}

with open("prompts/prompts.yaml", 'r') as stream:
    prompt_templates = yaml.safe_load(stream)
with open("prompts/audio_prompts.yaml", 'r') as stream:
    audio_prompt_templates = yaml.safe_load(stream)
with open("prompts/vlm_prompts.yaml", 'r') as stream:
    vlm_prompt_templates = yaml.safe_load(stream)
with open("prompts/context_search_prompts.yaml", 'r') as stream:
    context_search_prompt_templates = yaml.safe_load(stream)

PROMPT_TEMPLATE = {
    "master_agent": prompt_templates,
    "audio_agent": audio_prompt_templates,
    "vlm_agent": vlm_prompt_templates,
    "context_search_agent": context_search_prompt_templates
}

# Consolidated authorized imports for all agents
AUTHORIZED_IMPORTS = [
    # Audio processing
    "wave", "speech_recognition", "pytube", "pytube3", "youtube_dl", "pydub", "pyAudioAnalysis",
    # Image/Video processing
    "cv2", "cv2.dnn", "cv2.imread", "pytesseract", "onnxruntime", "PIL", "PIL.Image", "bs4", "tesseract",
    # Data processing
    "numpy", "pandas", "sklearn", "scipy", "math", "hmmlearn",
    # File handling
    "base64", "io", "json", "os", "pickle", "openpyxl", "pyxlsb"
    # Visualization
    "pyplot", "matplotlib", "matplotlib.pyplot",
    # Utilities
    "logging", "yaml", "datetime", "typing", "markdownify", "requests", "chess"
]

audio_model = InferenceClientModel(
    model_id=MODEL_CHOICES["audio"][0],
    token=os.getenv("HUGGINGFACE_API_KEY"),
    max_tokens=18000
)

audio_agent = CodeAgent(
    model=audio_model,
    tools=[transcribe_audio_tool, get_audio_from_file_path, noise_reduction, audio_segmentation, speaker_diarization],
    max_steps=4,
    additional_authorized_imports=AUTHORIZED_IMPORTS,
    planning_interval=4,
    name="audio_agent",
    prompt_templates=PROMPT_TEMPLATE["audio_agent"],
    description="This agent is responsible for processing audio, loading mp3 audio and converting it to base64, reducing noise, segmenting audio and transcribing audio (in base64 format). It cannot process videos."
)

vlm_model = InferenceClientModel(
    model_id=MODEL_CHOICES["vlm"][0],
    token=os.getenv("HUGGINGFACE_API_KEY"),
    max_tokens=18000
)

vlm_agent = CodeAgent(
    model=vlm_model,
    tools=[image_processing, object_detection_tool, ocr_scan_tool, extract_images_from_video, get_image_from_file_path, get_video_from_file_path],
    max_steps=4,
    additional_authorized_imports=AUTHORIZED_IMPORTS,
    planning_interval=4,
    name="vlm_agent",
    prompt_templates=PROMPT_TEMPLATE["vlm_agent"],
    description="This agent is responsible for downloading images or videos, processing images or videos, detecting objects in them and extracting text from them. It cannot process audios."
)

math_model = InferenceClientModel(
    model_id=MODEL_CHOICES["math"][0],
    token=os.getenv("HUGGINGFACE_API_KEY"),
    max_tokens=6000
)

math_agent = CodeAgent(
    model=math_model,
    tools=[operate_two_numbers, convert_number, load_dataframe_from_csv, load_dataframe_from_excel, to_dataframe, to_json, get_dataframe_data, get_dataframe_column, get_dataframe_row, get_dataframe_groupby],
    max_steps=4,
    planning_interval=4,
    additional_authorized_imports=AUTHORIZED_IMPORTS,
    name="math_agent",
    description="This agent is responsible for performing arithmetic operations on two numbers. It can also perform dataframe operations such as converting data to a dataframe, performing calculations on such dataframe and converting the dataframe back to a json or a csv file"
)

context_search_model = InferenceClientModel(
    model_id=MODEL_CHOICES["context_search"][0],
    token=os.getenv("HUGGINGFACE_API_KEY"),
    max_tokens=24000
)

context_search_agent = CodeAgent(
    model=context_search_model,
    tools=[*search_tools],
    max_steps=4,
    additional_authorized_imports=AUTHORIZED_IMPORTS,
    planning_interval=4,
    name="context_search_agent",
    prompt_templates=PROMPT_TEMPLATE["context_search_agent"],
    description="This agent is responsible for searching the web for context using wikipedia for general information and arxiv for scientific information."
)

master_model = InferenceClientModel(
    model_id=MODEL_CHOICES["master"][0],
    token=os.getenv("HUGGINGFACE_API_KEY"),
    max_tokens=24000
)

class MasterAgentWrapper:
    """Wrapper class to manage master agent with thread-safe browser tools"""
    def __init__(self):
        self.base_tools = [
            sort_list, 
            get_youtube_transcript_from_url, 
            read_python_file_from_path, 
            PythonInterpreterTool(),
            DuckDuckGoSearchTool(),
            tavily_search_tool,
            *community_tools,
        ]
        
        self.master_agent = CodeAgent(
            model=master_model,
            managed_agents=[audio_agent, vlm_agent, math_agent],
            tools=self.base_tools,  # Initialize without browser tools
            add_base_tools=False,
            max_steps=20, #One final plan step, 16 intermediate steps
            additional_authorized_imports=AUTHORIZED_IMPORTS,
            verbosity_level=logging.INFO,
            planning_interval=5,
            prompt_templates=PROMPT_TEMPLATE["master_agent"],
            name="master_agent",
            description="This agent is responsible for managing audio, vlm, context_search and math agents."
        )
    
    def _run_with_browser_tools(self, question: str, browser_tools: List[Tool]) -> str:
        """Run agent with browser tools"""
        # Temporarily add browser tools
        original_tools = self.master_agent.tools.copy()  # Copy the dictionary
        all_tools = original_tools.copy()
        # Add browser tools to the dictionary
        for tool in browser_tools:
            all_tools[tool.name] = tool
        
        self.master_agent.tools = all_tools
        
        try:
            # Run the agent directly since we're in a sync context
            result = self.master_agent.run(question)
            return result
        finally:
            # Restore original tools
            self.master_agent.tools = original_tools
    
    def run(self, question: str) -> str:
        """Run the agent with thread-safe browser tools"""
        try:
            # Get browser tools in the correct context
            with browser_manager.get_browser_tools() as browser_tools:
                # Run with browser tools
                return self._run_with_browser_tools(question, browser_tools)
            # return self.master_agent.run(question) # Try without browser tools
                    
        except Exception as e:
            logging.error(f"Error in master agent run: {e}")
            raise

# Create the wrapped master agent
master_agent = MasterAgentWrapper()

# For backward compatibility
def run_master_agent(question: str) -> str:
    return master_agent.run(question)

#TESTING 5