22Nikk0 commited on
Commit
4001ecc
·
verified ·
1 Parent(s): f572db7

Create agentic2.py

Browse files
Files changed (1) hide show
  1. agentic2.py +974 -0
agentic2.py ADDED
@@ -0,0 +1,974 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langgraph.graph import StateGraph, START, END
2
+ from typing_extensions import TypedDict, Annotated, Literal, Optional
3
+ from langchain_core.messages import AnyMessage, HumanMessage, SystemMessage
4
+ from langgraph.graph.message import add_messages
5
+ from langchain_mistralai import ChatMistralAI
6
+ from langchain_openai import ChatOpenAI
7
+ from langgraph.prebuilt import ToolNode, tools_condition
8
+ from langchain_core.runnables.graph import MermaidDrawMethod
9
+ from langchain_community.tools import DuckDuckGoSearchRun
10
+ from langchain_community.tools import WikipediaQueryRun
11
+ from langchain_community.utilities import WikipediaAPIWrapper
12
+ from langchain_aws import ChatBedrock
13
+ from langchain_google_genai import ChatGoogleGenerativeAI
14
+ from langchain_community.document_loaders import UnstructuredExcelLoader
15
+ # from langchain_google_vertexai import ChatVertexAI
16
+
17
+ # from langfuse.callback import CallbackHandler
18
+
19
+ import base64
20
+ import json
21
+ import time
22
+ import requests
23
+
24
+
25
+ # import boto3
26
+
27
+ from yt_dlp import YoutubeDL
28
+ import os
29
+ # from urllib.parse import urlparse, parse_qs
30
+ import re
31
+ from dotenv import load_dotenv
32
+
33
+ # Load env vars from .env file
34
+ load_dotenv()
35
+
36
+ # Initialize Langfuse CallbackHandler for LangGraph/Langchain (tracing)
37
+ # langfuse_handler = CallbackHandler()
38
+
39
+ ######## STATE ########
40
+ class State(TypedDict):
41
+ """
42
+ A class representing the state of the agent.
43
+ """
44
+ question: str
45
+ messages: Annotated[list[AnyMessage], add_messages]
46
+ input_file: str
47
+ downloaded_file: Optional[str]
48
+ task_id: str
49
+ web_search_node_result: AnyMessage
50
+ thinking_node_result: AnyMessage
51
+ vision_node_result: AnyMessage
52
+ video_node_result: AnyMessage
53
+ audio_node_result: AnyMessage
54
+ code_node_result: AnyMessage
55
+ excel_node_result: AnyMessage
56
+ next_node: str
57
+
58
+ ########################
59
+
60
+ ######## MODELS ########
61
+ def get_general_model():
62
+
63
+ llm_provider = os.getenv("LLM_PROVIDER", "mistral")
64
+
65
+ if llm_provider == "mistral":
66
+ general_model = ChatMistralAI(
67
+ model="mistral-large-2411",#"ministral-8b-latest",#"mistral-small-latest",
68
+ temperature=0,
69
+ max_retries=2,
70
+ api_key=os.getenv("MISTRAL_API_KEY")
71
+ )
72
+
73
+ if llm_provider == "aws":
74
+ general_model = ChatBedrock(
75
+ model_id="arn:aws:bedrock:us-east-1:416545197702:inference-profile/us.amazon.nova-lite-v1:0",
76
+ # provider="amazon",
77
+ temperature=0,
78
+ region_name="eu-west-3",
79
+ aws_access_key_id=os.getenv("AWS_ACCESS_KEY_ID"),
80
+ aws_secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY")
81
+ )
82
+
83
+ return general_model
84
+
85
+ def get_big_model():
86
+
87
+ big_model = ChatMistralAI(
88
+ model="mistral-medium-2505",
89
+ temperature=0,
90
+ max_retries=2,
91
+ api_key=os.getenv("MISTRAL_API_KEY")
92
+ )
93
+
94
+ return big_model
95
+
96
+ def get_vision_model():
97
+
98
+ vlm_provider = os.getenv("VLM_PROVIDER", "mistral")
99
+
100
+ if vlm_provider == "openai":
101
+ print("Spawning Open AI VLM")
102
+ vision_model = ChatOpenAI(
103
+ model="gpt-4o",
104
+ temperature=0,
105
+ max_tokens=None,
106
+ timeout=None,
107
+ max_retries=2,
108
+ api_key=os.getenv("OPENAI_API_KEY"),
109
+ )
110
+
111
+ if vlm_provider == "mistral":
112
+ print("Spawning Mistral VLM")
113
+ vision_model = ChatMistralAI(
114
+ model="pixtral-12b-2409",#"mistral-small-latest","pixtral-large-latest",#
115
+ temperature=0,
116
+ max_retries=2,
117
+ api_key=os.getenv("MISTRAL_API_KEY")
118
+ )
119
+
120
+ return vision_model
121
+
122
+ def get_video_handler_model():
123
+
124
+ video_handler_model = ChatGoogleGenerativeAI(
125
+ model="gemini-2.0-flash",
126
+ temperature=0,
127
+ max_tokens=None,
128
+ timeout=None,
129
+ max_retries=2,
130
+ # other params...
131
+ )
132
+
133
+ return video_handler_model
134
+
135
+ def get_audio_handler_model():
136
+ audio_handler_model = ChatOpenAI(
137
+ model="gpt-4o-audio-preview-2024-12-17",#,gpt-4o-mini-audio-preview-2024-12-17",#
138
+ temperature=0,
139
+ max_tokens=None,
140
+ timeout=None,
141
+ max_retries=2,
142
+ api_key=os.getenv("OPENAI_API_KEY"),
143
+ )
144
+
145
+ return audio_handler_model
146
+
147
+ ########################
148
+
149
+ ######## Functions ########
150
+
151
+ def download_youtube_content(url: str, output_path: Optional[str] = None) -> None:
152
+ """
153
+ Download YouTube content (single video or playlist) in MP4 format only.
154
+
155
+ Args:
156
+ url (str): URL of the YouTube video or playlist
157
+ output_path (str, optional): Directory to save the downloads. Defaults to './downloads'
158
+ """
159
+ # Set default output path if none provided
160
+ if output_path is None:
161
+ output_path = os.path.join(os.getcwd(), 'downloads')
162
+
163
+ # Create output directory if it doesn't exist
164
+ os.makedirs(output_path, exist_ok=True)
165
+
166
+ # Configure yt-dlp options for MP4 only
167
+ ydl_opts = {
168
+ 'format': 'bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best',
169
+ 'merge_output_format': 'mp4',
170
+ 'ignoreerrors': True,
171
+ 'no_warnings': False,
172
+ 'extract_flat': False,
173
+ # Disable all additional downloads
174
+ 'writesubtitles': False,
175
+ 'writethumbnail': False,
176
+ 'writeautomaticsub': False,
177
+ 'postprocessors': [{
178
+ 'key': 'FFmpegVideoConvertor',
179
+ 'preferedformat': 'mp4',
180
+ }],
181
+ # Clean up options
182
+ 'keepvideo': False,
183
+ 'clean_infojson': True
184
+ }
185
+
186
+
187
+ ydl_opts['outtmpl'] = os.path.join(output_path, '%(title)s.%(ext)s')
188
+ print("Detected single video URL. Downloading video...")
189
+
190
+ try:
191
+ with YoutubeDL(ydl_opts) as ydl:
192
+ # Download content
193
+ ydl.download([url])
194
+ print(f"\nDownload completed successfully! Files saved to: {output_path}")
195
+
196
+ except Exception as e:
197
+ print(f"An error occurred: {str(e)}")
198
+
199
+ result = os.listdir(output_path)
200
+
201
+ video_file_names = [x for x in result if re.match(r".*\.mp4$", x)]
202
+
203
+ if len(video_file_names) == 1:
204
+ video_file_name = video_file_names.pop()
205
+ video_file_name = f"{output_path}/{video_file_name}"
206
+ else:
207
+ video_file_name = None
208
+
209
+ for other_files in result:
210
+ if f"{output_path}/{other_files}" != video_file_name:
211
+ print(f"Removing file: {other_files}")
212
+ os.remove(os.path.join(output_path, other_files))
213
+
214
+ return video_file_name
215
+
216
+
217
+ web_search = DuckDuckGoSearchRun()
218
+ wikipedia_search = WikipediaQueryRun(api_wrapper=WikipediaAPIWrapper())
219
+
220
+ def download_input_file(task_id: str) -> str:
221
+ """
222
+ Download the file specified in state input_file key.
223
+ You only need the task_id to download the file.
224
+
225
+ Args:
226
+ task_id (str): The task_id of the file to download.
227
+
228
+ Returns:
229
+ str: The path to the downloaded file.
230
+ """
231
+
232
+ output_path = os.path.join(os.getcwd(), 'downloads')
233
+
234
+ api_url = os.getenv("DEFAULT_API_URL")
235
+
236
+ # Create output directory if it doesn't exist
237
+ os.makedirs(output_path, exist_ok=True)
238
+
239
+ # Construct the full URL
240
+ url = f"{api_url}/files/{task_id}"
241
+
242
+ try:
243
+ # Send a GET request to download the file
244
+ response = requests.get(url, stream=True)
245
+ response.raise_for_status() # Raise an error for bad status codes
246
+
247
+ headers = dict(response.headers)
248
+ attachement = headers["content-disposition"]
249
+
250
+ regex_result = re.search(r'filename="(.*)"', attachement)
251
+ filename = regex_result.group(1)
252
+
253
+ # Define the output file path
254
+ output_file_path = os.path.join(output_path, filename)
255
+
256
+ # Write the file to the output path
257
+ with open(output_file_path, 'wb') as file:
258
+ for chunk in response.iter_content(chunk_size=8192):
259
+ file.write(chunk)
260
+
261
+ print(f"File downloaded successfully and saved to: {output_file_path}")
262
+
263
+ return output_file_path
264
+
265
+ except requests.exceptions.RequestException as e:
266
+ print(f"An error occurred while downloading the file: {str(e)}")
267
+ return ""
268
+
269
+ ########################
270
+
271
+ ######## LLM associations ########
272
+
273
+ general_model = get_general_model()
274
+ big_model = get_big_model()
275
+
276
+ vision_model = get_vision_model()
277
+ video_handler_model = get_video_handler_model()
278
+ audio_handler_model = get_audio_handler_model()
279
+
280
+ ########################
281
+
282
+ ######## Nodes Definition ########
283
+
284
+ search_tools = [
285
+ web_search,
286
+ wikipedia_search,
287
+ ]
288
+
289
+ download_file_tool = [ download_input_file ]
290
+
291
+ web_search_node_agent = general_model.bind_tools(search_tools, parallel_tool_calls=False)
292
+
293
+ def thinking_node(state: State) -> dict:
294
+ """
295
+ A powerful node to answer general questions, reflection, maths, deduction, prediction.
296
+ This node does not handle files
297
+ This node does not handle images or pictures
298
+ This node does not handle videos
299
+ This node does not handle audio
300
+ This node does not handle code
301
+
302
+ Args:
303
+ state (State): A dictionary containing the current state of the agent, including the 'question' key which holds the question to be answered.
304
+
305
+ Returns:
306
+ dict: A dictionary containing the response from the web search node, with the key 'thinking_node_result' holding the list of messages generated by the general model.
307
+ """
308
+
309
+ prompt = f"""
310
+ You are a powerful assistant that answers general questions, reflection, maths, deduction, prediction.
311
+
312
+ 1. You need to fully understand the question
313
+ 2. You must think hard about what is relevant in the question to make the best answer
314
+ 3. If there are calculations or maths, you need to verify twice before answering.
315
+ 4. Report your thought process in detail, explaining your reasoning step-by-step.
316
+
317
+ Here is the question {state['question']}
318
+ Now provide your response immediately without any preamble in text but not in markdown.
319
+ """
320
+
321
+ state["thinking_node_result"] = state.get("thinking_node_result", "")
322
+
323
+ sys_msg = SystemMessage(content=prompt)
324
+
325
+ thinking_node_response = [general_model.invoke([sys_msg] + [state["thinking_node_result"]])]
326
+
327
+ thinking_node_response[-1].pretty_print()
328
+
329
+ return {
330
+ "thinking_node_result": thinking_node_response,
331
+ }
332
+
333
+ def code_node(state: State) -> dict:
334
+ """
335
+ A powerful node to handle and understand code.
336
+ This node does not handle images or pictures
337
+ This node does not handle videos
338
+ This node does not handle audio
339
+ This node does not access the web
340
+
341
+ Args:
342
+ state (State): A dictionary containing the current state of the agent, including the 'question' key which holds the question to be answered.
343
+
344
+ Returns:
345
+ dict: A dictionary containing the response from the web search node, with the key 'code_node_result' holding the list of messages generated by the general model.
346
+ """
347
+
348
+ with open(state["downloaded_file"], "r") as code_file:
349
+ code = code_file.read()
350
+
351
+ prompt = f"""
352
+ You are a powerful assistant that handle and understand code.
353
+
354
+ 1. You need to fully understand the question.
355
+ 2. You must think hard about the code and predict the result to answer the question.
356
+ 3. Report your thought process in detail, explaining your reasoning step-by-step.
357
+
358
+ Here is the question : {state['question']}
359
+ Here is the code : {code}
360
+
361
+ Now provide your response immediately without any preamble in text but not in markdown.
362
+ """
363
+
364
+ sys_msg = SystemMessage(content=prompt)
365
+
366
+ code_node_response = [general_model.invoke([sys_msg])]
367
+
368
+ code_node_response[-1].pretty_print()
369
+
370
+ return {
371
+ "code_node_result": code_node_response,
372
+ }
373
+
374
+ def web_search_node(state: State) -> dict:
375
+ """
376
+ A powerful node to answer questions and make research on the web based on the question provided in the state.
377
+ This node does not handle files
378
+ This node does not handle images or pictures
379
+ This node does not handle videos
380
+ This node does not handle audio
381
+ This node does not handle code
382
+
383
+ Args:
384
+ state (State): A dictionary containing the current state of the agent, including the 'question' key which holds the question to be answered.
385
+
386
+ Returns:
387
+ dict: A dictionary containing the response from the web search node, with the key 'web_search_node_result' holding the list of messages generated by the general model.
388
+ """
389
+
390
+ prompt = f"""
391
+ You are a powerful assistant that makes research on the web in order to give the best answer to the question.
392
+
393
+ 1. You need to fully understand the question
394
+ 2. You must think hard about what is relevant in the question to make the best search with write words
395
+ 3. You must use the best of the tools you have to answer the question precisly
396
+ 4. Report your thought process in detail, explaining your reasoning step-by-step.
397
+ 5. You must not change the way words or identifiers are written in the web search results.
398
+
399
+ Here are the tools available:
400
+ web_search:
401
+ {web_search.description}
402
+ Args:
403
+ {web_search.args_schema}
404
+ Returns:
405
+ {web_search.response_format}
406
+
407
+ wikipedia_search:
408
+ {wikipedia_search.description}
409
+ Args:
410
+ {wikipedia_search.args_schema}
411
+ Returns:
412
+ {wikipedia_search.response_format}
413
+
414
+ Here is the question {state['question']}
415
+ Now provide your response immediately without any preamble in text but not in markdown.
416
+ """
417
+
418
+ state["web_search_node_result"] = state.get("web_search_node_result", "")
419
+
420
+ sys_msg = SystemMessage(content=prompt)
421
+
422
+ web_search_node_response = [web_search_node_agent.invoke([sys_msg] + [state["web_search_node_result"]])]
423
+
424
+ web_search_node_response[-1].pretty_print()
425
+
426
+ return {
427
+ "web_search_node_result": web_search_node_response,
428
+ }
429
+
430
+ def vision_node(state: State) -> dict:
431
+ """
432
+ Vision model that can analyze images and pictures and answer questions about them.
433
+ This node does not handle videos.
434
+ This node does not handle audio.
435
+ This node does not handle code.
436
+
437
+ Args:
438
+ state (State): A dictionary containing the current state of the agent, including the 'question' key which holds the question to be answered and the 'input_file' key which holds the path to the image file.
439
+ Returns:
440
+ dict: A dictionary containing the response from the vision node, with the key 'vision_node_result' holding the list of messages generated by the vision model.
441
+ """
442
+
443
+ prompt = f"""
444
+ You are a powerful vision assistant, you can analyze images and answer question about the picture
445
+
446
+ 1. You need to fully understand the question.
447
+ 2. You must think hard about what is relevant in the image to make the best answer to the question.
448
+ 3. Report your thought process in detail, explaining your reasoning step-by-step.
449
+
450
+ Here is the question {state['question']}
451
+ Now provide your response immediately without any preamble in text but not in markdown.
452
+ """
453
+
454
+ image_base64 = ""
455
+ try:
456
+ with open(state["downloaded_file"], "rb") as image_file:
457
+ image_bytes = image_file.read()
458
+
459
+ image_base64 = base64.b64encode(image_bytes).decode("utf-8")
460
+
461
+ mistral_image_handling = {
462
+ "type": "image_url",
463
+ "image_url": f"data:image/png;base64,{image_base64}",
464
+ }
465
+
466
+ openai_image_handling = {
467
+ "type": "image",
468
+ "source_type": "base64",
469
+ "mime_type": "image/png", # or image/png, etc.
470
+ "data": image_base64,
471
+ }
472
+
473
+ vision_provider = os.getenv("VLM_PROVIDER", "mistral")
474
+
475
+ if vision_provider == "openai":
476
+ image_handling = openai_image_handling
477
+ else:
478
+ image_handling = mistral_image_handling
479
+
480
+ message = [
481
+ {
482
+ "role": "user",
483
+ "content": [
484
+ {
485
+ "type": "text",
486
+ "text": prompt,
487
+ },
488
+ image_handling
489
+ ]
490
+ }
491
+ ]
492
+
493
+ vision_node_response = [vision_model.invoke(
494
+ input=message,
495
+ # config={
496
+ # "callbacks": [langfuse_handler]
497
+ # }
498
+ )]
499
+
500
+ vision_node_response[-1].pretty_print()
501
+
502
+ return {
503
+ "vision_node_result": vision_node_response
504
+ }
505
+
506
+ except Exception as e:
507
+ # A butler should handle errors gracefully
508
+ error_msg = f"Error extracting text: {str(e)}"
509
+ print(error_msg)
510
+ return {}
511
+
512
+ def video_node(state: State) -> str:
513
+ """
514
+ Video handler model that can analyze videos and answer questions about them.
515
+ This node does not handle images or pictures.
516
+ This node does not handle audio.
517
+ This node does not handle code.
518
+
519
+ Args:
520
+ state (State): A dictionary containing the current state of the agent, including the 'question' key which holds the question to be answered.
521
+
522
+ Returns:
523
+ dict: A dictionary containing the response from the video handler node, with the key 'video_node_result' holding the list of messages generated by the video handler model.
524
+ """
525
+
526
+ prompt = f"""
527
+ You are a highly capable video analysis assistant. Your task is to watch and analyze the provided video content and answer the user's question as accurately and concisely as possible.
528
+
529
+ 1. You need to fully understand the question.
530
+ 2. Carefully observe the video, paying attention to relevant details, actions, and context.
531
+ 3. Focus on the user's question.
532
+ 4. If the question requires counting, identifying, or describing, be precise and clear in your response.
533
+ 5. If you are unsure, state what you can infer from the video.
534
+ 6. Do not make up information that is not visible or inferable from the video.
535
+
536
+ Here is the question {state['question']}
537
+ Now provide your response immediately without any preamble in text but not in markdown.
538
+ """
539
+
540
+ if re.search(r'youtube\.com', state["question"]):
541
+ # More flexible regex pattern to match YouTube URLs
542
+ regex_result = re.search(r"(?P<youtube_url>https://(?:www\.)?youtube\.com/watch\?v=[a-zA-Z0-9_-]+)", state["question"])
543
+ if regex_result:
544
+ video_url = regex_result.group("youtube_url")
545
+ downloaded_video = download_youtube_content(url=video_url)
546
+ else:
547
+ # Fallback if regex doesn't match
548
+ print("Could not extract YouTube URL from question. Using question as fallback.")
549
+ downloaded_video = state["downloaded_file"]
550
+ else:
551
+ downloaded_video = state["downloaded_file"]
552
+
553
+ print(f"Downloaded video: {downloaded_video}")
554
+
555
+ video_mime_type = "video/mp4"
556
+
557
+ with open(downloaded_video, "rb") as video_file:
558
+ encoded_video = base64.b64encode(video_file.read()).decode("utf-8")
559
+
560
+ os.remove(downloaded_video)
561
+
562
+ message = [
563
+ {
564
+ "role": "user",
565
+ "content": [
566
+ {
567
+ "type": "text",
568
+ "text": prompt,
569
+ },
570
+ {
571
+ "type": "media",
572
+ "data": encoded_video, # Use base64 string directly
573
+ "mime_type": video_mime_type,
574
+ },
575
+ ]
576
+ }
577
+ ]
578
+
579
+ video_node_response = [video_handler_model.invoke(
580
+ input=message,
581
+ # config={
582
+ # "callbacks": [langfuse_handler]
583
+ # }
584
+ )]
585
+
586
+ video_node_response[-1].pretty_print()
587
+
588
+ return {
589
+ "video_node_result": video_node_response
590
+ }
591
+
592
+ def audio_node(state: State) -> str:
593
+ """
594
+ Audio handler model that can analyze audio and answer questions about it.
595
+ This node does not handle images or pictures.
596
+ This node does not handle video.
597
+ This node does not handle code.
598
+
599
+ Args:
600
+ state (State): with question key inside
601
+
602
+ Returns:
603
+ dict: A dictionary containing the response from the video handler node, with the key 'audioo_node_result' holding the list of messages generated by the audio handler model.
604
+ """
605
+
606
+ prompt = f"""
607
+ You are a highly capable audio analysis assistant. Your task is to listen to and analyze the provided audio content and answer the user's question as accurately and concisely as possible.
608
+
609
+ 1. You need to fully understand the question.
610
+ 2. Carefully listen to the audio, paying attention to relevant details, actions, and context.
611
+ 3. Focus on the user's question.
612
+ 4. If the question requires counting, identifying, or describing, be precise and clear in your response.
613
+ 5. If you are unsure, state what you can infer from the audio.
614
+ 6. Do not make up information that is not audible or inferable from the audio.
615
+
616
+ Here is the question {state['question']}
617
+ Now provide your response immediately without any preamble in text but not in markdown.
618
+ """
619
+
620
+ downloaded_audio = state["downloaded_file"]
621
+
622
+ print(f"Downloaded audio: {downloaded_audio}")
623
+
624
+ audio_format = re.search(r'\.(\w+)$', downloaded_audio).group(1)
625
+
626
+ with open(downloaded_audio, "rb") as audio_file:
627
+ encoded_audio = base64.b64encode(audio_file.read()).decode()
628
+
629
+ os.remove(downloaded_audio)
630
+
631
+ message = [
632
+ {
633
+ "role": "user",
634
+ "content": [
635
+ {
636
+ "type": "text",
637
+ "text": prompt,
638
+ },
639
+ {
640
+ "type": "input_audio",
641
+ "input_audio": {
642
+ "data": encoded_audio,
643
+ "format": audio_format,
644
+ }
645
+ },
646
+ ]
647
+ }
648
+ ]
649
+
650
+ audio_node_response = [audio_handler_model.invoke(
651
+ input=message,
652
+ # config={
653
+ # "callbacks": [langfuse_handler]
654
+ # }
655
+ )]
656
+
657
+ audio_node_response[-1].pretty_print()
658
+
659
+ return {
660
+ "audio_node_result": audio_node_response
661
+ }
662
+
663
+ def excel_node(state: State):
664
+ """
665
+ Excel handler model that can analyze excel files and answer questions about it.
666
+ This node does not handle images or pictures.
667
+ This node does not handle video.
668
+ This node does not handle code.
669
+ This node does not handle audio.
670
+
671
+ Args:
672
+ state (State): with question key inside
673
+
674
+ Returns:
675
+ dict: A dictionary containing the response from the excel handler node, with the key 'excel_node_result' holding the list of messages generated by the excel handler model.
676
+ """
677
+
678
+ loader = UnstructuredExcelLoader(state["downloaded_file"], mode="elements")
679
+ docs = loader.load()
680
+
681
+ prompt = f"""
682
+ You are a powerful assistant which handles excel files.
683
+
684
+ 1. You need to fully understand the question.
685
+ 2. You must analyze the excel file to answer the question.
686
+ 3. If the question requires counting, identifying, or describing, be precise and clear in your response.
687
+ 4. Do not make up information that is not in the excel file.
688
+
689
+ Here is the question {state['question']}
690
+ Here is the excel file loaded in a Document object: {docs}. You will find htlm content of the file in the 'text_as_html' key.
691
+
692
+ Now provide your response immediately without any preamble in text but not in markdown.
693
+ """
694
+
695
+ response = big_model.invoke(
696
+ input=prompt,
697
+ # config={
698
+ # "callbacks": [langfuse_handler]
699
+ # }
700
+ )
701
+
702
+ response.pretty_print()
703
+
704
+ return {
705
+ "excel_node_result": response
706
+ }
707
+
708
+ def format_answer_node(state: State):
709
+ """
710
+ Format answer node that formats the answer of the last node.
711
+ This node does not handle images or pictures.
712
+ This node does not handle video.
713
+ This node does not handle audio.
714
+ This node does not handle code.
715
+
716
+ Args:
717
+ state (State): with question key inside, and all other nodes results
718
+
719
+ Returns:
720
+ dict: A dictionary containing the response from the format answer node, with the key 'format_answer_node_result' holding the list of messages generated by the format answer model.
721
+ """
722
+
723
+ prompt = """
724
+ You are the best assistant for final answer formating.
725
+
726
+ 1. You must not change the content of the response of the last node.
727
+ 2. You must fully understand the question
728
+ 3. You must return the answer by following hard the format and the constraints
729
+ 4. Report your thought process in detail, explaining your reasoning step-by-step.
730
+
731
+ 5. Conclude your answer with the following template:
732
+ FINAL ANSWER: [YOUR FINAL ANSWER]
733
+
734
+ ## Response Format
735
+ - If asked for a number:
736
+ For exemple 'How many' or a question asking for a number result
737
+ - Provide the number without commas, dollar signs, percent signs, or any units (unless specified).
738
+ - Provide digits, not words
739
+ - If asked for a string:
740
+ - Write the string without articles (a, an, the).
741
+ - Don't answer a full sentence when a short version is enough.
742
+ - Do not use abbreviations (e.g., for cities).
743
+ - Write digits in text but (e.g., "one" instead of "1") unless specified otherwise.
744
+ - Start the first word with a capital letter.
745
+ - If asked for a comma-separated list:
746
+ - Apply the above rules for numbers and strings to each element in the list.
747
+ - And take care of having a space after each comma.
748
+
749
+ ## Constraints
750
+ - You must not answer if the constraints above are not respected.
751
+ - Your final answer should be provided in the format: FINAL ANSWER: [YOUR FINAL ANSWER]
752
+ - Your final answer should be a number, a string, or a comma-separated list of numbers and/or strings, following the specified formatting rules.
753
+
754
+ Now provide your response immediately without any preamble in text but not in markdown.
755
+ """
756
+
757
+ nodes_response = [HumanMessage(content="Here are the results of the previous nodes")]
758
+
759
+ question = [HumanMessage(content=state["question"])]
760
+
761
+ for node_result in ["web_search_node_result", "vision_node_result", "video_node_result", "audio_node_result", "thinking_node_result", "code_node_result", "excel_node_result"]:
762
+ result = state.get(node_result, "")
763
+ if result:
764
+ # Ensure result is a string. If it's a message object, extract its content.
765
+ if hasattr(result, "content"):
766
+ content = result.content
767
+ else:
768
+ content = str(result)
769
+ nodes_response.append(HumanMessage(content=content))
770
+
771
+ sys_msg = SystemMessage(content=prompt)
772
+
773
+ response = [general_model.invoke([sys_msg] + state["messages"]+ question + nodes_response)]
774
+
775
+ return {
776
+ "messages": response,
777
+ }
778
+
779
+ ########################
780
+
781
+ ######## Entry Node ########
782
+ def entry_node(state: State)-> str:
783
+ # System message
784
+
785
+ system_prompt = f"""
786
+ You are a powerful assistant that handle the user message and manage other nodes in order to provide the best answer to the question.
787
+ You do not handle images or pictures
788
+ You do not handle videos
789
+ You do not handle audio
790
+ You do not handle code
791
+ You do not handle excel files
792
+
793
+ 1. You need to fully understand the subject of the question
794
+ 2. You need to understand the subject of the question with the question itself and the file extension
795
+ For example of extensions:
796
+ - .py is for code
797
+ - .wav or .mp3 is for audio
798
+ - a youtube url is for video
799
+ - a .jpg, .png, .jpeg is for image
800
+ - a .xlsx or .xls is for excel
801
+ 3. You must think hard about what is relevant in the question to make the best choice for the next node
802
+ 4. You must not answer the question by yourself
803
+ 5. Report your thought process in detail, explaining your reasoning step-by-step.
804
+
805
+ Here are the nodes you can choose:
806
+ - thinking_node: {thinking_node.__doc__}
807
+ - web_search_node: {web_search_node.__doc__}
808
+ - vision_node: {vision_node.__doc__}
809
+ - video_node: {video_node.__doc__}
810
+ - audio_node: {audio_node.__doc__}
811
+ - code_node: {code_node.__doc__}
812
+ - excel_node: {excel_node.__doc__}
813
+
814
+ Here is the question : {state['question']}
815
+ Here is the file : {state.get("input_file", "no file to handle")}
816
+
817
+ Now provide your response immediately.
818
+ You must always respect this format in lower case: next node <the node name you choose>.
819
+ """
820
+
821
+ downloaded = ""
822
+ # If there's an input file, download it directly:
823
+ if state.get("input_file", None):
824
+ downloaded = download_input_file(state.get("task_id"))
825
+
826
+ sys_msg = SystemMessage(content=system_prompt)
827
+
828
+ entry_node_response = [general_model.invoke([sys_msg] + state["messages"])]
829
+
830
+ entry_node_response[-1].pretty_print()
831
+
832
+ regex_result = re.search(r'.*next.*(?P<next_node>thinking_node|web_search_node|vision_node|video_node|audio_node|code_node|excel_node)', entry_node_response[-1].content, re.IGNORECASE)
833
+
834
+ next_node = "END"
835
+ if regex_result:
836
+ # Extract the node name and remove any quotes around it
837
+ next_node = regex_result.group("next_node")
838
+ next_node = next_node.lower()
839
+
840
+ print(f"Next node to invoke: {next_node}")
841
+
842
+ return {
843
+ "next_node": next_node,
844
+ "downloaded_file": downloaded
845
+ }
846
+
847
+ ########################
848
+
849
+ ######## Build Graph ########
850
+
851
+ def buildweb_search_graph():
852
+ builder = StateGraph(State)
853
+ builder.add_node("web_search_node", web_search_node)
854
+ builder.add_node("tools", ToolNode(search_tools))
855
+
856
+ builder.add_edge(START, "web_search_node")
857
+ builder.add_conditional_edges(
858
+ "web_search_node",
859
+ tools_condition,
860
+ )
861
+ builder.add_edge("tools", "web_search_node")
862
+ builder.add_edge("web_search_node", END)
863
+
864
+ return builder.compile()
865
+
866
+ def build_graph():
867
+ builder = StateGraph(State)
868
+ builder.add_node("entry_node", entry_node)
869
+ builder.add_node("web_search_node", buildweb_search_graph())
870
+ builder.add_node("vision_node", vision_node)
871
+ builder.add_node("video_node", video_node)
872
+ builder.add_node("audio_node", audio_node)
873
+ builder.add_node("code_node", code_node)
874
+ builder.add_node("thinking_node", thinking_node)
875
+ builder.add_node("excel_node", excel_node)
876
+ builder.add_node("format_answer_node", format_answer_node)
877
+
878
+ builder.add_edge(START, "entry_node")
879
+
880
+ # Conditional routing from entry_node to specialized nodes
881
+ builder.add_conditional_edges(
882
+ "entry_node",
883
+ lambda state: state["next_node"],
884
+ {
885
+ "web_search_node": "web_search_node",
886
+ "vision_node": "vision_node",
887
+ "video_node": "video_node",
888
+ "audio_node": "audio_node",
889
+ "code_node": "code_node",
890
+ "excel_node": "excel_node",
891
+ "thinking_node": "thinking_node"
892
+ }
893
+ )
894
+ # After specialized node, go to END
895
+ builder.add_edge("web_search_node", "format_answer_node")
896
+ builder.add_edge("vision_node", "format_answer_node")
897
+ builder.add_edge("video_node", "format_answer_node")
898
+ builder.add_edge("audio_node", "format_answer_node")
899
+ builder.add_edge("code_node", "format_answer_node")
900
+ builder.add_edge("excel_node", "format_answer_node")
901
+ builder.add_edge("thinking_node", "format_answer_node")
902
+ builder.add_edge("format_answer_node", END)
903
+
904
+
905
+ return builder.compile()
906
+
907
+ ########################
908
+
909
+ if __name__ == "__main__":
910
+
911
+ agent_graph = build_graph()
912
+
913
+ # Save the Mermaid diagram as text instead of trying to render as PNG
914
+ # This avoids issues with Pyppeteer browser launching
915
+ # with open("graph.png", "wb") as f:
916
+ # f.write(agent_graph.get_graph(xray=True).draw_mermaid_png())
917
+
918
+ # print("Graph saved as graph.png")
919
+
920
+
921
+
922
+ # print(vision_node.__doc__)
923
+
924
+ with open("./responses.json", "r") as responses:
925
+ json_responses = json.loads(responses.read())
926
+
927
+ # json_questions = [{
928
+ # "question": "The attached Excel file contains the sales of menu items for a local fast-food chain. What were the total sales that the chain made from food (not including drinks)? Express your answer in USD with two decimal places.",
929
+ # "file_name": "7bd855d8-463d-4ed5-93ca-5fe35145f733.xlsx",
930
+ # "task_id": "7bd855d8-463d-4ed5-93ca-5fe35145f733"
931
+ # }]
932
+
933
+ with open("questions.json", "r") as questions:
934
+ json_questions = json.loads(questions.read())
935
+
936
+ for input in json_questions:
937
+
938
+ question = input.get("question", "No question found")
939
+ file_name = input.get("file_name", "")
940
+ task_id = input.get("task_id", "")
941
+
942
+ print(f"QUESTION : {question}")
943
+ print(f"FILE: {file_name}")
944
+
945
+ user_prompt = [HumanMessage(content="Can you answer the question please ?")]
946
+
947
+ user_input = {"messages": user_prompt, "question": question, "input_file": file_name, "task_id": task_id}
948
+
949
+ messages = agent_graph.invoke(
950
+ input=user_input,
951
+ config={
952
+ "recursion_limit": 10,
953
+ # "callbacks": [langfuse_handler]
954
+ }
955
+ )
956
+
957
+ for m in messages['messages']:
958
+ m.pretty_print()
959
+
960
+ try:
961
+ regex_result = re.search(r"FINAL ANSWER:\s*(?P<answer>.*)$", messages['messages'][-1].content)
962
+ answer = regex_result.group("answer")
963
+ except:
964
+ regex_result = re.search(r"\s*(?P<answer>.*)$", messages['messages'][-1].content)
965
+ answer = regex_result.group("answer")
966
+
967
+ print(answer)
968
+ if answer == json_responses.get(task_id, ""):
969
+ print("The answer is correct !")
970
+ else:
971
+ print("The answer is incorrect !")
972
+ print(f"Expected: {json_responses.get(task_id, '')}")
973
+ print(f"Got: {answer}")
974
+