Spaces:
Sleeping
Sleeping
Chris
commited on
Commit
·
6afa67b
1
Parent(s):
b55bafd
Final 7.6.3
Browse files- src/agents/router.py +319 -27
- src/agents/web_researcher.py +193 -121
- src/tools/final_answer_tool.py +183 -205
- src/tools/web_search_tool.py +78 -64
src/agents/router.py
CHANGED
|
@@ -24,49 +24,43 @@ class RouterAgent:
|
|
| 24 |
|
| 25 |
def process(self, state: GAIAAgentState) -> GAIAAgentState:
|
| 26 |
"""
|
| 27 |
-
Enhanced
|
| 28 |
"""
|
| 29 |
-
logger.info("🧭 Router: Starting multi-phase
|
| 30 |
-
state.add_processing_step("Router:
|
| 31 |
|
| 32 |
try:
|
| 33 |
-
#
|
| 34 |
-
|
| 35 |
-
state.add_processing_step(f"Router: Structure = {structural_analysis['type']}")
|
| 36 |
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
state.add_processing_step(f"Router: Needs = {info_requirements['primary_need']}")
|
| 40 |
|
| 41 |
-
#
|
| 42 |
-
|
| 43 |
-
state.
|
| 44 |
|
| 45 |
-
#
|
| 46 |
-
agent_sequence = self._select_agent_sequence(execution_strategy, info_requirements)
|
| 47 |
-
|
| 48 |
-
# Store analysis in state for agents to use
|
| 49 |
state.router_analysis = {
|
| 50 |
-
'
|
| 51 |
-
'
|
| 52 |
-
'
|
| 53 |
-
'sequence': agent_sequence
|
| 54 |
}
|
| 55 |
|
| 56 |
-
logger.info(f"✅
|
| 57 |
-
state.add_processing_step(f"Router: Selected agents = {agent_sequence}")
|
| 58 |
|
| 59 |
-
# Set agent sequence for workflow
|
| 60 |
-
state.agent_sequence = agent_sequence
|
| 61 |
return state
|
| 62 |
|
| 63 |
except Exception as e:
|
| 64 |
-
error_msg = f"
|
| 65 |
logger.error(error_msg)
|
| 66 |
state.add_error(error_msg)
|
| 67 |
|
| 68 |
# Fallback to basic routing
|
| 69 |
-
state.
|
|
|
|
|
|
|
|
|
|
| 70 |
return state
|
| 71 |
|
| 72 |
def route_question(self, state: GAIAAgentState) -> GAIAAgentState:
|
|
@@ -826,4 +820,302 @@ REASONING: [brief explanation]
|
|
| 826 |
sequence.remove('synthesizer')
|
| 827 |
sequence.append('synthesizer')
|
| 828 |
|
| 829 |
-
return sequence
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
|
| 25 |
def process(self, state: GAIAAgentState) -> GAIAAgentState:
|
| 26 |
"""
|
| 27 |
+
Enhanced router processing with improved classification and planning
|
| 28 |
"""
|
| 29 |
+
logger.info("🧭 Router: Starting enhanced multi-phase analysis")
|
| 30 |
+
state.add_processing_step("Router: Enhanced multi-phase question analysis")
|
| 31 |
|
| 32 |
try:
|
| 33 |
+
# Enhanced classification
|
| 34 |
+
classification_result = self._classify_question_enhanced(state.question)
|
|
|
|
| 35 |
|
| 36 |
+
state.question_type = classification_result['question_type']
|
| 37 |
+
state.routing_decision = classification_result['reasoning']
|
|
|
|
| 38 |
|
| 39 |
+
# Select agents based on enhanced classification
|
| 40 |
+
agents = self._select_agents_for_type(classification_result)
|
| 41 |
+
state.selected_agents = agents
|
| 42 |
|
| 43 |
+
# Store enhanced analysis for downstream agents
|
|
|
|
|
|
|
|
|
|
| 44 |
state.router_analysis = {
|
| 45 |
+
'classification': classification_result,
|
| 46 |
+
'selected_agents': [a.value for a in agents],
|
| 47 |
+
'confidence': classification_result['confidence']
|
|
|
|
| 48 |
}
|
| 49 |
|
| 50 |
+
logger.info(f"✅ Enhanced routing: {classification_result['type']} -> {[a.value for a in agents]}")
|
|
|
|
| 51 |
|
|
|
|
|
|
|
| 52 |
return state
|
| 53 |
|
| 54 |
except Exception as e:
|
| 55 |
+
error_msg = f"Enhanced router analysis failed: {str(e)}"
|
| 56 |
logger.error(error_msg)
|
| 57 |
state.add_error(error_msg)
|
| 58 |
|
| 59 |
# Fallback to basic routing
|
| 60 |
+
state.question_type = QuestionType.GENERAL_INQUIRY
|
| 61 |
+
state.selected_agents = [AgentRole.WEB_RESEARCHER, AgentRole.REASONING_AGENT, AgentRole.SYNTHESIZER]
|
| 62 |
+
state.routing_decision = f"Enhanced routing failed, using fallback: {error_msg}"
|
| 63 |
+
|
| 64 |
return state
|
| 65 |
|
| 66 |
def route_question(self, state: GAIAAgentState) -> GAIAAgentState:
|
|
|
|
| 820 |
sequence.remove('synthesizer')
|
| 821 |
sequence.append('synthesizer')
|
| 822 |
|
| 823 |
+
return sequence
|
| 824 |
+
|
| 825 |
+
def _classify_question_enhanced(self, question: str) -> Dict[str, Any]:
|
| 826 |
+
"""Enhanced question classification using better pattern matching and LLM analysis"""
|
| 827 |
+
|
| 828 |
+
question_lower = question.lower()
|
| 829 |
+
|
| 830 |
+
# Enhanced pattern classification
|
| 831 |
+
pattern_classification = self._classify_by_enhanced_patterns(question_lower, question)
|
| 832 |
+
|
| 833 |
+
# LLM-based classification for complex cases
|
| 834 |
+
llm_classification = self._classify_with_llm(question)
|
| 835 |
+
|
| 836 |
+
# Combine both approaches
|
| 837 |
+
final_classification = self._combine_classifications(pattern_classification, llm_classification, question)
|
| 838 |
+
|
| 839 |
+
logger.info(f"🤖 Enhanced classification: Pattern={pattern_classification['type']}, LLM={llm_classification['type']}, Final={final_classification['type']}")
|
| 840 |
+
|
| 841 |
+
return final_classification
|
| 842 |
+
|
| 843 |
+
def _classify_by_enhanced_patterns(self, question_lower: str, original_question: str) -> Dict[str, Any]:
|
| 844 |
+
"""Enhanced pattern-based classification with better accuracy"""
|
| 845 |
+
|
| 846 |
+
# Mathematical/counting questions (high confidence patterns)
|
| 847 |
+
mathematical_patterns = [
|
| 848 |
+
r'\bhow many\b',
|
| 849 |
+
r'\bcount\b.*\b(of|the)\b',
|
| 850 |
+
r'\bnumber of\b',
|
| 851 |
+
r'\btotal\b.*\b(of|number)\b',
|
| 852 |
+
r'\bcalculate\b',
|
| 853 |
+
r'\bsum\b.*\bof\b',
|
| 854 |
+
r'\bhow much\b',
|
| 855 |
+
r'\bquantity\b'
|
| 856 |
+
]
|
| 857 |
+
|
| 858 |
+
if any(re.search(pattern, question_lower) for pattern in mathematical_patterns):
|
| 859 |
+
# Check for temporal constraints
|
| 860 |
+
temporal_indicators = ['between', 'from', 'during', 'in', r'\b(19|20)\d{2}\b']
|
| 861 |
+
has_temporal = any(re.search(indicator, question_lower) for indicator in temporal_indicators)
|
| 862 |
+
|
| 863 |
+
return {
|
| 864 |
+
'type': 'mathematical',
|
| 865 |
+
'confidence': 0.9,
|
| 866 |
+
'subtype': 'temporal_counting' if has_temporal else 'general_counting',
|
| 867 |
+
'reasoning': 'Strong mathematical/counting indicators found'
|
| 868 |
+
}
|
| 869 |
+
|
| 870 |
+
# Text manipulation questions
|
| 871 |
+
text_manipulation_patterns = [
|
| 872 |
+
r'\bopposite\b',
|
| 873 |
+
r'\breverse\b',
|
| 874 |
+
r'\bbackwards\b',
|
| 875 |
+
r'\bdecode\b',
|
| 876 |
+
r'\btranslate\b',
|
| 877 |
+
r'\bconvert\b',
|
| 878 |
+
r'\.rewsna', # Common in reversed text questions
|
| 879 |
+
r'\bcipher\b',
|
| 880 |
+
r'\bencrypt\b'
|
| 881 |
+
]
|
| 882 |
+
|
| 883 |
+
if any(re.search(pattern, question_lower) for pattern in text_manipulation_patterns):
|
| 884 |
+
return {
|
| 885 |
+
'type': 'text_manipulation',
|
| 886 |
+
'confidence': 0.85,
|
| 887 |
+
'subtype': 'text_processing',
|
| 888 |
+
'reasoning': 'Text manipulation patterns detected'
|
| 889 |
+
}
|
| 890 |
+
|
| 891 |
+
# File/code processing questions
|
| 892 |
+
file_patterns = [
|
| 893 |
+
r'\battached\b.*\b(file|image|document|excel|csv|python|code)\b',
|
| 894 |
+
r'\bfile\b.*\b(contains|attached|uploaded)\b',
|
| 895 |
+
r'\b(image|photo|picture)\b.*\b(shows|contains|attached)\b',
|
| 896 |
+
r'\bcode\b.*\b(attached|file|script)\b',
|
| 897 |
+
r'\bspreadsheet\b',
|
| 898 |
+
r'\b\.py\b|\b\.csv\b|\b\.xlsx\b|\b\.png\b|\b\.jpg\b'
|
| 899 |
+
]
|
| 900 |
+
|
| 901 |
+
if any(re.search(pattern, question_lower) for pattern in file_patterns):
|
| 902 |
+
return {
|
| 903 |
+
'type': 'file_processing',
|
| 904 |
+
'confidence': 0.9,
|
| 905 |
+
'subtype': 'file_analysis',
|
| 906 |
+
'reasoning': 'File processing indicators found'
|
| 907 |
+
}
|
| 908 |
+
|
| 909 |
+
# Web research questions (specific indicators)
|
| 910 |
+
web_research_patterns = [
|
| 911 |
+
r'\bwikipedia\b.*\barticle\b',
|
| 912 |
+
r'\bfeatured article\b',
|
| 913 |
+
r'\bpromoted\b.*\b(in|during)\b.*\b(19|20)\d{2}\b',
|
| 914 |
+
r'\bnominated\b.*\bby\b',
|
| 915 |
+
r'\byoutube\b.*\bvideo\b',
|
| 916 |
+
r'\bwatch\?v=\b',
|
| 917 |
+
r'\bhttps?://\b',
|
| 918 |
+
r'\bwebsite\b|\burl\b'
|
| 919 |
+
]
|
| 920 |
+
|
| 921 |
+
if any(re.search(pattern, question_lower) for pattern in web_research_patterns):
|
| 922 |
+
return {
|
| 923 |
+
'type': 'web_research',
|
| 924 |
+
'confidence': 0.8,
|
| 925 |
+
'subtype': 'specific_lookup',
|
| 926 |
+
'reasoning': 'Web-specific content indicators found'
|
| 927 |
+
}
|
| 928 |
+
|
| 929 |
+
# Reasoning/analysis questions
|
| 930 |
+
reasoning_patterns = [
|
| 931 |
+
r'\banalyze\b|\banalysis\b',
|
| 932 |
+
r'\bcompare\b|\bcomparison\b',
|
| 933 |
+
r'\bexplain\b|\bexplanation\b',
|
| 934 |
+
r'\bwhy\b.*\b(is|are|was|were|do|does|did)\b',
|
| 935 |
+
r'\bhow\b.*\b(does|do|did|can|could|would)\b',
|
| 936 |
+
r'\bwhat.*difference\b',
|
| 937 |
+
r'\bwhat.*relationship\b'
|
| 938 |
+
]
|
| 939 |
+
|
| 940 |
+
if any(re.search(pattern, question_lower) for pattern in reasoning_patterns):
|
| 941 |
+
return {
|
| 942 |
+
'type': 'reasoning',
|
| 943 |
+
'confidence': 0.7,
|
| 944 |
+
'subtype': 'analytical_reasoning',
|
| 945 |
+
'reasoning': 'Reasoning/analysis patterns detected'
|
| 946 |
+
}
|
| 947 |
+
|
| 948 |
+
# General factual questions
|
| 949 |
+
factual_patterns = [
|
| 950 |
+
r'\bwho\b.*\b(is|was|are|were)\b',
|
| 951 |
+
r'\bwhat\b.*\b(is|was|are|were)\b',
|
| 952 |
+
r'\bwhen\b.*\b(did|was|were|is|are)\b',
|
| 953 |
+
r'\bwhere\b.*\b(is|was|are|were)\b',
|
| 954 |
+
r'\bwhich\b.*\b(is|was|are|were)\b'
|
| 955 |
+
]
|
| 956 |
+
|
| 957 |
+
if any(re.search(pattern, question_lower) for pattern in factual_patterns):
|
| 958 |
+
return {
|
| 959 |
+
'type': 'factual_lookup',
|
| 960 |
+
'confidence': 0.6,
|
| 961 |
+
'subtype': 'general_factual',
|
| 962 |
+
'reasoning': 'General factual question patterns'
|
| 963 |
+
}
|
| 964 |
+
|
| 965 |
+
# Default classification
|
| 966 |
+
return {
|
| 967 |
+
'type': 'general',
|
| 968 |
+
'confidence': 0.4,
|
| 969 |
+
'subtype': 'unclassified',
|
| 970 |
+
'reasoning': 'No specific patterns matched'
|
| 971 |
+
}
|
| 972 |
+
|
| 973 |
+
def _classify_with_llm(self, question: str) -> Dict[str, Any]:
|
| 974 |
+
"""LLM-based classification for complex questions"""
|
| 975 |
+
|
| 976 |
+
classification_prompt = f"""
|
| 977 |
+
Analyze this question and classify it into one of these categories:
|
| 978 |
+
|
| 979 |
+
Categories:
|
| 980 |
+
- mathematical: Questions asking for counts, calculations, quantities
|
| 981 |
+
- text_manipulation: Questions involving text reversal, encoding, word puzzles
|
| 982 |
+
- file_processing: Questions about attached files, images, code, data
|
| 983 |
+
- web_research: Questions requiring web search, Wikipedia lookup, current information
|
| 984 |
+
- reasoning: Questions requiring analysis, comparison, logical deduction
|
| 985 |
+
- factual_lookup: Simple fact-based questions about people, places, events
|
| 986 |
+
|
| 987 |
+
Question: {question}
|
| 988 |
+
|
| 989 |
+
Respond with just the category name and a brief reason (max 10 words).
|
| 990 |
+
Format: category_name: reason
|
| 991 |
+
|
| 992 |
+
Classification:"""
|
| 993 |
+
|
| 994 |
+
try:
|
| 995 |
+
llm_result = self.llm_client.generate(
|
| 996 |
+
classification_prompt,
|
| 997 |
+
tier=ModelTier.ROUTER, # Use fast model for classification
|
| 998 |
+
max_tokens=50
|
| 999 |
+
)
|
| 1000 |
+
|
| 1001 |
+
if llm_result.success:
|
| 1002 |
+
response = llm_result.response.strip().lower()
|
| 1003 |
+
|
| 1004 |
+
# Parse the response
|
| 1005 |
+
if ':' in response:
|
| 1006 |
+
category, reason = response.split(':', 1)
|
| 1007 |
+
category = category.strip()
|
| 1008 |
+
reason = reason.strip()
|
| 1009 |
+
else:
|
| 1010 |
+
category = response.split()[0] if response.split() else 'general'
|
| 1011 |
+
reason = 'llm classification'
|
| 1012 |
+
|
| 1013 |
+
# Validate category
|
| 1014 |
+
valid_categories = ['mathematical', 'text_manipulation', 'file_processing', 'web_research', 'reasoning', 'factual_lookup']
|
| 1015 |
+
if category not in valid_categories:
|
| 1016 |
+
category = 'general'
|
| 1017 |
+
|
| 1018 |
+
return {
|
| 1019 |
+
'type': category,
|
| 1020 |
+
'confidence': 0.7,
|
| 1021 |
+
'reasoning': f'LLM: {reason}'
|
| 1022 |
+
}
|
| 1023 |
+
else:
|
| 1024 |
+
return {
|
| 1025 |
+
'type': 'general',
|
| 1026 |
+
'confidence': 0.3,
|
| 1027 |
+
'reasoning': 'LLM classification failed'
|
| 1028 |
+
}
|
| 1029 |
+
|
| 1030 |
+
except Exception as e:
|
| 1031 |
+
logger.warning(f"LLM classification failed: {e}")
|
| 1032 |
+
return {
|
| 1033 |
+
'type': 'general',
|
| 1034 |
+
'confidence': 0.3,
|
| 1035 |
+
'reasoning': 'LLM classification error'
|
| 1036 |
+
}
|
| 1037 |
+
|
| 1038 |
+
def _combine_classifications(self, pattern_result: Dict[str, Any], llm_result: Dict[str, Any], question: str) -> Dict[str, Any]:
|
| 1039 |
+
"""Combine pattern and LLM classifications for final decision"""
|
| 1040 |
+
|
| 1041 |
+
pattern_type = pattern_result['type']
|
| 1042 |
+
pattern_confidence = pattern_result['confidence']
|
| 1043 |
+
llm_type = llm_result['type']
|
| 1044 |
+
llm_confidence = llm_result['confidence']
|
| 1045 |
+
|
| 1046 |
+
# If pattern matching has high confidence, trust it
|
| 1047 |
+
if pattern_confidence >= 0.8:
|
| 1048 |
+
final_type = pattern_type
|
| 1049 |
+
final_confidence = pattern_confidence
|
| 1050 |
+
reasoning = f"High confidence pattern match: {pattern_result['reasoning']}"
|
| 1051 |
+
|
| 1052 |
+
# If both agree, boost confidence
|
| 1053 |
+
elif pattern_type == llm_type:
|
| 1054 |
+
final_type = pattern_type
|
| 1055 |
+
final_confidence = min(0.95, (pattern_confidence + llm_confidence) / 2 + 0.1)
|
| 1056 |
+
reasoning = f"Pattern and LLM agree: {pattern_type}"
|
| 1057 |
+
|
| 1058 |
+
# If they disagree, use the one with higher confidence
|
| 1059 |
+
elif pattern_confidence > llm_confidence:
|
| 1060 |
+
final_type = pattern_type
|
| 1061 |
+
final_confidence = pattern_confidence * 0.9 # Slight penalty for disagreement
|
| 1062 |
+
reasoning = f"Pattern-based: {pattern_result['reasoning']}"
|
| 1063 |
+
else:
|
| 1064 |
+
final_type = llm_type
|
| 1065 |
+
final_confidence = llm_confidence * 0.9 # Slight penalty for disagreement
|
| 1066 |
+
reasoning = f"LLM-based: {llm_result['reasoning']}"
|
| 1067 |
+
|
| 1068 |
+
# Map to question types
|
| 1069 |
+
type_mapping = {
|
| 1070 |
+
'mathematical': QuestionType.QUANTITATIVE_ANALYSIS,
|
| 1071 |
+
'text_manipulation': QuestionType.TEXT_MANIPULATION,
|
| 1072 |
+
'file_processing': QuestionType.FILE_PROCESSING,
|
| 1073 |
+
'web_research': QuestionType.WEB_RESEARCH,
|
| 1074 |
+
'reasoning': QuestionType.COMPLEX_REASONING,
|
| 1075 |
+
'factual_lookup': QuestionType.FACTUAL_LOOKUP,
|
| 1076 |
+
'general': QuestionType.GENERAL_INQUIRY
|
| 1077 |
+
}
|
| 1078 |
+
|
| 1079 |
+
question_type = type_mapping.get(final_type, QuestionType.GENERAL_INQUIRY)
|
| 1080 |
+
|
| 1081 |
+
return {
|
| 1082 |
+
'type': final_type,
|
| 1083 |
+
'question_type': question_type,
|
| 1084 |
+
'confidence': final_confidence,
|
| 1085 |
+
'reasoning': reasoning,
|
| 1086 |
+
'pattern_result': pattern_result,
|
| 1087 |
+
'llm_result': llm_result
|
| 1088 |
+
}
|
| 1089 |
+
|
| 1090 |
+
def _select_agents_for_type(self, classification_result: Dict[str, Any]) -> List[AgentRole]:
|
| 1091 |
+
"""Select appropriate agents based on enhanced classification"""
|
| 1092 |
+
|
| 1093 |
+
question_type = classification_result['type']
|
| 1094 |
+
confidence = classification_result['confidence']
|
| 1095 |
+
|
| 1096 |
+
# Agent selection based on question type
|
| 1097 |
+
if question_type == 'mathematical':
|
| 1098 |
+
agents = [AgentRole.WEB_RESEARCHER, AgentRole.REASONING_AGENT]
|
| 1099 |
+
elif question_type == 'text_manipulation':
|
| 1100 |
+
agents = [AgentRole.REASONING_AGENT]
|
| 1101 |
+
elif question_type == 'file_processing':
|
| 1102 |
+
agents = [AgentRole.FILE_PROCESSOR, AgentRole.REASONING_AGENT]
|
| 1103 |
+
elif question_type == 'web_research':
|
| 1104 |
+
agents = [AgentRole.WEB_RESEARCHER]
|
| 1105 |
+
elif question_type == 'reasoning':
|
| 1106 |
+
agents = [AgentRole.REASONING_AGENT, AgentRole.WEB_RESEARCHER]
|
| 1107 |
+
elif question_type == 'factual_lookup':
|
| 1108 |
+
agents = [AgentRole.WEB_RESEARCHER]
|
| 1109 |
+
else:
|
| 1110 |
+
# General questions - try multiple approaches
|
| 1111 |
+
agents = [AgentRole.WEB_RESEARCHER, AgentRole.REASONING_AGENT]
|
| 1112 |
+
|
| 1113 |
+
# Always add synthesizer
|
| 1114 |
+
agents.append(AgentRole.SYNTHESIZER)
|
| 1115 |
+
|
| 1116 |
+
# If confidence is low, add more agents for better coverage
|
| 1117 |
+
if confidence < 0.6:
|
| 1118 |
+
if AgentRole.WEB_RESEARCHER not in agents:
|
| 1119 |
+
agents.insert(-1, AgentRole.WEB_RESEARCHER) # Insert before synthesizer
|
| 1120 |
+
|
| 1121 |
+
return agents
|
src/agents/web_researcher.py
CHANGED
|
@@ -589,23 +589,165 @@ class WebResearchAgent:
|
|
| 589 |
return self._create_failure_result("YouTube research failed")
|
| 590 |
|
| 591 |
def _research_web_general(self, state: GAIAAgentState) -> AgentResult:
|
| 592 |
-
"""General web
|
| 593 |
|
|
|
|
| 594 |
search_terms = self._extract_search_terms(state.question)
|
| 595 |
|
| 596 |
-
logger.info(f"Web
|
| 597 |
|
| 598 |
-
#
|
| 599 |
-
|
| 600 |
-
|
| 601 |
-
"action": "search",
|
| 602 |
-
"limit": 5
|
| 603 |
-
})
|
| 604 |
|
| 605 |
if web_result.success and web_result.result.get('found'):
|
| 606 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 607 |
else:
|
| 608 |
-
return self._create_failure_result("Web search failed")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 609 |
|
| 610 |
def _research_url_content(self, state: GAIAAgentState) -> AgentResult:
|
| 611 |
"""Extract and analyze content from specific URLs"""
|
|
@@ -760,128 +902,58 @@ class WebResearchAgent:
|
|
| 760 |
|
| 761 |
return ' '.join(topic_words[:3]) if topic_words else "topic"
|
| 762 |
|
| 763 |
-
def _extract_search_terms(self, question: str, max_length: int =
|
| 764 |
"""
|
| 765 |
-
|
| 766 |
-
Prioritizes
|
| 767 |
"""
|
|
|
|
|
|
|
| 768 |
|
| 769 |
-
#
|
| 770 |
-
|
| 771 |
-
words = clean_question.split()
|
| 772 |
-
|
| 773 |
-
# Remove common stop words but keep question words
|
| 774 |
-
stop_words = {
|
| 775 |
-
'the', 'a', 'an', 'is', 'are', 'was', 'were', 'be', 'been', 'being',
|
| 776 |
-
'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could',
|
| 777 |
-
'should', 'may', 'might', 'must', 'shall', 'can', 'to', 'of', 'in',
|
| 778 |
-
'on', 'at', 'by', 'for', 'with', 'from', 'as', 'but', 'or', 'and',
|
| 779 |
-
'if', 'then', 'than', 'this', 'that', 'these', 'those', 'i', 'you',
|
| 780 |
-
'he', 'she', 'it', 'we', 'they', 'me', 'him', 'her', 'us', 'them'
|
| 781 |
-
}
|
| 782 |
|
| 783 |
-
#
|
| 784 |
-
|
|
|
|
| 785 |
|
| 786 |
-
#
|
| 787 |
-
|
|
|
|
|
|
|
|
|
|
| 788 |
|
| 789 |
-
# Extract
|
| 790 |
-
|
| 791 |
-
|
| 792 |
-
if len(phrase.strip()) > 0:
|
| 793 |
-
priority_terms.append(phrase.strip())
|
| 794 |
|
| 795 |
-
# Extract
|
| 796 |
-
|
|
|
|
| 797 |
|
| 798 |
-
#
|
| 799 |
-
|
| 800 |
-
|
| 801 |
-
|
| 802 |
-
|
| 803 |
-
|
| 804 |
-
|
| 805 |
-
|
| 806 |
-
|
|
|
|
| 807 |
|
| 808 |
-
#
|
| 809 |
-
|
| 810 |
-
|
| 811 |
-
number_matches = re.findall(r'\b\d{1,4}\b', question)
|
| 812 |
-
for num in number_matches:
|
| 813 |
-
# Skip very common/meaningless numbers and years already captured
|
| 814 |
-
if (num not in ['1', '2', '3', '4', '5', '10', '20', '19', '21', '22', '23', '24', '25'] and
|
| 815 |
-
num not in years and
|
| 816 |
-
len(num) > 1): # Require at least 2 digits for meaningful numbers
|
| 817 |
-
# Only include if it appears in a meaningful context
|
| 818 |
-
if any(context in question.lower() for context in [
|
| 819 |
-
f'{num} albums', f'{num} songs', f'{num} years', f'{num} people',
|
| 820 |
-
f'{num} times', f'{num} days', f'{num} months', f'episode {num}',
|
| 821 |
-
f'season {num}', f'volume {num}', f'part {num}'
|
| 822 |
-
]):
|
| 823 |
-
meaningful_numbers.append(num)
|
| 824 |
-
|
| 825 |
-
# Build search terms with priority
|
| 826 |
-
search_terms = []
|
| 827 |
-
|
| 828 |
-
# Add quoted phrases (highest priority)
|
| 829 |
-
search_terms.extend(priority_terms)
|
| 830 |
-
|
| 831 |
-
# Add proper nouns (high priority)
|
| 832 |
-
search_terms.extend(proper_nouns[:5]) # Limit to avoid duplication
|
| 833 |
-
|
| 834 |
-
# Add question words if present
|
| 835 |
-
for word in words:
|
| 836 |
-
if word in question_words and word not in search_terms:
|
| 837 |
-
search_terms.append(word)
|
| 838 |
|
| 839 |
-
#
|
| 840 |
-
search_terms
|
|
|
|
| 841 |
|
| 842 |
-
#
|
| 843 |
-
|
| 844 |
-
|
| 845 |
-
|
| 846 |
-
len(word) > 2 and
|
| 847 |
-
not word.isdigit()): # Avoid random numbers
|
| 848 |
-
search_terms.append(word)
|
| 849 |
-
|
| 850 |
-
# Stop if we have enough terms
|
| 851 |
-
if len(' '.join(search_terms)) > max_length - 20:
|
| 852 |
-
break
|
| 853 |
-
|
| 854 |
-
# Add a few important numbers if space allows
|
| 855 |
-
if len(' '.join(search_terms)) < max_length - 10:
|
| 856 |
-
search_terms.extend(meaningful_numbers[:2])
|
| 857 |
-
|
| 858 |
-
# Join and clean up
|
| 859 |
-
search_query = ' '.join(search_terms)
|
| 860 |
-
|
| 861 |
-
# Remove duplicates while preserving order
|
| 862 |
-
seen = set()
|
| 863 |
-
unique_terms = []
|
| 864 |
-
for term in search_terms:
|
| 865 |
-
if term.lower() not in seen:
|
| 866 |
-
seen.add(term.lower())
|
| 867 |
-
unique_terms.append(term)
|
| 868 |
-
|
| 869 |
-
# Final cleanup and length check
|
| 870 |
-
final_query = ' '.join(unique_terms)
|
| 871 |
-
if len(final_query) > max_length:
|
| 872 |
-
# Truncate to fit
|
| 873 |
-
truncated_terms = []
|
| 874 |
-
current_length = 0
|
| 875 |
-
for term in unique_terms:
|
| 876 |
-
if current_length + len(term) + 1 <= max_length:
|
| 877 |
-
truncated_terms.append(term)
|
| 878 |
-
current_length += len(term) + 1
|
| 879 |
-
else:
|
| 880 |
-
break
|
| 881 |
-
final_query = ' '.join(truncated_terms)
|
| 882 |
-
|
| 883 |
-
logger.info(f"📝 Optimized search terms: '{final_query}' from question: '{question[:50]}...'")
|
| 884 |
-
return final_query
|
| 885 |
|
| 886 |
def _extract_youtube_info(self, question: str) -> str:
|
| 887 |
"""Extract YouTube URL or search terms"""
|
|
|
|
| 589 |
return self._create_failure_result("YouTube research failed")
|
| 590 |
|
| 591 |
def _research_web_general(self, state: GAIAAgentState) -> AgentResult:
|
| 592 |
+
"""General web research with enhanced result analysis"""
|
| 593 |
|
| 594 |
+
# Extract optimized search terms
|
| 595 |
search_terms = self._extract_search_terms(state.question)
|
| 596 |
|
| 597 |
+
logger.info(f"Web research for: {search_terms}")
|
| 598 |
|
| 599 |
+
# Search the web
|
| 600 |
+
search_query = {"query": search_terms, "action": "search", "limit": 5}
|
| 601 |
+
web_result = self.web_search_tool.execute(search_query)
|
|
|
|
|
|
|
|
|
|
| 602 |
|
| 603 |
if web_result.success and web_result.result.get('found'):
|
| 604 |
+
search_data = web_result.result
|
| 605 |
+
|
| 606 |
+
# Enhanced analysis with focused LLM processing
|
| 607 |
+
analysis_prompt = self._create_enhanced_analysis_prompt(state.question, search_data, search_terms)
|
| 608 |
+
|
| 609 |
+
# Use appropriate model tier based on complexity
|
| 610 |
+
model_tier = ModelTier.COMPLEX if state.complexity_assessment == "complex" else ModelTier.MAIN
|
| 611 |
+
llm_result = self.llm_client.generate(analysis_prompt, tier=model_tier, max_tokens=600)
|
| 612 |
+
|
| 613 |
+
if llm_result.success:
|
| 614 |
+
# Parse the LLM response for better confidence assessment
|
| 615 |
+
confidence = self._assess_answer_confidence(llm_result.response, state.question, search_data)
|
| 616 |
+
|
| 617 |
+
return AgentResult(
|
| 618 |
+
agent_role=AgentRole.WEB_RESEARCHER,
|
| 619 |
+
success=True,
|
| 620 |
+
result=llm_result.response,
|
| 621 |
+
confidence=confidence,
|
| 622 |
+
reasoning=f"Enhanced web search analysis of {len(search_data.get('results', []))} sources for '{search_terms}'",
|
| 623 |
+
tools_used=[ToolResult(
|
| 624 |
+
tool_name="web_search",
|
| 625 |
+
success=True,
|
| 626 |
+
result=search_data,
|
| 627 |
+
execution_time=web_result.execution_time
|
| 628 |
+
)],
|
| 629 |
+
model_used=llm_result.model_used,
|
| 630 |
+
processing_time=web_result.execution_time + llm_result.response_time,
|
| 631 |
+
cost_estimate=llm_result.cost_estimate
|
| 632 |
+
)
|
| 633 |
+
else:
|
| 634 |
+
# Fallback to best search result
|
| 635 |
+
results = search_data.get('results', [])
|
| 636 |
+
best_result = results[0] if results else {"title": "No results", "snippet": "No information found"}
|
| 637 |
+
|
| 638 |
+
return AgentResult(
|
| 639 |
+
agent_role=AgentRole.WEB_RESEARCHER,
|
| 640 |
+
success=True,
|
| 641 |
+
result=f"Found: {best_result.get('title', 'Unknown')} - {best_result.get('snippet', 'No description')}",
|
| 642 |
+
confidence=0.4,
|
| 643 |
+
reasoning="Web search completed but analysis failed",
|
| 644 |
+
tools_used=[ToolResult(
|
| 645 |
+
tool_name="web_search",
|
| 646 |
+
success=True,
|
| 647 |
+
result=search_data,
|
| 648 |
+
execution_time=web_result.execution_time
|
| 649 |
+
)],
|
| 650 |
+
model_used="fallback",
|
| 651 |
+
processing_time=web_result.execution_time,
|
| 652 |
+
cost_estimate=0.0
|
| 653 |
+
)
|
| 654 |
else:
|
| 655 |
+
return self._create_failure_result(f"Web search failed for '{search_terms}': {web_result.result.get('message', 'Unknown error')}")
|
| 656 |
+
|
| 657 |
+
def _create_enhanced_analysis_prompt(self, question: str, search_data: Dict[str, Any], search_terms: str) -> str:
|
| 658 |
+
"""Create enhanced analysis prompt for better result processing"""
|
| 659 |
+
|
| 660 |
+
results = search_data.get('results', [])
|
| 661 |
+
search_source = search_data.get('source', 'web')
|
| 662 |
+
|
| 663 |
+
# Format search results concisely
|
| 664 |
+
formatted_results = []
|
| 665 |
+
for i, result in enumerate(results[:4], 1): # Limit to top 4 results
|
| 666 |
+
title = result.get('title', 'No title')
|
| 667 |
+
snippet = result.get('snippet', 'No description')
|
| 668 |
+
url = result.get('url', '')
|
| 669 |
+
source = result.get('source', search_source)
|
| 670 |
+
|
| 671 |
+
formatted_results.append(f"""
|
| 672 |
+
Result {i} ({source}):
|
| 673 |
+
Title: {title}
|
| 674 |
+
Content: {snippet}
|
| 675 |
+
URL: {url}
|
| 676 |
+
""")
|
| 677 |
+
|
| 678 |
+
# Create focused analysis prompt
|
| 679 |
+
prompt = f"""
|
| 680 |
+
You are analyzing web search results to answer a specific question. Provide a direct, accurate answer based on the search findings.
|
| 681 |
+
|
| 682 |
+
Question: {question}
|
| 683 |
+
|
| 684 |
+
Search Terms Used: {search_terms}
|
| 685 |
+
|
| 686 |
+
Search Results:
|
| 687 |
+
{''.join(formatted_results)}
|
| 688 |
+
|
| 689 |
+
Instructions:
|
| 690 |
+
1. Carefully read through all the search results
|
| 691 |
+
2. Look for information that directly answers the question
|
| 692 |
+
3. If you find a clear answer, state it concisely
|
| 693 |
+
4. If the information is incomplete, state what you found and what's missing
|
| 694 |
+
5. If you find no relevant information, clearly state that
|
| 695 |
+
6. For questions asking for specific numbers, dates, or names, be precise
|
| 696 |
+
7. Always base your answer on the search results provided
|
| 697 |
+
|
| 698 |
+
Provide your analysis and answer:"""
|
| 699 |
+
|
| 700 |
+
return prompt
|
| 701 |
+
|
| 702 |
+
def _assess_answer_confidence(self, answer: str, question: str, search_data: Dict[str, Any]) -> float:
|
| 703 |
+
"""Assess confidence in the answer based on various factors"""
|
| 704 |
+
|
| 705 |
+
# Base confidence factors
|
| 706 |
+
confidence = 0.5 # Start with medium confidence
|
| 707 |
+
|
| 708 |
+
# Factor 1: Search result quality
|
| 709 |
+
results = search_data.get('results', [])
|
| 710 |
+
if len(results) >= 3:
|
| 711 |
+
confidence += 0.1 # More results = higher confidence
|
| 712 |
+
|
| 713 |
+
# Factor 2: Source quality
|
| 714 |
+
source = search_data.get('source', 'unknown')
|
| 715 |
+
if source == 'Wikipedia':
|
| 716 |
+
confidence += 0.15 # Wikipedia is generally reliable
|
| 717 |
+
elif source == 'DuckDuckGo':
|
| 718 |
+
confidence += 0.1 # General web search
|
| 719 |
+
|
| 720 |
+
# Factor 3: Answer specificity
|
| 721 |
+
answer_lower = answer.lower()
|
| 722 |
+
if any(indicator in answer_lower for indicator in [
|
| 723 |
+
'no information', 'not found', 'unclear', 'unable to determine',
|
| 724 |
+
'cannot find', 'no clear answer', 'insufficient information'
|
| 725 |
+
]):
|
| 726 |
+
confidence -= 0.2 # Reduce confidence for uncertain answers
|
| 727 |
+
|
| 728 |
+
# Factor 4: Answer contains specific details
|
| 729 |
+
if any(pattern in answer for pattern in [
|
| 730 |
+
re.compile(r'\b\d{4}\b'), # Years
|
| 731 |
+
re.compile(r'\b\d+\b'), # Numbers
|
| 732 |
+
re.compile(r'\b[A-Z][a-z]+\b') # Proper nouns
|
| 733 |
+
]):
|
| 734 |
+
confidence += 0.1 # Specific details increase confidence
|
| 735 |
+
|
| 736 |
+
# Factor 5: Answer length (very short answers might be incomplete)
|
| 737 |
+
if len(answer.split()) < 5:
|
| 738 |
+
confidence -= 0.1
|
| 739 |
+
elif len(answer.split()) > 50:
|
| 740 |
+
confidence += 0.05 # Detailed answers
|
| 741 |
+
|
| 742 |
+
# Factor 6: Question type matching
|
| 743 |
+
question_lower = question.lower()
|
| 744 |
+
if 'how many' in question_lower and re.search(r'\b\d+\b', answer):
|
| 745 |
+
confidence += 0.15 # Numerical answer to numerical question
|
| 746 |
+
elif any(q_word in question_lower for q_word in ['who', 'what', 'when', 'where']) and len(answer.split()) > 3:
|
| 747 |
+
confidence += 0.1 # Substantial answer to factual question
|
| 748 |
+
|
| 749 |
+
# Ensure confidence stays within bounds
|
| 750 |
+
return max(0.1, min(0.95, confidence))
|
| 751 |
|
| 752 |
def _research_url_content(self, state: GAIAAgentState) -> AgentResult:
|
| 753 |
"""Extract and analyze content from specific URLs"""
|
|
|
|
| 902 |
|
| 903 |
return ' '.join(topic_words[:3]) if topic_words else "topic"
|
| 904 |
|
| 905 |
+
def _extract_search_terms(self, question: str, max_length: int = 180) -> str:
|
| 906 |
"""
|
| 907 |
+
Improved search term extraction for better web search results
|
| 908 |
+
Prioritizes entities, dates, and specific terms
|
| 909 |
"""
|
| 910 |
+
# Remove common question words first
|
| 911 |
+
question_clean = re.sub(r'\b(what|who|when|where|why|how|is|are|was|were|did|do|does|can|could|should|would|please|tell|me|find|about)\b', '', question.lower())
|
| 912 |
|
| 913 |
+
# Extract key patterns first
|
| 914 |
+
entities = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 915 |
|
| 916 |
+
# Extract quoted phrases (highest priority)
|
| 917 |
+
quoted_phrases = re.findall(r'"([^"]+)"', question)
|
| 918 |
+
entities.extend(quoted_phrases)
|
| 919 |
|
| 920 |
+
# Extract proper nouns (names, places, organizations)
|
| 921 |
+
proper_nouns = re.findall(r'\b[A-Z][a-zA-Z]*(?:\s+[A-Z][a-zA-Z]*)*\b', question)
|
| 922 |
+
# Filter out common question words that might be capitalized
|
| 923 |
+
filtered_nouns = [noun for noun in proper_nouns if noun.lower() not in {'you', 'i', 'me', 'my', 'the', 'a', 'an'}]
|
| 924 |
+
entities.extend(filtered_nouns[:4]) # Limit to top 4
|
| 925 |
|
| 926 |
+
# Extract years and dates (high priority for temporal questions)
|
| 927 |
+
years = re.findall(r'\b(19|20)\d{2}\b', question)
|
| 928 |
+
entities.extend(years)
|
|
|
|
|
|
|
| 929 |
|
| 930 |
+
# Extract important numbers that might be quantities
|
| 931 |
+
numbers = re.findall(r'\b\d+\b', question)
|
| 932 |
+
entities.extend(numbers[:2]) # Limit to first 2 numbers
|
| 933 |
|
| 934 |
+
# If we have good entities, use them primarily
|
| 935 |
+
if entities:
|
| 936 |
+
search_terms = ' '.join(entities[:8]) # Use top 8 entities
|
| 937 |
+
else:
|
| 938 |
+
# Fallback: clean the question and extract key words
|
| 939 |
+
words = question_clean.split()
|
| 940 |
+
# Remove very common words
|
| 941 |
+
stop_words = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'from', 'up', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'between', 'among', 'this', 'that', 'these', 'those', 'many', 'some', 'all', 'any', 'most', 'other', 'such', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 'just', 'now', 'here', 'there', 'then', 'them', 'they', 'their', 'would', 'could', 'should', 'will', 'can', 'may', 'might', 'must'}
|
| 942 |
+
filtered_words = [w for w in words if w.lower() not in stop_words and len(w) > 2]
|
| 943 |
+
search_terms = ' '.join(filtered_words[:10]) # Use top 10 content words
|
| 944 |
|
| 945 |
+
# Clean up the search terms
|
| 946 |
+
search_terms = re.sub(r'\s+', ' ', search_terms) # Remove multiple spaces
|
| 947 |
+
search_terms = search_terms.strip()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 948 |
|
| 949 |
+
# Ensure we don't exceed max length
|
| 950 |
+
if len(search_terms) > max_length:
|
| 951 |
+
search_terms = search_terms[:max_length].rsplit(' ', 1)[0] # Cut at word boundary
|
| 952 |
|
| 953 |
+
# Log the extraction for debugging
|
| 954 |
+
logger.info(f"📝 Optimized search terms: '{search_terms}' from question: '{question[:100]}...'")
|
| 955 |
+
|
| 956 |
+
return search_terms.strip()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 957 |
|
| 958 |
def _extract_youtube_info(self, question: str) -> str:
|
| 959 |
"""Extract YouTube URL or search terms"""
|
src/tools/final_answer_tool.py
CHANGED
|
@@ -21,248 +21,226 @@ class FinalAnswerTool:
|
|
| 21 |
def __init__(self, llm_client: QwenClient):
|
| 22 |
self.llm_client = llm_client
|
| 23 |
|
| 24 |
-
def extract_final_answer(self, question: str, agent_results: str, question_type: str = "") -> Dict[str, Any]:
|
| 25 |
"""
|
| 26 |
-
Extract
|
| 27 |
-
|
| 28 |
-
Args:
|
| 29 |
-
question: The original GAIA question
|
| 30 |
-
agent_results: Combined results from multiple agents
|
| 31 |
-
question_type: Type of question (for specialized extraction)
|
| 32 |
-
|
| 33 |
-
Returns:
|
| 34 |
-
Dict with extracted answer, confidence, and reasoning
|
| 35 |
"""
|
|
|
|
|
|
|
| 36 |
try:
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
# Create specialized extraction prompt
|
| 40 |
extraction_prompt = self._create_extraction_prompt(question, agent_results, question_type)
|
| 41 |
|
| 42 |
# Use 72B model for precise extraction
|
| 43 |
-
|
| 44 |
extraction_prompt,
|
| 45 |
-
tier=ModelTier.COMPLEX, #
|
| 46 |
-
max_tokens=
|
|
|
|
| 47 |
)
|
| 48 |
|
| 49 |
-
if
|
| 50 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
return {
|
| 52 |
-
"answer":
|
| 53 |
-
"confidence":
|
| 54 |
-
"reasoning": f"
|
|
|
|
|
|
|
| 55 |
}
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
# Validate answer format
|
| 61 |
-
validation_result = self._validate_answer(extracted_answer, question_type)
|
| 62 |
-
|
| 63 |
-
logger.info(f"✅ Final answer extracted: '{extracted_answer}'")
|
| 64 |
-
|
| 65 |
-
return {
|
| 66 |
-
"answer": extracted_answer,
|
| 67 |
-
"confidence": validation_result["confidence"],
|
| 68 |
-
"reasoning": f"Extracted using 72B model. Validation: {validation_result['status']}"
|
| 69 |
-
}
|
| 70 |
-
|
| 71 |
except Exception as e:
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
return {
|
| 75 |
-
"answer": "Extraction error",
|
| 76 |
-
"confidence": 0.0,
|
| 77 |
-
"reasoning": error_msg
|
| 78 |
-
}
|
| 79 |
|
| 80 |
def _create_extraction_prompt(self, question: str, agent_results: str, question_type: str) -> str:
|
| 81 |
"""Create specialized extraction prompt based on question type"""
|
| 82 |
|
| 83 |
-
|
| 84 |
-
CRITICAL:
|
| 85 |
-
Your response must be ONLY the
|
| 86 |
-
|
| 87 |
-
Question: {question}
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
{agent_results}
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 94 |
|
| 95 |
-
# Add type-specific rules
|
| 96 |
-
if "mathematical" in question_type.lower() or any(word in question.lower() for word in ["how many", "count", "number", "albums"]):
|
| 97 |
-
base_prompt += """
|
| 98 |
-
- If asking for a count/number: respond with ONLY the number (e.g., "5", "23", "0")
|
| 99 |
-
- If asking for calculation: respond with ONLY the result (e.g., "42", "3.14", "100")
|
| 100 |
-
- No units unless specifically requested in the question
|
| 101 |
-
"""
|
| 102 |
-
elif "text_manipulation" in question_type.lower() or "reverse" in question.lower():
|
| 103 |
-
base_prompt += """
|
| 104 |
-
- If text is reversed: provide the corrected text
|
| 105 |
-
- If asking for opposite: provide ONLY the opposite word (e.g., "right" for opposite of "left")
|
| 106 |
-
- If asking to decode: provide ONLY the decoded answer
|
| 107 |
-
"""
|
| 108 |
-
elif "yes" in question.lower() or "true" in question.lower() or "false" in question.lower():
|
| 109 |
-
base_prompt += """
|
| 110 |
-
- If yes/no question: respond with ONLY "yes" or "no" (lowercase)
|
| 111 |
-
- If true/false question: respond with ONLY "true" or "false" (lowercase)
|
| 112 |
-
"""
|
| 113 |
-
elif any(word in question.lower() for word in ["name", "who", "which person"]):
|
| 114 |
-
base_prompt += """
|
| 115 |
-
- If asking for a name: provide ONLY the name (e.g., "John Smith", "Einstein")
|
| 116 |
-
- If asking for first name only: provide ONLY first name (e.g., "John")
|
| 117 |
-
- If asking for last name only: provide ONLY last name (e.g., "Smith")
|
| 118 |
-
"""
|
| 119 |
-
elif any(word in question.lower() for word in ["where", "location", "city", "country"]):
|
| 120 |
-
base_prompt += """
|
| 121 |
-
- If asking for location: provide ONLY the location name (e.g., "Paris", "USA", "New York")
|
| 122 |
-
- No additional descriptors unless specifically requested
|
| 123 |
-
"""
|
| 124 |
else:
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
"""
|
| 130 |
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
- Question: "How many albums?" → Answer: "5"
|
| 135 |
-
- Question: "What is the opposite of left?" → Answer: "right"
|
| 136 |
-
- Question: "True or false?" → Answer: "true"
|
| 137 |
-
- Question: "Who discovered X?" → Answer: "Einstein"
|
| 138 |
-
- Question: "Which city?" → Answer: "London"
|
| 139 |
-
|
| 140 |
-
Extract the precise answer NOW:"""
|
| 141 |
|
| 142 |
-
return
|
|
|
|
|
|
|
|
|
|
| 143 |
|
| 144 |
-
def
|
| 145 |
-
"""Clean and
|
| 146 |
|
| 147 |
-
# Remove common
|
| 148 |
answer = raw_answer.strip()
|
| 149 |
|
| 150 |
-
# Remove common prefixes
|
| 151 |
prefixes_to_remove = [
|
| 152 |
-
"the answer is",
|
| 153 |
-
"
|
| 154 |
-
"final answer
|
| 155 |
-
"result:",
|
| 156 |
-
"response:",
|
| 157 |
-
"conclusion:",
|
| 158 |
-
"based on",
|
| 159 |
-
"according to",
|
| 160 |
-
"from the",
|
| 161 |
]
|
| 162 |
|
|
|
|
| 163 |
for prefix in prefixes_to_remove:
|
| 164 |
-
if
|
| 165 |
answer = answer[len(prefix):].strip()
|
|
|
|
| 166 |
|
| 167 |
# Remove quotes if they wrap the entire answer
|
| 168 |
if answer.startswith('"') and answer.endswith('"'):
|
| 169 |
answer = answer[1:-1]
|
| 170 |
-
|
| 171 |
answer = answer[1:-1]
|
| 172 |
|
| 173 |
-
#
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
# For different question types, extract differently
|
| 177 |
-
if "mathematical" in question_type.lower() or any(word in question.lower() for word in ["how many", "count", "number", "albums"]):
|
| 178 |
-
# Extract just the number for mathematical questions
|
| 179 |
-
number_match = re.search(r'-?\d+(?:\.\d+)?', answer)
|
| 180 |
-
if number_match:
|
| 181 |
-
answer = number_match.group()
|
| 182 |
-
elif "name" in question_type.lower() or any(word in question.lower() for word in ["who", "name"]):
|
| 183 |
-
# Extract just the name (first few words)
|
| 184 |
-
words = answer.split()
|
| 185 |
-
if len(words) > 3:
|
| 186 |
-
answer = ' '.join(words[:3]) # Keep only first 3 words for names
|
| 187 |
-
elif "location" in question_type.lower() or any(word in question.lower() for word in ["where", "city", "country"]):
|
| 188 |
-
# Extract just the location name
|
| 189 |
-
words = answer.split()
|
| 190 |
-
if len(words) > 2:
|
| 191 |
-
answer = ' '.join(words[:2]) # Keep only first 2 words for locations
|
| 192 |
-
elif "yes_no" in question_type.lower() or any(word in answer.lower() for word in ["yes", "no", "true", "false"]):
|
| 193 |
-
# Extract yes/no/true/false
|
| 194 |
-
if any(word in answer.lower() for word in ["yes", "no", "true", "false"]):
|
| 195 |
-
for word in answer.lower().split():
|
| 196 |
-
if word in ["yes", "no", "true", "false"]:
|
| 197 |
-
answer = word
|
| 198 |
-
break
|
| 199 |
-
else:
|
| 200 |
-
# For other types, take first sentence or clause
|
| 201 |
-
sentences = re.split(r'[.!?]', answer)
|
| 202 |
-
if sentences:
|
| 203 |
-
answer = sentences[0].strip()
|
| 204 |
-
# If still too long, take first clause
|
| 205 |
-
if len(answer) > 30:
|
| 206 |
-
clauses = re.split(r'[,;:]', answer)
|
| 207 |
-
if clauses:
|
| 208 |
-
answer = clauses[0].strip()
|
| 209 |
-
|
| 210 |
-
# Handle specific formatting based on question type
|
| 211 |
-
if "text_manipulation" in question_type.lower():
|
| 212 |
-
# For reversed text questions, ensure clean output
|
| 213 |
-
if len(answer.split()) == 1: # Single word answer
|
| 214 |
-
answer = answer.lower()
|
| 215 |
-
|
| 216 |
-
# Final aggressive truncation if still too long
|
| 217 |
-
if len(answer) > 40:
|
| 218 |
-
# Split into words and take as many as fit
|
| 219 |
-
words = answer.split()
|
| 220 |
-
truncated_words = []
|
| 221 |
-
current_length = 0
|
| 222 |
-
for word in words:
|
| 223 |
-
if current_length + len(word) + 1 <= 40:
|
| 224 |
-
truncated_words.append(word)
|
| 225 |
-
current_length += len(word) + 1
|
| 226 |
-
else:
|
| 227 |
-
break
|
| 228 |
-
if truncated_words:
|
| 229 |
-
answer = ' '.join(truncated_words)
|
| 230 |
-
else:
|
| 231 |
-
# Last resort - take first 40 characters
|
| 232 |
-
answer = answer[:40].strip()
|
| 233 |
-
|
| 234 |
-
# Remove any trailing punctuation that's not part of the answer
|
| 235 |
-
answer = answer.rstrip('.,!?;:')
|
| 236 |
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
return {"status": "empty_answer", "confidence": 0.0}
|
| 244 |
|
| 245 |
-
|
| 246 |
-
|
| 247 |
-
|
|
|
|
|
|
|
|
|
|
| 248 |
|
| 249 |
-
#
|
| 250 |
-
|
| 251 |
-
if re.match(r'^-?\d+(?:\.\d+)?$', answer):
|
| 252 |
-
return {"status": "valid_number", "confidence": 0.9}
|
| 253 |
-
else:
|
| 254 |
-
return {"status": "invalid_number_format", "confidence": 0.5}
|
| 255 |
|
| 256 |
-
|
| 257 |
-
|
| 258 |
-
|
| 259 |
-
|
| 260 |
-
|
|
|
|
| 261 |
|
| 262 |
-
|
| 263 |
-
|
| 264 |
-
|
| 265 |
-
|
| 266 |
-
|
| 267 |
-
|
| 268 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
def __init__(self, llm_client: QwenClient):
|
| 22 |
self.llm_client = llm_client
|
| 23 |
|
| 24 |
+
def extract_final_answer(self, question: str, agent_results: str, question_type: str = "general") -> Dict[str, Any]:
|
| 25 |
"""
|
| 26 |
+
Extract GAIA-compliant final answer with enhanced accuracy
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
"""
|
| 28 |
+
logger.info("🎯 Extracting GAIA-compliant final answer")
|
| 29 |
+
|
| 30 |
try:
|
| 31 |
+
# Create specialized extraction prompt based on question type
|
|
|
|
|
|
|
| 32 |
extraction_prompt = self._create_extraction_prompt(question, agent_results, question_type)
|
| 33 |
|
| 34 |
# Use 72B model for precise extraction
|
| 35 |
+
llm_result = self.llm_client.generate(
|
| 36 |
extraction_prompt,
|
| 37 |
+
tier=ModelTier.COMPLEX, # Always use most capable model
|
| 38 |
+
max_tokens=100, # Keep answer concise
|
| 39 |
+
temperature=0.1 # Lower temperature for consistency
|
| 40 |
)
|
| 41 |
|
| 42 |
+
if llm_result.success:
|
| 43 |
+
# Clean and validate the extracted answer
|
| 44 |
+
raw_answer = llm_result.response.strip()
|
| 45 |
+
final_answer = self._clean_and_validate_answer(raw_answer, question, question_type)
|
| 46 |
+
|
| 47 |
+
# Assess answer quality
|
| 48 |
+
confidence = self._assess_answer_quality(final_answer, question, agent_results, question_type)
|
| 49 |
+
|
| 50 |
return {
|
| 51 |
+
"answer": final_answer,
|
| 52 |
+
"confidence": confidence,
|
| 53 |
+
"reasoning": f"Extracted from {question_type} analysis using 72B model",
|
| 54 |
+
"raw_response": raw_answer,
|
| 55 |
+
"validation_passed": len(final_answer) <= 100 and len(final_answer) > 0
|
| 56 |
}
|
| 57 |
+
else:
|
| 58 |
+
# Fallback to simple extraction
|
| 59 |
+
return self._fallback_extraction(question, agent_results)
|
| 60 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
except Exception as e:
|
| 62 |
+
logger.error(f"Final answer extraction failed: {e}")
|
| 63 |
+
return self._fallback_extraction(question, agent_results)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 64 |
|
| 65 |
def _create_extraction_prompt(self, question: str, agent_results: str, question_type: str) -> str:
|
| 66 |
"""Create specialized extraction prompt based on question type"""
|
| 67 |
|
| 68 |
+
base_instructions = """
|
| 69 |
+
CRITICAL: Extract the exact answer for GAIA benchmark evaluation.
|
| 70 |
+
Your response must be ONLY the answer - no explanations, no prefixes, no extra text.
|
| 71 |
+
|
| 72 |
+
Question: {question}
|
| 73 |
+
|
| 74 |
+
Analysis from agents:
|
| 75 |
+
{agent_results}
|
| 76 |
+
|
| 77 |
+
"""
|
| 78 |
+
|
| 79 |
+
# Specialized instructions based on question type
|
| 80 |
+
if question_type == "mathematical" or "how many" in question.lower():
|
| 81 |
+
type_instructions = """
|
| 82 |
+
This is a counting/mathematical question. Respond with ONLY the number.
|
| 83 |
+
Examples of correct responses: "5", "42", "0"
|
| 84 |
+
Do NOT include words like "albums", "songs", "items", etc.
|
| 85 |
+
"""
|
| 86 |
+
|
| 87 |
+
elif question_type == "yes_no":
|
| 88 |
+
type_instructions = """
|
| 89 |
+
This is a yes/no question. Respond with ONLY "yes" or "no".
|
| 90 |
+
"""
|
| 91 |
+
|
| 92 |
+
elif question_type == "name" or any(word in question.lower() for word in ["who", "name"]):
|
| 93 |
+
type_instructions = """
|
| 94 |
+
This is asking for a name. Respond with ONLY the name requested.
|
| 95 |
+
Examples: "John Smith", "Mike102", "Einstein"
|
| 96 |
+
"""
|
| 97 |
+
|
| 98 |
+
elif question_type == "location":
|
| 99 |
+
type_instructions = """
|
| 100 |
+
This is asking for a location. Respond with ONLY the location name.
|
| 101 |
+
Examples: "Paris", "New York", "LIE", "Hanoi"
|
| 102 |
+
"""
|
| 103 |
+
|
| 104 |
+
elif question_type == "text_manipulation":
|
| 105 |
+
type_instructions = """
|
| 106 |
+
This involves text manipulation. Respond with ONLY the processed text result.
|
| 107 |
+
Examples: "right", "hello", "12345"
|
| 108 |
+
"""
|
| 109 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 110 |
else:
|
| 111 |
+
type_instructions = """
|
| 112 |
+
Respond with ONLY the direct answer requested.
|
| 113 |
+
Keep it concise and specific.
|
| 114 |
+
"""
|
|
|
|
| 115 |
|
| 116 |
+
ending_instructions = """
|
| 117 |
+
|
| 118 |
+
EXTRACT ONLY THE ANSWER:"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 119 |
|
| 120 |
+
return base_instructions.format(
|
| 121 |
+
question=question,
|
| 122 |
+
agent_results=agent_results[:2000] # Limit input length
|
| 123 |
+
) + type_instructions + ending_instructions
|
| 124 |
|
| 125 |
+
def _clean_and_validate_answer(self, raw_answer: str, question: str, question_type: str) -> str:
|
| 126 |
+
"""Clean and validate the extracted answer"""
|
| 127 |
|
| 128 |
+
# Remove common prefixes and suffixes
|
| 129 |
answer = raw_answer.strip()
|
| 130 |
|
| 131 |
+
# Remove common answer prefixes
|
| 132 |
prefixes_to_remove = [
|
| 133 |
+
"final answer:", "answer:", "the answer is:", "result:", "conclusion:",
|
| 134 |
+
"based on", "according to", "therefore", "thus", "so", "hence",
|
| 135 |
+
"final answer is", "the result is", "it is", "this is"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 136 |
]
|
| 137 |
|
| 138 |
+
answer_lower = answer.lower()
|
| 139 |
for prefix in prefixes_to_remove:
|
| 140 |
+
if answer_lower.startswith(prefix):
|
| 141 |
answer = answer[len(prefix):].strip()
|
| 142 |
+
answer_lower = answer.lower()
|
| 143 |
|
| 144 |
# Remove quotes if they wrap the entire answer
|
| 145 |
if answer.startswith('"') and answer.endswith('"'):
|
| 146 |
answer = answer[1:-1]
|
| 147 |
+
elif answer.startswith("'") and answer.endswith("'"):
|
| 148 |
answer = answer[1:-1]
|
| 149 |
|
| 150 |
+
# Remove trailing punctuation that's not part of the answer
|
| 151 |
+
while answer and answer[-1] in '.!?:;':
|
| 152 |
+
answer = answer[:-1]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 153 |
|
| 154 |
+
# Special handling for different question types
|
| 155 |
+
if question_type == "mathematical" or "how many" in question.lower():
|
| 156 |
+
# Extract just the number
|
| 157 |
+
numbers = re.findall(r'\b\d+\b', answer)
|
| 158 |
+
if numbers:
|
| 159 |
+
answer = numbers[0]
|
|
|
|
| 160 |
|
| 161 |
+
elif question_type == "yes_no":
|
| 162 |
+
# Normalize yes/no answers
|
| 163 |
+
if any(word in answer.lower() for word in ['yes', 'true', 'correct', 'right']):
|
| 164 |
+
answer = "yes"
|
| 165 |
+
elif any(word in answer.lower() for word in ['no', 'false', 'incorrect', 'wrong']):
|
| 166 |
+
answer = "no"
|
| 167 |
|
| 168 |
+
# Final cleanup
|
| 169 |
+
answer = answer.strip()
|
|
|
|
|
|
|
|
|
|
|
|
|
| 170 |
|
| 171 |
+
# Ensure answer is not empty
|
| 172 |
+
if not answer:
|
| 173 |
+
# Try to extract from the original raw answer
|
| 174 |
+
words = raw_answer.split()
|
| 175 |
+
if words:
|
| 176 |
+
answer = words[-1] # Take the last word as fallback
|
| 177 |
|
| 178 |
+
return answer
|
| 179 |
+
|
| 180 |
+
def _assess_answer_quality(self, answer: str, question: str, agent_results: str, question_type: str) -> float:
|
| 181 |
+
"""Assess the quality/confidence of the extracted answer"""
|
| 182 |
+
|
| 183 |
+
confidence = 0.7 # Base confidence
|
| 184 |
+
|
| 185 |
+
# Factor 1: Answer length appropriateness
|
| 186 |
+
if len(answer) == 0:
|
| 187 |
+
return 0.1 # Very low confidence for empty answers
|
| 188 |
+
elif len(answer) > 100:
|
| 189 |
+
confidence -= 0.2 # Too long for GAIA
|
| 190 |
+
elif 1 <= len(answer) <= 50:
|
| 191 |
+
confidence += 0.1 # Good length
|
| 192 |
+
|
| 193 |
+
# Factor 2: Question type matching
|
| 194 |
+
question_lower = question.lower()
|
| 195 |
+
|
| 196 |
+
if ("how many" in question_lower or question_type == "mathematical") and re.match(r'^\d+$', answer):
|
| 197 |
+
confidence += 0.15 # Numeric answer to counting question
|
| 198 |
+
elif ("who" in question_lower or "name" in question_lower) and len(answer.split()) <= 3:
|
| 199 |
+
confidence += 0.1 # Name-like answer to who question
|
| 200 |
+
elif ("where" in question_lower) and len(answer.split()) <= 2:
|
| 201 |
+
confidence += 0.1 # Location-like answer
|
| 202 |
+
elif ("yes or no" in question_lower) and answer.lower() in ["yes", "no"]:
|
| 203 |
+
confidence += 0.15 # Perfect yes/no answer
|
| 204 |
+
|
| 205 |
+
# Factor 3: Answer appears in agent results (indicates it was found)
|
| 206 |
+
if answer.lower() in agent_results.lower():
|
| 207 |
+
confidence += 0.1
|
| 208 |
+
|
| 209 |
+
# Factor 4: Answer specificity
|
| 210 |
+
if re.search(r'\b\d{4}\b', answer): # Contains year
|
| 211 |
+
confidence += 0.05
|
| 212 |
+
if re.search(r'\b[A-Z][a-z]+\b', answer): # Contains proper noun
|
| 213 |
+
confidence += 0.05
|
| 214 |
+
|
| 215 |
+
# Factor 5: Common failure patterns
|
| 216 |
+
failure_indicators = ['unknown', 'unclear', 'not found', 'unable to determine', 'no information']
|
| 217 |
+
if any(indicator in answer.lower() for indicator in failure_indicators):
|
| 218 |
+
confidence -= 0.3
|
| 219 |
+
|
| 220 |
+
return max(0.1, min(0.95, confidence))
|
| 221 |
+
|
| 222 |
+
def _fallback_extraction(self, question: str, agent_results: str) -> Dict[str, Any]:
|
| 223 |
+
"""Simple fallback when LLM extraction fails"""
|
| 224 |
+
|
| 225 |
+
# Try to extract a reasonable answer from agent results
|
| 226 |
+
lines = agent_results.split('\n')
|
| 227 |
+
|
| 228 |
+
# Look for lines that might contain answers
|
| 229 |
+
potential_answers = []
|
| 230 |
+
for line in lines:
|
| 231 |
+
line = line.strip()
|
| 232 |
+
if len(line) > 0 and len(line) < 100:
|
| 233 |
+
# Skip lines that are clearly explanatory
|
| 234 |
+
if not any(word in line.lower() for word in ['according', 'based on', 'however', 'therefore', 'because']):
|
| 235 |
+
potential_answers.append(line)
|
| 236 |
+
|
| 237 |
+
# Use the first reasonable answer or a fallback
|
| 238 |
+
answer = potential_answers[0] if potential_answers else "Unable to determine"
|
| 239 |
+
|
| 240 |
+
return {
|
| 241 |
+
"answer": answer,
|
| 242 |
+
"confidence": 0.3,
|
| 243 |
+
"reasoning": "Fallback extraction due to LLM failure",
|
| 244 |
+
"raw_response": agent_results[:100],
|
| 245 |
+
"validation_passed": False
|
| 246 |
+
}
|
src/tools/web_search_tool.py
CHANGED
|
@@ -126,105 +126,119 @@ class WebSearchTool(BaseTool):
|
|
| 126 |
"""Check if text is a URL"""
|
| 127 |
return bool(re.match(r'https?://', text))
|
| 128 |
|
| 129 |
-
def _extract_search_terms(self,
|
| 130 |
"""
|
| 131 |
-
Extract
|
|
|
|
| 132 |
"""
|
| 133 |
-
#
|
| 134 |
-
|
| 135 |
-
return query
|
| 136 |
-
|
| 137 |
-
# Remove common stop words and extract key terms
|
| 138 |
-
stop_words = {
|
| 139 |
-
'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by',
|
| 140 |
-
'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'do', 'does', 'did',
|
| 141 |
-
'will', 'would', 'could', 'should', 'may', 'might', 'must', 'can', 'this', 'that', 'these', 'those',
|
| 142 |
-
'i', 'you', 'he', 'she', 'it', 'we', 'they', 'me', 'him', 'her', 'us', 'them',
|
| 143 |
-
'what', 'where', 'when', 'why', 'how', 'which', 'who', 'whose', 'whom',
|
| 144 |
-
'please', 'could', 'you', 'tell', 'me', 'find', 'search', 'for', 'about'
|
| 145 |
-
}
|
| 146 |
-
|
| 147 |
-
# Split into words and filter
|
| 148 |
-
words = re.findall(r'\b\w+\b', query.lower())
|
| 149 |
-
key_words = [word for word in words if word not in stop_words and len(word) > 2]
|
| 150 |
-
|
| 151 |
-
# Keep important phrases and entities
|
| 152 |
-
# Look for quoted phrases, proper nouns, numbers, dates
|
| 153 |
-
important_patterns = [
|
| 154 |
-
r'"[^"]*"', # Quoted phrases
|
| 155 |
-
r'\b[A-Z][a-z]*(?:\s+[A-Z][a-z]*)*\b', # Proper nouns
|
| 156 |
-
r'\b\d{4}\b', # Years
|
| 157 |
-
r'\b\d+\b', # Numbers
|
| 158 |
-
]
|
| 159 |
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
matches = re.findall(pattern, query)
|
| 163 |
-
important_terms.extend(matches)
|
| 164 |
|
| 165 |
-
#
|
| 166 |
-
|
|
|
|
| 167 |
|
| 168 |
-
#
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
search_terms.append(term)
|
| 172 |
|
| 173 |
-
#
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
if len(potential_query) <= max_length:
|
| 177 |
-
search_terms.append(word)
|
| 178 |
-
else:
|
| 179 |
-
break
|
| 180 |
|
| 181 |
-
|
|
|
|
|
|
|
| 182 |
|
| 183 |
-
# If
|
| 184 |
-
if
|
| 185 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 186 |
|
| 187 |
-
#
|
| 188 |
-
if
|
| 189 |
-
|
| 190 |
|
| 191 |
-
|
| 192 |
-
|
| 193 |
|
| 194 |
-
return
|
| 195 |
|
| 196 |
def _search_web(self, query: str, limit: int = 5, extract_content: bool = False) -> Dict[str, Any]:
|
| 197 |
"""
|
| 198 |
-
Search the web using available search engines in priority order
|
| 199 |
"""
|
| 200 |
|
| 201 |
-
# Extract search terms
|
| 202 |
-
search_query = self._extract_search_terms(query, max_length=
|
| 203 |
|
| 204 |
# Try DuckDuckGo first (most comprehensive for general web search)
|
| 205 |
if self.use_duckduckgo:
|
| 206 |
try:
|
| 207 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 208 |
except Exception as e:
|
| 209 |
logger.warning(f"DuckDuckGo search failed, trying Tavily: {e}")
|
| 210 |
|
| 211 |
# Try Tavily if DuckDuckGo fails and API key is available
|
| 212 |
if self.use_tavily:
|
| 213 |
try:
|
| 214 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 215 |
except Exception as e:
|
| 216 |
logger.warning(f"Tavily search failed, trying Wikipedia: {e}")
|
| 217 |
|
| 218 |
# Fallback to Wikipedia search
|
| 219 |
if self.use_wikipedia:
|
| 220 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 221 |
|
| 222 |
-
# No search engines available
|
|
|
|
| 223 |
return {
|
| 224 |
"query": query,
|
| 225 |
"found": False,
|
| 226 |
-
"
|
| 227 |
-
"
|
|
|
|
|
|
|
|
|
|
| 228 |
}
|
| 229 |
|
| 230 |
def _search_with_duckduckgo(self, query: str, limit: int = 5, extract_content: bool = False) -> Dict[str, Any]:
|
|
|
|
| 126 |
"""Check if text is a URL"""
|
| 127 |
return bool(re.match(r'https?://', text))
|
| 128 |
|
| 129 |
+
def _extract_search_terms(self, question: str, max_length: int = 200) -> str:
|
| 130 |
"""
|
| 131 |
+
Extract focused search terms from a question
|
| 132 |
+
Prioritizes key entities, dates, and specific information
|
| 133 |
"""
|
| 134 |
+
# Remove common question words first
|
| 135 |
+
question_clean = re.sub(r'\b(what|who|when|where|why|how|is|are|was|were|did|do|does|can|could|should|would)\b', '', question.lower())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 136 |
|
| 137 |
+
# Extract key patterns first
|
| 138 |
+
entities = []
|
|
|
|
|
|
|
| 139 |
|
| 140 |
+
# Extract quoted phrases (highest priority)
|
| 141 |
+
quoted_phrases = re.findall(r'"([^"]+)"', question)
|
| 142 |
+
entities.extend(quoted_phrases)
|
| 143 |
|
| 144 |
+
# Extract proper nouns (names, places, organizations)
|
| 145 |
+
proper_nouns = re.findall(r'\b[A-Z][a-zA-Z]*(?:\s+[A-Z][a-zA-Z]*)*\b', question)
|
| 146 |
+
entities.extend(proper_nouns[:3]) # Limit to top 3
|
|
|
|
| 147 |
|
| 148 |
+
# Extract years and dates
|
| 149 |
+
years = re.findall(r'\b(19|20)\d{2}\b', question)
|
| 150 |
+
entities.extend(years)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 151 |
|
| 152 |
+
# Extract numbers that might be important
|
| 153 |
+
numbers = re.findall(r'\b\d+\b', question)
|
| 154 |
+
entities.extend(numbers[:2]) # Limit to first 2 numbers
|
| 155 |
|
| 156 |
+
# If we have good entities, use them primarily
|
| 157 |
+
if entities:
|
| 158 |
+
search_terms = ' '.join(entities[:6]) # Use top 6 entities
|
| 159 |
+
else:
|
| 160 |
+
# Fallback: clean the question and extract key words
|
| 161 |
+
words = question_clean.split()
|
| 162 |
+
# Remove very common words
|
| 163 |
+
stop_words = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'from', 'up', 'about', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'between', 'among', 'this', 'that', 'these', 'those', 'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves'}
|
| 164 |
+
filtered_words = [w for w in words if w.lower() not in stop_words and len(w) > 2]
|
| 165 |
+
search_terms = ' '.join(filtered_words[:8]) # Use top 8 content words
|
| 166 |
|
| 167 |
+
# Ensure we don't exceed max length
|
| 168 |
+
if len(search_terms) > max_length:
|
| 169 |
+
search_terms = search_terms[:max_length].rsplit(' ', 1)[0] # Cut at word boundary
|
| 170 |
|
| 171 |
+
# Log the extraction for debugging
|
| 172 |
+
logger.info(f"📝 Extracted search terms: '{search_terms}' from question: '{question[:100]}...'")
|
| 173 |
|
| 174 |
+
return search_terms.strip()
|
| 175 |
|
| 176 |
def _search_web(self, query: str, limit: int = 5, extract_content: bool = False) -> Dict[str, Any]:
|
| 177 |
"""
|
| 178 |
+
Search the web using available search engines in priority order with improved search terms
|
| 179 |
"""
|
| 180 |
|
| 181 |
+
# Extract clean search terms from the query
|
| 182 |
+
search_query = self._extract_search_terms(query, max_length=200)
|
| 183 |
|
| 184 |
# Try DuckDuckGo first (most comprehensive for general web search)
|
| 185 |
if self.use_duckduckgo:
|
| 186 |
try:
|
| 187 |
+
ddg_result = self._search_with_duckduckgo(search_query, limit, extract_content)
|
| 188 |
+
if ddg_result.get('success') and ddg_result.get('count', 0) > 0:
|
| 189 |
+
return {
|
| 190 |
+
'success': True,
|
| 191 |
+
'found': True,
|
| 192 |
+
'results': [r.to_dict() if hasattr(r, 'to_dict') else r for r in ddg_result['results']],
|
| 193 |
+
'query': query,
|
| 194 |
+
'source': 'DuckDuckGo',
|
| 195 |
+
'total_found': ddg_result['count']
|
| 196 |
+
}
|
| 197 |
except Exception as e:
|
| 198 |
logger.warning(f"DuckDuckGo search failed, trying Tavily: {e}")
|
| 199 |
|
| 200 |
# Try Tavily if DuckDuckGo fails and API key is available
|
| 201 |
if self.use_tavily:
|
| 202 |
try:
|
| 203 |
+
tavily_result = self._search_with_tavily(search_query, limit, extract_content)
|
| 204 |
+
if tavily_result.get('success') and tavily_result.get('count', 0) > 0:
|
| 205 |
+
return {
|
| 206 |
+
'success': True,
|
| 207 |
+
'found': True,
|
| 208 |
+
'results': [r.to_dict() if hasattr(r, 'to_dict') else r for r in tavily_result['results']],
|
| 209 |
+
'query': query,
|
| 210 |
+
'source': 'Tavily',
|
| 211 |
+
'total_found': tavily_result['count']
|
| 212 |
+
}
|
| 213 |
except Exception as e:
|
| 214 |
logger.warning(f"Tavily search failed, trying Wikipedia: {e}")
|
| 215 |
|
| 216 |
# Fallback to Wikipedia search
|
| 217 |
if self.use_wikipedia:
|
| 218 |
+
try:
|
| 219 |
+
wiki_result = self._search_with_wikipedia(search_query, limit)
|
| 220 |
+
if wiki_result.get('success') and wiki_result.get('count', 0) > 0:
|
| 221 |
+
return {
|
| 222 |
+
'success': True,
|
| 223 |
+
'found': True,
|
| 224 |
+
'results': [r.to_dict() if hasattr(r, 'to_dict') else r for r in wiki_result['results']],
|
| 225 |
+
'query': query,
|
| 226 |
+
'source': 'Wikipedia',
|
| 227 |
+
'total_found': wiki_result['count']
|
| 228 |
+
}
|
| 229 |
+
except Exception as e:
|
| 230 |
+
logger.warning(f"Wikipedia search failed: {e}")
|
| 231 |
|
| 232 |
+
# No search engines available or all failed
|
| 233 |
+
logger.warning("All search engines failed, returning empty results")
|
| 234 |
return {
|
| 235 |
"query": query,
|
| 236 |
"found": False,
|
| 237 |
+
"success": False,
|
| 238 |
+
"message": "❌ All search engines failed or returned no results.",
|
| 239 |
+
"results": [],
|
| 240 |
+
"source": "none",
|
| 241 |
+
"total_found": 0
|
| 242 |
}
|
| 243 |
|
| 244 |
def _search_with_duckduckgo(self, query: str, limit: int = 5, extract_content: bool = False) -> Dict[str, Any]:
|