Spaces:
				
			
			
	
			
			
		Sleeping
		
	
	
	
			
			
	
	
	
	
		
		
		Sleeping
		
	Commit 
							
							Β·
						
						e0deddf
	
1
								Parent(s):
							
							2d9d8bd
								
fix agent meory stuff
Browse files- app.py +49 -49
- config.py +10 -0
- geo_bot.py +130 -67
    	
        app.py
    CHANGED
    
    | @@ -60,16 +60,33 @@ st.markdown("### *The all-knowing AI that sees everything, knows everything*") | |
| 60 | 
             
            with st.sidebar:
         | 
| 61 | 
             
                st.header("Configuration")
         | 
| 62 |  | 
| 63 | 
            -
                 | 
|  | |
|  | |
|  | |
|  | |
| 64 | 
             
                model_choice = st.selectbox("Model", list(MODELS_CONFIG.keys()))
         | 
| 65 | 
             
                steps_per_sample = st.slider("Max Steps", 3, 20, 10)
         | 
| 66 |  | 
| 67 | 
            -
                # Load dataset
         | 
| 68 | 
             
                data_paths = get_data_paths(dataset_choice)
         | 
| 69 | 
            -
                 | 
| 70 | 
            -
                     | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 71 |  | 
| 72 | 
            -
                st.info(f"Dataset has {len(golden_labels)} samples")
         | 
| 73 | 
             
                num_samples = st.slider(
         | 
| 74 | 
             
                    "Samples to Test", 1, len(golden_labels), min(3, len(golden_labels))
         | 
| 75 | 
             
                )
         | 
| @@ -102,7 +119,7 @@ if start_button: | |
| 102 |  | 
| 103 | 
             
                        with sample_container:
         | 
| 104 | 
             
                            # Initialize step tracking
         | 
| 105 | 
            -
                            history =  | 
| 106 | 
             
                            final_guess = None
         | 
| 107 |  | 
| 108 | 
             
                            for step in range(steps_per_sample):
         | 
| @@ -126,35 +143,19 @@ if start_button: | |
| 126 | 
             
                                        )
         | 
| 127 |  | 
| 128 | 
             
                                    with col2:
         | 
| 129 | 
            -
                                        #  | 
| 130 | 
            -
                                         | 
| 131 | 
            -
                                             | 
| 132 | 
            -
                                                Image.open(BytesIO(screenshot_bytes))
         | 
| 133 | 
            -
                                            ),
         | 
| 134 | 
            -
                                            "action": "N/A",
         | 
| 135 | 
            -
                                        }
         | 
| 136 | 
            -
                                        history.append(current_step)
         | 
| 137 | 
            -
             | 
| 138 | 
            -
                                        available_actions = bot.controller.get_available_actions()
         | 
| 139 | 
            -
                                        history_text = "\n".join(
         | 
| 140 | 
            -
                                            [
         | 
| 141 | 
            -
                                                f"Step {j + 1}: {h['action']}"
         | 
| 142 | 
            -
                                                for j, h in enumerate(history[:-1])
         | 
| 143 | 
            -
                                            ]
         | 
| 144 | 
            -
                                        )
         | 
| 145 | 
            -
                                        if not history_text:
         | 
| 146 | 
            -
                                            history_text = "First step."
         | 
| 147 | 
            -
             | 
| 148 | 
            -
                                        prompt = AGENT_PROMPT_TEMPLATE.format(
         | 
| 149 | 
            -
                                            remaining_steps=steps_per_sample - step,
         | 
| 150 | 
            -
                                            history_text=history_text,
         | 
| 151 | 
            -
                                            available_actions=json.dumps(available_actions),
         | 
| 152 | 
             
                                        )
         | 
|  | |
|  | |
| 153 |  | 
| 154 | 
             
                                        # Show AI context
         | 
| 155 | 
             
                                        st.write("**Available Actions:**")
         | 
| 156 | 
             
                                        st.code(json.dumps(available_actions, indent=2))
         | 
| 157 |  | 
|  | |
|  | |
| 158 | 
             
                                        st.write("**AI Context:**")
         | 
| 159 | 
             
                                        st.text_area(
         | 
| 160 | 
             
                                            "History",
         | 
| @@ -168,21 +169,22 @@ if start_button: | |
| 168 | 
             
                                        if step_num == steps_per_sample:
         | 
| 169 | 
             
                                            action = "GUESS"
         | 
| 170 | 
             
                                            st.warning("Max steps reached. Forcing GUESS.")
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
| 171 | 
             
                                        else:
         | 
| 172 | 
            -
                                            #  | 
| 173 | 
            -
                                             | 
| 174 | 
            -
             | 
|  | |
| 175 | 
             
                                            )
         | 
| 176 | 
            -
                                            response = bot.model.invoke(message)
         | 
| 177 | 
            -
                                            decision = bot._parse_agent_response(response)
         | 
| 178 |  | 
| 179 | 
             
                                            if decision is None:
         | 
| 180 | 
            -
                                                raise ValueError(
         | 
| 181 | 
            -
                                                    f"Failed to parse AI response: {response.content}"
         | 
| 182 | 
            -
                                                )
         | 
| 183 |  | 
| 184 | 
             
                                            action = decision["action_details"]["action"]
         | 
| 185 | 
            -
                                            history[-1]["action"] = action
         | 
| 186 |  | 
| 187 | 
             
                                            # Show AI decision
         | 
| 188 | 
             
                                            st.write("**AI Reasoning:**")
         | 
| @@ -191,9 +193,12 @@ if start_button: | |
| 191 | 
             
                                            st.write("**AI Action:**")
         | 
| 192 | 
             
                                            st.success(f"`{action}`")
         | 
| 193 |  | 
| 194 | 
            -
                                            # Show raw response
         | 
| 195 | 
            -
                                            with st.expander(" | 
| 196 | 
            -
                                                st. | 
|  | |
|  | |
|  | |
| 197 |  | 
| 198 | 
             
                                    # Execute action
         | 
| 199 | 
             
                                    if action == "GUESS":
         | 
| @@ -209,14 +214,9 @@ if start_button: | |
| 209 | 
             
                                            final_guess = (lat, lon)
         | 
| 210 | 
             
                                            st.success(f"Final Guess: {lat:.4f}, {lon:.4f}")
         | 
| 211 | 
             
                                        break
         | 
| 212 | 
            -
                                     | 
| 213 | 
            -
                                        bot | 
| 214 | 
            -
             | 
| 215 | 
            -
                                        bot.controller.move("backward")
         | 
| 216 | 
            -
                                    elif action == "PAN_LEFT":
         | 
| 217 | 
            -
                                        bot.controller.pan_view("left")
         | 
| 218 | 
            -
                                    elif action == "PAN_RIGHT":
         | 
| 219 | 
            -
                                        bot.controller.pan_view("right")
         | 
| 220 |  | 
| 221 | 
             
                                    # Auto scroll to bottom
         | 
| 222 | 
             
                                    st.empty()  # Force refresh to show latest content
         | 
|  | |
| 60 | 
             
            with st.sidebar:
         | 
| 61 | 
             
                st.header("Configuration")
         | 
| 62 |  | 
| 63 | 
            +
                # Get available datasets and ensure we have a valid default
         | 
| 64 | 
            +
                available_datasets = get_available_datasets()
         | 
| 65 | 
            +
                default_dataset = available_datasets[0] if available_datasets else "default"
         | 
| 66 | 
            +
                
         | 
| 67 | 
            +
                dataset_choice = st.selectbox("Dataset", available_datasets, index=0)
         | 
| 68 | 
             
                model_choice = st.selectbox("Model", list(MODELS_CONFIG.keys()))
         | 
| 69 | 
             
                steps_per_sample = st.slider("Max Steps", 3, 20, 10)
         | 
| 70 |  | 
| 71 | 
            +
                # Load dataset with error handling
         | 
| 72 | 
             
                data_paths = get_data_paths(dataset_choice)
         | 
| 73 | 
            +
                try:
         | 
| 74 | 
            +
                    with open(data_paths["golden_labels"], "r") as f:
         | 
| 75 | 
            +
                        golden_labels = json.load(f).get("samples", [])
         | 
| 76 | 
            +
                    
         | 
| 77 | 
            +
                    st.info(f"Dataset '{dataset_choice}' has {len(golden_labels)} samples")
         | 
| 78 | 
            +
                    if len(golden_labels) == 0:
         | 
| 79 | 
            +
                        st.error(f"Dataset '{dataset_choice}' contains no samples!")
         | 
| 80 | 
            +
                        st.stop()
         | 
| 81 | 
            +
                        
         | 
| 82 | 
            +
                except FileNotFoundError:
         | 
| 83 | 
            +
                    st.error(f"β Dataset '{dataset_choice}' not found at {data_paths['golden_labels']}")
         | 
| 84 | 
            +
                    st.info("π‘ Available datasets: " + ", ".join(available_datasets))
         | 
| 85 | 
            +
                    st.stop()
         | 
| 86 | 
            +
                except Exception as e:
         | 
| 87 | 
            +
                    st.error(f"β Error loading dataset '{dataset_choice}': {str(e)}")
         | 
| 88 | 
            +
                    st.stop()
         | 
| 89 |  | 
|  | |
| 90 | 
             
                num_samples = st.slider(
         | 
| 91 | 
             
                    "Samples to Test", 1, len(golden_labels), min(3, len(golden_labels))
         | 
| 92 | 
             
                )
         | 
|  | |
| 119 |  | 
| 120 | 
             
                        with sample_container:
         | 
| 121 | 
             
                            # Initialize step tracking
         | 
| 122 | 
            +
                            history = bot.init_history()
         | 
| 123 | 
             
                            final_guess = None
         | 
| 124 |  | 
| 125 | 
             
                            for step in range(steps_per_sample):
         | 
|  | |
| 143 | 
             
                                        )
         | 
| 144 |  | 
| 145 | 
             
                                    with col2:
         | 
| 146 | 
            +
                                        # Get current screenshot as base64
         | 
| 147 | 
            +
                                        current_screenshot_b64 = bot.pil_to_base64(
         | 
| 148 | 
            +
                                            Image.open(BytesIO(screenshot_bytes))
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 149 | 
             
                                        )
         | 
| 150 | 
            +
                                        
         | 
| 151 | 
            +
                                        available_actions = bot.controller.get_available_actions()
         | 
| 152 |  | 
| 153 | 
             
                                        # Show AI context
         | 
| 154 | 
             
                                        st.write("**Available Actions:**")
         | 
| 155 | 
             
                                        st.code(json.dumps(available_actions, indent=2))
         | 
| 156 |  | 
| 157 | 
            +
                                        # Generate and display history
         | 
| 158 | 
            +
                                        history_text = bot.generate_history_text(history)
         | 
| 159 | 
             
                                        st.write("**AI Context:**")
         | 
| 160 | 
             
                                        st.text_area(
         | 
| 161 | 
             
                                            "History",
         | 
|  | |
| 169 | 
             
                                        if step_num == steps_per_sample:
         | 
| 170 | 
             
                                            action = "GUESS"
         | 
| 171 | 
             
                                            st.warning("Max steps reached. Forcing GUESS.")
         | 
| 172 | 
            +
                                            # Create a forced decision for consistency
         | 
| 173 | 
            +
                                            decision = {
         | 
| 174 | 
            +
                                                "reasoning": "Maximum steps reached, forcing final guess with fallback coordinates.",
         | 
| 175 | 
            +
                                                "action_details": {"action": "GUESS", "lat": 0.0, "lon": 0.0}
         | 
| 176 | 
            +
                                            }
         | 
| 177 | 
             
                                        else:
         | 
| 178 | 
            +
                                            # Use the bot's agent step execution
         | 
| 179 | 
            +
                                            remaining_steps = steps_per_sample - step
         | 
| 180 | 
            +
                                            decision = bot.execute_agent_step(
         | 
| 181 | 
            +
                                                history, remaining_steps, current_screenshot_b64, available_actions
         | 
| 182 | 
             
                                            )
         | 
|  | |
|  | |
| 183 |  | 
| 184 | 
             
                                            if decision is None:
         | 
| 185 | 
            +
                                                raise ValueError("Failed to get AI decision")
         | 
|  | |
|  | |
| 186 |  | 
| 187 | 
             
                                            action = decision["action_details"]["action"]
         | 
|  | |
| 188 |  | 
| 189 | 
             
                                            # Show AI decision
         | 
| 190 | 
             
                                            st.write("**AI Reasoning:**")
         | 
|  | |
| 193 | 
             
                                            st.write("**AI Action:**")
         | 
| 194 | 
             
                                            st.success(f"`{action}`")
         | 
| 195 |  | 
| 196 | 
            +
                                            # Show raw response for debugging
         | 
| 197 | 
            +
                                            with st.expander("Decision Details"):
         | 
| 198 | 
            +
                                                st.json(decision)
         | 
| 199 | 
            +
             | 
| 200 | 
            +
                                        # Add step to history using the bot's method
         | 
| 201 | 
            +
                                        bot.add_step_to_history(history, current_screenshot_b64, decision)
         | 
| 202 |  | 
| 203 | 
             
                                    # Execute action
         | 
| 204 | 
             
                                    if action == "GUESS":
         | 
|  | |
| 214 | 
             
                                            final_guess = (lat, lon)
         | 
| 215 | 
             
                                            st.success(f"Final Guess: {lat:.4f}, {lon:.4f}")
         | 
| 216 | 
             
                                        break
         | 
| 217 | 
            +
                                    else:
         | 
| 218 | 
            +
                                        # Use bot's execute_action method
         | 
| 219 | 
            +
                                        bot.execute_action(action)
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
| 220 |  | 
| 221 | 
             
                                    # Auto scroll to bottom
         | 
| 222 | 
             
                                    st.empty()  # Force refresh to show latest content
         | 
    	
        config.py
    CHANGED
    
    | @@ -48,6 +48,16 @@ MODELS_CONFIG = { | |
| 48 | 
             
                    "model_name": "gemini-1.5-pro-latest",
         | 
| 49 | 
             
                    "description": "Google Gemini 1.5 Pro",
         | 
| 50 | 
             
                },
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 51 | 
             
                "qwen2-vl-7b": {
         | 
| 52 | 
             
                    "class": "HuggingFaceChat",
         | 
| 53 | 
             
                    "model_name": "Qwen/Qwen2-VL-7B-Instruct",
         | 
|  | |
| 48 | 
             
                    "model_name": "gemini-1.5-pro-latest",
         | 
| 49 | 
             
                    "description": "Google Gemini 1.5 Pro",
         | 
| 50 | 
             
                },
         | 
| 51 | 
            +
                "gemini-2.0-flash-exp": {
         | 
| 52 | 
            +
                    "class": "ChatGoogleGenerativeAI",
         | 
| 53 | 
            +
                    "model_name": "gemini-2.0-flash-exp",
         | 
| 54 | 
            +
                    "description": "Google Gemini 2.0 Flash Exp",
         | 
| 55 | 
            +
                },
         | 
| 56 | 
            +
            "gemini-2.5-pro": {
         | 
| 57 | 
            +
                    "class": "ChatGoogleGenerativeAI",
         | 
| 58 | 
            +
                    "model_name": "gemini-2.5-pro-preview-06-05",
         | 
| 59 | 
            +
                    "description": "Google Gemini 2.5 Pro",
         | 
| 60 | 
            +
                },
         | 
| 61 | 
             
                "qwen2-vl-7b": {
         | 
| 62 | 
             
                    "class": "HuggingFaceChat",
         | 
| 63 | 
             
                    "model_name": "Qwen/Qwen2-VL-7B-Instruct",
         | 
    	
        geo_bot.py
    CHANGED
    
    | @@ -15,38 +15,48 @@ from hf_chat import HuggingFaceChat | |
| 15 |  | 
| 16 | 
             
            from mapcrunch_controller import MapCrunchController
         | 
| 17 |  | 
| 18 | 
            -
            # The "Golden" Prompt ( | 
| 19 | 
             
            AGENT_PROMPT_TEMPLATE = """
         | 
| 20 | 
            -
            **Mission:** You are an expert geo-location agent. Your goal is to  | 
| 21 |  | 
| 22 | 
            -
            **Current Status | 
| 23 | 
            -
             | 
| 24 | 
            -
             | 
| 25 |  | 
| 26 | 
            -
             | 
| 27 | 
            -
            **Core Principles | 
| 28 |  | 
| 29 | 
            -
            1.  ** | 
|  | |
|  | |
|  | |
|  | |
| 30 |  | 
| 31 | 
            -
            2.  ** | 
| 32 | 
            -
                -  | 
| 33 | 
            -
                -  | 
|  | |
| 34 |  | 
| 35 | 
            -
            3.  ** | 
|  | |
|  | |
| 36 |  | 
| 37 | 
            -
            4.  ** | 
| 38 |  | 
| 39 | 
            -
             | 
| 40 | 
            -
            **Context & Task:**
         | 
| 41 | 
            -
            Analyze your full journey history and current view, apply the Core Principles, and decide your next action in the required JSON format.
         | 
| 42 |  | 
| 43 | 
            -
             | 
|  | |
| 44 | 
             
            {history_text}
         | 
| 45 |  | 
|  | |
|  | |
|  | |
|  | |
|  | |
| 46 | 
             
            **JSON Output Format:**
         | 
| 47 | 
            -
            Your response MUST be a valid JSON object wrapped in  | 
| 48 | 
            -
             | 
| 49 | 
            -
            - For the final guess: `{{"reasoning": "...", "action_details": {{"action": "GUESS", "lat": <float>, "lon": <float>}} }}`
         | 
| 50 | 
             
            """
         | 
| 51 |  | 
| 52 | 
             
            BENCHMARK_PROMPT = """
         | 
| @@ -149,8 +159,100 @@ class GeoBot: | |
| 149 | 
             
                        print(f"Invalid JSON from LLM: {e}\nFull response was:\n{response.content}")
         | 
| 150 | 
             
                        return None
         | 
| 151 |  | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 152 | 
             
                def run_agent_loop(self, max_steps: int = 10) -> Optional[Tuple[float, float]]:
         | 
| 153 | 
            -
                    history | 
| 154 |  | 
| 155 | 
             
                    for step in range(max_steps, 0, -1):
         | 
| 156 | 
             
                        print(f"\n--- Step {max_steps - step + 1}/{max_steps} ---")
         | 
| @@ -169,46 +271,13 @@ class GeoBot: | |
| 169 | 
             
                        available_actions = self.controller.get_available_actions()
         | 
| 170 | 
             
                        print(f"Available actions: {available_actions}")
         | 
| 171 |  | 
| 172 | 
            -
                         | 
| 173 | 
            -
                         | 
| 174 | 
            -
             | 
| 175 | 
            -
                            history_text = "No history yet. This is the first step."
         | 
| 176 | 
            -
                        else:
         | 
| 177 | 
            -
                            for i, h in enumerate(history):
         | 
| 178 | 
            -
                                history_text += f"--- History Step {i + 1} ---\n"
         | 
| 179 | 
            -
                                history_text += f"Reasoning: {h.get('reasoning', 'N/A')}\n"
         | 
| 180 | 
            -
                                history_text += f"Action: {h.get('action_details', {}).get('action', 'N/A')}\n\n"
         | 
| 181 | 
            -
                                image_b64_for_prompt.append(h["screenshot_b64"])
         | 
| 182 | 
            -
             | 
| 183 | 
            -
                        image_b64_for_prompt.append(current_screenshot_b64)
         | 
| 184 | 
            -
             | 
| 185 | 
            -
                        prompt = AGENT_PROMPT_TEMPLATE.format(
         | 
| 186 | 
            -
                            remaining_steps=step,
         | 
| 187 | 
            -
                            history_text=history_text,
         | 
| 188 | 
            -
                            available_actions=json.dumps(available_actions),
         | 
| 189 | 
             
                        )
         | 
| 190 |  | 
| 191 | 
            -
                         | 
| 192 | 
            -
             | 
| 193 | 
            -
                                prompt, image_b64_for_prompt
         | 
| 194 | 
            -
                            )
         | 
| 195 | 
            -
                            response = self.model.invoke(message)
         | 
| 196 | 
            -
                            decision = self._parse_agent_response(response)
         | 
| 197 | 
            -
                        except Exception as e:
         | 
| 198 | 
            -
                            print(f"Error during model invocation: {e}")
         | 
| 199 | 
            -
                            decision = None
         | 
| 200 | 
            -
             | 
| 201 | 
            -
                        if not decision:
         | 
| 202 | 
            -
                            print(
         | 
| 203 | 
            -
                                "Response parsing failed or model error. Using default recovery action: PAN_RIGHT."
         | 
| 204 | 
            -
                            )
         | 
| 205 | 
            -
                            decision = {
         | 
| 206 | 
            -
                                "reasoning": "Recovery due to parsing failure or model error.",
         | 
| 207 | 
            -
                                "action_details": {"action": "PAN_RIGHT"},
         | 
| 208 | 
            -
                            }
         | 
| 209 | 
            -
             | 
| 210 | 
            -
                        decision["screenshot_b64"] = current_screenshot_b64
         | 
| 211 | 
            -
                        history.append(decision)
         | 
| 212 |  | 
| 213 | 
             
                        action_details = decision.get("action_details", {})
         | 
| 214 | 
             
                        action = action_details.get("action")
         | 
| @@ -219,14 +288,8 @@ class GeoBot: | |
| 219 | 
             
                            lat, lon = action_details.get("lat"), action_details.get("lon")
         | 
| 220 | 
             
                            if lat is not None and lon is not None:
         | 
| 221 | 
             
                                return lat, lon
         | 
| 222 | 
            -
                         | 
| 223 | 
            -
                            self. | 
| 224 | 
            -
                        elif action == "MOVE_BACKWARD":
         | 
| 225 | 
            -
                            self.controller.move("backward")
         | 
| 226 | 
            -
                        elif action == "PAN_LEFT":
         | 
| 227 | 
            -
                            self.controller.pan_view("left")
         | 
| 228 | 
            -
                        elif action == "PAN_RIGHT":
         | 
| 229 | 
            -
                            self.controller.pan_view("right")
         | 
| 230 |  | 
| 231 | 
             
                    print("Max steps reached. Agent did not make a final guess.")
         | 
| 232 | 
             
                    return None
         | 
|  | |
| 15 |  | 
| 16 | 
             
            from mapcrunch_controller import MapCrunchController
         | 
| 17 |  | 
| 18 | 
            +
            # The "Golden" Prompt (v7): add more descprtions in context and task
         | 
| 19 | 
             
            AGENT_PROMPT_TEMPLATE = """
         | 
| 20 | 
            +
            **Mission:** You are an expert geo-location agent. Your goal is to pinpoint our position in as few moves as possible.
         | 
| 21 |  | 
| 22 | 
            +
            **Current Status**
         | 
| 23 | 
            +
            β’ Remaining Steps: {remaining_steps}  
         | 
| 24 | 
            +
            β’ Actions You Can Take *this* turn: {available_actions}
         | 
| 25 |  | 
| 26 | 
            +
            ββββββββββββββββββββββββββββββββ
         | 
| 27 | 
            +
            **Core Principles**
         | 
| 28 |  | 
| 29 | 
            +
            1.  **Observe β Orient β Act**  
         | 
| 30 | 
            +
                Start each turn with a structured three-part reasoning block:  
         | 
| 31 | 
            +
                **(1) Visual Clues β** plainly describe what you see (signs, text language, road lines, vegetation, building styles, vehicles, terrain, weather, etc.).  
         | 
| 32 | 
            +
                **(2) Potential Regions β** list the most plausible regions/countries those clues suggest.  
         | 
| 33 | 
            +
                **(3) Most Probable + Plan β** pick the single likeliest region and explain the next action (move/pan or guess).  
         | 
| 34 |  | 
| 35 | 
            +
            2.  **Navigate with Labels:**  
         | 
| 36 | 
            +
                - `MOVE_FORWARD` follows the green **UP** arrow.  
         | 
| 37 | 
            +
                - `MOVE_BACKWARD` follows the red **DOWN** arrow.  
         | 
| 38 | 
            +
                - No arrow β you cannot move that way.
         | 
| 39 |  | 
| 40 | 
            +
            3.  **Efficient Exploration:**  
         | 
| 41 | 
            +
                - **Pan Before You Move:** At fresh spots/intersections, use `PAN_LEFT` / `PAN_RIGHT` first.  
         | 
| 42 | 
            +
                - After ~2 or 3 fruitless moves in repetitive scenery, turn around.
         | 
| 43 |  | 
| 44 | 
            +
            4.  **Be Decisive:** A unique, definitive clue (full address, rare town name, etc.) β `GUESS` immediately.
         | 
| 45 |  | 
| 46 | 
            +
            5.  **Final-Step Rule:** If **Remaining Steps = 1**, you **MUST** `GUESS`.
         | 
|  | |
|  | |
| 47 |  | 
| 48 | 
            +
            ββββββββββββββββββββββββββββββββ
         | 
| 49 | 
            +
            **Action History**
         | 
| 50 | 
             
            {history_text}
         | 
| 51 |  | 
| 52 | 
            +
            ββββββββββββββββββββββββββββββββ
         | 
| 53 | 
            +
            **OUTPUT FORMAT**
         | 
| 54 | 
            +
             | 
| 55 | 
            +
            Return **one** JSON object wrapped in ```json β¦ ```:
         | 
| 56 | 
            +
             | 
| 57 | 
             
            **JSON Output Format:**
         | 
| 58 | 
            +
            Your response MUST be a valid JSON object wrapped in json ... .
         | 
| 59 | 
            +
            {{"reasoning": "...", "action_details": {{"action": "GUESS", "lat": <float>, "lon": <float>}} }}
         | 
|  | |
| 60 | 
             
            """
         | 
| 61 |  | 
| 62 | 
             
            BENCHMARK_PROMPT = """
         | 
|  | |
| 159 | 
             
                        print(f"Invalid JSON from LLM: {e}\nFull response was:\n{response.content}")
         | 
| 160 | 
             
                        return None
         | 
| 161 |  | 
| 162 | 
            +
                def init_history(self) -> List[Dict[str, Any]]:
         | 
| 163 | 
            +
                    """Initialize an empty history list for agent steps."""
         | 
| 164 | 
            +
                    return []
         | 
| 165 | 
            +
             | 
| 166 | 
            +
                def add_step_to_history(
         | 
| 167 | 
            +
                    self, 
         | 
| 168 | 
            +
                    history: List[Dict[str, Any]], 
         | 
| 169 | 
            +
                    screenshot_b64: str, 
         | 
| 170 | 
            +
                    decision: Optional[Dict[str, Any]] = None
         | 
| 171 | 
            +
                ) -> Dict[str, Any]:
         | 
| 172 | 
            +
                    """
         | 
| 173 | 
            +
                    Add a step to the history with proper structure.
         | 
| 174 | 
            +
                    Returns the step dictionary that was added.
         | 
| 175 | 
            +
                    """
         | 
| 176 | 
            +
                    step = {
         | 
| 177 | 
            +
                        "screenshot_b64": screenshot_b64,
         | 
| 178 | 
            +
                        "reasoning": decision.get("reasoning", "N/A") if decision else "N/A",
         | 
| 179 | 
            +
                        "action_details": decision.get("action_details", {"action": "N/A"}) if decision else {"action": "N/A"}
         | 
| 180 | 
            +
                    }
         | 
| 181 | 
            +
                    history.append(step)
         | 
| 182 | 
            +
                    return step
         | 
| 183 | 
            +
             | 
| 184 | 
            +
                def generate_history_text(self, history: List[Dict[str, Any]]) -> str:
         | 
| 185 | 
            +
                    """Generate formatted history text for prompt."""
         | 
| 186 | 
            +
                    if not history:
         | 
| 187 | 
            +
                        return "No history yet. This is the first step."
         | 
| 188 | 
            +
                    
         | 
| 189 | 
            +
                    history_text = ""
         | 
| 190 | 
            +
                    for i, h in enumerate(history):
         | 
| 191 | 
            +
                        history_text += f"--- History Step {i + 1} ---\n"
         | 
| 192 | 
            +
                        history_text += f"Reasoning: {h.get('reasoning', 'N/A')}\n"
         | 
| 193 | 
            +
                        history_text += f"Action: {h.get('action_details', {}).get('action', 'N/A')}\n\n"
         | 
| 194 | 
            +
                    return history_text
         | 
| 195 | 
            +
             | 
| 196 | 
            +
                def get_history_images(self, history: List[Dict[str, Any]]) -> List[str]:
         | 
| 197 | 
            +
                    """Extract image base64 strings from history."""
         | 
| 198 | 
            +
                    return [h["screenshot_b64"] for h in history]
         | 
| 199 | 
            +
             | 
| 200 | 
            +
                def execute_agent_step(
         | 
| 201 | 
            +
                    self, 
         | 
| 202 | 
            +
                    history: List[Dict[str, Any]], 
         | 
| 203 | 
            +
                    remaining_steps: int,
         | 
| 204 | 
            +
                    current_screenshot_b64: str,
         | 
| 205 | 
            +
                    available_actions: Dict[str, Any]
         | 
| 206 | 
            +
                ) -> Optional[Dict[str, Any]]:
         | 
| 207 | 
            +
                    """
         | 
| 208 | 
            +
                    Execute a single agent step: generate prompt, get AI decision, return decision.
         | 
| 209 | 
            +
                    This is the core step logic extracted for reuse.
         | 
| 210 | 
            +
                    """
         | 
| 211 | 
            +
                    history_text = self.generate_history_text(history)
         | 
| 212 | 
            +
                    image_b64_for_prompt = self.get_history_images(history) + [current_screenshot_b64]
         | 
| 213 | 
            +
             | 
| 214 | 
            +
                    prompt = AGENT_PROMPT_TEMPLATE.format(
         | 
| 215 | 
            +
                        remaining_steps=remaining_steps,
         | 
| 216 | 
            +
                        history_text=history_text,
         | 
| 217 | 
            +
                        available_actions=json.dumps(available_actions),
         | 
| 218 | 
            +
                    )
         | 
| 219 | 
            +
             | 
| 220 | 
            +
                    try:
         | 
| 221 | 
            +
                        message = self._create_message_with_history(prompt, image_b64_for_prompt[-1:])
         | 
| 222 | 
            +
                        response = self.model.invoke(message)
         | 
| 223 | 
            +
                        decision = self._parse_agent_response(response)
         | 
| 224 | 
            +
                    except Exception as e:
         | 
| 225 | 
            +
                        print(f"Error during model invocation: {e}")
         | 
| 226 | 
            +
                        decision = None
         | 
| 227 | 
            +
             | 
| 228 | 
            +
                    if not decision:
         | 
| 229 | 
            +
                        print("Response parsing failed or model error. Using default recovery action: PAN_RIGHT.")
         | 
| 230 | 
            +
                        decision = {
         | 
| 231 | 
            +
                            "reasoning": "Recovery due to parsing failure or model error.",
         | 
| 232 | 
            +
                            "action_details": {"action": "PAN_RIGHT"},
         | 
| 233 | 
            +
                        }
         | 
| 234 | 
            +
             | 
| 235 | 
            +
                    return decision
         | 
| 236 | 
            +
             | 
| 237 | 
            +
                def execute_action(self, action: str) -> bool:
         | 
| 238 | 
            +
                    """
         | 
| 239 | 
            +
                    Execute the given action using the controller.
         | 
| 240 | 
            +
                    Returns True if action was executed, False if it was GUESS.
         | 
| 241 | 
            +
                    """
         | 
| 242 | 
            +
                    if action == "GUESS":
         | 
| 243 | 
            +
                        return False
         | 
| 244 | 
            +
                    elif action == "MOVE_FORWARD":
         | 
| 245 | 
            +
                        self.controller.move("forward")
         | 
| 246 | 
            +
                    elif action == "MOVE_BACKWARD":
         | 
| 247 | 
            +
                        self.controller.move("backward")
         | 
| 248 | 
            +
                    elif action == "PAN_LEFT":
         | 
| 249 | 
            +
                        self.controller.pan_view("left")
         | 
| 250 | 
            +
                    elif action == "PAN_RIGHT":
         | 
| 251 | 
            +
                        self.controller.pan_view("right")
         | 
| 252 | 
            +
                    return True
         | 
| 253 | 
            +
             | 
| 254 | 
             
                def run_agent_loop(self, max_steps: int = 10) -> Optional[Tuple[float, float]]:
         | 
| 255 | 
            +
                    history = self.init_history()
         | 
| 256 |  | 
| 257 | 
             
                    for step in range(max_steps, 0, -1):
         | 
| 258 | 
             
                        print(f"\n--- Step {max_steps - step + 1}/{max_steps} ---")
         | 
|  | |
| 271 | 
             
                        available_actions = self.controller.get_available_actions()
         | 
| 272 | 
             
                        print(f"Available actions: {available_actions}")
         | 
| 273 |  | 
| 274 | 
            +
                        # Use the extracted step execution method
         | 
| 275 | 
            +
                        decision = self.execute_agent_step(
         | 
| 276 | 
            +
                            history, step, current_screenshot_b64, available_actions
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 277 | 
             
                        )
         | 
| 278 |  | 
| 279 | 
            +
                        # Add step to history
         | 
| 280 | 
            +
                        self.add_step_to_history(history, current_screenshot_b64, decision)
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 281 |  | 
| 282 | 
             
                        action_details = decision.get("action_details", {})
         | 
| 283 | 
             
                        action = action_details.get("action")
         | 
|  | |
| 288 | 
             
                            lat, lon = action_details.get("lat"), action_details.get("lon")
         | 
| 289 | 
             
                            if lat is not None and lon is not None:
         | 
| 290 | 
             
                                return lat, lon
         | 
| 291 | 
            +
                        else:
         | 
| 292 | 
            +
                            self.execute_action(action)
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 293 |  | 
| 294 | 
             
                    print("Max steps reached. Agent did not make a final guess.")
         | 
| 295 | 
             
                    return None
         | 
