Spaces:

andylizf
/

Omniscient

Sleeping

App Files Files Community

Chrisyichuan commited on Jun 12

Commit

e0deddf

1 Parent(s): 2d9d8bd

fix agent meory stuff

Browse files

Files changed (3) hide show

app.py +49 -49
config.py +10 -0
geo_bot.py +130 -67

app.py CHANGED Viewed

@@ -60,16 +60,33 @@ st.markdown("### *The all-knowing AI that sees everything, knows everything*")
 with st.sidebar:
     st.header("Configuration")
-    dataset_choice = st.selectbox("Dataset", get_available_datasets())
     model_choice = st.selectbox("Model", list(MODELS_CONFIG.keys()))
     steps_per_sample = st.slider("Max Steps", 3, 20, 10)
-    # Load dataset
     data_paths = get_data_paths(dataset_choice)
-    with open(data_paths["golden_labels"], "r") as f:
-        golden_labels = json.load(f).get("samples", [])
-    st.info(f"Dataset has {len(golden_labels)} samples")
     num_samples = st.slider(
         "Samples to Test", 1, len(golden_labels), min(3, len(golden_labels))
     )
@@ -102,7 +119,7 @@ if start_button:
             with sample_container:
                 # Initialize step tracking
-                history = []
                 final_guess = None
                 for step in range(steps_per_sample):
@@ -126,35 +143,19 @@ if start_button:
                             )
                         with col2:
-                            # Build history for AI
-                            current_step = {
-                                "image_b64": bot.pil_to_base64(
-                                    Image.open(BytesIO(screenshot_bytes))
-                                ),
-                                "action": "N/A",
-                            }
-                            history.append(current_step)
-                            available_actions = bot.controller.get_available_actions()
-                            history_text = "\n".join(
-                                [
-                                    f"Step {j + 1}: {h['action']}"
-                                    for j, h in enumerate(history[:-1])
-                                ]
-                            )
-                            if not history_text:
-                                history_text = "First step."
-                            prompt = AGENT_PROMPT_TEMPLATE.format(
-                                remaining_steps=steps_per_sample - step,
-                                history_text=history_text,
-                                available_actions=json.dumps(available_actions),
                             )
                             # Show AI context
                             st.write("**Available Actions:**")
                             st.code(json.dumps(available_actions, indent=2))
                             st.write("**AI Context:**")
                             st.text_area(
                                 "History",
@@ -168,21 +169,22 @@ if start_button:
                             if step_num == steps_per_sample:
                                 action = "GUESS"
                                 st.warning("Max steps reached. Forcing GUESS.")
                             else:
-                                # Get AI response
-                                message = bot._create_message_with_history(
-                                    prompt, [h["image_b64"] for h in history]
                                 )
-                                response = bot.model.invoke(message)
-                                decision = bot._parse_agent_response(response)
                                 if decision is None:
-                                    raise ValueError(
-                                        f"Failed to parse AI response: {response.content}"
-                                    )
                                 action = decision["action_details"]["action"]
-                                history[-1]["action"] = action
                                 # Show AI decision
                                 st.write("**AI Reasoning:**")
@@ -191,9 +193,12 @@ if start_button:
                                 st.write("**AI Action:**")
                                 st.success(f"`{action}`")
-                                # Show raw response
-                                with st.expander("Raw AI Response"):
-                                    st.text(response.content)
                         # Execute action
                         if action == "GUESS":
@@ -209,14 +214,9 @@ if start_button:
                                 final_guess = (lat, lon)
                                 st.success(f"Final Guess: {lat:.4f}, {lon:.4f}")
                             break
-                        elif action == "MOVE_FORWARD":
-                            bot.controller.move("forward")
-                        elif action == "MOVE_BACKWARD":
-                            bot.controller.move("backward")
-                        elif action == "PAN_LEFT":
-                            bot.controller.pan_view("left")
-                        elif action == "PAN_RIGHT":
-                            bot.controller.pan_view("right")
                         # Auto scroll to bottom
                         st.empty()  # Force refresh to show latest content

 with st.sidebar:
     st.header("Configuration")
+    # Get available datasets and ensure we have a valid default
+    available_datasets = get_available_datasets()
+    default_dataset = available_datasets[0] if available_datasets else "default"
+    dataset_choice = st.selectbox("Dataset", available_datasets, index=0)
     model_choice = st.selectbox("Model", list(MODELS_CONFIG.keys()))
     steps_per_sample = st.slider("Max Steps", 3, 20, 10)
+    # Load dataset with error handling
     data_paths = get_data_paths(dataset_choice)
+    try:
+        with open(data_paths["golden_labels"], "r") as f:
+            golden_labels = json.load(f).get("samples", [])
+        st.info(f"Dataset '{dataset_choice}' has {len(golden_labels)} samples")
+        if len(golden_labels) == 0:
+            st.error(f"Dataset '{dataset_choice}' contains no samples!")
+            st.stop()
+    except FileNotFoundError:
+        st.error(f"❌ Dataset '{dataset_choice}' not found at {data_paths['golden_labels']}")
+        st.info("💡 Available datasets: " + ", ".join(available_datasets))
+        st.stop()
+    except Exception as e:
+        st.error(f"❌ Error loading dataset '{dataset_choice}': {str(e)}")
+        st.stop()
     num_samples = st.slider(
         "Samples to Test", 1, len(golden_labels), min(3, len(golden_labels))
     )
             with sample_container:
                 # Initialize step tracking
+                history = bot.init_history()
                 final_guess = None
                 for step in range(steps_per_sample):
                             )
                         with col2:
+                            # Get current screenshot as base64
+                            current_screenshot_b64 = bot.pil_to_base64(
+                                Image.open(BytesIO(screenshot_bytes))
                             )
+                            available_actions = bot.controller.get_available_actions()
                             # Show AI context
                             st.write("**Available Actions:**")
                             st.code(json.dumps(available_actions, indent=2))
+                            # Generate and display history
+                            history_text = bot.generate_history_text(history)
                             st.write("**AI Context:**")
                             st.text_area(
                                 "History",
                             if step_num == steps_per_sample:
                                 action = "GUESS"
                                 st.warning("Max steps reached. Forcing GUESS.")
+                                # Create a forced decision for consistency
+                                decision = {
+                                    "reasoning": "Maximum steps reached, forcing final guess with fallback coordinates.",
+                                    "action_details": {"action": "GUESS", "lat": 0.0, "lon": 0.0}
+                                }
                             else:
+                                # Use the bot's agent step execution
+                                remaining_steps = steps_per_sample - step
+                                decision = bot.execute_agent_step(
+                                    history, remaining_steps, current_screenshot_b64, available_actions
                                 )
                                 if decision is None:
+                                    raise ValueError("Failed to get AI decision")
                                 action = decision["action_details"]["action"]
                                 # Show AI decision
                                 st.write("**AI Reasoning:**")
                                 st.write("**AI Action:**")
                                 st.success(f"`{action}`")
+                                # Show raw response for debugging
+                                with st.expander("Decision Details"):
+                                    st.json(decision)
+                            # Add step to history using the bot's method
+                            bot.add_step_to_history(history, current_screenshot_b64, decision)
                         # Execute action
                         if action == "GUESS":
                                 final_guess = (lat, lon)
                                 st.success(f"Final Guess: {lat:.4f}, {lon:.4f}")
                             break
+                        else:
+                            # Use bot's execute_action method
+                            bot.execute_action(action)
                         # Auto scroll to bottom
                         st.empty()  # Force refresh to show latest content

config.py CHANGED Viewed

@@ -48,6 +48,16 @@ MODELS_CONFIG = {
         "model_name": "gemini-1.5-pro-latest",
         "description": "Google Gemini 1.5 Pro",
     },
     "qwen2-vl-7b": {
         "class": "HuggingFaceChat",
         "model_name": "Qwen/Qwen2-VL-7B-Instruct",

         "model_name": "gemini-1.5-pro-latest",
         "description": "Google Gemini 1.5 Pro",
     },
+    "gemini-2.0-flash-exp": {
+        "class": "ChatGoogleGenerativeAI",
+        "model_name": "gemini-2.0-flash-exp",
+        "description": "Google Gemini 2.0 Flash Exp",
+    },
+"gemini-2.5-pro": {
+        "class": "ChatGoogleGenerativeAI",
+        "model_name": "gemini-2.5-pro-preview-06-05",
+        "description": "Google Gemini 2.5 Pro",
+    },
     "qwen2-vl-7b": {
         "class": "HuggingFaceChat",
         "model_name": "Qwen/Qwen2-VL-7B-Instruct",

geo_bot.py CHANGED Viewed

@@ -15,38 +15,48 @@ from hf_chat import HuggingFaceChat
 from mapcrunch_controller import MapCrunchController
-# The "Golden" Prompt (v6): Combines clear mechanics with robust strategic principles.
 AGENT_PROMPT_TEMPLATE = """
-**Mission:** You are an expert geo-location agent. Your goal is to find clues to determine your location within a limited number of steps.
-**Current Status:**
-- **Remaining Steps: {remaining_steps}**
-- **Available Actions This Turn: {available_actions}**
----
-**Core Principles of an Expert Player:**
-1.  **Navigate with Labels:** `MOVE_FORWARD` follows the green 'UP' arrow. `MOVE_BACKWARD` follows the red 'DOWN' arrow. These labels are your most reliable compass. If there are no arrows, you cannot move.
-2.  **Efficient Exploration (to avoid "Bulldozer" mode):**
-    - **Pan Before You Move:** At a new location or an intersection, it's often wise to use `PAN_LEFT` or `PAN_RIGHT` to quickly survey your surroundings before committing to a move.
-    - **Don't Get Stuck:** If you've moved forward 2-3 times down a path and found nothing but repetitive scenery (like an empty forest or highway), consider it a barren path. It's smarter to turn around (using `PAN`) and check another direction.
-3.  **Be Decisive:** If you find a truly definitive clue (like a full, readable address or a sign with a unique town name), `GUESS` immediately. Don't waste steps.
-4.  **Final Step Rule:** If `remaining_steps` is **exactly 1**, your action **MUST be `GUESS`**.
----
-**Context & Task:**
-Analyze your full journey history and current view, apply the Core Principles, and decide your next action in the required JSON format.
-**Action History:**
 {history_text}
 **JSON Output Format:**
-Your response MUST be a valid JSON object wrapped in ```json ... ```.
-- For exploration: `{{"reasoning": "...", "action_details": {{"action": "ACTION_NAME"}} }}`
-- For the final guess: `{{"reasoning": "...", "action_details": {{"action": "GUESS", "lat": <float>, "lon": <float>}} }}`
 """
 BENCHMARK_PROMPT = """
@@ -149,8 +159,100 @@ class GeoBot:
             print(f"Invalid JSON from LLM: {e}\nFull response was:\n{response.content}")
             return None
     def run_agent_loop(self, max_steps: int = 10) -> Optional[Tuple[float, float]]:
-        history: List[Dict[str, Any]] = []
         for step in range(max_steps, 0, -1):
             print(f"\n--- Step {max_steps - step + 1}/{max_steps} ---")
@@ -169,46 +271,13 @@ class GeoBot:
             available_actions = self.controller.get_available_actions()
             print(f"Available actions: {available_actions}")
-            history_text: str = ""
-            image_b64_for_prompt: List[str] = []
-            if not history:
-                history_text = "No history yet. This is the first step."
-            else:
-                for i, h in enumerate(history):
-                    history_text += f"--- History Step {i + 1} ---\n"
-                    history_text += f"Reasoning: {h.get('reasoning', 'N/A')}\n"
-                    history_text += f"Action: {h.get('action_details', {}).get('action', 'N/A')}\n\n"
-                    image_b64_for_prompt.append(h["screenshot_b64"])
-            image_b64_for_prompt.append(current_screenshot_b64)
-            prompt = AGENT_PROMPT_TEMPLATE.format(
-                remaining_steps=step,
-                history_text=history_text,
-                available_actions=json.dumps(available_actions),
             )
-            try:
-                message = self._create_message_with_history(
-                    prompt, image_b64_for_prompt
-                )
-                response = self.model.invoke(message)
-                decision = self._parse_agent_response(response)
-            except Exception as e:
-                print(f"Error during model invocation: {e}")
-                decision = None
-            if not decision:
-                print(
-                    "Response parsing failed or model error. Using default recovery action: PAN_RIGHT."
-                )
-                decision = {
-                    "reasoning": "Recovery due to parsing failure or model error.",
-                    "action_details": {"action": "PAN_RIGHT"},
-                }
-            decision["screenshot_b64"] = current_screenshot_b64
-            history.append(decision)
             action_details = decision.get("action_details", {})
             action = action_details.get("action")
@@ -219,14 +288,8 @@ class GeoBot:
                 lat, lon = action_details.get("lat"), action_details.get("lon")
                 if lat is not None and lon is not None:
                     return lat, lon
-            elif action == "MOVE_FORWARD":
-                self.controller.move("forward")
-            elif action == "MOVE_BACKWARD":
-                self.controller.move("backward")
-            elif action == "PAN_LEFT":
-                self.controller.pan_view("left")
-            elif action == "PAN_RIGHT":
-                self.controller.pan_view("right")
         print("Max steps reached. Agent did not make a final guess.")
         return None

 from mapcrunch_controller import MapCrunchController
+# The "Golden" Prompt (v7): add more descprtions in context and task
 AGENT_PROMPT_TEMPLATE = """
+**Mission:** You are an expert geo-location agent. Your goal is to pinpoint our position in as few moves as possible.
+**Current Status**
+• Remaining Steps: {remaining_steps}
+• Actions You Can Take *this* turn: {available_actions}
+────────────────────────────────
+**Core Principles**
+1.  **Observe → Orient → Act**
+    Start each turn with a structured three-part reasoning block:
+    **(1) Visual Clues —** plainly describe what you see (signs, text language, road lines, vegetation, building styles, vehicles, terrain, weather, etc.).
+    **(2) Potential Regions —** list the most plausible regions/countries those clues suggest.
+    **(3) Most Probable + Plan —** pick the single likeliest region and explain the next action (move/pan or guess).
+2.  **Navigate with Labels:**
+    - `MOVE_FORWARD` follows the green **UP** arrow.
+    - `MOVE_BACKWARD` follows the red **DOWN** arrow.
+    - No arrow ⇒ you cannot move that way.
+3.  **Efficient Exploration:**
+    - **Pan Before You Move:** At fresh spots/intersections, use `PAN_LEFT` / `PAN_RIGHT` first.
+    - After ~2 or 3 fruitless moves in repetitive scenery, turn around.
+4.  **Be Decisive:** A unique, definitive clue (full address, rare town name, etc.) ⇒ `GUESS` immediately.
+5.  **Final-Step Rule:** If **Remaining Steps = 1**, you **MUST** `GUESS`.
+────────────────────────────────
+**Action History**
 {history_text}
+────────────────────────────────
+**OUTPUT FORMAT**
+Return **one** JSON object wrapped in ```json … ```:
 **JSON Output Format:**
+Your response MUST be a valid JSON object wrapped in json ... .
+{{"reasoning": "...", "action_details": {{"action": "GUESS", "lat": <float>, "lon": <float>}} }}
 """
 BENCHMARK_PROMPT = """
             print(f"Invalid JSON from LLM: {e}\nFull response was:\n{response.content}")
             return None
+    def init_history(self) -> List[Dict[str, Any]]:
+        """Initialize an empty history list for agent steps."""
+        return []
+    def add_step_to_history(
+        self,
+        history: List[Dict[str, Any]],
+        screenshot_b64: str,
+        decision: Optional[Dict[str, Any]] = None
+    ) -> Dict[str, Any]:
+        """
+        Add a step to the history with proper structure.
+        Returns the step dictionary that was added.
+        """
+        step = {
+            "screenshot_b64": screenshot_b64,
+            "reasoning": decision.get("reasoning", "N/A") if decision else "N/A",
+            "action_details": decision.get("action_details", {"action": "N/A"}) if decision else {"action": "N/A"}
+        }
+        history.append(step)
+        return step
+    def generate_history_text(self, history: List[Dict[str, Any]]) -> str:
+        """Generate formatted history text for prompt."""
+        if not history:
+            return "No history yet. This is the first step."
+        history_text = ""
+        for i, h in enumerate(history):
+            history_text += f"--- History Step {i + 1} ---\n"
+            history_text += f"Reasoning: {h.get('reasoning', 'N/A')}\n"
+            history_text += f"Action: {h.get('action_details', {}).get('action', 'N/A')}\n\n"
+        return history_text
+    def get_history_images(self, history: List[Dict[str, Any]]) -> List[str]:
+        """Extract image base64 strings from history."""
+        return [h["screenshot_b64"] for h in history]
+    def execute_agent_step(
+        self,
+        history: List[Dict[str, Any]],
+        remaining_steps: int,
+        current_screenshot_b64: str,
+        available_actions: Dict[str, Any]
+    ) -> Optional[Dict[str, Any]]:
+        """
+        Execute a single agent step: generate prompt, get AI decision, return decision.
+        This is the core step logic extracted for reuse.
+        """
+        history_text = self.generate_history_text(history)
+        image_b64_for_prompt = self.get_history_images(history) + [current_screenshot_b64]
+        prompt = AGENT_PROMPT_TEMPLATE.format(
+            remaining_steps=remaining_steps,
+            history_text=history_text,
+            available_actions=json.dumps(available_actions),
+        )
+        try:
+            message = self._create_message_with_history(prompt, image_b64_for_prompt[-1:])
+            response = self.model.invoke(message)
+            decision = self._parse_agent_response(response)
+        except Exception as e:
+            print(f"Error during model invocation: {e}")
+            decision = None
+        if not decision:
+            print("Response parsing failed or model error. Using default recovery action: PAN_RIGHT.")
+            decision = {
+                "reasoning": "Recovery due to parsing failure or model error.",
+                "action_details": {"action": "PAN_RIGHT"},
+            }
+        return decision
+    def execute_action(self, action: str) -> bool:
+        """
+        Execute the given action using the controller.
+        Returns True if action was executed, False if it was GUESS.
+        """
+        if action == "GUESS":
+            return False
+        elif action == "MOVE_FORWARD":
+            self.controller.move("forward")
+        elif action == "MOVE_BACKWARD":
+            self.controller.move("backward")
+        elif action == "PAN_LEFT":
+            self.controller.pan_view("left")
+        elif action == "PAN_RIGHT":
+            self.controller.pan_view("right")
+        return True
     def run_agent_loop(self, max_steps: int = 10) -> Optional[Tuple[float, float]]:
+        history = self.init_history()
         for step in range(max_steps, 0, -1):
             print(f"\n--- Step {max_steps - step + 1}/{max_steps} ---")
             available_actions = self.controller.get_available_actions()
             print(f"Available actions: {available_actions}")
+            # Use the extracted step execution method
+            decision = self.execute_agent_step(
+                history, step, current_screenshot_b64, available_actions
             )
+            # Add step to history
+            self.add_step_to_history(history, current_screenshot_b64, decision)
             action_details = decision.get("action_details", {})
             action = action_details.get("action")
                 lat, lon = action_details.get("lat"), action_details.get("lon")
                 if lat is not None and lon is not None:
                     return lat, lon
+            else:
+                self.execute_action(action)
         print("Max steps reached. Agent did not make a final guess.")
         return None