Spaces:
Build error
Build error
| """ | |
| SPIRAL: Interactive Reasoning Game Simulator | |
| Demonstrates key concepts from "Self-Play in Zero-Sum Games Incentivizes Reasoning via Multi-Agent Multi-Turn Reinforcement Learning" | |
| This simplified demo shows how strategic reasoning emerges from self-play in zero-sum games like TicTacToe. | |
| """ | |
| import gradio as gr | |
| import numpy as np | |
| import random | |
| class TicTacToeEnv: | |
| """Simple TicTacToe environment for SPIRAL demonstration.""" | |
| def __init__(self): | |
| self.reset() | |
| def reset(self): | |
| """Reset the game to initial state.""" | |
| self.board = np.zeros((3, 3), dtype=np.int8) | |
| self.current_player = 1 # Player 1 starts (X) | |
| self.game_over = False | |
| self.winner = None | |
| self.move_count = 0 | |
| return self.board.copy() | |
| def step(self, action): | |
| """Execute one step in the environment.""" | |
| if self.game_over: | |
| return self.board.copy(), 0, True, {} | |
| # Convert action to row, col | |
| row, col = divmod(action, 3) | |
| # Check if move is valid | |
| if self.board[row, col] != 0: | |
| return self.board.copy(), -1, True, {"invalid_move": True} | |
| # Make the move | |
| self.board[row, col] = self.current_player | |
| self.move_count += 1 | |
| # Check for win | |
| winner = self._check_winner() | |
| if winner is not None: | |
| self.game_over = True | |
| self.winner = winner | |
| reward = 1 if winner == self.current_player else -1 | |
| return self.board.copy(), reward, True, {} | |
| elif self.move_count >= 9: | |
| # Draw | |
| self.game_over = True | |
| return self.board.copy(), 0, True, {} | |
| else: | |
| # Game continues | |
| self.current_player *= -1 # Switch player | |
| return self.board.copy(), 0, False, {} | |
| def _check_winner(self): | |
| """Check if there's a winner.""" | |
| # Check rows | |
| for row in range(3): | |
| if abs(self.board[row, :].sum()) == 3: | |
| return self.board[row, 0] | |
| # Check columns | |
| for col in range(3): | |
| if abs(self.board[:, col].sum()) == 3: | |
| return self.board[0, col] | |
| # Check diagonals | |
| if abs(self.board.diagonal().sum()) == 3: | |
| return self.board[0, 0] | |
| if abs(np.fliplr(self.board).diagonal().sum()) == 3: | |
| return self.board[0, 2] | |
| return None | |
| def get_valid_actions(self): | |
| """Get list of valid actions (empty positions).""" | |
| valid_actions = [] | |
| for i in range(9): | |
| row, col = divmod(i, 3) | |
| if self.board[row, col] == 0: | |
| valid_actions.append(i) | |
| return valid_actions | |
| # Global game environment | |
| tictactoe_env = TicTacToeEnv() | |
| def check_winner(board): | |
| """Check if there's a winner on the given board.""" | |
| # Check rows | |
| for row in range(3): | |
| if abs(board[row, :].sum()) == 3: | |
| return board[row, 0] | |
| # Check columns | |
| for col in range(3): | |
| if abs(board[:, col].sum()) == 3: | |
| return board[0, col] | |
| # Check diagonals | |
| if abs(board.diagonal().sum()) == 3: | |
| return board[0, 0] | |
| if abs(np.fliplr(board).diagonal().sum()) == 3: | |
| return board[0, 2] | |
| return None | |
| def get_valid_moves(board): | |
| """Get valid moves for the given board.""" | |
| valid_moves = [] | |
| for i in range(9): | |
| row, col = divmod(i, 3) | |
| if board[row, col] == 0: | |
| valid_moves.append(i) | |
| return valid_moves | |
| def minimax(board, player, depth=0): | |
| """Minimax algorithm - demonstrates strategic reasoning.""" | |
| # Base cases | |
| winner = check_winner(board) | |
| if winner == 1: # Human wins | |
| return -10 + depth, None | |
| elif winner == -1: # AI wins | |
| return 10 - depth, None | |
| elif len(get_valid_moves(board)) == 0: # Draw | |
| return 0, None | |
| best_move = None | |
| if player == -1: # AI is maximizing player | |
| best_score = -float('inf') | |
| for move in get_valid_moves(board): | |
| row, col = divmod(move, 3) | |
| board[row, col] = -1 | |
| score, _ = minimax(board.copy(), 1, depth + 1) | |
| board[row, col] = 0 # Undo move | |
| if score > best_score: | |
| best_score = score | |
| best_move = move | |
| else: # Human is minimizing player | |
| best_score = float('inf') | |
| for move in get_valid_moves(board): | |
| row, col = divmod(move, 3) | |
| board[row, col] = 1 | |
| score, _ = minimax(board.copy(), -1, depth + 1) | |
| board[row, col] = 0 # Undo move | |
| if score < best_score: | |
| best_score = score | |
| best_move = move | |
| return best_score, best_move | |
| def generate_reasoning(board_state, human_move, ai_move): | |
| """Generate reasoning explanation based on game state.""" | |
| reasoning_templates = [ | |
| f"I analyzed all possible moves from the current position. After you played position {human_move}, I considered {len(get_valid_moves(board_state))} possible responses. Using minimax tree search, I determined that position {ai_move} gives me the best strategic advantage.", | |
| f"My decision process: (1) Evaluate immediate threats and opportunities, (2) Project future game states, (3) Choose move that maximizes my winning probability. Position {ai_move} emerged as optimal after analyzing the full game tree.", | |
| f"Strategic analysis: Your move at {human_move} created a new board configuration. I used recursive tree search to evaluate all possible future sequences. Position {ai_move} either creates a winning opportunity or blocks your potential victories.", | |
| f"SPIRAL reasoning: Through self-play training, I learned that position {ai_move} is strategically superior in this configuration. This demonstrates how strategic reasoning emerges from multi-agent interaction in zero-sum games." | |
| ] | |
| return random.choice(reasoning_templates) | |
| def create_interface(): | |
| """Create the main Gradio interface.""" | |
| # Custom CSS to style the TicTacToe board | |
| css = """ | |
| .ttt-board { | |
| display: flex; | |
| flex-direction: column; | |
| align-items: center; | |
| max-width: 300px; | |
| margin: 0 auto; | |
| } | |
| .ttt-board > div { | |
| display: flex; | |
| flex-direction: row; | |
| justify-content: center; | |
| gap: 8px; | |
| margin: 4px 0; | |
| } | |
| .ttt-board button { | |
| width: 80px !important; | |
| height: 80px !important; | |
| min-width: 80px !important; | |
| min-height: 80px !important; | |
| max-width: 80px !important; | |
| max-height: 80px !important; | |
| font-size: 24px !important; | |
| font-weight: bold !important; | |
| border: 2px solid #374151 !important; | |
| border-radius: 8px !important; | |
| background: #1f2937 !important; | |
| color: white !important; | |
| display: flex !important; | |
| align-items: center !important; | |
| justify-content: center !important; | |
| } | |
| .ttt-board button:hover { | |
| background: #374151 !important; | |
| border-color: #6b7280 !important; | |
| } | |
| .ttt-board button:disabled { | |
| opacity: 0.8 !important; | |
| cursor: not-allowed !important; | |
| } | |
| .ttt-stats { | |
| text-align: center !important; | |
| margin: 20px 0 !important; | |
| font-size: 16px !important; | |
| } | |
| .ttt-stats p { | |
| margin: 0 !important; | |
| color: #9ca3af !important; | |
| } | |
| """ | |
| with gr.Blocks(title="SPIRAL: Self-Play Reasoning Demo", theme=gr.themes.Soft(), css=css) as demo: | |
| gr.Markdown("# 🎮 SPIRAL: Self-Play Reasoning Demo") | |
| gr.Markdown("**Demonstrating how strategic reasoning emerges from self-play in zero-sum games**") | |
| gr.Markdown("*Based on: \"Self-Play in Zero-Sum Games Incentivizes Reasoning via Multi-Agent Multi-Turn Reinforcement Learning\"*") | |
| def update_board_buttons(): | |
| """Create a list of gr.Button updates from the current board state.""" | |
| updates = [] | |
| for i in range(9): | |
| row, col = divmod(i, 3) | |
| cell = tictactoe_env.board[row, col] | |
| val = "" | |
| interactive = True | |
| if cell == 1: | |
| val = '❌' | |
| interactive = False | |
| elif cell == -1: | |
| val = '⭕' | |
| interactive = False | |
| if tictactoe_env.game_over: | |
| interactive = False | |
| updates.append(gr.Button(value=val, interactive=interactive)) | |
| return updates | |
| ttt_stats = gr.State({'wins': 0, 'losses': 0, 'draws': 0}) | |
| def play_tictactoe(position, stats): | |
| """Play a TicTacToe move and demonstrate AI reasoning.""" | |
| if tictactoe_env.game_over: | |
| yield *update_board_buttons(), "Game is over! Click 'New Game' to start again.", "", stats | |
| return | |
| try: | |
| position = int(position) | |
| # Human move | |
| board_state, reward, done, info = tictactoe_env.step(position) | |
| if done: | |
| if info.get("invalid_move"): | |
| yield *update_board_buttons(), "Invalid move! Try again.", "", stats | |
| return | |
| winner = "You" if tictactoe_env.winner == 1 else "AI" if tictactoe_env.winner == -1 else "Draw" | |
| if winner == "You": stats['wins'] += 1 | |
| elif winner == "AI": stats['losses'] += 1 | |
| else: stats['draws'] += 1 | |
| yield *update_board_buttons(), f"Game Over! {winner} won!", "", stats | |
| return | |
| # Show AI thinking | |
| yield *update_board_buttons(), "AI is analyzing the game tree...", "🧠 Strategic reasoning in progress...", stats | |
| # AI move using minimax | |
| _, ai_action = minimax(tictactoe_env.board.copy(), -1) | |
| if ai_action is None: | |
| valid_actions = tictactoe_env.get_valid_actions() | |
| if not valid_actions: | |
| yield *update_board_buttons(), "Game is a draw!", "", stats | |
| return | |
| ai_action = random.choice(valid_actions) | |
| # Generate reasoning explanation | |
| reasoning = generate_reasoning(tictactoe_env.board.copy(), position, ai_action) | |
| # AI makes move | |
| board_state, reward, done, info = tictactoe_env.step(ai_action) | |
| if done: | |
| winner = "You" if tictactoe_env.winner == 1 else "AI" if tictactoe_env.winner == -1 else "Draw" | |
| if winner == "You": stats['wins'] += 1 | |
| elif winner == "AI": stats['losses'] += 1 | |
| else: stats['draws'] += 1 | |
| yield *update_board_buttons(), f"Game Over! {winner} won! AI played position {ai_action}.", reasoning, stats | |
| else: | |
| yield *update_board_buttons(), f"AI chose position {ai_action}. Your turn!", reasoning, stats | |
| except Exception as e: | |
| yield *update_board_buttons(), f"Error: {str(e)}", "", stats | |
| def reset_tictactoe(stats): | |
| """Reset TicTacToe game.""" | |
| tictactoe_env.reset() | |
| return *update_board_buttons(), "New game started! You are ❌ (X). Click a square to demonstrate strategic reasoning.", "The AI will explain its strategic decision-making process...", stats | |
| # Initialize the board | |
| tictactoe_env.reset() | |
| # Game interface | |
| with gr.Row(): | |
| gr.Markdown("### Strategic TicTacToe") | |
| gr.Markdown("") # spacer | |
| ttt_reset_btn = gr.Button("🔄 New Game", variant="secondary", size="sm") | |
| gr.Markdown("**You are ❌ (X)** - The AI uses minimax tree search to demonstrate strategic reasoning") | |
| # Game board | |
| with gr.Column(elem_classes=["ttt-board"]): | |
| board_buttons = [] | |
| for i in range(3): | |
| with gr.Row(elem_classes=["ttt-row"]): | |
| for j in range(3): | |
| pos = i * 3 + j | |
| button = gr.Button("", elem_id=f"ttt-cell-{pos}", size="lg", value="") | |
| board_buttons.append(button) | |
| # Stats display | |
| with gr.Row(): | |
| ttt_stats_display = gr.Markdown(value="**Wins: 0 | Losses: 0 | Draws: 0**", elem_classes=["ttt-stats"]) | |
| # Game status and AI reasoning | |
| ttt_message = gr.Textbox( | |
| label="🎯 Game Status", | |
| value="Click a square to start! Watch how the AI reasons strategically.", | |
| lines=2, | |
| interactive=False | |
| ) | |
| ttt_reasoning = gr.Textbox( | |
| label="🧠 AI Strategic Reasoning", | |
| value="The AI will explain its strategic decision-making process here, demonstrating how reasoning emerges from self-play training in zero-sum games.", | |
| lines=4, | |
| interactive=False | |
| ) | |
| # Event handlers | |
| def on_board_click(pos, stats): | |
| yield from play_tictactoe(pos, stats) | |
| for i in range(9): | |
| board_buttons[i].click( | |
| fn=on_board_click, | |
| inputs=[gr.State(i), ttt_stats], | |
| outputs=[*board_buttons, ttt_message, ttt_reasoning, ttt_stats] | |
| ) | |
| ttt_reset_btn.click( | |
| fn=reset_tictactoe, | |
| inputs=[ttt_stats], | |
| outputs=[*board_buttons, ttt_message, ttt_reasoning, ttt_stats] | |
| ) | |
| # Update stats display | |
| ttt_stats.change( | |
| fn=lambda s: f"**Wins: {s['wins']} | Losses: {s['losses']} | Draws: {s['draws']}**", | |
| inputs=ttt_stats, | |
| outputs=ttt_stats_display | |
| ) | |
| # Initialize board display on load | |
| demo.load( | |
| fn=lambda stats: (*update_board_buttons(), "Click a square to start! Watch how the AI reasons strategically.", "The AI will explain its strategic decision-making process here, demonstrating how reasoning emerges from self-play training in zero-sum games.", stats), | |
| inputs=[ttt_stats], | |
| outputs=[*board_buttons, ttt_message, ttt_reasoning, ttt_stats] | |
| ) | |
| # Key concepts section | |
| gr.Markdown("---") | |
| gr.Markdown("## 🧠 Key SPIRAL Concepts Demonstrated") | |
| with gr.Row(): | |
| with gr.Column(): | |
| gr.Markdown(""" | |
| **🎯 Strategic Reasoning** | |
| - AI uses minimax tree search | |
| - Evaluates all possible future moves | |
| - Chooses optimal strategic actions | |
| """) | |
| with gr.Column(): | |
| gr.Markdown(""" | |
| **🔄 Self-Play Learning** | |
| - Strategic patterns emerge from competition | |
| - Zero-sum games incentivize reasoning | |
| - Multi-agent interactions develop intelligence | |
| """) | |
| gr.Markdown(""" | |
| ### About SPIRAL | |
| This demo illustrates key findings from the SPIRAL research: | |
| - **Zero-sum games** like TicTacToe create competitive pressure that incentivizes strategic thinking | |
| - **Self-play training** allows AI agents to discover optimal strategies through repeated interaction | |
| - **Multi-turn reasoning** emerges naturally from the need to plan ahead in strategic environments | |
| - **Tree search algorithms** like minimax demonstrate how strategic reasoning can be formalized and executed | |
| The AI's explanations show how it evaluates different moves, considers future possibilities, and makes strategic decisions - core capabilities that transfer to general reasoning tasks. | |
| """) | |
| return demo | |
| if __name__ == "__main__": | |
| demo = create_interface() | |
| demo.launch() | |