Spaces:

JaceWei
/

PaperShow

Sleeping

File size: 15,968 Bytes

7c08dc3

\documentclass{beamer}


% Theme and Color
\usetheme{Madrid}
\usecolortheme{default}

% Packages
\usepackage[utf8]{inputenc}
\usepackage[T1]{fontenc}
\usepackage{amsmath, amssymb, amsfonts}
\usepackage{booktabs}
\usepackage{graphicx}
\usepackage{hyperref}
\usepackage{bm} % For bold math symbols

% Custom commands from the source text for consistency
\newcommand{\KL}{D_{\mathrm{KL}}}
\def\figref#1{Figure~\ref{#1}}

\title[Meta-Safe RL]{A CMDP-within-online framework for Meta-Safe Reinforcement Learning}
\author{Vanshaj Khattar\inst{1} \and Yuhao Ding\inst{2} \and Bilgehan Sel\inst{1} \and Javad Lavaei\inst{2} \and Ming Jin\inst{1}}
\institute[VT \& UCB]{
  \inst{1} Virginia Tech \\
  \inst{2} UC Berkeley
}
\date{\today}


\setbeamerfont{caption}{size=\scriptsize}
\begin{document}
\begin{frame}
  \titlepage
\end{frame}
\begin{frame}{Outline}
  \tableofcontents
\end{frame}
\section{Motivation}
\begin{frame}{Motivation: Why Meta-Safe RL?}
  \begin{block}{Background: Meta-Reinforcement Learning (Meta-RL)}
    \footnotesize
    \begin{itemize}
      \item Meta-RL enables agents to learn new tasks quickly with limited experience.
      \item It's a "learning-to-learn" framework successful in robotics, federated learning, etc.
    \end{itemize}
  \end{block}

  \begin{block}{The Problem: Safety is Critical}
    \footnotesize
    \begin{itemize}
      \item Many real-world applications have \alert{safety constraints} that must not be violated (e.g., robotics, autonomous driving).
      \item Existing Meta-RL methods do not adequately address these constraints.
      \item Safe RL problems are often modeled as \alert{Constrained Markov Decision Processes (CMDPs)}, but standard CMDP algorithms don't generalize efficiently to new tasks.
    \end{itemize}
  \end{block}

  \begin{block}{Our Goal}
    \footnotesize
    \begin{itemize}
      \item Develop a principled framework, \alert{Meta-Safe RL (Meta-SRL)}, that combines the fast adaptation of meta-learning with the safety guarantees of Safe RL.
      \item Provide the \alert{first provable guarantees} for learning across multiple safe RL tasks.
    \end{itemize}
  \end{block}
\end{frame}
\section{Related Work}
\begin{frame}{Related Work}
  \begin{itemize}
    \item \textbf{Meta-Reinforcement Learning:}
    \begin{itemize}
        \item Focuses on learning initial conditions, hyperparameters, etc., for fast adaptation.
        \item Most work is for \alert{unconstrained} environments.
    \end{itemize}
    \item \textbf{Online Meta-Learning:}
    \begin{itemize}
        \item Provides theoretical frameworks, often for convex and decomposable loss functions.
        \item Our work extends this to the \alert{nonconvex and complex} setting of CMDPs.
    \end{itemize}
    \item \textbf{Safe RL and CMDPs:}
    \begin{itemize}
        \item A rich field with many algorithms (e.g., primal-dual, policy-based like \alert{CRPO}).
        \item However, these are designed for a \alert{single task} and are not built to generalize or adapt quickly to unseen tasks.
    \end{itemize}
  \end{itemize}
\end{frame}
\section{Method}
\begin{frame}{Method: CMDP-within-Online Framework}
  \begin{block}{Core Idea}
    \footnotesize
    \footnotesize
    \begin{itemize}
        \item A \alert{meta-learner} (online algorithm) operates over a sequence of CMDP tasks.
        \item For each task $t$, the meta-learner provides an initial policy $\alert{\pi_{t,0}}$ and a learning rate $\alert{\alpha_t}$ to a \alert{within-task} Safe RL algorithm (e.g., CRPO).
        \item The goal is to minimize the \textbf{Task-Averaged Optimality Gap (TAOG)} and \textbf{Task-Averaged Constraint Violation (TACV)}.
    \end{itemize}
  \end{block}
  \begin{figure}
    \centering
    \includegraphics[width=0.3\textwidth]{illustrate.pdf}
    \caption{Conceptual illustration of the meta-learning process.}
    \label{fig:method_concept}
  \end{figure}
\end{frame}
\begin{frame}{Method: The Within-Task Algorithm (CRPO)}
  % Corresponds to the source text Section 2.1
  \begin{block}{Constrained Markov Decision Process (CMDP)}
    \footnotesize
    For each task $t$, the agent aims to solve:
    \begin{equation*}
        \underset{\pi}{\max} \hspace{0.1cm} J_{t,0}(\pi) \hspace{0.3cm} \text{s.t.} \hspace{0.2cm} \alert{J_{t,i}(\pi) \leq d_{t,i}}, \hspace{0.3cm} \forall i = 1,...,p
    \end{equation*}
    where $J_{t,0}$ is the expected reward and $J_{t,i}$ are expected costs.
  \end{block}

  \begin{block}{CRPO Algorithm \& Regret}
    \footnotesize
    \begin{itemize}
        \item We use the Constraint-Rectified Policy Optimization (\alert{CRPO}) algorithm.
        \item The single-task optimality gap ($R_0$) and constraint violation ($R_i$) are bounded by:
        \begin{equation*}
            R_0, R_i \leq \mathcal{O}\left( \frac{\mathbb{E}_{s \sim \nu_t^*}[\alert{\KL(\pi_t^*|\pi_{t,0})}]}{\alpha_t M} + \alpha_t \right)
        \end{equation*}
        \item \textbf{Key Insight:} The performance depends heavily on the KL-divergence between the optimal policy $\pi_t^*$ and the initial policy $\pi_{t,0}$.
        \item Our meta-learner will optimize this upper bound by choosing good `$\alert{\pi_{t,0}}$` and `$\alert{\alpha_t}$`.
    \end{itemize}
  \end{block}
\end{frame}
\begin{frame}{Method: The Inexact Framework}
  % Corresponds to the source text Section 3.1
  \begin{block}{Challenge: Unknown Optimal Policies}
    \footnotesize
    \begin{itemize}
        \item In practice, the optimal policy $\alert{\pi_t^*}$ and its state distribution $\alert{\nu_t^*}$ are unknown.
        \item We only have access to a suboptimal policy $\alert{\hat{\pi}_t}$ and collected trajectory data $\alert{\mathcal{D}_t}$.
    \end{itemize}
  \end{block}

  \begin{block}{Solution: Estimate and Bound the Error}
    \footnotesize
    \begin{itemize}
        \item \textbf{Estimate:} Use the suboptimal policy $\hat{\pi}_t$ and estimate its state distribution $\hat{\nu}_t$ from data $\mathcal{D}_t$ using \alert{DualDICE}.
        \item \textbf{Inexact Loss:} The meta-learner optimizes an inexact loss function:
        $$ \hat{f}_{t}(\phi) = \mathbb{E}_{\hat{\nu}_t}[\KL(\hat{\pi}_t|\phi)] $$
        \item \textbf{Bound the Error:} We prove a bound on the estimation error:
        $$ |\mathbb{E}_{\nu_t^*}[\KL(\pi_t^*|\phi)] - \mathbb{E}_{\hat{\nu}_t}[\KL(\hat{\pi}_t|\phi)]| \leq \alert{\epsilon_t} $$
        This bound (Thm. 3.1) is derived using novel techniques from \alert{tame geometry}.
    \end{itemize}
  \end{block}
\end{frame}
\begin{frame}{Method: Dynamic Regret \& Adaptive Learning Rates}
  % Corresponds to the source text Section 3.3
  \begin{block}{Challenge: Adapting to Dynamic Environments}
    \footnotesize
    \begin{itemize}
        \item A fixed meta-initialization may not be optimal if the environment changes over time.
        \item Setting the learning rate $\alpha_t$ optimally requires knowledge of future tasks.
    \end{itemize}
  \end{block}

  \begin{block}{Solution: Separate Online Learners}
    \footnotesize
    \begin{itemize}
        \item We decompose the regret upper bound into two components.
        \item We use two parallel Online Gradient Descent (OGD) algorithms:
        \begin{enumerate}
            \item \textbf{INIT}: Learns the policy initialization $\alert{\pi_{t,0}}$ by minimizing $\hat{f}_{t}^{init}(\phi) = \mathbb{E}_{\hat{\nu}_t}[\KL(\hat{\pi}_t|\phi)]$.
            \item \textbf{SIM}: Learns the learning rate $\alert{\alpha_t}$ by minimizing its own loss term $\hat{f}_t^{sim}(\kappa)$.
        \end{enumerate}
        \item This allows the framework to adapt both policy and learning rate online, without knowing task properties in advance.
    \end{itemize}
  \end{block}
\end{frame}
\section{Innovation}
\begin{frame}{Our Innovations}
  \begin{block}{Novel Framework and Guarantees}
    \footnotesize
    \begin{itemize}
        \item The \alert{first provable guarantees} for Meta-Safe RL, establishing bounds on task-averaged optimality gap (TAOG) and constraint violation (TACV).
        \item The regret bounds explicitly improve with \alert{task-similarity} ($\hat{D}^*$) or \alert{task-relatedness} ($\hat{V}_\psi$).
    \end{itemize}
  \end{block}

  \begin{block}{Practical and Adaptive Algorithm}
    \footnotesize
    \begin{itemize}
        \item \textbf{Inexact framework}: Works with suboptimal policies and estimates distributions using \alert{DualDICE}, making it practical.
        \item \textbf{Adaptive learning}: The meta-learner adapts both policy initialization and learning rates for each task, handling dynamic environments.
    \end{itemize}
  \end{block}

  \begin{block}{Technical Contributions}
    \footnotesize
    \begin{itemize}
        \item New analysis of the \alert{optimization landscape of CMDPs} using tame geometry to bound the distance between optimal and suboptimal policies.
        \item Extended analysis for \alert{inexact online gradient descent} to handle dynamic regret with biased gradient estimates.
    \end{itemize}
  \end{block}
\end{frame}
\begin{frame}
  \centering
  \Huge
  Experimental Evaluation
\end{frame}
\section{Experimental Method}
\begin{frame}{Experimental Method}
  \begin{block}{Objective}
    \footnotesize
    \begin{itemize}
      \item To empirically validate the effectiveness of our \alert{Meta-SRL} framework against standard meta-learning baselines.
    \end{itemize}
  \end{block}
  
  \begin{block}{Baselines for Comparison}
    \footnotesize
    \begin{itemize}
        \item \alert{Random Initialization}: Standard CRPO with a new random policy for each task.
        \item \alert{Pre-trained}: Initialize with the final policy from the previous task.
        \item \alert{Simple Averaging}: Offline average of all previously learned policies.
        \item \alert{Follow the Average Leader (FAL)}: Online average of all previously learned policies.
    \end{itemize}
  \end{block}

  \begin{block}{Task Generation}
    \footnotesize
    \begin{itemize}
        \item We generate a sequence of related CMDP tasks by sampling from a distribution over environment parameters (e.g., transition dynamics, reward functions).
        \item We test under two conditions: \alert{high task-similarity} and \alert{low task-similarity}.
    \end{itemize}
  \end{block}
\end{frame}
\section{Experimental Setting}
\begin{frame}{Experimental Setting}
  \begin{block}{Environments}
  \footnotesize
  We use a range of classic control environments with added safety constraints:
    \begin{itemize}
        \item \textbf{OpenAI Gym:}
        \begin{itemize}
            \item \alert{FrozenLake}: Discrete state space, $T=10$ tasks.
            \item \alert{Acrobot}: Continuous state space, $T=50$ tasks.
        \end{itemize}
        \item \textbf{MuJoCo:}
        \begin{itemize}
            \item \alert{Half-Cheetah}: High-dimensional continuous control, $T=100$ tasks. Constraint on head height.
            \item \alert{Humanoid}: Very high-dimensional, $T=250$ tasks. Constraint on joint angles for smooth motion.
        \end{itemize}
    \end{itemize}
  \end{block}
\end{frame}
\section{Experimental Results}
\begin{frame}{Experimental Results: Low Task-Similarity}
  \begin{columns}[T]
    \begin{column}{0.5\textwidth}
      \centering
      \textbf{FrozenLake}
      \includegraphics[width=1\textwidth]{FrozenLake/FrozenLakeLowSimilarity.pdf}
    \end{column}
    \begin{column}{0.5\textwidth}
      \centering
      \textbf{Acrobot}
      \includegraphics[width=1\textwidth]{Acrobot/Acrobot_low_similarity2.pdf}
    \end{column}
  \end{columns}
  
  \begin{block}{Observations}
    \footnotesize
    \footnotesize
    \begin{itemize}
      \item In settings with low task similarity, \alert{Meta-SRL} (our method) consistently learns faster and more safely.
      \item It achieves higher rewards while rapidly satisfying the safety constraints (driving constraint violation to zero).
      \item Simpler baselines like \alert{FAL} and \alert{Pre-trained} struggle to satisfy constraints or learn good policies.
    \end{itemize}
  \end{block}
\end{frame}
\begin{frame}{Experimental Results: MuJoCo Environments}
    \centering
    \textbf{Half-Cheetah (Low Task-Similarity)}
    \begin{figure}
        \includegraphics[width=0.6\textwidth]{HalfCheetah/HalfCheetahReward_low_task_similarity_broken_axis.pdf}
        \includegraphics[width=0.6\textwidth]{HalfCheetah/HalfCheetahCost_low_task_similarity.pdf}
        \caption{Reward (top) and constraint violation (bottom) for Half-Cheetah. Our method (Meta-SRL) learns a high-reward policy while keeping the constraint violation below the threshold (blue line).}
        \label{fig:halfcheetah}
    \end{figure}
\end{frame}
\section{Ablation Experiment}
\begin{frame}{Ablation Analysis}
  While no explicit ablation study was conducted, comparing Meta-SRL to the baselines serves as a validation of its key components.

  \begin{block}{Meta-SRL vs. FAL / Simple Averaging}
    \footnotesize
    \begin{itemize}
        \item \textbf{Ablated Component:} The intelligent meta-update (using \alert{DualDICE} estimates and \alert{OGD} on the regret bound).
        \item \textbf{Result:} Meta-SRL significantly outperforms simple averaging, showing that a weighted, adaptive update is crucial and superior to naive averaging.
    \end{itemize}
  \end{block}

  \begin{block}{Meta-SRL vs. Pre-trained}
    \footnotesize
    \begin{itemize}
        \item \textbf{Ablated Component:} Learning from a history of multiple tasks. The pre-trained baseline only uses the most recent task.
        \item \textbf{Result:} Meta-SRL is more robust, especially in low-similarity settings, demonstrating the benefit of aggregating knowledge from diverse past experiences.
    \end{itemize}
  \end{block}
  
  \begin{block}{Conclusion}
  \footnotesize
  The full \alert{Meta-SRL} model, with its inexact estimation and adaptive learning, is critical for achieving strong performance and safety.
  \end{block}
\end{frame}
\section{Deficiencies}
\begin{frame}{Limitations of the Current Method}
  \begin{itemize}
    \item \textbf{Algorithm-Specific Guarantees:}
    \begin{itemize}
        \item Our theoretical framework is built upon the \alert{CRPO} algorithm.
        \item Extending it to other within-task Safe RL algorithms (e.g., primal-dual methods) would require a new analysis of their specific regret bounds.
    \end{itemize}
    \bigskip
    \item \textbf{No Hard Safety Guarantees During Learning:}
    \begin{itemize}
        \item The framework minimizes task-averaged constraint violation, achieving safety \textit{on average} and \textit{asymptotically}.
        \item It does not guarantee \alert{zero constraint violation} at every step during the learning process, which may be a requirement for highly critical systems.
    \end{itemize}
  \end{itemize}
\end{frame}
\section{Future Research}
\begin{frame}{Future Research Directions}
  \begin{itemize}
    \item \textbf{Meta-SRL with Zero-Violation Guarantees:}
    \begin{itemize}
        \item Designing frameworks that can provide hard safety constraints throughout the learning phase, possibly by integrating pessimistic or certified approaches.
    \end{itemize}
    \bigskip
    \item \textbf{Extension to More Complex Scenarios:}
    \begin{itemize}
        \item \alert{Non-stationary environments} where the task distribution itself may shift over time.
        \item \alert{Multi-agent settings}, where agents must learn to coordinate safely and adapt to each other's policies.
    \end{itemize}
    \bigskip
    \item \textbf{Fairness and Socially Responsible AI:}
    \begin{itemize}
        \item Adapting the framework to handle \alert{fairness constraints}, ensuring that RL agents do not produce biased or discriminatory outcomes in non-stationary environments.
    \end{itemize}
  \end{itemize}
\end{frame}
\section{End}
\begin{frame}
  \centering
  \Huge
  Thank You!
  \vfill
  \Large
  Questions?
\end{frame}
\end{document}