File size: 15,968 Bytes
7c08dc3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 |
\documentclass{beamer}
% Theme and Color
\usetheme{Madrid}
\usecolortheme{default}
% Packages
\usepackage[utf8]{inputenc}
\usepackage[T1]{fontenc}
\usepackage{amsmath, amssymb, amsfonts}
\usepackage{booktabs}
\usepackage{graphicx}
\usepackage{hyperref}
\usepackage{bm} % For bold math symbols
% Custom commands from the source text for consistency
\newcommand{\KL}{D_{\mathrm{KL}}}
\def\figref#1{Figure~\ref{#1}}
\title[Meta-Safe RL]{A CMDP-within-online framework for Meta-Safe Reinforcement Learning}
\author{Vanshaj Khattar\inst{1} \and Yuhao Ding\inst{2} \and Bilgehan Sel\inst{1} \and Javad Lavaei\inst{2} \and Ming Jin\inst{1}}
\institute[VT \& UCB]{
\inst{1} Virginia Tech \\
\inst{2} UC Berkeley
}
\date{\today}
\setbeamerfont{caption}{size=\scriptsize}
\begin{document}
\begin{frame}
\titlepage
\end{frame}
\begin{frame}{Outline}
\tableofcontents
\end{frame}
\section{Motivation}
\begin{frame}{Motivation: Why Meta-Safe RL?}
\begin{block}{Background: Meta-Reinforcement Learning (Meta-RL)}
\footnotesize
\begin{itemize}
\item Meta-RL enables agents to learn new tasks quickly with limited experience.
\item It's a "learning-to-learn" framework successful in robotics, federated learning, etc.
\end{itemize}
\end{block}
\begin{block}{The Problem: Safety is Critical}
\footnotesize
\begin{itemize}
\item Many real-world applications have \alert{safety constraints} that must not be violated (e.g., robotics, autonomous driving).
\item Existing Meta-RL methods do not adequately address these constraints.
\item Safe RL problems are often modeled as \alert{Constrained Markov Decision Processes (CMDPs)}, but standard CMDP algorithms don't generalize efficiently to new tasks.
\end{itemize}
\end{block}
\begin{block}{Our Goal}
\footnotesize
\begin{itemize}
\item Develop a principled framework, \alert{Meta-Safe RL (Meta-SRL)}, that combines the fast adaptation of meta-learning with the safety guarantees of Safe RL.
\item Provide the \alert{first provable guarantees} for learning across multiple safe RL tasks.
\end{itemize}
\end{block}
\end{frame}
\section{Related Work}
\begin{frame}{Related Work}
\begin{itemize}
\item \textbf{Meta-Reinforcement Learning:}
\begin{itemize}
\item Focuses on learning initial conditions, hyperparameters, etc., for fast adaptation.
\item Most work is for \alert{unconstrained} environments.
\end{itemize}
\item \textbf{Online Meta-Learning:}
\begin{itemize}
\item Provides theoretical frameworks, often for convex and decomposable loss functions.
\item Our work extends this to the \alert{nonconvex and complex} setting of CMDPs.
\end{itemize}
\item \textbf{Safe RL and CMDPs:}
\begin{itemize}
\item A rich field with many algorithms (e.g., primal-dual, policy-based like \alert{CRPO}).
\item However, these are designed for a \alert{single task} and are not built to generalize or adapt quickly to unseen tasks.
\end{itemize}
\end{itemize}
\end{frame}
\section{Method}
\begin{frame}{Method: CMDP-within-Online Framework}
\begin{block}{Core Idea}
\footnotesize
\footnotesize
\begin{itemize}
\item A \alert{meta-learner} (online algorithm) operates over a sequence of CMDP tasks.
\item For each task $t$, the meta-learner provides an initial policy $\alert{\pi_{t,0}}$ and a learning rate $\alert{\alpha_t}$ to a \alert{within-task} Safe RL algorithm (e.g., CRPO).
\item The goal is to minimize the \textbf{Task-Averaged Optimality Gap (TAOG)} and \textbf{Task-Averaged Constraint Violation (TACV)}.
\end{itemize}
\end{block}
\begin{figure}
\centering
\includegraphics[width=0.3\textwidth]{illustrate.pdf}
\caption{Conceptual illustration of the meta-learning process.}
\label{fig:method_concept}
\end{figure}
\end{frame}
\begin{frame}{Method: The Within-Task Algorithm (CRPO)}
% Corresponds to the source text Section 2.1
\begin{block}{Constrained Markov Decision Process (CMDP)}
\footnotesize
For each task $t$, the agent aims to solve:
\begin{equation*}
\underset{\pi}{\max} \hspace{0.1cm} J_{t,0}(\pi) \hspace{0.3cm} \text{s.t.} \hspace{0.2cm} \alert{J_{t,i}(\pi) \leq d_{t,i}}, \hspace{0.3cm} \forall i = 1,...,p
\end{equation*}
where $J_{t,0}$ is the expected reward and $J_{t,i}$ are expected costs.
\end{block}
\begin{block}{CRPO Algorithm \& Regret}
\footnotesize
\begin{itemize}
\item We use the Constraint-Rectified Policy Optimization (\alert{CRPO}) algorithm.
\item The single-task optimality gap ($R_0$) and constraint violation ($R_i$) are bounded by:
\begin{equation*}
R_0, R_i \leq \mathcal{O}\left( \frac{\mathbb{E}_{s \sim \nu_t^*}[\alert{\KL(\pi_t^*|\pi_{t,0})}]}{\alpha_t M} + \alpha_t \right)
\end{equation*}
\item \textbf{Key Insight:} The performance depends heavily on the KL-divergence between the optimal policy $\pi_t^*$ and the initial policy $\pi_{t,0}$.
\item Our meta-learner will optimize this upper bound by choosing good `$\alert{\pi_{t,0}}$` and `$\alert{\alpha_t}$`.
\end{itemize}
\end{block}
\end{frame}
\begin{frame}{Method: The Inexact Framework}
% Corresponds to the source text Section 3.1
\begin{block}{Challenge: Unknown Optimal Policies}
\footnotesize
\begin{itemize}
\item In practice, the optimal policy $\alert{\pi_t^*}$ and its state distribution $\alert{\nu_t^*}$ are unknown.
\item We only have access to a suboptimal policy $\alert{\hat{\pi}_t}$ and collected trajectory data $\alert{\mathcal{D}_t}$.
\end{itemize}
\end{block}
\begin{block}{Solution: Estimate and Bound the Error}
\footnotesize
\begin{itemize}
\item \textbf{Estimate:} Use the suboptimal policy $\hat{\pi}_t$ and estimate its state distribution $\hat{\nu}_t$ from data $\mathcal{D}_t$ using \alert{DualDICE}.
\item \textbf{Inexact Loss:} The meta-learner optimizes an inexact loss function:
$$ \hat{f}_{t}(\phi) = \mathbb{E}_{\hat{\nu}_t}[\KL(\hat{\pi}_t|\phi)] $$
\item \textbf{Bound the Error:} We prove a bound on the estimation error:
$$ |\mathbb{E}_{\nu_t^*}[\KL(\pi_t^*|\phi)] - \mathbb{E}_{\hat{\nu}_t}[\KL(\hat{\pi}_t|\phi)]| \leq \alert{\epsilon_t} $$
This bound (Thm. 3.1) is derived using novel techniques from \alert{tame geometry}.
\end{itemize}
\end{block}
\end{frame}
\begin{frame}{Method: Dynamic Regret \& Adaptive Learning Rates}
% Corresponds to the source text Section 3.3
\begin{block}{Challenge: Adapting to Dynamic Environments}
\footnotesize
\begin{itemize}
\item A fixed meta-initialization may not be optimal if the environment changes over time.
\item Setting the learning rate $\alpha_t$ optimally requires knowledge of future tasks.
\end{itemize}
\end{block}
\begin{block}{Solution: Separate Online Learners}
\footnotesize
\begin{itemize}
\item We decompose the regret upper bound into two components.
\item We use two parallel Online Gradient Descent (OGD) algorithms:
\begin{enumerate}
\item \textbf{INIT}: Learns the policy initialization $\alert{\pi_{t,0}}$ by minimizing $\hat{f}_{t}^{init}(\phi) = \mathbb{E}_{\hat{\nu}_t}[\KL(\hat{\pi}_t|\phi)]$.
\item \textbf{SIM}: Learns the learning rate $\alert{\alpha_t}$ by minimizing its own loss term $\hat{f}_t^{sim}(\kappa)$.
\end{enumerate}
\item This allows the framework to adapt both policy and learning rate online, without knowing task properties in advance.
\end{itemize}
\end{block}
\end{frame}
\section{Innovation}
\begin{frame}{Our Innovations}
\begin{block}{Novel Framework and Guarantees}
\footnotesize
\begin{itemize}
\item The \alert{first provable guarantees} for Meta-Safe RL, establishing bounds on task-averaged optimality gap (TAOG) and constraint violation (TACV).
\item The regret bounds explicitly improve with \alert{task-similarity} ($\hat{D}^*$) or \alert{task-relatedness} ($\hat{V}_\psi$).
\end{itemize}
\end{block}
\begin{block}{Practical and Adaptive Algorithm}
\footnotesize
\begin{itemize}
\item \textbf{Inexact framework}: Works with suboptimal policies and estimates distributions using \alert{DualDICE}, making it practical.
\item \textbf{Adaptive learning}: The meta-learner adapts both policy initialization and learning rates for each task, handling dynamic environments.
\end{itemize}
\end{block}
\begin{block}{Technical Contributions}
\footnotesize
\begin{itemize}
\item New analysis of the \alert{optimization landscape of CMDPs} using tame geometry to bound the distance between optimal and suboptimal policies.
\item Extended analysis for \alert{inexact online gradient descent} to handle dynamic regret with biased gradient estimates.
\end{itemize}
\end{block}
\end{frame}
\begin{frame}
\centering
\Huge
Experimental Evaluation
\end{frame}
\section{Experimental Method}
\begin{frame}{Experimental Method}
\begin{block}{Objective}
\footnotesize
\begin{itemize}
\item To empirically validate the effectiveness of our \alert{Meta-SRL} framework against standard meta-learning baselines.
\end{itemize}
\end{block}
\begin{block}{Baselines for Comparison}
\footnotesize
\begin{itemize}
\item \alert{Random Initialization}: Standard CRPO with a new random policy for each task.
\item \alert{Pre-trained}: Initialize with the final policy from the previous task.
\item \alert{Simple Averaging}: Offline average of all previously learned policies.
\item \alert{Follow the Average Leader (FAL)}: Online average of all previously learned policies.
\end{itemize}
\end{block}
\begin{block}{Task Generation}
\footnotesize
\begin{itemize}
\item We generate a sequence of related CMDP tasks by sampling from a distribution over environment parameters (e.g., transition dynamics, reward functions).
\item We test under two conditions: \alert{high task-similarity} and \alert{low task-similarity}.
\end{itemize}
\end{block}
\end{frame}
\section{Experimental Setting}
\begin{frame}{Experimental Setting}
\begin{block}{Environments}
\footnotesize
We use a range of classic control environments with added safety constraints:
\begin{itemize}
\item \textbf{OpenAI Gym:}
\begin{itemize}
\item \alert{FrozenLake}: Discrete state space, $T=10$ tasks.
\item \alert{Acrobot}: Continuous state space, $T=50$ tasks.
\end{itemize}
\item \textbf{MuJoCo:}
\begin{itemize}
\item \alert{Half-Cheetah}: High-dimensional continuous control, $T=100$ tasks. Constraint on head height.
\item \alert{Humanoid}: Very high-dimensional, $T=250$ tasks. Constraint on joint angles for smooth motion.
\end{itemize}
\end{itemize}
\end{block}
\end{frame}
\section{Experimental Results}
\begin{frame}{Experimental Results: Low Task-Similarity}
\begin{columns}[T]
\begin{column}{0.5\textwidth}
\centering
\textbf{FrozenLake}
\includegraphics[width=1\textwidth]{FrozenLake/FrozenLakeLowSimilarity.pdf}
\end{column}
\begin{column}{0.5\textwidth}
\centering
\textbf{Acrobot}
\includegraphics[width=1\textwidth]{Acrobot/Acrobot_low_similarity2.pdf}
\end{column}
\end{columns}
\begin{block}{Observations}
\footnotesize
\footnotesize
\begin{itemize}
\item In settings with low task similarity, \alert{Meta-SRL} (our method) consistently learns faster and more safely.
\item It achieves higher rewards while rapidly satisfying the safety constraints (driving constraint violation to zero).
\item Simpler baselines like \alert{FAL} and \alert{Pre-trained} struggle to satisfy constraints or learn good policies.
\end{itemize}
\end{block}
\end{frame}
\begin{frame}{Experimental Results: MuJoCo Environments}
\centering
\textbf{Half-Cheetah (Low Task-Similarity)}
\begin{figure}
\includegraphics[width=0.6\textwidth]{HalfCheetah/HalfCheetahReward_low_task_similarity_broken_axis.pdf}
\includegraphics[width=0.6\textwidth]{HalfCheetah/HalfCheetahCost_low_task_similarity.pdf}
\caption{Reward (top) and constraint violation (bottom) for Half-Cheetah. Our method (Meta-SRL) learns a high-reward policy while keeping the constraint violation below the threshold (blue line).}
\label{fig:halfcheetah}
\end{figure}
\end{frame}
\section{Ablation Experiment}
\begin{frame}{Ablation Analysis}
While no explicit ablation study was conducted, comparing Meta-SRL to the baselines serves as a validation of its key components.
\begin{block}{Meta-SRL vs. FAL / Simple Averaging}
\footnotesize
\begin{itemize}
\item \textbf{Ablated Component:} The intelligent meta-update (using \alert{DualDICE} estimates and \alert{OGD} on the regret bound).
\item \textbf{Result:} Meta-SRL significantly outperforms simple averaging, showing that a weighted, adaptive update is crucial and superior to naive averaging.
\end{itemize}
\end{block}
\begin{block}{Meta-SRL vs. Pre-trained}
\footnotesize
\begin{itemize}
\item \textbf{Ablated Component:} Learning from a history of multiple tasks. The pre-trained baseline only uses the most recent task.
\item \textbf{Result:} Meta-SRL is more robust, especially in low-similarity settings, demonstrating the benefit of aggregating knowledge from diverse past experiences.
\end{itemize}
\end{block}
\begin{block}{Conclusion}
\footnotesize
The full \alert{Meta-SRL} model, with its inexact estimation and adaptive learning, is critical for achieving strong performance and safety.
\end{block}
\end{frame}
\section{Deficiencies}
\begin{frame}{Limitations of the Current Method}
\begin{itemize}
\item \textbf{Algorithm-Specific Guarantees:}
\begin{itemize}
\item Our theoretical framework is built upon the \alert{CRPO} algorithm.
\item Extending it to other within-task Safe RL algorithms (e.g., primal-dual methods) would require a new analysis of their specific regret bounds.
\end{itemize}
\bigskip
\item \textbf{No Hard Safety Guarantees During Learning:}
\begin{itemize}
\item The framework minimizes task-averaged constraint violation, achieving safety \textit{on average} and \textit{asymptotically}.
\item It does not guarantee \alert{zero constraint violation} at every step during the learning process, which may be a requirement for highly critical systems.
\end{itemize}
\end{itemize}
\end{frame}
\section{Future Research}
\begin{frame}{Future Research Directions}
\begin{itemize}
\item \textbf{Meta-SRL with Zero-Violation Guarantees:}
\begin{itemize}
\item Designing frameworks that can provide hard safety constraints throughout the learning phase, possibly by integrating pessimistic or certified approaches.
\end{itemize}
\bigskip
\item \textbf{Extension to More Complex Scenarios:}
\begin{itemize}
\item \alert{Non-stationary environments} where the task distribution itself may shift over time.
\item \alert{Multi-agent settings}, where agents must learn to coordinate safely and adapt to each other's policies.
\end{itemize}
\bigskip
\item \textbf{Fairness and Socially Responsible AI:}
\begin{itemize}
\item Adapting the framework to handle \alert{fairness constraints}, ensuring that RL agents do not produce biased or discriminatory outcomes in non-stationary environments.
\end{itemize}
\end{itemize}
\end{frame}
\section{End}
\begin{frame}
\centering
\Huge
Thank You!
\vfill
\Large
Questions?
\end{frame}
\end{document} |