diff --git a/app/scripts/latex-to-markdown/mdx-converter.mjs b/app/scripts/latex-to-markdown/mdx-converter.mjs index d52532920cc29f943053167be3481c7691c50589..4868c0915f58c1ef77db7463df6651283444dc9b 100644 --- a/app/scripts/latex-to-markdown/mdx-converter.mjs +++ b/app/scripts/latex-to-markdown/mdx-converter.mjs @@ -62,33 +62,62 @@ Examples: */ const usedComponents = new Set(); +/** + * Track individual image imports needed + */ +const imageImports = new Map(); // src -> varName + /** * Add required component imports to the frontmatter * @param {string} content - MDX content * @returns {string} - Content with component imports */ +/** + * Generate a variable name from image path + * @param {string} src - Image source path + * @returns {string} - Valid variable name + */ +function generateImageVarName(src) { + // Extract filename without extension and make it a valid JS variable + const filename = src.split('/').pop().replace(/\.[^.]+$/, ''); + return filename.replace(/[^a-zA-Z0-9]/g, '_').replace(/^[0-9]/, 'img_$&'); +} + function addComponentImports(content) { - console.log(' 📦 Adding component imports...'); + console.log(' 📦 Adding component and image imports...'); - if (usedComponents.size === 0) { - console.log(' ℹ️ No components to import'); - return content; + let imports = []; + + // Add component imports + if (usedComponents.size > 0) { + const componentImports = Array.from(usedComponents) + .map(component => `import ${component} from '../components/${component}.astro';`); + imports.push(...componentImports); + console.log(` ✅ Importing components: ${Array.from(usedComponents).join(', ')}`); + } + + // Add image imports + if (imageImports.size > 0) { + const imageImportStatements = Array.from(imageImports.entries()) + .map(([src, varName]) => `import ${varName} from '${src}';`); + imports.push(...imageImportStatements); + console.log(` ✅ Importing ${imageImports.size} image(s)`); } - // Create import statements - const imports = Array.from(usedComponents) - .map(component => `import ${component} from '../components/${component}.astro';`) - .join('\n'); + if (imports.length === 0) { + console.log(' ℹ️ No imports needed'); + return content; + } - console.log(` ✅ Importing: ${Array.from(usedComponents).join(', ')}`); + const importBlock = imports.join('\n'); // Insert imports after frontmatter const frontmatterEnd = content.indexOf('---', 3) + 3; if (frontmatterEnd > 2) { - return content.slice(0, frontmatterEnd) + '\n\n' + imports + '\n' + content.slice(frontmatterEnd); + return content.slice(0, frontmatterEnd) + '\n\n' + importBlock + '\n' + content.slice(frontmatterEnd); } else { // No frontmatter, add at beginning - return imports + '\n\n' + content; + return importBlock + '\n\n' + content; } } @@ -98,54 +127,150 @@ function addComponentImports(content) { * @param {string} content - MDX content * @returns {string} - Content with ResponsiveImage components */ +/** + * Create ResponsiveImage component with import + * @param {string} src - Clean image source + * @param {string} alt - Alt text + * @param {string} id - Element ID + * @param {string} caption - Figure caption + * @param {string} width - Optional width + * @returns {string} - ResponsiveImage component markup + */ +function createResponsiveImageComponent(src, alt = '', id = '', caption = '', width = '') { + const varName = generateImageVarName(src); + imageImports.set(src, varName); + usedComponents.add('ResponsiveImage'); + + const props = []; + props.push(`src={${varName}}`); + props.push('zoomable'); + props.push('downloadable'); + if (id) props.push(`id="${id}"`); + props.push('layout="fixed"'); + if (alt) props.push(`alt="${alt}"`); + if (caption) props.push(`caption={'${caption}'}`); + + return ``; +} + function transformImages(content) { - console.log(' 🖼️ Transforming images to ResponsiveImage components...'); + console.log(' 🖼️ Transforming images to ResponsiveImage components with imports...'); let hasImages = false; - // Transform HTML figure/img to ResponsiveImage + // Helper function to clean source paths + const cleanSrcPath = (src) => { + return src.replace(/.*\/output\/assets\//, '../assets/') + .replace(/\/Users\/[^\/]+\/[^\/]+\/[^\/]+\/[^\/]+\/[^\/]+\/app\/scripts\/latex-to-markdown\/output\/assets\//, '../assets/'); + }; + + // Helper to clean caption text + const cleanCaption = (caption) => { + return caption + .replace(/<[^>]*>/g, '') // Remove HTML tags + .replace(/\n/g, ' ') // Replace newlines with spaces + .replace(/\r/g, ' ') // Replace carriage returns with spaces + .replace(/\s+/g, ' ') // Replace multiple spaces with single space + .replace(/'/g, "\\'") // Escape quotes + .trim(); // Trim whitespace + }; + + // Helper to clean alt text + const cleanAltText = (alt, maxLength = 100) => { + const cleaned = alt + .replace(/<[^>]*>/g, '') // Remove HTML tags + .replace(/\n/g, ' ') // Replace newlines with spaces + .replace(/\r/g, ' ') // Replace carriage returns with spaces + .replace(/\s+/g, ' ') // Replace multiple spaces with single space + .trim(); // Trim whitespace + + return cleaned.length > maxLength + ? cleaned.substring(0, maxLength) + '...' + : cleaned; + }; + + // 1. Transform complex HTML figures with style attributes + content = content.replace( + /
\s*\s*
\s*(.*?)\s*<\/figcaption>\s*<\/figure>/gs, + (match, id, src, style, caption) => { + const cleanSrc = cleanSrcPath(src); + const cleanCap = cleanCaption(caption); + const altText = cleanAltText(cleanCap); + hasImages = true; + + return createResponsiveImageComponent(cleanSrc, altText, id, cleanCap); + } + ); + + // 2. Transform standalone img tags with style + content = content.replace( + //g, + (match, src, style, alt) => { + const cleanSrc = cleanSrcPath(src); + const cleanAlt = cleanAltText(alt || 'Figure'); + hasImages = true; + + return createResponsiveImageComponent(cleanSrc, cleanAlt); + } + ); + + // 3. Transform images within wrapfigure divs + content = content.replace( + /
\s*r[\d.]+\s*]*\/>\s*<\/div>/gs, + (match, src) => { + const cleanSrc = cleanSrcPath(src); + hasImages = true; + + return createResponsiveImageComponent(cleanSrc, 'Figure'); + } + ); + + // 4. Transform simple HTML figure/img without style content = content.replace( /
\s*\s*
\s*(.*?)\s*<\/figcaption>\s*<\/figure>/gs, (match, id, src, caption) => { - // Clean up the source path for web - const cleanSrc = src.replace(/.*\/output\/assets\//, '/assets/'); + const cleanSrc = cleanSrcPath(src); + const cleanCap = cleanCaption(caption); + const altText = cleanAltText(cleanCap); + hasImages = true; + + return createResponsiveImageComponent(cleanSrc, altText, id, cleanCap); + } + ); + + // 5. Clean up figures with minipage divs + content = content.replace( + /
\s*
\s*]*\/>\s*<\/div>\s*]*>(.*?)<\/figcaption>\s*<\/figure>/gs, + (match, id, src, caption) => { + const cleanSrc = cleanSrcPath(src); + const cleanCap = cleanCaption(caption); + const altText = cleanAltText(cleanCap); hasImages = true; - usedComponents.add('ResponsiveImage'); - return ``; + return createResponsiveImageComponent(cleanSrc, altText, id, cleanCap); } ); - // Transform Pandoc-style images: ![alt](src){#id attr="value"} + // 6. Transform Pandoc-style images: ![alt](src){#id attr="value"} content = content.replace( /!\[([^\]]*)\]\(([^)]+)\)(?:\{([^}]+)\})?/g, (match, alt, src, attributes) => { - // Clean up the source path for web - const cleanSrc = src.replace(/.*\/output\/assets\//, '/assets/'); + const cleanSrc = cleanSrcPath(src); + const cleanAlt = cleanAltText(alt || 'Figure'); hasImages = true; - usedComponents.add('ResponsiveImage'); - - let props = []; - props.push(`src="${cleanSrc}"`); - if (alt) props.push(`alt="${alt}"`); - // Parse attributes if present + let id = ''; if (attributes) { const idMatch = attributes.match(/#([\w-]+)/); - if (idMatch) props.push(`id="${idMatch[1]}"`); - - const widthMatch = attributes.match(/width="([^"]+)"/); - if (widthMatch && widthMatch[1] !== '\\linewidth') { - props.push(`width="${widthMatch[1]}"`); - } + if (idMatch) id = idMatch[1]; } - return ``; + return createResponsiveImageComponent(cleanSrc, cleanAlt, id); } ); if (hasImages) { - console.log(' âś… ResponsiveImage component will be imported'); + console.log(' âś… ResponsiveImage components with imports will be created'); } return content; @@ -258,8 +383,9 @@ function cleanMdxSyntax(content) { function processMdxContent(content) { console.log('đź”§ Processing for Astro MDX compatibility...'); - // Clear previous component tracking + // Clear previous tracking usedComponents.clear(); + imageImports.clear(); let processedContent = content; diff --git a/app/scripts/latex-to-markdown/output/main.mdx b/app/scripts/latex-to-markdown/output/main.mdx index 2db77979a81e83c45bf249d78d665bc7b293ea0c..72aeaf2efec2a3eb357c4c7f2dee43f4ed58241f 100644 --- a/app/scripts/latex-to-markdown/output/main.mdx +++ b/app/scripts/latex-to-markdown/output/main.mdx @@ -5,6 +5,46 @@ date: "2025-09-18" --- import ResponsiveImage from '../components/ResponsiveImage.astro'; +import ch1_lerobot_figure1 from '../assets/image/figures/ch1/ch1-lerobot-figure1.png'; +import ch2_approaches from '../assets/image/figures/ch2/ch2-approaches.png'; +import ch2_platforms from '../assets/image/figures/ch2/ch2-platforms.png'; +import ch2_cost_accessibility from '../assets/image/figures/ch2/ch2-cost-accessibility.png'; +import ch2_so100_to_planar_manipulator from '../assets/image/figures/ch2/ch2-so100-to-planar-manipulator.png'; +import ch2_planar_manipulator_free from '../assets/image/figures/ch2/ch2-planar-manipulator-free.png'; +import ch2_planar_manipulator_floor from '../assets/image/figures/ch2/ch2-planar-manipulator-floor.png'; +import ch2_planar_manipulator_floor_shelf from '../assets/image/figures/ch2/ch2-planar-manipulator-floor-shelf.png'; +import ch2_classical_limitations from '../assets/image/figures/ch2/ch2-classical-limitations.png'; +import ch3_learning_benefits from '../assets/image/figures/ch3/ch3-learning-benefits.png'; +import ch3_learning_atlas from '../assets/image/figures/ch3/ch3-learning-atlas.png'; +import ch3_rl_examples from '../assets/image/figures/ch3/ch3-rl-examples.png'; +import ch3_agent_env from '../assets/image/figures/ch3/ch3-agent-env.png'; +import ch3_rl_algorithms_atlas from '../assets/image/figures/ch3/ch3-rl-algorithms-atlas.png'; +import ch3_duck_sim_vs_real from '../assets/image/figures/ch3/ch3-duck-sim-vs-real.png'; +import ch3_many_ducks from '../assets/image/figures/ch3/ch3-many-ducks.png'; +import ch3_hil_serl_examples from '../assets/image/figures/ch3/ch3-hil-serl-examples.png'; +import ch4_bc_trajectories from '../assets/image/figures/ch4/ch4-bc-trajectories.png'; +import ch4_observation_action_mapping from '../assets/image/figures/ch4/ch4-observation-action-mapping.png'; +import ch4_issues_with_bc from '../assets/image/figures/ch4/ch4-issues-with-bc.png'; +import ch4_task_effect_on_pairs from '../assets/image/figures/ch4/ch4-task-effect-on-pairs.png'; +import ch4_latent_variable_model from '../assets/image/figures/ch4/ch4-latent-variable-model.png'; +import ch4_many_latents from '../assets/image/figures/ch4/ch4-many-latents.png'; +import ch4_diffusion_robot_actions from '../assets/image/figures/ch4/ch4-diffusion-robot-actions.png'; +import ch4_action_vs_observation_distribution from '../assets/image/figures/ch4/ch4-action-vs-observation-distribution.png'; +import ch4_normalizing_flows from '../assets/image/figures/ch4/ch4-normalizing-flows.png'; +import ch4_diffusion_vs_flowmatching from '../assets/image/figures/ch4/ch4-diffusion-vs-flowmatching.png'; +import ch4_act from '../assets/image/figures/ch4/ch4-act.png'; +import ch4_act_encoder from '../assets/image/figures/ch4/ch4-act-encoder.png'; +import ch4_act_decoder from '../assets/image/figures/ch4/ch4-act-decoder.png'; +import ch4_diffusion_policy from '../assets/image/figures/ch4/ch4-diffusion-policy.png'; +import ch5_ml_vs_robotics_foundation from '../assets/image/figures/ch5/ch5-ml-vs-robotics-foundation.png'; +import ch5_generalist_policies_timeline from '../assets/image/figures/ch5/ch5-generalist-policies-timeline.png'; +import ch5_trends from '../assets/image/figures/ch5/ch5-trends.png'; +import ch5_pi0 from '../assets/image/figures/ch5/ch5-pi0.png'; +import ch5_smolvla from '../assets/image/figures/ch5/ch5-smolvla.png'; +import ch2_planar_manipulator_floor_box from '../assets/image/figures/ch2/ch2-planar-manipulator-floor-box.png'; +import ch4_async_inference from '../assets/image/figures/ch4/ch4-async-inference.png'; +import ch4_queues from '../assets/image/figures/ch4/ch4-queues.png'; +import ch5_pi0_sampling_timesteps from '../assets/image/figures/ch5/ch5-pi0-sampling-timesteps.png'; # Foreword @@ -27,7 +67,15 @@ We sincerely hope this tutorial serves as a valuable starting point for your jou # Introduction - + Autonomous robotics holds the premise of relieving humans from repetitive, tiring or dangerous manual tasks. Consequently, the field of robotics has been widely studied since its first inception in the 1950s. Lately, advancements in Machine Learning (ML) have sparked the development of a relatively new class of methods used to tackle robotics problems, leveraging large amounts of data and computation rather than human expertise and modeling skills to develop autonomous systems. @@ -160,10 +208,15 @@ TL;DR Learning-based approaches to robotics are motivated by the need to (1) gen ## Explicit and Implicit Models -
- -
Overview of methods to generate motion (clearly non-exhausitve, see @bekrisStateRobotMotion2024). The different methods can be grouped based on whether they explicitly (dynamics-based) or implicitly (learning-based) model robot-environment interactions.
-
+ Robotics is concerned with producing artificial motion in the physical world in useful, reliable and safe fashion. Thus, robotics is an inherently multi-disciplinar domain: producing autonomous motion in the physical world requires, to the very least, interfacing different software (motion planners) and hardware (motion executioners) components. Further, knowledge of mechanical, electrical, and software engineering, as well as rigid-body mechanics and control theory have therefore proven quintessential in robotics since the field first developed in the 1950s. More recently, Machine Learning (ML) has also proved effective in robotics, complementing these more traditional disciplines @connellRobotLearning1993. As a direct consequence of its multi-disciplinar nature, robotics has developed as a rather wide array of methods, all concerned with the main purpose of producing artificial motion in the physical world. @@ -171,10 +224,15 @@ Methods to produce robotics motion range from traditional *explicit* models-- - -
Different kinds of motions are achieved with potentially very different robotic platforms. From left to right, top to bottom: ViperX, SO-100, Boston Dynamics’ Spot, Open-Duck, 1X’s NEO, Boston Dynamics’ Atlas. This is an example list of robotic platforms and is (very) far from being exhaustive.
-
+ In the vast majority of instances, robotics deals with producing motion via actuating joints connecting nearly entirely-rigid links. A key distinction between focus areas in robotics is based on whether the generated motion modifies (1) the absolute state of the environment (via dexterity), (2) the relative state of the robot with respect to its environment (exercising mobility skills), or (3) a combination of the two (Figure 3). @@ -188,17 +246,27 @@ Robot manipulators typically consist of a series of links and joints, articulate Recently, the development of low-cost manipulators like the ALOHA @zhaoLearningFineGrainedBimanual2023 ALOHA-2 @aldacoALOHA2Enhanced and SO-100/SO-101 @knightStandardOpenSO100 platforms significantly lowered the barrier to entry to robotics, considering the increased accessibility of these robots compared to more traditional platforms like the Franka Emika Panda arm (Figure 4). -
- -
Cheaper, more accessible robots are starting to rival traditional platforms like the Panda arm platforms in adoption in resource-constrained scenarios. The SO-100, in particular, has a cost in the 100s of Euros, and can be entirely 3D-printed in hours, while the industrially-manufactured Panda arm costs tens of thousands of Euros and is not openly available.
-
+ Deriving an intuition as per why learning-based approaches are gaining popularity in the robotics community requires briefly analyzing traditional approaches for manipulation, leveraging tools like forward and inverse kinematics (FK, IK) and control theory. Providing a detailed overview of these methods falls (well) out of the scope of this tutorial, and we refer the reader to works including @sicilianoSpringerHandbookRobotics2016, @lynchModernRoboticsMechanics2017, @tedrakeRoboticManipulationPerception, @tedrakeUnderactuatedRoboticsAlgorithms for a much more comprehensive description of these techniques. Here, we mostly wish to highlight the benefits of ML over these traditional techniques -
- -
The SO-100 arm is a 6-dof manipulator arm. Preventing some of its joints (shoulder pane, wrist flex and wrist roll) from actuating, it can be represented as a traditional 2-dof planar manipulator (the gripper joint in the end-effector is not considered towards the count of the degrees of freedom used to produce motion).
-
+ Consider the (simple) case where a SO-100 is restrained from actuating (1) the shoulder pane and (2) the wrist flex and roll motors. This effectively reduces the degrees of freedom of the SO-100 from the original 5+1 (5 joints + 1 gripper) to 2+1 (shoulder lift, elbow flex + gripper). As the end-effector does not impact motion in this model, the SO-100 is effectively reduced to the planar manipulator robot presented in Figure 5, where spheres represent actuators, and solid lines indicate length-$l$ links from the base of the SO-100 to the end-effector (*ee*). @@ -207,18 +275,33 @@ Further, let us make the simplifying assumption that actuators can produce rotat All these simplifying assumptions leave us with the planar manipulator of Figure 6, free of moving its end-effector by controlling the angles $\theta_1$ and $\theta_2$, jointly referred to as the robot’s *configuration*, and indicated with $q = [\theta_1, \theta_2 ] \in [-\pi, +\pi]^2$. The axis attached to the joints indicate the associated reference frame, whereas circular arrows indicate the maximal feasible rotation allowed at each joint. In this tutorial, we do not cover topics related to spatial algebra, and we instead refer the reader to and for excellent explanations of the mechanics and theoretical foundations of producing motion on rigid bodies.
-
- -
Free to move
-
-
- -
Constrained by the surface
-
-
- -
Constrained by surface and (fixed) obstacle
-
+ + +
Planar, 2-dof schematic representation of the SO-100 manipulator under diverse deployment settings. From left to right: completely free of moving; constrained by the presence of the surface; constrained by the surface and presence of obstacles. Circular arrows around each joint indicate the maximal rotation feasible at that joint.
@@ -264,7 +347,13 @@ While very effective when a goal trajectory has been well specified, the perform
-r0.3 image +r0.3
One such case is presented in Figure [fig:planar-manipulator-box-velocity], where another rigid body other than the manipulator is moving in the environment along the horizontal axis, with velocity $\dot x_B$. Accounting analytically for the presence of this disturbance--for instance, to prevent the midpoint of the link from ever colliding with the object--requires access to $\dot x_B$ at least, to derive the equation characterizing the motion of the environment. @@ -281,10 +370,15 @@ We point the interested reader to , , and  for extended coverage of FK, IK, di Despite the last 60+ years of robotics research, autonomous robots are still largely incapable of performing tasks at human-level performance in the physical world generalizing across (1) robot embodiments (different manipulators, different locomotion platforms, etc.) and (2) tasks (tying shoe-laces, manipulating a diverse set of objects). While essential in the early development of robotics, the aforementioned methods require significant human expertise to be used in practice, and are typically specific to a particular applicative problem. -
- -
Dynamics-based approaches to robotics suffer from several limitations: (1) orchestrating multiple components poses integration challenges; (2) the need to develop custom processing pipelines for the sensing modalities and tasks considered hinders scalability; (3) simplified analytical models of physical phenomena (here friction at the gripper; credits to @antonovaReinforcementLearningPivoting2017) limit real-world performance. Lastly, (4) dynamics-based methods overlook trends in the availability and growth of robotics data.
-
+ Dynamics-based robotics pipelines have historically been developed sequentially, engineering the different blocks now within most architectures for specific purposes. That is, sensing, state estimation, mapping, planning, (diff-)IK, and low-level control have been traditionally developed as distinct modules with fixed interfaces. Pipelining these specific modules proved error-prone, and brittleness emerges--alongside compounding errors--whenever changes incur (e.g., changes in lighting for sensing, occlusion/failure of sensors, control failures). Adapting such a stack to new tasks or robotic platforms often entails re-specifying objectives, constraints, and heuristics at multiple stages, incurring significant engineering overhead. @@ -310,28 +404,43 @@ Richard Sutton TL;DR The need for expensive high-fidelity simulators can be obviated by learning from real-world data, using sample-efficient algorithms that can safely train directly on hardware.
-
- -
Learning-based robotics streamlines perception-to-action by learning a (1) unified high-level controller capable to take (2) high-dimensional, unstructured sensorimotor information. Learning (3) does not require a dynamics model and instead focuses on interaction data, and (4) empirically correlates with the scale of the data used.
-
+ Learning-based techniques for robotics naturally address the limitations presented in 2 (Figure 11). Learning-based techniques typically rely on prediction-to-action (*visuomotor policies*), thereby directly mapping sensorimotor inputs to predicted actions, streamlining control policies by removing the need to interface multiple components. Mapping sensorimotor inputs to actions directly also allows to add diverse input modalities, leveraging the automatic feature extraction characteristic of most modern learning systems. Further, learning-based approaches can in principle entirely bypass modeling efforts and instead rely exclusively on interactions data, proving transformative when dynamics are challenging to model or even entirely unknown. Lastly, learning for robotics (*robot learning*) is naturally well posed to leverage the growing amount of robotics data openly available, just as computer vision first and natural language processing later did historically benefit from large scale corpora of (possibly non curated) data, in great part overlooked by dynamics-based approaches. Being a field at its relative nascent stages, no prevalent technique(s) proved distinctly better better in robot learning. Still, two major classes of methods gained prominence: reinforcement learning (RL) and Behavioral Cloning (BC) (Figure 12). In this section, we provide a conceptual overview of applications of the former to robotics, as well as introduce practical examples of how to use RL within `lerobot`. We then introduce the major limitations RL suffers from, to introduce BC techniques in the next sections ([sec:learning-bc-single, sec:learning-bc-generalist]). -
- -
Overview of the robot learning methods implemented in lerobot.
-
+ In Figure 12 we decided to include generalist robot models @blackp0VisionLanguageActionFlow2024, @shukorSmolVLAVisionLanguageActionModel2025 alongside task-specific BC methods. While significant different in spirit--*generalist* models are language-conditioned and use instructions to generate motion valid across many tasks, while *task-specific* models are typically not language-conditioned and used to perform a single task--foundation models are largely trained to reproduce trajectories contained in a large training set of input demonstrations. Thus, we argue generalist policies can indeed be grouped alongside other task-specific BC methods, as they both leverage similar training data and schemas. Figure 12 illustrates this categorization graphically, explicitly listing all the robot learning policies currently available in `lerobot`: Action Chunking with Transformers (ACT) @zhaoLearningFineGrainedBimanual2023, Diffusion Policy @chiDiffusionPolicyVisuomotor2024, Vector-Quantized Behavior Transformer (VQ-BeT) @leeBehaviorGenerationLatent2024, $\pi_0$ @blackp0VisionLanguageActionFlow2024, SmolVLA @shukorSmolVLAVisionLanguageActionModel2025, Human-in-the-loop Sample-efficient RL (HIL-SERL) @luoPreciseDexterousRobotic2024 and TD-MPC @hansenTemporalDifferenceLearning2022. -
- -
Examples of two different robotics tasks performed using RL. In the manipulation task (A) an agent learns to reach for a yellow plastic block in its environment, and to put it inside of a box. In the locomotion task (B) an agent learns to move its center of mass sideways without falling.
-
+ Applications of RL to robotics have been long studied, to the point the relationship between these two disciplines has been compared to that between physics and matematics @koberReinforcementLearningRobotics. Indeed, due to their interactive and sequential nature, many robotics problems can be directly mapped to RL problems. Figure 13 depicts two of such cases. Reaching for an object to move somewhere else in the scene is an indeed sequential problem where at each cycle the controller needs to adjust the position of the robotic arm based on their current configuration and the (possibly varying) position of the object. Figure 13 also shows an example of a locomotion problem, where sequentiality is inherent in the problem formulation. While sliding to the side, the controller has to constantly keep adjusting to the robot’s propioperception to avoid failure (falling). @@ -339,10 +448,15 @@ Applications of RL to robotics have been long studied, to the point the relation The RL framework @suttonReinforcementLearningIntroduction2018, which we briefly introduce here, has often been used to model robotics problems @koberReinforcementLearningRobotics. RL is a subfield within ML fundamentally concerned with the development of autonomous systems (*agents*) learning how to *continuously behave* in an evolving environment, developing (ideally, well-performing) control strategies (*policies*). Crucially for robotics, RL agents can improve via trial-and-error only, thus entirely bypassing the need to develop explicit models of the problem dynamics, and rather exploiting interaction data only. In RL, this feedback loop (Figure 14) between actions and outcomes is established through the agent sensing a scalar quantity (*reward*). -
- -
Agent-Environment interaction diagram (image credits to @suttonReinforcementLearningIntroduction2018).
-
+ Formally, interactions between an agent and its environment are typically modeled via a Markov Decision Process (MDP) @bellmanMarkovianDecisionProcess1957. Representing robotics problems via MDPs offers several advantages, including (1) incorporating uncertainty through MDP’s inherently stochastic formulation and (2) providing a theoretically sound framework for learning *without* an explicit dynamic model. While accommodating also a continuous time formulation, MDPs are typically considered in discrete time in RL, thus assuming interactions to atomically take place over the course of discrete *timestep* $t=0,1,2,3, \dots, T$. MDPs allowing for an unbounded number of interactions ( $T \to + \infty$ ) are typically termed *infinite-horizon*, and opposed to *finite-horizon* MDPs in which $T$ cannot grow unbounded. Unless diversely specified, we will only be referring to discrete-time finite-horizon (*episodic*) MDPs here. @@ -405,10 +519,15 @@ $$ $$ Inducing an ordering over states and state-action pairs under $\pi$, value functions are central to most RL algorithms. A variety of methods have been developed in RL as standalone attemps to find (approximate) solutions to the problem of maximizing cumulative reward (Figure 15). -
- -
Popular RL algorithms. See @SpinningUp2018 for a complete list of citations.
-
+ Popular approaches to continuous state and action space--such as those studied within robotics--include @schulmanTrustRegionPolicy2017, @schulmanProximalPolicyOptimization2017, @haarnojaSoftActorCriticOffPolicy2018. Across manipulation @akkayaSolvingRubiksCube2019 and locomotion @leeLearningQuadrupedalLocomotion2020 problems, RL proved extremely effective in providing a platform to (1) adopt a unified, streamlined perception-to-action pipeline, (2) natively integrate propioperception with multi-modal high-dimensional sensor streams (3) disregard a description of the environment dynamics, by focusing on observed interaction data rather than modeling, and (4) anchor policies in the experience collected and stored in datasets. For a more complete survey of applications of RL to robotics, we refer the reader to @koberReinforcementLearningRobotics, @tangDeepReinforcementLearning2024. @@ -420,17 +539,27 @@ First, especially early in training, actions are Second, learning with a limited number of samples remains problematic in RL, limiting the applicability of RL in real-world robotics due to consequently prohibitive timescales of training. Even strong algorithms such as SAC @haarnojaSoftActorCriticOffPolicy2018 typically require a large numbers of transitions $\{ (s_t, a_t, r_t, s_{t+1})\}_{t=1}^N$. On hardware, generating these data is time-consuming and can even be prohibitive. -
- -
Simulated (left) vs. real-world (right) OpenDuck. Discrepancies in the simulation dynamics (reality gap) pose risks to policy transfer.
-
+ Training RL policies in simulation @tobinDomainRandomizationTransferring2017 addresses both issues: it eliminates physical risk and dramatically increases throughput. Yet, simulators require significant modeling effort, and rely on assumptions (simplified physical modeling, instantaneous actuation, static environmental conditions, etc.) limiting transferring policies learned in simulation due the discrepancy between real and simulated environments (*reality gap*, Figure 16). *Domain randomization* (DR) is a popular technique to overcome the reality gap, consisting in randomizing parameters of the simulated environment during training, to induce robustness to specific disturbances. In turn, DR is employed to increase the diversity of scenarios over the course of training, improving on the chances sim-to-real transfer @akkayaSolvingRubiksCube2019, @antonovaReinforcementLearningPivoting2017, @jiDribbleBotDynamicLegged2023. In practice, DR is performed further parametrizing the *simulator*’s dynamics $\mathcal D \equiv \mathcal D_\xi$ with a *dynamics* (random) vector $\xi$ drawn an arbitrary distribution, $\xi \sim \Xi$. Over the course of training--typically at each episode’s reset--a new $\xi$ is drawn, and used to specify the environment’s dynamics for that episode. For instance, one could decide to randomize the friction coefficient of the surface in a locomotion task (Figure 17), or the center of mass of an object for a manipulation task. -
- -
The same locomotion task can be carried out in different (simulated) domains (exemplified by the difference in terrains) at training time, resulting to increased robustness over diverse environment dynamics.
-
+ While effective in transfering policies across the reality gap in real-world robotics @tobinDomainRandomizationTransferring2017, @akkayaSolvingRubiksCube2019, @jiDribbleBotDynamicLegged2023, @tiboniDomainRandomizationEntropy2024, DR often requires extensive manual engineering. First, identifying which parameters to randomize--i.e., the *support* $\text{supp} (\Xi)$ of $\Xi$--is an inherently task specific process. When locomoting over different terrains, choosing to randomize the friction coefficient is a reasonable choice, yet not completely resolutive as other factors (lightning conditions, external temperature, joints’ fatigue, etc.) may prove just as important, making selecting these parameters yet another source of brittlness. @@ -517,10 +646,15 @@ Despite the possibility to leverage offline data for learning, the effectiveness Lastly, in order to improve on the robustness of their approach to different goals while maintaing practical scalability, @luoSERLSoftwareSuite2025 introduced a modified state and action space, expressing proprioperceptive configurations $q$ and actions $\dot q$ in the frame of end-effector pose at $t=0$. Randomizing the initial pose of the end-effector ($s_0$),@luoSERLSoftwareSuite2025 achieved a similar result to that of having to manually randomize the environment at every timestep, but with the benefit of maintaining the environment in the same condition across multiple training episodes, achieving higher scalability of their method thanks to the increased practicality of their approach. -
- -
(A) HIL-SERL allows for real-world training of high performance RL agents by building on top advancements presented by of SAC, RLPD and SERL. (B) Example of human intervention during a HIL-SERL training process on a SO-100.
-
+ Building on off-policy deep Q-learning with replay buffers, entropy regularization for better exploration and performance, expert demonstrations to guide learning, and a series of tools and recommendations for real-world training using reward classifiers (Figure 18), @luoPreciseDexterousRobotic2024 introduce human interactions during training, learning near-optimal policies in challenging real-world manipulation tasks in 1-2 hours. @@ -554,19 +688,29 @@ Norbert Wiener TL;DR Behavioral Cloning provides a natural platform to learn from real-world interactions without the need to design any reward function, and generative models prove more effective than point-wise policies at dealing with multimodal demonstration datasets. -
- -
(A) Average (with standard deviation) evolution of the actuation levels over the first 5 recorded episodes in lerobot/svla_so101_pickplace. Proprioperceptive state provide invaluable to determine the robot’s state during an episode. (B) Camera frames are also recorded alongside measurements on the robot’s state, capturing information about the robot’s interaction with its environment.
-
+ Learning from human demonstrations provides a pragmatic alternative to the reinforcement-learning pipeline discussed in Section 3. Indeed, in real-world robotics online exploration is typically costly and potentially unsafe, and designing (dense) reward signals is a brittle and task-specific process. In general, success detection itself may often require bespoke instrumentation, while episodic training demands reliable resets--all factors complicating training RL algorithms on hardware at scale. Behavioral Cloning (BC) sidesteps these constraints by casting control an imitation learning problem, leveraging previously collected expert demonstrations. Most notably, by learning to imitate autonomous systems naturally adhere to the objectives, preferences, and success criteria implicitly encoded in the data, which obviates reduces early-stage exploratory failures and obviates hand-crafted reward shaping altogether. Formally, let $\mathcal D = \{ \tau^{(i)} \}_{i=1}^N$ be a set of expert trajectories, with $\tau^{(i)} = \{(o_t^{(i)}, a_t^{(i)})\}_{t=0}^{T_i}$ representing the $i$-th trajectory in $\mathcal D$, $o_t \in \mathcal O$ denoting observations (e.g., images and proprioception altogether), and $a_t \in \mathcal A$ the expert actions. Typically, observations $o \in \mathcal O$ consist of both image and proprioperceptive information, while actions $a \in \mathcal A$ represent control specifications for the robot to execute, e.g. a joint configuration. Note that differently from Section 3, in the imitation learning context $\mathcal D$ denotes an offline dataset collecting $N$ length-$T_i$ reward-free (expert) human trajectories $\tau^{(i)}$, and *not* the environment dynamics. Similarily, in this section $\tau^{(i)}$ represent a length-$T_i$ trajectory of observation-action pairs, which crucially *omits entirely any reward* information. Figure 19 graphically shows trajectories in terms of the average evolution of the actuation on the 6 joints over a group of teleoperated episodes for the SO-100 manipulator. Notice how proprioperceptive states are captured jointly with camera frames over the course of the recorded episodes, providing a unified high-frame rate collection of teleoperation data. Figure 20 shows $(o_t, a_t)$-pairs for the same dataset, with the actions performed by the human expert illustrated just alongside the corresponding observation. In principle, (expert) trajectories $\tau^{(i)}$ can have different lengths since demonstrations might exhibit multi-modal strategies to attain the same goal, resulting in possibly multiple, different behaviors. -
- -
Sample observations and action pairs over the course of a given trajectory recorded in lerobot/svla_so101_pickplace. Observations, comprising of both proprioperceptive and visual information, are recorded alongside the configuration of a second, leader robot controlled by a human expert, providing complete information for regressing actions given observation.
-
+ Behavioral Cloning (BC) @pomerleauALVINNAutonomousLand1988a aims at synthetizing synthetic behaviors by learning the mapping from observations to actions, and in its most natural formulation can be effectively tackled as a *supevised* learning problem, consisting of learning the (deterministic) mapping $f: \mathcal O\mapsto \mathcal A, \ a_t = f(o_t)$ by solving ``` math @@ -581,10 +725,15 @@ Typically, the expert’s joint observation-action distribution $p: \mathcal O\t Despite the inherent challenges of learning on non-i.i.d. data, the BC formulation affords several operational advantages in robotics. First, training happens offline and typically uses expert human demonstration data, hereby severily limiting exploration risks by preventing the robot from performing dangerous actions altogether. Second, reward design is entirely unnecessary in BC, as demonstrations already reflect human intent and task completion. This also mitigates the risk of misalignment and specification gaming (*reward hacking*), otherwise inherent in purely reward-based RL @heessEmergenceLocomotionBehaviours2017. Third, because expert trajectories encode terminal conditions, success detection and resets are implicit in the dataset. Finally, BC scales naturally with growing corpora of demonstrations collected across tasks, embodiments, and environments. However, BC can in principle only learn behaviors that are, at most, as good as the one exhibited by the demonstrator, and thus critically provides no mitigation for the suboptimal decision making that might be enaced by humans. Still, while problematic in sequential-decision making problems for which expert demonstrations are not generally available--data migth be expensive to collect, or human performance may be inherently suboptimal--many robotics applications benefit from relative cheap pipelines to acquire high-quality trajectories generated by humans, thus justifying BC approaches. -
- -
Point-wise policies suffer from limitations due to (A) covariate shifts and poor approximation of (B) multimodal demonstrations. (A) Initially small errors may drive the policy out of distribution, incuring in a vicious circle ultimately resulting in failure. (B) Both modes of reaching for a target object in a scene, either left or right-first, are equally as good and thus equally as likely to be present in a dataset of human demonstrations, ultimately resulting in multimodal demonstrations.
-
+ While conceptually elegant, point-estimate policies $f : \mathcal O\mapsto \mathcal A$ learned by solving [eq:loss-minimization-SL] have been observed to suffer from (1) compounding errors @rossReductionImitationLearning2011 and (2) poor fit to multimodal distributions @florenceImplicitBehavioralCloning2022, @keGraspingChopsticksCombating2020. Figure 21 illustrates these two key issues related to learning *explicit policies* @florenceImplicitBehavioralCloning2022. Besides sequentiality in $\mathcal D$, compounding errors due to *covariate shift* may also prove catastrophic, as even small $\epsilon$-prediction errors $0 < \Vert \mu(o_t) - a_t \Vert \leq \epsilon$ can quickly drive the policy into out-of-distribution states, incuring in less confident generations and thus errors compounding (Figure 21, left).Moreover, point-estimate policies typically fail to learn *multimodal* targets, which are very common in human demonstrations solving robotics problems, since multiple trajectories can be equally as good towards the accomplishment of a goal (e.g., symmetric grasps, Figure 21, right). In particular, unimodal regressors tend to average across modes, yielding indecisive or even unsafe commands @florenceImplicitBehavioralCloning2022. To address poor multimodal fitting, @florenceImplicitBehavioralCloning2022 propose learning the generative model $p(o, a)$ underlying the samples in $\mathcal D$, rather than an explicitly learning a prediction function $f(o) = a$. @@ -594,10 +743,15 @@ Generative Models (GMs) aim to learn the stochastic process underlying the very ### Variational Auto-Encoders -
- -
Intuitively, latent variable in a single latent model may contain information regarding the task being performed, which directly results in the likelihood of the same observation-action pair being different for two different tasks. When (A) picking a block the likelihood of a wide gripper’s opening should be higher than narrower one, while it should be the opposite when (B) pushing the block.
-
+ A common inductive bias used in GM posits samples $(o,a)$ are influenced from an unobservable latent variable $z \in Z$, resulting in ``` math @@ -608,11 +762,15 @@ A common inductive bias used in GM posits samples $(o,a)$ are influenced from an ``` Intuitively, in the case of observation-action pairs $(o, a)$ for a robotics application, $z$ could be some high level representation of the underlying task being performed by the human demonstrator. In such case, treating $p(o,a)$ as a marginalization over $\text{supp}({Z})$ of the complete joint distribution $p(o,a,z)$ natively captures the effect different tasks have on the likelihood of observation-action pairs. Figure 22 graphically illustrates this concept in the case of a (A) picking and (B) pushing task, for which, nearing the target object, the likelihood of actions resulting in opening the gripper--the higher $q_6$, the wider the gripper’s opening--should intuitively be (A) high or (B) low, depending on the task performed. While the latent space $Z$ typically has a much richer structure than the set of all actual tasks performed, [eq:BC-latent-variable] still provides a solid framework to learn joint distribution conditioned on unobservable yet relevant factors. Figure 23 represents this framework of latent-variable for a robotics application: the true, $z$-conditioned generative process on assigns *likelihood* $p((o,a) \vert z)$ to the single $(o,a)$-pair. Using Bayes’ theorem, one can reconstruct the *posterior* distribution on $\text{supp}({Z})$, $q_\theta(z \vert o,a)$ from the likelihood $p_\theta(o,a \vert z)$, *prior* $p_\theta(z)$ and *evidence* $p_\theta(o,a)$. VAEs approximate the latent variable model presented in [eq:BC-latent-variable]) using an *approximate posterior* $q_\phi(z \vert o,a)$ while regressing parameters for a parametric likelihood, $p_\theta(o,a \vert z)$ (Figure 23). -
- -
(A) The latent variable model in a robotics application regulates influence between observed ( -o, a) variables and an unobservable latent variable. (B) VAEs approximate exact latent variable models by means of variational inference.
-
+ Given a dataset $\mathcal D$ consisting of $N$ i.i.d. observation-action pairs, the log-likelihood of all datapoints under $\theta$ (in Bayesian terms, the *evidence* $p_\theta(\mathcal D)$) can thus be written as: $$ @@ -673,10 +831,15 @@ $$ $$ where we explicitly showed the marginalization over the multiple latents in [eq:BC-multi-latent-model-1], and used the law of conditional probability and Markov property in [eq:BC-multi-latent-model-2]. -
- -
HMLV models posit the data generation process is influenced by a stack of Markov-dependent latent variables, with samples from the posterior distribution being progressively higher up in the hierarchy.
-
+ Similarily to VAEs, providing an exact interpretation for the latent variables is typically not possible. Still, one fairly reasonable application-driven intuition is that, by providing a model of the hierarchical, decoupled interaction of latent variables, Hierarchical Markov Latent Variable (HMLV) models attempt to capture the different resolutions at which different conditioning factors intervene, so that in a robotics application for instance, one could naturally distinguish between early-stage trajectory planning ($t \to T$) and fine-grained adjustments ($t \to 0$). In that, HMLV models thus provide a framework to perform variational inference via multiple, sequential sampling steps from different higher level distributions instead of approximating the generative process with a single-latent variable model. DMs are a particular instantiation of HMLV models for which the posterior $q( z_t \vert z_{t-1}) = \mathcal N(z_t \sqrt{1-\beta_t}, \beta_t \mathbf{I})$ for a given $\beta_t \in \mathbb R^+$, thereby iteratively reducing the signal-to-noise ratio as $\beta_t$ increases along the latents hierarchy. @@ -693,17 +856,27 @@ $$ In their seminal work on using DMs for variational inference, @hoDenoisingDiffusionProbabilistic2020 introduce major contributions regarding solving $\min_\theta -\log p_\theta(o,a)$. In particular, @hoDenoisingDiffusionProbabilistic2020 exclusively adopt a fixed *Gaussian* posterior in the form of $q(z_t \vert z_{t-1}) = \mathcal{N}(\sqrt{1-\beta_t}z_{t-1}, \beta_t \mathbf I)$. The choice of adopting Gaussians has profound implications on the generative process modeled. Indeed, under the (mild) assumption that the variance is sufficiently small $\beta_t \leq \eta, \eta \in \mathbb R^+$, @sohl-dicksteinDeepUnsupervisedLearning2015 proved that the likelihood $p(z_{t-1} \vert z_t)$ is Gaussian as well, which allows for the particularly convenient parametrization of the approximate likelihood $p_\theta (x_{t-1} \vert x_t) = \mathcal N(\mu_\theta(x_t, t), \Sigma_\theta(x_t,t)), \ t \in [1,T]$, as well as for closed-form tractability of the KL-divergence terms in [eq:diffusion-likelihood]. Further, the posterior’s structure also enables an analytical description for the distribution of the $t$-th latent variable, $q(z_t \vert z_0) = \mathcal N (\sqrt{\bar{\alpha}_t}z_0, (1-\bar{\alpha}_t) \mathbf{I})$, with $\alpha_t = 1-\beta_t, \ \bar \alpha_t = \prod_{k=1}^t \alpha_k$, which conveniently prevents iterative posterior sampling. -
- -
DMs iteratively corrupt samples (left) from an unknown distribution into a quasi-standard Gaussian (center), learning the displacement field (right) that permits to reconstruct samples from the unknown target distribution by iteratively denoising samples of a tractable, easy-to-sample distribution.
-
+ Finally, adopting Gaussian posteriors permits a particularly pleasing interpretation of the dynamics of training DMs @permenterInterpretingImprovingDiffusion2024. By using Gaussian posteriors, the hierarchical latent variables effectively lose increasingly more information circa the original (unknown) distribution’s sample, $z_0$, increasingly distributing according to a standard Gaussian and thus containing no information at all (Figure 25). Figure 25 illustrates this procedure on a simplified, bidimensional observation-action distribution, where we considered $o=q_2$ and $a=q^h_2$, with $q_2$ representing the robot’s *elbow flex* actuation and $q^h_2$ the human teleoperator’s robot elbow flex. -
- -
A joint action-observation distribution, in the simplified case where the observation is the elbow-flex actuation in a SO-100, and the action is the recorded position for the same joint in the teleoperator arm. The motion recorded being teleoperated, the points distribute along a the diagonal.
-
+ Because the recorded behavior is teleoperated, measurements mostly distribute along the line $a = o + \eta, \eta \sim N(0,1)$, with $\eta$-variability accouting for minor control inconsistencies (Figure 26). Using Gaussian posteriors--i.e., adding Gaussian noise--effectively simulates a *Brownian motion* for the elements in the distribution’s support (in Figure 25, $\mathcal O\times \mathcal A$), whereby information *diffuses away* from the samples, and comparing the diffused samples to the original data points one can derive an estimate of the total displacement induced by diffusion. Under the only assumption that the likelihood of the diffused samples is low under the original unknown data distribution, then one can effectively approximate the unkwown distribution by learning to *reverse* such displacement. This key intuition allows to write a simplified training objective: $ \mathcal L(\theta) = \mathbb{E}_{t, z_0, \epsilon} \big[ @@ -733,15 +906,27 @@ FM proved very effective in a variety of applications, ranging from image @esse ``` Note that the traditional discrete-time noise-scheduler ${\beta_t}_{t=0}^T$ is now generalized to a continuous map $\beta : [0,1] \mapsto \mathbb R^+$. Crucially, @lipmanFlowMatchingGenerative2023 prove that by exclusively optimizing the vector field for individual data points $z_0 \in \mathcal D$ individually, one also retrieves the optimal flow to morph the entire support of the initial distribution $p_0$ into $p_1 \ \text{s.t.} \mathcal D \sim p_1$. -
- -
Probability distributions can be modified applying vector fields resulting in a flow of mass in the support. When acting over time, vector fields can effectively change the distribution’s structure.
-
+ While the noising schedule of DMs results in a stochastic process that resembles a random walk, FM allows for more general--potentially, deterministic--likelihood and posterior parametrization. In the FM literature the likelihood and posterior probabilty densities defined along a HMLV model are typically jointly referred to as a *probability path*, where the distributions for successive adjacent transitions in the HMLV model are related by the (normalized) flow between them (Figure 27). The inherent flexibility of FM is one of their key advantages over DMs, as it opens up the possibility of *learning* more efficient paths. For instance, one can design probability paths inspired by Optimal Transport (OT)--a subdiscipline studying the problem of finding the most efficient way to morph one probability distribution into another. Probability paths obtained through OT paths tend to be *straighter* than diffusion paths (Figure 28), which can lead to faster and more stable training, as well as higher-quality sample generation with fewer steps at inference time. By avoiding unnecessary backtracking associated with the inherent stochastic nature of both the noising and denoising process in DMs, test-time compute is typically significantly reduced, while retaining comparable results @lipmanFlowMatchingGenerative2023. - + In practice, FM can be applied to generative modeling by learning a vector field regressor $v_\theta(z, t)$ to approximate a given target vector field $u(t, z)$. In the particular case of DMs, $u(t, z)$ is defined as in [eq:fm-diffusion-vector-field], while in priciple the target vector field can be learned to induce a particular transportation, or fixed according to OT. Given a sample from the data distribution $z_1 \sim p_1$ and a sample from an easy-to-sample prior $z_0 \sim p_0$, CFM defines a simple path between them using *linear interpolation* between samples $z_t = (1-t)z_0 + t z_1$, resulting in the target vector field $u(t, z_t) = z_1 - z_0$. Then, a FM model can be trained with the simple regression objective defined as $ \mathcal L(\theta) = \mathbb{E}_{t, z_0, z_1} \big[ @@ -761,32 +946,39 @@ In practice, when learning from demonstrations adopting CVAEs results in a sligh In their work, @zhaoLearningFineGrainedBimanual2023 ablated using a GM to learn from human demonstrations compared to a simpler, supervised objective, $\mathcal L_1(a,a^\prime) = \Vert a - a^\prime \Vert_1$. Interestingly, they found the performance of these two approaches to be comparable when learning from *scripted* demonstrations. That is, when learning from data collected rolling out a predetermined set of commands $[q^c_0, q^c_1, \dots]$, GM did *not* prove competitive compared to standard supervised learning. However, when learning from human demonstrations--i.e., from data collected executing commands coming from a human controller $[q^h_0, q^h_1, \dots]$--they found performance (success rate on a downstream task) to be severily (-33.3%) hindered from adopting a standard supervised learning objective compared to a richer, potentially more complex to learn variational objective, in keeping with the multimodal nature of human demonstrations data and findings presented in @florenceImplicitBehavioralCloning2022. The authors also ablate the action chunking paradigm, reporting significant performance gains for performing action chunking (1% vs. 44% success rate). To avoid acting openloop, @zhaoLearningFineGrainedBimanual2023 design an inference process consisting in performing inference at every timestep $t$ and then aggregate overlapping chunks using chunks’ exponential moving average. -
- -
Action Chunking with Transformer (ACT), as in @zhaoLearningFineGrainedBimanual2023. ACT introduces an action chunking paradigm to cope with high-dimensional multi-modal demonstration data, and a transformer-based CVAE architecture.
-
+ In ACT (Figure 29), inference for a given observation $o \in \mathcal O$ could be performed by (1) computing a prior $p_\omega(z \vert o)$ for the latent and (2) decoding an action chunk from a sampled latent $z \sim p_\omega(\bullet \vert o)$, similarily to how standard VAEs generate samples, with the exception that vanilla VAEs typically pose $p(z\vert o) \equiv p(z) \sim N(\mathbf{0}, \mathbf{I})$ and thus skip (1). -
- -
The CVAE encoder used in ACT. Input action chunks are first embedded and aggregated with positional embeddings, before being processed alongside embedded proprioperceptive information, and a learned [CLS] token used to aggregate input level information, and predict the style variable -z -. The encoder is entirely disregarded at inference time.
-
+ However, the authors claim using a deterministic procedure to derive $z$ may benefit policy evaluation, and thus avoid sampling from the conditional prior at all. At test time, instead, they simply use $z = \mathbf{0}$, as the conditional prior on $z$ used in training is set to be the unit Gaussian. At test time, conditioning on the observation $o$ is instead achieved through explicitly feeding proprioperceptive and visual observations to the decoder, $p_\theta(a \vert z, o)$, while during training $z$ is indeed sampled from the approximate posterior distribution $p_\phi(z \vert o, a)$, which, however, disregards image observations and exclusively uses proprioperceptive states to form $o$ for efficiency reasons (as the posterior $q_\phi$ is completely disregarded at test time). -
- -
The CVAE decoder used in ACT, comprising of a full encoder-decoder Transformer architecture. Camera observations from all -n - camera views are first embedded using pre-trained visual encoders, and then concatenated to the corresponding positional embeddings. Then, alongside embeddings for the proprioperceptive information available and the style variable -z - retrieved from the CVAE encoder, the Transformer encoder shares the matrices -K, Q - with the Transformer decoder, trained to decode fixed position embeddings into action valid chunks.
-
+ ### Code Example: Learning ACT @@ -804,21 +996,15 @@ $$ $$ Notice how in [eq:diffusion-policy-objective] the noise regressor is conditioned both on the latent variable rank $t$ *and* on a stack of previous observations $o_{t-T_o:t}$.  @chiDiffusionPolicyVisuomotor2024 claim the combination of (1) conditioning on a horizon of previous observations and (2) predicting multiple actions into the future allows DP to *commit to specific modes* in the data at inference time, which proves essential for good performance and avoiding undecisiveness. -
- -
The Diffusion Policy archicture, as in @chiDiffusionPolicyVisuomotor2024. A stack of -H - -o - - previous observations is used as external conditioning to denoise a group of -H - -a - - actions. Conditioning is used at every layer of a U-Net block, and in practice allows to obtain fully-formed action chunks with as little as -T = 10 denoising steps.
-
+ Figure 32 shows the convolution-based version of the architecture proposed by @chiDiffusionPolicyVisuomotor2024, illustrating inference on a single sample from $\mathcal D$ for simplicity. An arbitrarily noisy chunk of $H_a$ actions $\tilde a_{t:t+H_a}$ is mapped to a learned high-dimensional space. Similarily, both image observations and poses are embedded before being aggregated to the action embeddings. Then, a U-Net @ronnebergerUNetConvolutionalNetworks2015 is trained to regress the noise added into $\tilde a_{t:t+H_a}$, using observation conditioning information at every layer and seeking to optimize [eq:diffusion-policy-objective]. At inference time, the noise predictor is used to predict the quantity of noise at every $t \in [T, \dots, 0 ]$ and iteratively subtract it from $\tilde a_{t:t+T_a}$, reversing the diffusion process simulated in training conditioned on $o_{t-T_o:t}$ to predict $a_{t:t+T_a}$. @@ -838,7 +1024,13 @@ We directly assess the lack of adaptiveness of robot systems due to acting open-
- +
Asynchronous inference. Illustration of the asynchronous inference stack. Note that the policy can be run on a remote server, possibly with GPUs.
@@ -873,7 +1065,13 @@ Interestingly, the behavior of async inference can be studied analytically. Firs
- +
Action queue size evolution at runtime for various levels of g @@ -900,22 +1098,29 @@ TL;DR Openly available large scale datasets and the development of stable, expre The advent of large models trained on internet-scale datasets has drastically influenced fields like Computer Vision (CV) and Natural Language Processing (NLP), shifting the paradigm towards combining (1) an initial, task-agnostic large-scale pre-training stage and a (2) task-specific, adjustment phase. The pre-training/adaptation paradigm has now largely replaced more classic approaches consisting of task-specific data collection, curation and model training in many subdomains within CV and NLP, motivated by the main drawback of limited scalability for *task-specific approaches*, traditionally labor intensive. Factors including (1) the advancements in generalist models learned with self-supervision for perception @oquabDINOv2LearningRobust2024 or semantic understanding @devlinBERTPretrainingDeep2019 and (2) the popularization collective efforts to aggregate large-scale openly available datasets @collaborationOpenXEmbodimentRobotic2025, @khazatskyDROIDLargeScaleInTheWild2025 are increasingly pushing the field of robot learning towards the pre-train-and-adapt paradigm. This shift taps into the long-standing challenge of developing generalist robot policies, and holds the premise to surpass traditionally siloed approaches to robotics problems and develop a *foundation robotics model*. While Section [sec:learning-bc-single] introduced methods for learning *single-task policies* such as ACT or Diffusion Policy, in this section we present advancements in developing *generalist, multi-task, policies*, capable of performing a wide range of tasks across different environments and embodiments, and guided by unstructured instructions given via natural language. -
- -
Fields within ML such as Computer Vision and NLP converged on the development of foundation models, trained on a variety of large scale models and capable to perform multiple downstream tasks (top). Conversely, robotics suffered from limited standardization in terms of the architectures used, and siloed, task specific datasets, incurring in a high degree of fragmentation which traditionally hindered the development of generalist models for robotics in favour of task-specific models (bottom).
-
+ ## Preliminaries: Models and Data The remarkable success of foundation models in NLP and CV is predicated on two core principles: architectural innovation and joint data-compute scaling. The transformer architecture proved instrumental in capturing long-range dependencies in sequential data such as text, and its stability and expressivity made it the *de facto* standard for modern large-scale models trained on internet-scale amounts of data. In stark contrast with popular NLP @raffelExploringLimitsTransfer2023 and CV @ImageNet_VSS09 general-purpose datasets, the field of robotics has historically developed around task-specific datasets which hinders scalability across problems, resulting in a concrete data deficit for general-purpose robot learning. Unlike the wealth of relatively readily available text and images on the internet, robotics data is intrinsically embodied--datasets collected for a manipulation robot typically differ entirely from locomotion datasets. Further, datasets consisting of expert demonstrations are (1) intrinsically expensive to collect (2) and notoriously heterogeneous--different human experts may perform the same task optimally yet in very different ways. In particular, since each expert trajectory is tied to a specific robot platform and the operating conditions of its environment and task, data heterogeneity has long posed a *methodological* challenge for scaling robotics datasets via aggregation. Beyond this, heterogeneity also raises *conceptual* issues: naively mixing data across embodiments can induce negative transfer, as control strategies developed in isolation for different robot systems in different environments may even conflict when combined. Thus, the high degree of fragmentation of robotics datasets and tasks has traditionally led to the development of *specialist* policies, trained on small, task-specific datasets, and which excel at their designated task but fail to generalize to new situations (Figure 35). -
- -
Early efforts in the development of generalist models for robotics include BC-Zero @jangBCZZeroShotTask2022, RT-1 @brohanRT1RoboticsTransformer2023, and RT-2 @brohanRT2VisionLanguageActionModels2023: large scale models trained on thousands of demonstrations. The open release of the Open-X @collaborationOpenXEmbodimentRobotic2025 and DROID datasets @khazatskyDROIDLargeScaleInTheWild2025 fostered the development of open source models: OpenVLA @kimOpenVLAOpenSourceVisionLanguageAction2024, -π -0 - @blackp0VisionLanguageActionFlow2024 and SmolVLA @shukorSmolVLAVisionLanguageActionModel2025.
-
+ Motivated by the pursuit of generalist robot policies, the research community started investigating what and how to integrate from other domains within ML. Figure 36 shows a timeline of some of the most popular contributions attempting at developing generalist policies. Starting from BC-Zero, a latent variable model trained on 25K+ demonstrations, the field has now evolved into $\pi_0$, a transformer-based model trained on 10M+ demonstrations and exhibiting strong few-shot capabilities across tasks and embodiments. For starters, Robotics Transformer 1 (RT-1) @brohanRT1RoboticsTransformer2023 represented a significant step in the direction of developing a generalist robot policies over prior work including (1) BC-Zero @jangBCZZeroShotTask2022 and (2) Gato @reedGeneralistAgent2022, in that @brohanRT1RoboticsTransformer2023 uses a much larger and diverse set of training tasks compared to both BC-Zero and Gato. In particular, RT-1 uses a transformer architecture, and is trained on as many as 130k human-recorded trajectories collected over 13 robots in the span on 17 months. RT-1 learns to process a history of camera images and a natural language instruction, and feeds the resulting sequence of high-dimensional tokens to a transformer, trained using a *classification loss on a discretized actions space* consisting of 6 256 bins, each for each joint of a 6-dof robotic arm. @@ -925,10 +1130,15 @@ Traditionally, research involved not only training the model but also collecting The success of large, proprietary models like RT-1 and RT-2, highlighted a growing accessibility gap in robotics research, as training and deploying large-scale models requires computational resources simply unattainable for most research institutions. The OpenVLA project @kimOpenVLAOpenSourceVisionLanguageAction2024 emerged in direct contrast of closed-source counterparts, as a community-driven effort to create powerful, openly available VLAs. In particular, @kimOpenVLAOpenSourceVisionLanguageAction2024 trained OpenVLA by exclusively leveraging openly available data (970K+ from the Open-X dataset), and share training recipes alongside the model weights. Architecturally, OpenVLA integrates a pre-trained vision encoder to project visual tokens into the embedding space of Llama2-7B @touvronLlama2Open2023 language model backbone. The language model backbone is then used to predict *discrete action tokens* over 256 activation levels. - + Figure 37 illustrates graphically the two most relevant trends in modern robot learning. As datasets collected via centralized, cross-institutions cooperation of increasing size are made available for the research community, decentralized datasets collected by individual researchers and practitioners have also gained traction recently, closing the gap with academic benchmarks thanks to community-contributed datasets. Further, models used across tasks and embodiments are also becoming much more compute-efficient, and as a result the models’ size has been consistently reducing over time, with consequent gains for autonomous robots in real-world, resource-constrained environments. @@ -948,13 +1158,15 @@ Recently, compute efficiency has also become a central focus in VLM research. Se $\pi_0$ @blackp0VisionLanguageActionFlow2024 introduce a VLA consisting of a MoE architecture consisting of (1) a pre-trained VLM backbone (Gemma 2.6B @teamGemma2Improving2024) and (2) a dedicated action expert used to generate continuous actions via flow matching. Images and language are embedded with a late-fusion VLM (PaliGemma), while proprioceptive state and actions chunks are routed to a smaller action expert, initialized from scratch. The two separate experts communicate via self-attention layers, but maintain disjoint weights to obtain query, key and values matrices at each layer, maintaining specialization while efficiently allocating computation. -
- -
The -π -0 -architecture, as in @blackp0VisionLanguageActionFlow2024. Vision and language tokens are routed to a VLM backbone which is prevented from attending robot proprioperceptive states and action tokens, which are instead routed to a smaller subset of weights within the architecture. The architecture is trained with Flow Matching on 10M+ trajectories from a mixture of closed and openly available datasets.
-
+ Concretely, $\pi_0$ is a unified transformer with two disjoint sets of weights $\phi, \theta$. A larger VLM backbone $p_\phi$ initialized from Gemma 2.6B processes multiple image frames obtained from multiple cameras points $[\{ I_t \}_{t=1}^n]$, as well as a language instruction $[\ell_t]$ used to describe the task considered. Concurrently, a 300M-parameter *action expert* based on a similar transformer architecture is used processes the robot proprioperceptive state $q_t$ and an action chunk $a_{t:t+H_a}$ (Figure 38). The different expert networks operate separately in processing the respective inputs and turning them into query, key and value matrices, and only share information between each other via self-attention layers. The outputs from the VLM backbone are disregarded, while the vector field regressed by the action expert is used to iteratively refine the action process. In particular, $\pi_0$uses a *blockwise causal attention mask* over tokens belonging to three separate blocks: (1) image and language tokens $\mathcal T_i$ obtained from $[\{ I_t \}_{t=1}^n, \ell_t]$, (2) proprioperceptive tokens $\mathcal T_q$ obtained from $q_t$, and (3) the action tokens $\mathcal T_a$ for items in the chunk $a^{\tau}_{t:t+H_a}$ at time $\tau$ in the flow-matching process. Notably, *within* each block the attention operations are bidirectional, while across blocks, future blocks are masked out. Formally, this corresponds to using the attention mask $\mathbf{A} = \bordermatrix{ @@ -989,7 +1201,13 @@ Flow matching  can be seen as a continuous time, detetrministic generalization
-r0.4 image +r0.4
Using such Beta distribution emphasizes higher noise levels during training, a choice @blackp0VisionLanguageActionFlow2024 argue allows $\pi_0$to focus on learning the mean of the data distribution $\mathbb E[a_{t:t+H_a} \vert o_t]$ during training, in keeping with @esserScalingRectifiedFlow2024. To further optimize performance and reduce inference time, @blackp0VisionLanguageActionFlow2024 propose reducing the support of the timestep distribution to $[0,s], \ s < 1$, as for any forward-integration step size $\delta = 1-s$ timesteps above $s$ are never sampled at inference time. @@ -1004,13 +1222,15 @@ Lastly, @blackp0VisionLanguageActionFlow2024 present cross-embodiment experimen VLAs remain in an early stage of development and are not yet as mature or widely adopted as LLMs and VLMs. Further, much of the impactful VLA progress remains proprietary, with many models sharing only weights while withholding full training details and essential methodological components. SmolVLA @shukorSmolVLAVisionLanguageActionModel2025 is an entirely open-source research effort, aiming to democratize the developments of robotics foundation models by open sourcing model, training recipes and data used. -
- -
The SmolVLA architecture, as in @shukorSmolVLAVisionLanguageActionModel2025. SmolVLA is a compact MoE model trained with flow matching to denoise action chunks. Vision and language tokens are fed to a VLM backbone, and share information with the proprioperceptive and action tokens via the attention mechanism. The attention expert interleaves SA and CA layers for further conditioning on the visual features from the VLM backbone. SmolVLA skips computations and reduces the visual tokens, resulting in 6x less memory usage than -π -0 -.
-
+ While encouraging efforts like $\pi_0$ @blackp0VisionLanguageActionFlow2024 demonstrate the feasibility of open VLA systems, they remain (1) large and compute-intensive and (2) dependent on closed datasets collected via centralized efforts on costly robotic platforms, ultimately hindering accessibility. SmolVLA mitigates both these accessibility issues by (1) prioritizing a compact, compute-efficient VLA design and (2) targeting community-contributed datasets on accessible robotic platforms such as the SO-100 and SO-101 arms. Similarly to $\pi_0$, SmolVLA (Figure 39) employs a MoE architecture combining a pretrained VLM backbone with a dedicated action expert, and trains with flow matching. To ensure efficiency and accessibility, SmolVLA adopts SmolVLM-2 @marafiotiSmolVLMRedefiningSmall2025 as its VLM backbone, considering SmolVLM-2’s reduced size and capability to process multiple image inputs alongside text items. SmolVLM-2 uses SigLIP @zhaiSigmoidLossLanguage2023 as vision encoder, producing visual features for a SmolLM2 language decoder @allalSmolLM2WhenSmol2025. Further, SmolVLA adopts a smaller action expert consisting of $\sim$100M parameters and an interleaved stack of self and cross-attention layers. To improve efficiency, the action expert adopts a reduced embedding dimension compared to the VLM backbone, resulting in $d_{v_\theta} = 0.75 d_{\text{VLM}}$. @shukorSmolVLAVisionLanguageActionModel2025’s design choices thus result in a much smaller size model compared to $\pi_0$, consisting of around 450M parameters versus $\pi_0$’s 3.3B parameters. diff --git a/app/src/content/article.mdx b/app/src/content/article.mdx index 7d5ee15090d9190bf9d4c933c9ab7b287922af6e..09078980ba69d62f3e3316b1fe159b520f8ac8a7 100644 --- a/app/src/content/article.mdx +++ b/app/src/content/article.mdx @@ -4,7 +4,48 @@ description: "Converted from LaTeX to MDX" date: "2025-09-18" --- -import ResponsiveImage from "../components/ResponsiveImage.astro"; +import ResponsiveImage from '../components/ResponsiveImage.astro'; +import ch1_lerobot_figure1 from '../assets/image/figures/ch1/ch1-lerobot-figure1.png'; +import ch2_approaches from '../assets/image/figures/ch2/ch2-approaches.png'; +import ch2_platforms from '../assets/image/figures/ch2/ch2-platforms.png'; +import ch2_cost_accessibility from '../assets/image/figures/ch2/ch2-cost-accessibility.png'; +import ch2_so100_to_planar_manipulator from '../assets/image/figures/ch2/ch2-so100-to-planar-manipulator.png'; +import ch2_planar_manipulator_free from '../assets/image/figures/ch2/ch2-planar-manipulator-free.png'; +import ch2_planar_manipulator_floor from '../assets/image/figures/ch2/ch2-planar-manipulator-floor.png'; +import ch2_planar_manipulator_floor_shelf from '../assets/image/figures/ch2/ch2-planar-manipulator-floor-shelf.png'; +import ch2_classical_limitations from '../assets/image/figures/ch2/ch2-classical-limitations.png'; +import ch3_learning_benefits from '../assets/image/figures/ch3/ch3-learning-benefits.png'; +import ch3_learning_atlas from '../assets/image/figures/ch3/ch3-learning-atlas.png'; +import ch3_rl_examples from '../assets/image/figures/ch3/ch3-rl-examples.png'; +import ch3_agent_env from '../assets/image/figures/ch3/ch3-agent-env.png'; +import ch3_rl_algorithms_atlas from '../assets/image/figures/ch3/ch3-rl-algorithms-atlas.png'; +import ch3_duck_sim_vs_real from '../assets/image/figures/ch3/ch3-duck-sim-vs-real.png'; +import ch3_many_ducks from '../assets/image/figures/ch3/ch3-many-ducks.png'; +import ch3_hil_serl_examples from '../assets/image/figures/ch3/ch3-hil-serl-examples.png'; +import ch4_bc_trajectories from '../assets/image/figures/ch4/ch4-bc-trajectories.png'; +import ch4_observation_action_mapping from '../assets/image/figures/ch4/ch4-observation-action-mapping.png'; +import ch4_issues_with_bc from '../assets/image/figures/ch4/ch4-issues-with-bc.png'; +import ch4_task_effect_on_pairs from '../assets/image/figures/ch4/ch4-task-effect-on-pairs.png'; +import ch4_latent_variable_model from '../assets/image/figures/ch4/ch4-latent-variable-model.png'; +import ch4_many_latents from '../assets/image/figures/ch4/ch4-many-latents.png'; +import ch4_diffusion_robot_actions from '../assets/image/figures/ch4/ch4-diffusion-robot-actions.png'; +import ch4_action_vs_observation_distribution from '../assets/image/figures/ch4/ch4-action-vs-observation-distribution.png'; +import ch4_normalizing_flows from '../assets/image/figures/ch4/ch4-normalizing-flows.png'; +import ch4_diffusion_vs_flowmatching from '../assets/image/figures/ch4/ch4-diffusion-vs-flowmatching.png'; +import ch4_act from '../assets/image/figures/ch4/ch4-act.png'; +import ch4_act_encoder from '../assets/image/figures/ch4/ch4-act-encoder.png'; +import ch4_act_decoder from '../assets/image/figures/ch4/ch4-act-decoder.png'; +import ch4_diffusion_policy from '../assets/image/figures/ch4/ch4-diffusion-policy.png'; +import ch5_ml_vs_robotics_foundation from '../assets/image/figures/ch5/ch5-ml-vs-robotics-foundation.png'; +import ch5_generalist_policies_timeline from '../assets/image/figures/ch5/ch5-generalist-policies-timeline.png'; +import ch5_trends from '../assets/image/figures/ch5/ch5-trends.png'; +import ch5_pi0 from '../assets/image/figures/ch5/ch5-pi0.png'; +import ch5_smolvla from '../assets/image/figures/ch5/ch5-smolvla.png'; +import ch2_planar_manipulator_floor_box from '../assets/image/figures/ch2/ch2-planar-manipulator-floor-box.png'; +import ch4_async_inference from '../assets/image/figures/ch4/ch4-async-inference.png'; +import ch4_queues from '../assets/image/figures/ch4/ch4-queues.png'; +import ch5_pi0_sampling_timesteps from '../assets/image/figures/ch5/ch5-pi0-sampling-timesteps.png'; + # Foreword @@ -14,11 +55,11 @@ Nonetheless, we also hold that the wealth of research from both academia and ind This tutorial... -- Does *not* aim to be a comprehensive guide to general field of robotics, manipulation or underactuated systems: [@sicilianoSpringerHandbookRobotics2016](#bibliography) and [@tedrakeRoboticManipulationPerception](#bibliography), [@tedrakeUnderactuatedRoboticsAlgorithms](#bibliography) do this better than we ever could. +- Does *not* aim to be a comprehensive guide to general field of robotics, manipulation or underactuated systems: @sicilianoSpringerHandbookRobotics2016 and @tedrakeRoboticManipulationPerception, @tedrakeUnderactuatedRoboticsAlgorithms do this better than we ever could. -- Does *not* aim to be an introduction to statistical or deep learning: [@shalev-shwartzUnderstandingMachineLearning2014](#bibliography) and [@prince2023understanding](#bibliography) cover these subjects better than we ever could. +- Does *not* aim to be an introduction to statistical or deep learning: @shalev-shwartzUnderstandingMachineLearning2014 and @prince2023understanding cover these subjects better than we ever could. -- Does *not* aim to be a deep dive into Reinforcement Learning, Diffusion Models, or Flow Matching: invaluable works such as [@suttonReinforcementLearningIntroduction2018](#bibliography), [@nakkiranStepbyStepDiffusionElementary2024](#bibliography), and [@lipmanFlowMatchingGuide2024](#bibliography) do this better than we ever could. +- Does *not* aim to be a deep dive into Reinforcement Learning, Diffusion Models, or Flow Matching: invaluable works such as @suttonReinforcementLearningIntroduction2018, @nakkiranStepbyStepDiffusionElementary2024, and @lipmanFlowMatchingGuide2024 do this better than we ever could. Instead, our goal here is to provide an intuitive explanation as per why these disparate ideas have converged to form the exciting field of modern robot learning, driving the unprecedented progress we see today. In this spirit, we follow the adage: "a jack of all trades is a master of none, *but oftentimes better than a master of one*." @@ -26,4 +67,611 @@ We sincerely hope this tutorial serves as a valuable starting point for your jou # Introduction - + + +Autonomous robotics holds the premise of relieving humans from repetitive, tiring or dangerous manual tasks. Consequently, the field of robotics has been widely studied since its first inception in the 1950s. Lately, advancements in Machine Learning (ML) have sparked the development of a relatively new class of methods used to tackle robotics problems, leveraging large amounts of data and computation rather than human expertise and modeling skills to develop autonomous systems. + +The frontier of robotics research is indeed increasingly moving away from classical model-based control paradigm, embracing the advancements made in ML, aiming to unlock (1) monolithic perception-to-action control pipelines and (2) multi-modal data-driven feature extraction strategies, together with (3) reduced reliance on precise models of the world and (4) a better positioning to benefit from the growing availability of open robotics data. While central problems in manipulation, locomotion and whole-body control demand knowledge of rigid-body dynamics, contact modeling, planning under uncertainty, recent results seem to indicate learning can prove just as effective as explicit modeling, sparking interest in the field of *robot learning*. This interest can be largely justified considering the significant challenges related to deriving accurate models of robot-environment interactions. + +Moreover, since end-to-end learning on ever-growing collections of text and image data has historically been at the core of the development of *foundation models* capable of semantic reasoning across multiple modalities (images, text, audio, etc.), deriving robotics methods grounded in learning appears particularly consequential, especially as the number of openly available datasets continues to grow. + +Robotics is, at its core, an inherently multidisciplinary field, requiring a wide range of expertise in both *software* and *hardware*. The integration of learning-based techniques further broadens this spectrum of skills, raising the bar for both research and practical applications. `lerobot` is an open-source library designed to integrate end-to-end with the entire robotics stack. With a strong focus on accessible, real-world robots (1) `lerobot` supports many, openly available, robotic platforms for manipulation, locomotion and even whole-body control. `lerobot`also implements a (2) unified, low-level approach to reading/writing robot configurations to extend support for other robot platforms with relatively low effort. The library introduces `LeRobotDataset`, (3) a native robotics dataset’s format currently being used by the community to efficiently record and share datasets. `lerobot` also supports many state-of-the-art (SOTA) algorithms in robot learning--mainly based on Reinforcement Learning (RL) and Behavioral Cloning (BC) techniques--with efficient implementations in Pytorch, and extended support to experimentation and experiments tracking. Lastly, `lerobot` defines a custom, optimized inference stack for robotic policies decoupling action planning from action execution, proving effective in guaranteeing more adaptability at runtime. + +This tutorial serves the double purpose of providing useful references for the Science behind--and practical use of--common robot learning techniques. To this aim, we strike to provide a rigorous yet concise overview of the core concepts behind the techniques presented, paired with practical examples of how to use such techniques concretely, with code examples in `lerobot`, for researchers and practitioners interested in the field of robot learning. This tutorial is structured as follows: + +- Section 2 reviews classical robotics foundations, introducing the limitations of dynamics-based approaches to robotics. + +- Section 3 elaborates on the limitations of dynamics-based methods, and introduce RL as a practical approach to solve robotics problems, considering its upsides and potential limitations. + +- Section 4 further describes robot learning techniques that aim at solving single-tasks learning, leveraging BC techniques to autonomously reproduce specific expert demonstrations. + +- Section 5 presents recent contributions on developing generalist models for robotics applications, by learning from large corpora of multi-task  multi-robot data (*robotics foundation models*). + +Our goal with this tutorial is to provide an intuitive explanation of the reasons various disparate ideas from Machine Learning (ML) have converged and are powering the current evolution of Robotics, driving the unprecedented progress we see today. We complement our presentation of the most common and recent approaches in robot learning with practical code implementations using `lerobot`, and start here by presenting the dataset format introduced with `lerobot`. + +## `LeRobotDataset` + +`LeRobotDataset` is a standardized dataset format designed to address the specific needs of robot learning research, and it provides a unified and convenient access to robotics data across modalities, including sensorimotor readings, multiple camera feeds and teleoperation status. `LeRobotDataset` also accommodates for storing general information regarding the data being collected, including textual descriptions of the task being performed by the teleoperator, the kind of robot used, and relevant measurement specifics like the frames per second at which the recording of both image and robot state’s streams are proceeding. + +In this, `LeRobotDataset` provides a unified interface for handling multi-modal, time-series data, and it is designed to seamlessly integrate with the PyTorch and Hugging Face ecosystems. `LeRobotDataset` can be easily extended by users and it is highly customizable by users, and it already supports openly available data coming from a variety of embodiments supported in `lerobot`, ranging from manipulator platforms like the SO-100 arm and ALOHA-2 setup, to real-world humanoid arm and hands, as well as entirely simulation-based datasets, and self-driving cars. This dataset format is built to be both efficient for training and flexible enough to accommodate the diverse data types encountered in robotics, while promoting reproducibility and ease of use for users. + +### The dataset class design + +A core design choice behind `LeRobotDataset` is separating the underlying data storage from the user-facing API. This allows for efficient storage while presenting the data in an intuitive, ready-to-use format. + +Datasets are always organized into three main components: + +- **Tabular Data**: Low-dimensional, high-frequency data such as joint states, and actions are stored in efficient memory-mapped files, and typically offloaded to the more mature `datasets` library by Hugging Face, providing fast with limited memory consumption. + +- **Visual Data**: To handle large volumes of camera data, frames are concatenated and encoded into MP4 files. Frames from the same episode are always grouped together into the same video, and multiple videos are grouped together by camera. To reduce stress on the file system, groups of videos for the same camera view are also broke into multiple sub-directories, after a given threshold number. + +- **Metadata** A collection of JSON files which describes the dataset’s structure in terms of its metadata, serving as the relational counterpart to both the tabular and visual dimensions of data. Metadata include the different feature schema, frame rates, normalization statistics, and episode boundaries. + +For scalability, and to support datasets with potentially millions of trajectories (resulting in hundreds of millions or billions of individual camera frames), we merge data from different episodes into the same high-level structure. Concretely, this means that any given tabular collection and video will not typically contain information about one episode only, but rather a concatenation of the information available in multiple episodes. This keeps the pressure on the file system limited, both locally and on remote storage providers like Hugging Face, though at the expense of leveraging more heavily relational-like, metadata parts of the dataset, which are used to reconstruct information such as at which position, in a given file, an episode starts or ends. An example struture for a given `LeRobotDataset` would appear as follows: + +- `meta/info.json`: This metadata is a central metadata file. It contains the complete dataset schema, defining all features (e.g., `observation.state`, `action`), their shapes, and data types. It also stores crucial information like the dataset’s frames-per-second (`fps`), `lerobot`’s version at the time of capture, and the path templates used to locate data and video files. + +- `meta/stats.json`: This file stores aggregated statistics (mean, std, min, max) for each feature across the entire dataset, used for data normalization for most policy models and accessible externally via `dataset.meta.stats`. + +- `meta/tasks.jsonl`: This file contains the mapping from natural language task descriptions to integer task indices, which are useful for task-conditioned policy training. + +- `meta/episodes/*` This directory contains metadata about each individual episode, such as its length, the corresponding task, and pointers to where its data is stored in the dataset’s files. For scalability, this information is stored in files rather than a single large JSON file. + +- `data/*`: Contains the core frame-by-frame tabular data, using parquet files to allow for fast, memory-mapped access. To improve performance and handle large datasets, data from multiple episodes are concatenated into larger files. These files are organized into chunked subdirectories to keep the size of directories manageable. A single file typically contains data for more than one single episode. + +- `videos/*`: Contains the MP4 video files for all visual observation streams. Similar to the `data/` directory, the video footage from multiple episodes is concatenated into single MP4 files. This strategy significantly reduces the number of files in the dataset, which is more efficient for modern filesystems. + +## Code Example: Batching a (Streaming) Dataset + +This section provides an overview of how to access datasets hosted on Hugging Face using the `LeRobotDataset` class. Every dataset on the Hugging Face Hub containing the three main pillars presented above (Tabular, Visual and relational Metadata), and can be assessed with a single instruction. + +In practice, most reinforcement learning (RL) and behavioral cloning (BC) algorithms tend to operate on stack of observation and actions. For the sake of brevity, we will refer to joint spaces, and camera frames with the single term of *frame*. For instance, RL algorithms may use a history of previous frames $o_{t-H_o:t}$ to mitigate partial observability, and BC algorithms are in practice trained to regress chunks of multiple actions ($a_{t+t+H_a}$) rather than single controls. To accommodate for these specifics of robot learning training, `LeRobotDataset` provides a native windowing operation, whereby users can define the *seconds* of a given window (before and after) around any given frame, by using the `delta_timestemps` functionality. Unavailable frames are opportunely padded, and a padding mask is also returned to filter out the padded frames. Notably, this all happens within the `LeRobotDataset`, and is entirely transparent to higher level wrappers commonly used in training ML models such as `torch.utils.data.DataLoader`. + +Conveniently, by using `LeRobotDataset` with a Pytorch `DataLoader` one can automatically collate the individual sample dictionaries from the dataset into a single dictionary of batched tensors for downstream training or inference. `LeRobotDataset` also natively supports streaming mode for datasets. Users can stream data of a large dataset hosted on the Hugging Face Hub, with a one-line change in their implementation. Streaming datasets supports high-performance batch processing (ca. 80-100 it/s, varying on connectivity) and high levels of frames randomization, key features for practical BC algorithms which otherwise may be slow or operating on highly non-i.i.d. data. This feature is designed to improve on accessibility so that large datasets can be processed by users without requiring large amounts of memory and storage. + +
+ +Batching a (Streaming) Dataset + +```python +import torch +from lerobot.datasets.lerobot_dataset import LeRobotDataset +from lerobot.datasets.streaming_dataset import StreamingLeRobotDataset + +delta_timestamps = { + "observation.images.wrist_camera": [-0.2, -0.1, 0.0] # 0.2, and 0.1 seconds *before* each frame +} + +# Optionally, use StreamingLeRobotDataset to avoid downloading the dataset +dataset = LeRobotDataset( + "lerobot/svla_so101_pickplace", + delta_timestamps=delta_timestamps +) + +# Streams frames from the Hugging Face Hub without loading into memory +streaming_dataset = StreamingLeRobotDataset( + "lerobot/svla_so101_pickplace", + delta_timestamps=delta_timestamps +) + +# Get the 100th frame in the dataset by +sample = dataset[100] +print(sample) +# + +batch_size=16 +# wrap the dataset in a DataLoader to use process it batches for training purposes +data_loader = torch.utils.data.DataLoader( + dataset, + batch_size=batch_size +) + +# Iterate over the DataLoader in a training loop +num_epochs = 1 +device = "cuda" if torch.cuda.is_available() else "cpu" + +for epoch in range(num_epochs): + for batch in data_loader: + # Move data to the appropriate device (e.g., GPU) + observations = batch["observation.state"].to(device) + actions = batch["action"].to(device) + images = batch["observation.images.wrist_camera"].to(device) + + # Next, you can do amazing_model.forward(batch) + ... +``` + +
+ +# Classical Robotics + +
+ +*Know your enemy* \[...\] + +Sun Tzu + +
+
+ +TL;DR Learning-based approaches to robotics are motivated by the need to (1) generalize across tasks and embodiments (2) reduce dependency on human expertise (3) leverage historical trends on the production of data--all traditionally overlooked by dynamics-based techniques. + +
+ +## Explicit and Implicit Models + + + +Robotics is concerned with producing artificial motion in the physical world in useful, reliable and safe fashion. Thus, robotics is an inherently multi-disciplinar domain: producing autonomous motion in the physical world requires, to the very least, interfacing different software (motion planners) and hardware (motion executioners) components. Further, knowledge of mechanical, electrical, and software engineering, as well as rigid-body mechanics and control theory have therefore proven quintessential in robotics since the field first developed in the 1950s. More recently, Machine Learning (ML) has also proved effective in robotics, complementing these more traditional disciplines @connellRobotLearning1993. As a direct consequence of its multi-disciplinar nature, robotics has developed as a rather wide array of methods, all concerned with the main purpose of producing artificial motion in the physical world. + +Methods to produce robotics motion range from traditional *explicit* models--dynamics-based[^1] methods, leveraging precise descriptions of the mechanics of robots’ rigid bodies and their interactions with eventual obstacles in the environment--to *implicit* models--learning-based methods, treating artificial motion as a statistical pattern to learn given multiple sensorimotor readings @agrawalComputationalSensorimotorLearning, @bekrisStateRobotMotion2024. A variety of methods have been developed between these two extrema. For instance,  @hansenTemporalDifferenceLearning2022 show how learning-based systems can benefit from information on the physics of problems, complementing a traditional learning method such as Temporal Difference (TD)-learning @suttonReinforcementLearningIntroduction2018 with Model-Predictive Control (MPC). Conversely, as explicit models may be relying on assumptions proving overly simplistic--or even unrealistic--in practice, learning can prove effective to improve modeling of complex phenomena or complement perception @mccormacSemanticFusionDense3D2016. Such examples aim at demonstrating the richness of approaches to robotics, and Figure 2 graphically illustrates some of the most relevant techniques. Such a list is clearly far from being exhaustive, and we refer to @bekrisStateRobotMotion2024 for a more comprehensive overview of both general and application-specific methods for motion generation. In this section, we wish to introduce the inherent benefits of learning-based approaches to robotics--the core focus on this tutorial. + +## Different Types of Motion + + + +In the vast majority of instances, robotics deals with producing motion via actuating joints connecting nearly entirely-rigid links. A key distinction between focus areas in robotics is based on whether the generated motion modifies (1) the absolute state of the environment (via dexterity), (2) the relative state of the robot with respect to its environment (exercising mobility skills), or (3) a combination of the two (Figure 3). + +Effects such as (1) are typically achieved *through* the robot, i.e. generating motion to perform an action inducing a desirable modification, effectively *manipulating* the environment (manipulation). Motions like (2) may result in changes in the robot’s physical location within its environment. Generally, modifications to a robot’s location within its environment may be considered instances of the general *locomotion* problem, further specified as *wheeled* or *legged* locomotion based on whenever a robot makes use of wheels or leg(s) to move in the environment. Lastly, an increased level of dynamism in the robot-environment interactions can be obtained combining (1) and (2), thus designing systems capable to interact with *and* move within their environment. This category is problems is typically termed *mobile manipulation*, and is characterized by a typically much larger set of control variables compared to either locomotion or manipulation alone. + +The traditional body of work developed since the very inception of robotics is increasingly complemented by learning-based approaches. ML has indeed proven particularly transformative across the entire robotics stack, first empowering planning-based techniques with improved state estimation used for traditional planning @tangPerceptionNavigationAutonomous2023 and then end-to-end replacing controllers, effectively yielding perception-to-action methods @koberReinforcementLearningRobotics. Work in producing robots capable of navigating a diverse set of terrains demonstrated the premise of both dynamics and learning-based approaches for locomotion @griffinWalkingStabilizationUsing2017, @jiDribbleBotDynamicLegged2023, @leeLearningQuadrupedalLocomotion2020, @margolisRapidLocomotionReinforcement2022, and recent works on whole-body control indicated the premise of learning-based approaches to generate rich motion on complex robots, including humanoids @zhangWoCoCoLearningWholeBody2024, @bjorckGR00TN1Open2025. Manipulation has also been widely studied, particularly considering its relevance for many impactful use-cases ranging from high-risk applications for humans @fujitaDevelopmentRobotsNuclear2020, @alizadehComprehensiveSurveySpace2024 to manufacturing @sannemanStateIndustrialRobotics2020. While explicit models have proven fundamental in achieving important milestones towards the development of modern robotics, recent works leveraging implicit models proved particularly promising in surpassing scalability and applicability challenges via learning @koberReinforcementLearningRobotics. + +## Example: Planar Manipulation + +Robot manipulators typically consist of a series of links and joints, articulated in a chain finally connected to an *end-effector*. Actuated joints are considered responsible for generating motion of the links, while the end effector is instead used to perform specific actions at the target location (e.g., grasping/releasing objects via closing/opening a gripper end-effector, using a specialized tool like a screwdriver, etc.). + +Recently, the development of low-cost manipulators like the ALOHA @zhaoLearningFineGrainedBimanual2023 ALOHA-2 @aldacoALOHA2Enhanced and SO-100/SO-101 @knightStandardOpenSO100 platforms significantly lowered the barrier to entry to robotics, considering the increased accessibility of these robots compared to more traditional platforms like the Franka Emika Panda arm (Figure 4). + + + +Deriving an intuition as per why learning-based approaches are gaining popularity in the robotics community requires briefly analyzing traditional approaches for manipulation, leveraging tools like forward and inverse kinematics (FK, IK) and control theory. Providing a detailed overview of these methods falls (well) out of the scope of this tutorial, and we refer the reader to works including @sicilianoSpringerHandbookRobotics2016, @lynchModernRoboticsMechanics2017, @tedrakeRoboticManipulationPerception, @tedrakeUnderactuatedRoboticsAlgorithms for a much more comprehensive description of these techniques. Here, we mostly wish to highlight the benefits of ML over these traditional techniques + + + +Consider the (simple) case where a SO-100 is restrained from actuating (1) the shoulder pane and (2) the wrist flex and roll motors. This effectively reduces the degrees of freedom of the SO-100 from the original 5+1 (5 joints + 1 gripper) to 2+1 (shoulder lift, elbow flex + gripper). As the end-effector does not impact motion in this model, the SO-100 is effectively reduced to the planar manipulator robot presented in Figure 5, where spheres represent actuators, and solid lines indicate length-$l$ links from the base of the SO-100 to the end-effector (*ee*). + +Further, let us make the simplifying assumption that actuators can produce rotations up to $2 \pi$ radians. In practice, this is seldom the case due to movement obstructions caused by the robot body itself (for instance, the shoulder lift cannot produce counter-clockwise movement due to the presence of the robot’s base used to secure the SO-100 to its support and host the robot bus), but we will introduce movement obstruction at a later stage. + +All these simplifying assumptions leave us with the planar manipulator of Figure 6, free of moving its end-effector by controlling the angles $\theta_1$ and $\theta_2$, jointly referred to as the robot’s *configuration*, and indicated with $q = [\theta_1, \theta_2 ] \in [-\pi, +\pi]^2$. The axis attached to the joints indicate the associated reference frame, whereas circular arrows indicate the maximal feasible rotation allowed at each joint. In this tutorial, we do not cover topics related to spatial algebra, and we instead refer the reader to and for excellent explanations of the mechanics and theoretical foundations of producing motion on rigid bodies. + +
+ + + +
Planar, 2-dof schematic representation of the SO-100 manipulator under diverse deployment settings. From left to right: completely free of moving; constrained by the presence of the surface; constrained by the surface and presence of obstacles. Circular arrows around each joint indicate the maximal rotation feasible at that joint.
+
+ +Considering the (toy) example presented in Figure 6, then we can analytically write the end-effector’s position $p \in \mathbb R^2$ as a function of the robot’s configuration, $p = p(q), p: \mathcal Q \mapsto \mathbb R^2$. In particular, we have: +$$ +`p(q) = +\begin +p_x(\theta_1, \theta_2)\\ + p_y(\theta_1, \theta_2) +\end{pmatrix} += +\begin{pmatrix} +l \cos(\theta_1) + l \cos(\theta_1 + \theta_2)\\ + l \sin(\theta_1) + l \sin(\theta_1 + \theta_2) +\end{pmatrix} +\in S^{n=2}_{l_1+l_2} = \{ p(q) \in \mathbb R^2: \Vert p(q) \Vert_2^2 \leq (2l)^2, \ \forall q \in \mathcal Q \}` +$$ + + +Deriving the end-effector’s *pose*--position *and* orientation--in some $m$-dimensional space $\boldsymbol{p} \in \mathcal{P} \subset \mathbb{R}^{m}$ starting from the configuration ${\textnormal{q}}\in \mathcal Q \subset \mathbb R^n$ of a $n$-joints robot is referred to as *forward kinematics* (FK), whereas identifying the configuration corresponding to any given target pose is termed *inverse kinematics* (IK). In that, FK is used to map a robot configuration into the corresponding end-effector pose, whereas IK is used to reconstruct the configuration(s) given an end-effector pose. + +In the simplified case here considered (for which $\boldsymbol{p} \equiv p$, as the orientation of the end-effector is disregarded for simplicity), one can solve the problem of controlling the end-effector’s location to reach a goal position $p^*$ by solving analytically for $q: p(q) = f_{\text{FK}}(q) = p^*$. However, in the general case, one might not be able to solve this problem analytically, and can typically resort to iterative optimization methods comparing candidate solutions using a loss function (in the simplest case, $\Vert p(q) - p^* \Vert_2^2$ is a natural candidate), yielding: + +$\min_{q \in \mathcal Q} \Vert p(q) - p^* \Vert_2^2 \, . +$ + +Exact analytical solutions to IK are even less appealing when one considers the presence of obstacles in the robot’s workspace, resulting in constraints on the possible values of $q \in \mathcal Q \subseteq [-\pi, +\pi]^n \subset \mathbb R^n$ in the general case of $n$-links robots. + +For instance, the robot in Figure 7 is (very naturally) obstacled by the presence of the surface upon which it rests: $\theta_1$ can now exclusively vary within $[0, \pi]$, while possible variations in $\theta_2$ depend on $\theta_1$ (when $\theta_1 \to 0$ or $\theta_1 \to \pi$, further downwards movements are restricted). Even for a simplified kinematic model, developing techniques to solve eq. [eq:ik_problem] is in general non-trivial in the presence of constraints, particularly considering that the feasible set of solutions $\mathcal Q$ may change across problems. Figure 9 provides an example of how the environment influences the feasible set considered, with a new set of constraints deriving from the position of a new obstacle. + +However, IK--solving eq. [eq:ik_problem] for a feasible $q$--only proves useful in determining information regarding the robot’s configuration in the goal pose, and crucially does not provide information on the *trajectory* to follow over time to reach a target pose. Expert-defined trajectories obviate to this problem providing a length-$K$ succession of goal poses $\tau_K = [p^*_0, p^*_1, \dots p^*_K]$ for tracking. In practice, trajectories can also be obtained automatically through *motion planning* algorithms, thus avoiding expensive trajectory definition from human experts. However, tracking $\tau_K$ via IK can prove prohibitively expensive, as tracking would require $K$ resolutions of eq. [eq:ik_problem] (one for each target pose). *Differential* inverse kinematics (diff-IK) complements IK via closed-form solution of a variant of eq. [eq:ik_problem]. Let $J(q)$ denote the Jacobian matrix of (partial) derivatives of the FK-function $f_\text{FK}: \mathcal Q \mapsto \mathcal P$, such that $J(q) = \frac{\partial f_{FK}(q)}{\partial q }$. Then, one can apply the chain rule to any $p(q) = f_{\text{FK}}(q)$, deriving $\dot p = J(q) \dot q$, and thus finally relating variations in the robot configurations to variations in pose, thereby providing a platform for control. + +Given a desired end-effector trajectory $\dot {p}^*(t)$ (1) indicating anchor regions in space and (2) how much time to spend in each region, diff-IK finds $\dot q(t)$ solving for joints’ *velocities* instead of *configurations*, $\dot q(t) = \arg\min_\nu \; \lVert J(q(t)) \nu - \dot {p}^*(t) \rVert_2^2 +$ + +Unlike eq. [eq:ik_problem], solving for $\dot q$ is much less dependent on the environment (typically, variations in velocity are constrained by physical limits on the actuators). Conveniently, eq. [eq:reg_ik_velocity] also often admits the closed-form solution $\dot q = J(q)^+ \dot {p}^*$, where $J^+(q)$ denotes the Moore-Penrose pseudo-inverse of $J(q)$. Finally, discrete-time joint configurations $q$ can be reconstructed from joint velocities $\dot q$ using forward-integration on the continuous-time joint velocity , $q_{t+1} = q_t + \Delta t\,\dot q_t$ for a given $\Delta t$, resulting in tracking via diff-IK. + +Following trajectories with diff-IK is a valid option in well-controlled and static environments (e.g., industrial manipulators in controlled manufacturing settings), and relies on the ability to define a set of target velocities to track $[\dot {p}^*_0, \dot {p}^*_1, \dots, \dot {p}^*_k ]$--an error-prone task largely requiring human expertise. Furthermore, diff-IK relies on the ability to (1) access $J(q) \, \forall q \in \mathcal Q$ and (2) compute its pseudo-inverse at every iteration of a given control cycle--a challenging assumption in highly dynamical settings, or for complex kinematic chains. + +### Adding Feedback Loops + +While very effective when a goal trajectory has been well specified, the performance of diff-IK can degrade significantly in the presence of modeling/tracking errors, or in the presence of non-modeled dynamics in the environment. + +
+ +r0.3 +
+ +One such case is presented in Figure [fig:planar-manipulator-box-velocity], where another rigid body other than the manipulator is moving in the environment along the horizontal axis, with velocity $\dot x_B$. Accounting analytically for the presence of this disturbance--for instance, to prevent the midpoint of the link from ever colliding with the object--requires access to $\dot x_B$ at least, to derive the equation characterizing the motion of the environment. + +Less predictable disturbances however (e.g., $\dot x_B \leftarrow \dot x_B + {\varepsilon}, {\varepsilon}\sim N(0,1)$) may prove challenging to model analytically, and one could attain the same result of preventing link-object collision by adding a condition on the distance between the midpoint of $l$ and $x_B$, enforced through a feedback loop on the position of the robot and object at each control cycle. + +To mitigate the effect of modeling errors, sensing noise and other disturbances, classical pipelines indeed do augment diff-IK with feedback control looping back quantities of interest. In practice, following a trajectory with a closed feedback loop might consist in backwarding the error between the target and measured pose, $\Delta p = p^*- p(q)$, hereby modifying the control applied to $\dot q = J(q)^+ (\dot {p}^*+ k_p \Delta p )$, with $k_p$ defined as the (proportional) gain. + +More advanced techniques for control consisting in feedback linearization, PID control, Linear Quatratic Regulator (LQR) or Model-Predictive Control (MPC) can be employed to stabilize tracking and reject moderate perturbations, and we refer to for in-detail explanation of these concepts, or for a simple, intuitive example in the case of a point-mass system. Nonetheless, feedback control presents its challenges as well: tuning gains remains laborious and system-specific. Further, manipulation tasks present intermittent contacts inducing hybrid dynamics (mode switches) and discontinuities in the Jacobian, challenging the stability guarantees of the controller and thus often necessitating rather conservative gains and substantial hand-tuning. + +We point the interested reader to , , and  for extended coverage of FK, IK, diff-IK and control for (diff-)IK. + +## Limitations of Dynamics-based Robotics + +Despite the last 60+ years of robotics research, autonomous robots are still largely incapable of performing tasks at human-level performance in the physical world generalizing across (1) robot embodiments (different manipulators, different locomotion platforms, etc.) and (2) tasks (tying shoe-laces, manipulating a diverse set of objects). While essential in the early development of robotics, the aforementioned methods require significant human expertise to be used in practice, and are typically specific to a particular applicative problem. + + + +Dynamics-based robotics pipelines have historically been developed sequentially, engineering the different blocks now within most architectures for specific purposes. That is, sensing, state estimation, mapping, planning, (diff-)IK, and low-level control have been traditionally developed as distinct modules with fixed interfaces. Pipelining these specific modules proved error-prone, and brittleness emerges--alongside compounding errors--whenever changes incur (e.g., changes in lighting for sensing, occlusion/failure of sensors, control failures). Adapting such a stack to new tasks or robotic platforms often entails re-specifying objectives, constraints, and heuristics at multiple stages, incurring significant engineering overhead. + +Moreover, classical planners operate on compact, assumed-sufficient state representations; extending them to reason directly over raw, heterogeneous and noisy data streams is non-trivial. This results in a limited scalability to multimodal data and multitask settings, as incorporating high-dimensional perceptual inputs (RGB, depth, tactile, audio) traditionally required extensive engineering efforts to extract meaningful features for control. Also, the large number of tasks, coupled with the adoption of *per-task* planners, goal parameterizations, and safety constraints, results in an explosion in design and validation options, with little opportunity to reuse solutions across tasks. + +Setting aside integration and scalability challenges: developing accurate modeling of contact, friction, and compliance for complicated systems remains difficult. Rigid-body approximations are often insufficient in the presence of deformable objects, and relying on approximated models hinders real-world applicability of the methods developed. In the case of complex, time-dependent and/or non-linear dynamics, even moderate mismatches in parameters, unmodeled evolutions, or grasp-induced couplings can qualitatively affect the observed dynamics. + +Lastly, dynamics-based methods (naturally) overlook the rather recent increase in availability of openly-available robotics datasets. The curation of academic datasets by large centralized groups of human experts in robotics @collaborationOpenXEmbodimentRobotic2025, @khazatskyDROIDLargeScaleInTheWild2025 is now increasingly complemented by a growing number of robotics datasets contributed in a decentralized fashion by individuals with varied expertise. If not tangentially, dynamics-based approaches are not posed to maximally benefit from this trend, which holds the premise of allowing generalization in the space of tasks and embodiments, like data was the cornerstone for advancements in vision @alayracFlamingoVisualLanguage2022 and natural-language understanding @brownLanguageModelsAre2020. + +Taken together, these limitations (Figure 10) motivate the exploration of learning-based approaches that can (1) integrate perception and control more tightly, (2) adapt across tasks and embodiments with reduced expert modeling interventions and (3) scale gracefully in performance as more robotics data becomes available. + +# Robot (Reinforcement) Learning + +
+ +*Approximate the solution, not the problem* \[...\] + +Richard Sutton + +
+
+ +TL;DR The need for expensive high-fidelity simulators can be obviated by learning from real-world data, using sample-efficient algorithms that can safely train directly on hardware. + +
+ + +Learning-based techniques for robotics naturally address the limitations presented in 2 (Figure 11). Learning-based techniques typically rely on prediction-to-action (*visuomotor policies*), thereby directly mapping sensorimotor inputs to predicted actions, streamlining control policies by removing the need to interface multiple components. Mapping sensorimotor inputs to actions directly also allows to add diverse input modalities, leveraging the automatic feature extraction characteristic of most modern learning systems. Further, learning-based approaches can in principle entirely bypass modeling efforts and instead rely exclusively on interactions data, proving transformative when dynamics are challenging to model or even entirely unknown. Lastly, learning for robotics (*robot learning*) is naturally well posed to leverage the growing amount of robotics data openly available, just as computer vision first and natural language processing later did historically benefit from large scale corpora of (possibly non curated) data, in great part overlooked by dynamics-based approaches. + +Being a field at its relative nascent stages, no prevalent technique(s) proved distinctly better better in robot learning. Still, two major classes of methods gained prominence: reinforcement learning (RL) and Behavioral Cloning (BC) (Figure 12). In this section, we provide a conceptual overview of applications of the former to robotics, as well as introduce practical examples of how to use RL within `lerobot`. We then introduce the major limitations RL suffers from, to introduce BC techniques in the next sections ([sec:learning-bc-single, sec:learning-bc-generalist]). + + + +In Figure 12 we decided to include generalist robot models @blackp0VisionLanguageActionFlow2024, @shukorSmolVLAVisionLanguageActionModel2025 alongside task-specific BC methods. While significant different in spirit--*generalist* models are language-conditioned and use instructions to generate motion valid across many tasks, while *task-specific* models are typically not language-conditioned and used to perform a single task--foundation models are largely trained to reproduce trajectories contained in a large training set of input demonstrations. Thus, we argue generalist policies can indeed be grouped alongside other task-specific BC methods, as they both leverage similar training data and schemas. + +Figure 12 illustrates this categorization graphically, explicitly listing all the robot learning policies currently available in `lerobot`: Action Chunking with Transformers (ACT) @zhaoLearningFineGrainedBimanual2023, Diffusion Policy @chiDiffusionPolicyVisuomotor2024, Vector-Quantized Behavior Transformer (VQ-BeT) @leeBehaviorGenerationLatent2024, $\pi_0$ @blackp0VisionLanguageActionFlow2024, SmolVLA @shukorSmolVLAVisionLanguageActionModel2025, Human-in-the-loop Sample-efficient RL (HIL-SERL) @luoPreciseDexterousRobotic2024 and TD-MPC @hansenTemporalDifferenceLearning2022. + + + +Applications of RL to robotics have been long studied, to the point the relationship between these two disciplines has been compared to that between physics and matematics @koberReinforcementLearningRobotics. Indeed, due to their interactive and sequential nature, many robotics problems can be directly mapped to RL problems. Figure 13 depicts two of such cases. Reaching for an object to move somewhere else in the scene is an indeed sequential problem where at each cycle the controller needs to adjust the position of the robotic arm based on their current configuration and the (possibly varying) position of the object. Figure 13 also shows an example of a locomotion problem, where sequentiality is inherent in the problem formulation. While sliding to the side, the controller has to constantly keep adjusting to the robot’s propioperception to avoid failure (falling). + +## A (Concise) Introduction to RL + +The RL framework @suttonReinforcementLearningIntroduction2018, which we briefly introduce here, has often been used to model robotics problems @koberReinforcementLearningRobotics. RL is a subfield within ML fundamentally concerned with the development of autonomous systems (*agents*) learning how to *continuously behave* in an evolving environment, developing (ideally, well-performing) control strategies (*policies*). Crucially for robotics, RL agents can improve via trial-and-error only, thus entirely bypassing the need to develop explicit models of the problem dynamics, and rather exploiting interaction data only. In RL, this feedback loop (Figure 14) between actions and outcomes is established through the agent sensing a scalar quantity (*reward*). + + + +Formally, interactions between an agent and its environment are typically modeled via a Markov Decision Process (MDP) @bellmanMarkovianDecisionProcess1957. Representing robotics problems via MDPs offers several advantages, including (1) incorporating uncertainty through MDP’s inherently stochastic formulation and (2) providing a theoretically sound framework for learning *without* an explicit dynamic model. While accommodating also a continuous time formulation, MDPs are typically considered in discrete time in RL, thus assuming interactions to atomically take place over the course of discrete *timestep* $t=0,1,2,3, \dots, T$. MDPs allowing for an unbounded number of interactions ( $T \to + \infty$ ) are typically termed *infinite-horizon*, and opposed to *finite-horizon* MDPs in which $T$ cannot grow unbounded. Unless diversely specified, we will only be referring to discrete-time finite-horizon (*episodic*) MDPs here. + +Formally, a lenght-$T$ Markov Decision Process (MDP) is a tuple $\mathcal M = \langle \mathcal S, \mathcal A, \mathcal D, r, \gamma, \rho, T \rangle$, where: + +- $\mathcal S$ is the *state space*; $s_t\in \mathcal S$ denotes the (possibly non-directly observable) environment state at time $t$. In robotics, states often comprise robot configuration and velocities ($q_t, \dot q_t$), and can accomodate sensor readings such as camera or audio streams. + +- $\mathcal A$ is the *action space*; $a_t\in \mathcal A$ may represent joint torques, joint velocities, or even end-effector commands. In general, actions correspond to commands intervenings on the configuration of the robot. + +- $\mathcal D$ represents the (possibly non-deterministic) environment dynamics, with $\mathcal D: \mathcal S\times \mathcal A\times \mathcal S\mapsto [0, 1]$ corresponding to $\mathcal D\, (s_t, a_t, s_{t+1})= \mathbb P (s_{t+1}\vert s_t, a_t)$. For instance, for a planar manipulator dynamics could be considered deterministic when the environment is fully described (Figure 6), and stochastic when unmodeled disturbances depending on non-observable parameters intervene (Figure [fig:planar-manipulator-box-velocity]). + +- $r: \mathcal S\times \mathcal A\times \mathcal S\to \mathbb R$ is the *reward function*, weighing the transition $(s_t, a_t, s_{t+1})$ in the context of the achievement of an arbitrary goal. For instance, a simple reward function for quickly moving the along the $x$ axis in 3D-space (Figure 13) could be based on the absolute position of the robot along the $x$ axis ($p_x$), present negative penalties for falling over (measured from $p_z$) and a introduce bonuses $\dot p_x$ for speed, $r (s_t, a_t, s_{t+1})\equiv r(s_t) = p_{x_t} \cdot \dot p_{x_t} - \tfrac{1}{p_{z_t}}$. + +Lastly, $\gamma \in [0,1]$ represent the discount factor regulating preference for immediate versus long-term reward (with an effective horizon equal to $\tfrac{1}{1-\gamma}$), and $\rho$ is the distribution, defined over $\mathcal S$, the MDP’s *initial* state is sampled from, $s_0 \sim \rho$. + +A length-$T$ *trajectory* is the (random) sequence +``` math +\begin{equation} + + \tau = (s_0, a_0, r_0, s_1, a_1, r_1, \dots, s_{T-1}, a_{T-1}, r_{T-1}, s_T), +\end{equation} +``` +with per-step rewards defined as $r_t = r (s_t, a_t, s_{t+1})$ for ease of notation.Interestingly, assuming both the environment dynamics and conditional distribution over actions given states--the *policy*--to be *Markovian*: +$$ +`\mathbb P(s_{t+1}\vert s_t, a_t, s_{t-1}, a_{t-1}, \dots s_0, a_0 ) = \mathbb P (s_{t+1}\vert s_t, a_t)\\ + \mathbb P(a_t\vert s_t, a_{t-1}, s_{t-1}, s_0, a_0) = \mathbb P(a_t\vert s_t) ` +$$ + The probability of observing a given trajectory $\tau$ factorizes into +``` math +\begin{equation} + + \mathbb P(\tau) = \mathbb P (s_0) \prod_{t=0}^{T-1} \mathbb P (s_{t+1}\vert s_t, a_t)\ \mathbb P(a_t\vert s_t). +\end{equation} +``` + +Policies $\mathbb P(a_t\vert s_t)$ are typically indicated as $\pi(a_t\vert s_t)$, and often parametrized via $\theta$, yielding $\pi_\theta (a_t\vert s_t)$. Policies are trained optimizing the (discounted) *return* associated to a given $\tau$, i.e. the (random) sum of measured rewards over trajectory: +``` math +G(\tau) = \sum_{t=0}^{T-1} \gamma^{t} r_t. +``` +In that, agents seek to learn control strategies (*policies*, $\pi_\theta$) maximizing the expected return $\mathbb E_{\tau \sim \pi_\theta} G(\tau)$. For a given dynamics $\mathcal D$--i.e., for a given problem--taking the expectation over the (possibly random) trajectories resulting from acting according to a certain policy provides a direct, goal-conditioned ordering in the space of all the possible policies $\Pi$, yielding the (maximization) target $J : \Pi \mapsto \mathbb R$ +$$ +`J(\pi_\theta) = \mathbb E_{\tau \sim \mathbb P_{\theta; \mathcal D}} [G(\tau)],\\ + \mathbb P_{\theta; \mathcal D} (\tau) = \rho \prod_{t=0}^{T-1} \mathcal D (s_t, a_t, s_{t+1})\ \pi_\theta (a_t\vert s_t).` +$$ + + +Because in the RL framework the agent is assumed to only be able to observe the environment dynamics and not to intervene on them, [eq:RL-j-function] varies exclusively with the policy followed. In turn, MDPs naturally provide a framework to optimize over the space of the possible behaviors an agent might enact ($\pi \in \Pi$), searching for the *optimal policy* $\pi^* = \arg \max_{\theta} J(\pi_\theta)$, where $\theta$ is the parametrization adopted by the policy set $\Pi: \pi_\theta \in \Pi, \ \forall \theta$. Other than providing a target for policy search, $G(\tau)$ can also be used as a target to discriminate between states and state-action pairs. Given any state $s \in \mathcal S$--e.g., a given configuration of the robot--the *state-value* function +``` math +V_\pi(s) = \mathbb E_{\tau \sim \pi} [G(\tau) \big \vert s_0 = s] +``` +can be used to discriminate between desirable and undesirable state in terms of long-term (discounted) reward maximization, under a given policy $\pi$. Similarily, the *state-action* value function also conditions the cumulative discounted reward on selecting action $a$ when in $s$, and thereafter act according to $\pi$: +``` math +Q_\pi(s,a) = \mathbb E_{\tau \sim \pi} [G (\tau) \big \vert s_0 = s, a_0=a] +``` +Crucially, value functions are interrelated: +$$ +`Q_\pi(s_t, a_t) = \mathbb{E}_{s_{t+1}\sim \mathbb P(\bullet \vert s_t, a_t)} [r_t + \gamma V_\pi(s_{t+1})]\\ + V_\pi(s_t) = \mathbb E_{a_t\sim \pi(\bullet \vert s_t)} [Q_\pi (s_t, a_t)] +` +$$ + Inducing an ordering over states and state-action pairs under $\pi$, value functions are central to most RL algorithms. A variety of methods have been developed in RL as standalone attemps to find (approximate) solutions to the problem of maximizing cumulative reward (Figure 15). + + + +Popular approaches to continuous state and action space--such as those studied within robotics--include @schulmanTrustRegionPolicy2017, @schulmanProximalPolicyOptimization2017, @haarnojaSoftActorCriticOffPolicy2018. Across manipulation @akkayaSolvingRubiksCube2019 and locomotion @leeLearningQuadrupedalLocomotion2020 problems, RL proved extremely effective in providing a platform to (1) adopt a unified, streamlined perception-to-action pipeline, (2) natively integrate propioperception with multi-modal high-dimensional sensor streams (3) disregard a description of the environment dynamics, by focusing on observed interaction data rather than modeling, and (4) anchor policies in the experience collected and stored in datasets. For a more complete survey of applications of RL to robotics, we refer the reader to @koberReinforcementLearningRobotics, @tangDeepReinforcementLearning2024. + +## Real-world RL for Robotics + +Streamlined end-to-end control pipelines, data-driven feature extraction and a disregard for explicit modeling in favor of interaction data are all features of RL for robotics. However, particularly in the context of real-world robotics, RL still suffers from limitations concerning machine safety and learning efficiency. + +First, especially early in training, actions are typically explorative, and thus erractic. On physical systems, untrained policies may command high velocities, self-collisiding configurations, or torques exceeding joint limits, leading to wear and potential hardware damage. Mitigating these risks requires external safeguards (e.g., watchdogs, safety monitors, emergency stops), often incuring in a high degree of human supervision. Further, in the typical episodic setting considered in most robotics problems, experimentation is substantially slowed down by the need to manually reset the environment over the course of training, a time-consuming and brittle process. + +Second, learning with a limited number of samples remains problematic in RL, limiting the applicability of RL in real-world robotics due to consequently prohibitive timescales of training. Even strong algorithms such as SAC @haarnojaSoftActorCriticOffPolicy2018 typically require a large numbers of transitions $\{ (s_t, a_t, r_t, s_{t+1})\}_{t=1}^N$. On hardware, generating these data is time-consuming and can even be prohibitive. + + + +Training RL policies in simulation @tobinDomainRandomizationTransferring2017 addresses both issues: it eliminates physical risk and dramatically increases throughput. Yet, simulators require significant modeling effort, and rely on assumptions (simplified physical modeling, instantaneous actuation, static environmental conditions, etc.) limiting transferring policies learned in simulation due the discrepancy between real and simulated environments (*reality gap*, Figure 16). *Domain randomization* (DR) is a popular technique to overcome the reality gap, consisting in randomizing parameters of the simulated environment during training, to induce robustness to specific disturbances. In turn, DR is employed to increase the diversity of scenarios over the course of training, improving on the chances sim-to-real transfer @akkayaSolvingRubiksCube2019, @antonovaReinforcementLearningPivoting2017, @jiDribbleBotDynamicLegged2023. In practice, DR is performed further parametrizing the *simulator*’s dynamics $\mathcal D \equiv \mathcal D_\xi$ with a *dynamics* (random) vector $\xi$ drawn an arbitrary distribution, $\xi \sim \Xi$. Over the course of training--typically at each episode’s reset--a new $\xi$ is drawn, and used to specify the environment’s dynamics for that episode. For instance, one could decide to randomize the friction coefficient of the surface in a locomotion task (Figure 17), or the center of mass of an object for a manipulation task. + + + +While effective in transfering policies across the reality gap in real-world robotics @tobinDomainRandomizationTransferring2017, @akkayaSolvingRubiksCube2019, @jiDribbleBotDynamicLegged2023, @tiboniDomainRandomizationEntropy2024, DR often requires extensive manual engineering. First, identifying which parameters to randomize--i.e., the *support* $\text{supp} (\Xi)$ of $\Xi$--is an inherently task specific process. When locomoting over different terrains, choosing to randomize the friction coefficient is a reasonable choice, yet not completely resolutive as other factors (lightning conditions, external temperature, joints’ fatigue, etc.) may prove just as important, making selecting these parameters yet another source of brittlness. + +Selecting the dynamics distribution $\Xi$ is also non-trivial. On the one hand, distributions with low entropy might risk to cause failure at transfer time, due to the limited robustness induced over the course of training. On the other hand, excessive randomization may cause over-regularization and hinder performance. Consequently, the research community investigated approaches to automatically select the randomization distribution $\Xi$, using signals from the training process or tuning it to reproduce observed real-world trajectories.  @akkayaSolvingRubiksCube2019 use a parametric uniform distribution $\mathcal U(a, b)$ as $\Xi$, widening the bounds as training progresses and the agent’s performance improves (AutoDR). While effective, AutoDR requires significant tuning--the bounds are widened by a fixed, pre-specified amount $\Delta$--and may disregard data when performance *does not* improve after a distribution update @tiboniDomainRandomizationEntropy2024.  @tiboniDomainRandomizationEntropy2024 propose a similar method to AutoDR (DORAEMON) to evolve $\Xi$ based on training signal, but with the key difference of explicitly maximizing the entropy of parametric Beta distributions, inherently more flexible than uniform distributions. DORAEMON proves particularly effective at dynamically increasing the entropy levels of the training distribution by employing a max-entropy objective, under performance constraints formulation. Other approaches to automatic DR consist in specifically tuning $\Xi$ to align as much as possible the simulation and real-world domains. For instance,  @chebotar2019closing interleave in-simulation policy training with repeated real-world policy rollouts used to adjust $\Xi$ based on real-world data, while  @tiboniDROPOSimtoRealTransfer2023 leverage a single, pre-collected set of real-world trajectories and tune $\Xi$ under a simple likelihood objective. + +While DR has shown promise, it does not address the main limitation that, even under the assumption that an ideal distribution $\Xi$ to sample from was indeed available, many robotics problems cannot be simulated with high-enough fidelity under practical computational constraints in the first place. Simulating contact-rich manipulation of possibly deformable or soft materials--i.e., *folding a piece of clothing*--can be costly and even time-intensive, limiting the benefits of in-simulation training. + +A perhaps more foundamental limitation of RL for robotics is the general unavailability of complicated tasks’ *dense* reward function, the design of which is essentially based on human expertise and trial-and-error. In practice, *sparse* reward functions can be used to conclude whether one specific goal has been attained--*has this t-shirt been correctly folded?*--but unfortunately incur in more challenging learning. As a result, despite notable successes, deploying RL directly on real-world robots at scale remains challenging. + +To make the most of (1) the growing number of openly available datasets and (2) relatively inexpensive robots like the SO-100, RL could (1) be anchored in already-collected trajectories--limiting erratic and dangerous exploration--and (2) train in the real-world directly--bypassing the aforementioned issues with low-fidelity simulations. In such a context, sample-efficient learning is also paramount, as training on the real-world is inherently time-bottlenecked. + +Off-policy algorithms like Soft Actor-Critic (SAC) @haarnojaSoftActorCriticOffPolicy2018 tend to be more sample efficient then their on-policy counterpart @schulmanProximalPolicyOptimization2017, due to the presence a *replay buffer* used over the course of the training. Other than allowing to re-use transitions $(s_t, a_t, r_t, s_{t+1})$ over the course of training, the replay buffer can also accomodate for the injection of previously-collected data in the training process @ballEfficientOnlineReinforcement2023. Using expert demonstrations to guide learning together with learned rewards, RL training can effectively be carried out in the real-world @luoSERLSoftwareSuite2025. Interestingly, when completed with in-training human interventions, real-world RL agents have been shown to learn policies with near-perfect success rates on challenging manipulation tasks in 1-2 hours @luoPreciseDexterousRobotic2024. + +#### Sample-efficient RL + +In an MDP, the optimal policy $\pi^*$ can be derived from its associated $Q$-function, $Q_{\pi^*}$, and in particular the optimal action(s) $\mu(s_t)$ can be selected maximizing the optimal $Q$-function over the action space, +``` math +\mu(s_t) = \max_{a_t\in \mathcal A} Q_{\pi^*}(s_t, a_t). +``` +Interestingly, the $Q^*$-function satisfies a recursive relationship (*Bellman equation*) based on a very natural intuition [^2]: + +> \[...\] If the optimal value $Q^*(s_{t+1}, a_{t+1})$ of the \[state\] $s_{t+1}$ was known for all possible actions $a_{t+1}$, then the optimal strategy is to select the action $a_{t+1}$ maximizing the expected value of $r_t + \gamma Q^*(s_{t+1}, a_{t+1})$ +> ``` math +> Q^*(s_t, a_t) = \mathbb E_{s_{t+1} \sim \mathbb P(\bullet \vert s_t, a_t)} [r_t + \gamma \max_{a_{t+1} \in \mathcal A} Q^*(s_{t+1}, a_{t+1}) \big\vert s_t, a_t] +> ``` + +In turn, the optimal $Q$-function  is guaranteed to be self-consistent by definition. *Value-iteration* methods exploit this relationship (and/or its state-value counterpart, $V^*(s_t)$ ) by iteratively updating an initial estimate of $Q^*$, $Q_k$ using the Bellman equation as update rule (*Q-learning*): +``` math +Q_{i+1}(s_t, a_t) \leftarrow \mathbb E_{s_{t+1} \sim \mathbb P(\bullet \vert s_t, a_t)} [r_t + \gamma \max_{a_{t+1} \in \mathcal A} Q_i (s_{t+1}, a_{t+1}) \big\vert s_t, a_t], \quad i=0,1,2,\dots,K +``` +Then, one can derive the (ideally, near-optimal) policy by explicitly maximizing over the action space the final (ideally, near-optimal) estimate $Q_K \approx Q^*$ at each timestep. In fact, under certain assumptions on the MDP considered, $Q_K \to Q^* \, \text{as } K \to \infty$. + +Effective in its early applications to small-scale discrete problems and theoretically sound, vanilla Q-learning was found complicated to scale to large $\mathcal S\times \mathcal A$ problems, in which the storing of $Q : \mathcal S\times \mathcal A\mapsto \mathbb R$ alone might result prohibitive. Also, vanilla Q-learning is not directly usable for *continuous*, unstructured state-action space MPDs, such as those considered in robotics. In their seminal work on *Deep Q-Learning* (DQN), @mnihPlayingAtariDeep2013 propose learning Q-values using deep convolutional neural networks, thereby accomodating for large and even unstructured *state* spaces. DQN parametrizes the Q-function using a neural network with parameters $\theta$, updating the parameters by sequentially minimizing the expected squared temporal-difference error (TD-error, $\delta_i$): +$$ +`\mathcal L(\theta_i) = \mathbb E_{(s_t, a_t) \sim \chi(\bullet)} + \big[ + (\underbrace{y_i - Q_{\theta_i}(s_t, a_t)}_{\delta_i})^2 + \big],\\ + y_i = \mathbb E_{s_{t+1} \sim \mathbb P(\bullet \vert s_t, a_t)} \big[ r_t + \gamma \max_{a_t\in \mathcal A} Q_{\theta_{i-1}} (s_{t+1}, a_{t+1}) \big], ` +$$ + Where $\chi$ represents a behavior distribution over state-action pairs. Crucially, $\chi$ can in principle be different from the policy being followed, effectively allowing to reuse prior data stored in a *replay buffer* in the form of $(s_t, a_t, r_t, s_{t+1})$ transitions, used to form the TD-target $y_i$, TD-error $\delta_i$ and loss function [eq:dqn-loss] via Monte-Carlo (MC) estimates. + +While effective in handling large, unstructured state spaces for discrete action-space problems, DQN application’s to continous control problems proved challenging. Indeed, in the case of high-capacity function approximators such as neural networks, solving $\max_{a_t \in \mathcal A} Q_\theta(s_t, a_t)$ at each timestep is simply unfeasible due to the (1) continous nature of the action space ($\mathcal A\subset \mathbb R^n$ for some $n$) and (2) impossibility to express the find a cheap (ideally, closed-form) solution to $Q_\theta$.  @silverDeterministicPolicyGradient2014 tackle this fundamental challenge by using a *deterministic* function of the state $s_t$ as policy, $\mu_\phi(s_t) = a_t$, parametrized by $\phi$. Thus, policies can be iteratively refined updating $\phi$ along the direction: +``` math +\begin{equation} + + d_\phi = \mathbb E_{s_t \sim \mathbb P (\bullet)} [\nabla_\phi Q(s_t, a_t)\vert_{a_t = \mu_\phi(s_t)}] = \mathbb E_{s_t \sim \mathbb P(\bullet)} [\nabla_{a_t} Q(s_t, a_t) \vert_{a_t = \mu_\phi(s_t)} \cdot \nabla_\phi \mu(s_t)] +\end{equation} +``` +Provably, [eq:deterministic-pg] is the *deterministic policy gradient* (DPG) of the policy $\mu_\phi$ @silverDeterministicPolicyGradient2014, so that updates $\phi_{k+1}\leftarrow \phi_k + \alpha d_\phi$ are guaranteed to increase the (deterministic) cumulative discounted reward, $J(\mu_\phi)$.  @lillicrapContinuousControlDeep2019 extended DPG to the case of (1) high-dimensional unstructured observations and (2) continuous action spaces, introducing Deep Deterministic Policy Gradient (DDPG), an important algorithm RL and its applications to robotics. DDPG adopts a modified TD-target compared to the one defined in [eq:TD-target], by maintaining a policy network used to select actions, yielding +``` math +\begin{equation} + +y_i = \mathbb E_{s_{t+1} \sim \mathbb P(\bullet \vert s_t, a_t)} \big[ r_t + \gamma Q_{\theta_{i-1}} (s_{t+1}, \mu_\phi(s_{t+1})) \big] . +\end{equation} +``` +Similarily to DQN, DDPG also employs the same replay buffer mechanism, to reuse past transitions over training for increased sample efficiency and estimate the loss function via MC-estimates. + +Soft Actor-Critic (SAC) @haarnojaSoftActorCriticOffPolicy2018 is a derivation of DDPG in the max-entropy (MaxEnt) RL framework, in which RL agents are tasked with maximizing the discounted cumulative reward, while acting as randomly as possible. MaxEnt RL @haarnojaReinforcementLearningDeep2017 has proven particularly robust thanks to the development of diverse behaviors, incentivized by its entropy-regularization formulation. In that, MaxEnt revisits the RL objective $J (\pi)$ to specifically account for the policy entropy, $J(\pi) = \sum_{t=0}^T \mathbb{E}_{(s_t, a_t) \sim \chi} [r_t + \alpha \mathcal H(\pi (\bullet \vert s_t))] $ This modified objective results in the *soft* TD-target: +``` math +\begin{equation} + + y_i = \mathbb E_{s_{t+1} \sim \mathbb P( \bullet \vert s_t, a_t)} [r_t + \gamma \left( Q_{\theta_{i-1}} (s_{t+1}, a_{t+1}) - \alpha \log \pi_\phi(a_{t+1} \vert s_{t+1}) \right)], \quad a_{t+1} \sim \pi_\phi(\bullet \vert s_t) +\end{equation} +``` +Similarily to DDPG, SAC also maintains an explicit policy, trained under the same MaxEnt framework for the maximization of [eq:J-soft], and updated using: +``` math +\begin{equation} + + \pi_{k+1} \leftarrow \arg\min_{\pi^\prime \in \Pi} \text{D}_{\text{KL}}\left(\pi^\prime (\bullet \vert s_t) \bigg\Vert \frac{\exp(Q_{\pi_k}(s_t, \bullet))}{Z_{\pi_k}(s_t)} \right) +\end{equation} +``` +The update rule provided in [eq:sac-policy-update] optimizes the policy while projecting it on a set $\Pi$ of tractable distributions (e.g., Gaussians, @haarnojaReinforcementLearningDeep2017). + +#### Sample-efficient, data-driven RL + +Importantly, sampling $(s_t, a_t, r_t, s_{t+1})$ from the replay buffer $D$ conveniently allows to approximate the previously introduced expectations for TD-target and TD-error through Monte-Carlo (MC) estimates. The replay buffer $D$ also proves extremely useful in maintaining a history of previous transitions and using it for training, improving on sample efficiency. Furthermore, it also naturally provides an entry point to inject offline trajectories recorded, for instance, by a human demonstrator, into the training process. + +Reinforcement Learning with Prior Data (RLPD) @ballEfficientOnlineReinforcement2023 is an Offline-to-Online RL algorithm leveraging prior data to effectively accelerate the training of a SAC agent. Unlike previous works on Offline-to-Online RL, RLPD avoids any pre-training and instead uses the available offline data $D_\text{offline}$ to improve online-learning from scratch. During each training step, transitions from both the offline and online replay buffers are sampled in equal proportion, and used in the underlying SAC routine. + +#### Sample-efficient, data-driven, real-world RL + +Despite the possibility to leverage offline data for learning, the effectiveness of real-world RL training is still limited by the need to define a task-specific, hard-to-define reward function. Further, even assuming to have access to a well-defined reward function, typical robotics pipelines rely mostly on propioperceptive inputs augmented by camera streams of the environment. As such, even well-defined rewards would need to be derived from processed representations of unstructured observations, introducing brittleness. In their technical report, @luoSERLSoftwareSuite2025 empirically address the needs (1) to define a reward function and (2) to use it on image observations, by introducing a series of tools to allow for streamlined training of *reward classifiers* $c$, as well as jointly learn forward-backward controllers to speed up real-world RL. Reward classifiers are particularly useful in treating complex tasks--e.g., folding a t-shirt--for which a precise reward formulation is arbitrarily complex to obtain, or that do require significant shaping and are more easily learned directly from demonstrations of success ($e^+$) or failure ($e^-$) states, $s \in \mathcal S$, with a natural choice for the state-conditioned reward function being $r \mathcal S \mapsto \mathbb R$ being $r(s) = \log c(e^+ \ vert s )$. Further, @luoSERLSoftwareSuite2025 demonstrate the benefits of learning *forward* (executing the task from initial state to completion) and *backward* (resetting the environment to the initial state from completion) controllers, parametrized by separate policies. + +Lastly, in order to improve on the robustness of their approach to different goals while maintaing practical scalability, @luoSERLSoftwareSuite2025 introduced a modified state and action space, expressing proprioperceptive configurations $q$ and actions $\dot q$ in the frame of end-effector pose at $t=0$. Randomizing the initial pose of the end-effector ($s_0$),@luoSERLSoftwareSuite2025 achieved a similar result to that of having to manually randomize the environment at every timestep, but with the benefit of maintaining the environment in the same condition across multiple training episodes, achieving higher scalability of their method thanks to the increased practicality of their approach. + + + +Building on off-policy deep Q-learning with replay buffers, entropy regularization for better exploration and performance, expert demonstrations to guide learning, and a series of tools and recommendations for real-world training using reward classifiers (Figure 18), @luoPreciseDexterousRobotic2024 introduce human interactions during training, learning near-optimal policies in challenging real-world manipulation tasks in 1-2 hours. + +Human in the Loop Sample Efficient Robot reinforcement Learning (HIL-SERL) @luoPreciseDexterousRobotic2024 augments offline-to-online RL with targeted human corrections during training, and employs prior data to (1) train a reward classifier and (2) bootstrap RL training on expert trajectories. While demonstrations provide the initial dataset seeding learning and constraining early exploration, interactive corrections allow a human supervisor to intervene on failure modes and supply targeted interventions to aid the learning process. Crucially, human interventions are stored in both the offline and online replay buffers, differently from the autonomous transitions generated at training time and stored in the online buffer only. Consequently, given an intervention timestep $k \in (0, T)$, length-$K$ human intervention data $\{ s^{\text{human}}_k, a^{\text{human}}_k, r^{\text{human}}_k, s^{\text{human}}_{k+1},\}_{k=1}^K$ is more likely to be sampled for off-policy learning than the data generated online during training, providing stronger supervision to the agent while still allowing for autonomous learning. Empirically, HIL-SERL attains near-perfect success rates on diverse manipulation tasks within 1-2 hours of training @luoPreciseDexterousRobotic2024, underscoring how offline datasets with online RL can markedly improve stability and data efficiency, and ultimately even allow real-world RL-training. + +### Code Example: Real-world RL + +**TODO(fracapuano): work out rl training example** + +### Limitations of RL in Real-World Robotics: Simulators and Reward Design + +Despite the advancements in real-world RL training, solving robotics training RL agents in the real world still suffers from the following limitations: + +- In those instances where real-world training experience is prohibitively expensive to gather @degraveMagneticControlTokamak2022, @bellemareAutonomousNavigationStratospheric2020, in-simulation training is often the only option. However, high-fidelity simulators for real-world problems can be difficult to build and maintain, especially for contact-rich manipulation and tasks involving deformable or soft materials. + +- Reward design poses an additional source of brittleness. Dense shaping terms are often required to guide exploration in long-horizon problems, but poorly tuned terms can lead to specification gaming or local optima. Sparse rewards avoid shaping but exacerbate credit assignment and slow down learning. In practice, complex behaviors require efforts shaping rewards: a britlle and error prone process. + +Advances in Behavioral Cloning (BC) from corpora of human demonstrations address both of these concerns. By learning in a supervised fashion to reproduce expert demonstrations, BC methods prove competitive while bypassing the need for simulated environments and hard-to-define reward functions. + +# Robot (Imitation) Learning diff --git a/app/src/content/assets/image/ch2/ch2-approaches.png b/app/src/content/assets/image/figures/ch2/ch2-approaches.png similarity index 100% rename from app/src/content/assets/image/ch2/ch2-approaches.png rename to app/src/content/assets/image/figures/ch2/ch2-approaches.png diff --git a/app/src/content/assets/image/ch2/ch2-classical-limitations.png b/app/src/content/assets/image/figures/ch2/ch2-classical-limitations.png similarity index 100% rename from app/src/content/assets/image/ch2/ch2-classical-limitations.png rename to app/src/content/assets/image/figures/ch2/ch2-classical-limitations.png diff --git a/app/src/content/assets/image/ch2/ch2-cost-accessibility.png b/app/src/content/assets/image/figures/ch2/ch2-cost-accessibility.png similarity index 100% rename from app/src/content/assets/image/ch2/ch2-cost-accessibility.png rename to app/src/content/assets/image/figures/ch2/ch2-cost-accessibility.png diff --git a/app/src/content/assets/image/ch2/ch2-planar-manipulator-floor-box.png b/app/src/content/assets/image/figures/ch2/ch2-planar-manipulator-floor-box.png similarity index 100% rename from app/src/content/assets/image/ch2/ch2-planar-manipulator-floor-box.png rename to app/src/content/assets/image/figures/ch2/ch2-planar-manipulator-floor-box.png diff --git a/app/src/content/assets/image/ch2/ch2-planar-manipulator-floor-shelf.png b/app/src/content/assets/image/figures/ch2/ch2-planar-manipulator-floor-shelf.png similarity index 100% rename from app/src/content/assets/image/ch2/ch2-planar-manipulator-floor-shelf.png rename to app/src/content/assets/image/figures/ch2/ch2-planar-manipulator-floor-shelf.png diff --git a/app/src/content/assets/image/ch2/ch2-planar-manipulator-floor.png b/app/src/content/assets/image/figures/ch2/ch2-planar-manipulator-floor.png similarity index 100% rename from app/src/content/assets/image/ch2/ch2-planar-manipulator-floor.png rename to app/src/content/assets/image/figures/ch2/ch2-planar-manipulator-floor.png diff --git a/app/src/content/assets/image/ch2/ch2-planar-manipulator-free.png b/app/src/content/assets/image/figures/ch2/ch2-planar-manipulator-free.png similarity index 100% rename from app/src/content/assets/image/ch2/ch2-planar-manipulator-free.png rename to app/src/content/assets/image/figures/ch2/ch2-planar-manipulator-free.png diff --git a/app/src/content/assets/image/ch2/ch2-platforms.png b/app/src/content/assets/image/figures/ch2/ch2-platforms.png similarity index 100% rename from app/src/content/assets/image/ch2/ch2-platforms.png rename to app/src/content/assets/image/figures/ch2/ch2-platforms.png diff --git a/app/src/content/assets/image/ch2/ch2-so100-to-planar-manipulator.png b/app/src/content/assets/image/figures/ch2/ch2-so100-to-planar-manipulator.png similarity index 100% rename from app/src/content/assets/image/ch2/ch2-so100-to-planar-manipulator.png rename to app/src/content/assets/image/figures/ch2/ch2-so100-to-planar-manipulator.png diff --git a/app/src/content/assets/image/ch3/ch3-agent-env.png b/app/src/content/assets/image/figures/ch3/ch3-agent-env.png similarity index 100% rename from app/src/content/assets/image/ch3/ch3-agent-env.png rename to app/src/content/assets/image/figures/ch3/ch3-agent-env.png diff --git a/app/src/content/assets/image/ch3/ch3-duck-sim-vs-real.png b/app/src/content/assets/image/figures/ch3/ch3-duck-sim-vs-real.png similarity index 100% rename from app/src/content/assets/image/ch3/ch3-duck-sim-vs-real.png rename to app/src/content/assets/image/figures/ch3/ch3-duck-sim-vs-real.png diff --git a/app/src/content/assets/image/ch3/ch3-hil-serl-examples.png b/app/src/content/assets/image/figures/ch3/ch3-hil-serl-examples.png similarity index 100% rename from app/src/content/assets/image/ch3/ch3-hil-serl-examples.png rename to app/src/content/assets/image/figures/ch3/ch3-hil-serl-examples.png diff --git a/app/src/content/assets/image/ch3/ch3-learning-atlas.png b/app/src/content/assets/image/figures/ch3/ch3-learning-atlas.png similarity index 100% rename from app/src/content/assets/image/ch3/ch3-learning-atlas.png rename to app/src/content/assets/image/figures/ch3/ch3-learning-atlas.png diff --git a/app/src/content/assets/image/ch3/ch3-learning-benefits.png b/app/src/content/assets/image/figures/ch3/ch3-learning-benefits.png similarity index 100% rename from app/src/content/assets/image/ch3/ch3-learning-benefits.png rename to app/src/content/assets/image/figures/ch3/ch3-learning-benefits.png diff --git a/app/src/content/assets/image/ch3/ch3-many-ducks.png b/app/src/content/assets/image/figures/ch3/ch3-many-ducks.png similarity index 100% rename from app/src/content/assets/image/ch3/ch3-many-ducks.png rename to app/src/content/assets/image/figures/ch3/ch3-many-ducks.png diff --git a/app/src/content/assets/image/ch3/ch3-rl-algorithms-atlas.png b/app/src/content/assets/image/figures/ch3/ch3-rl-algorithms-atlas.png similarity index 100% rename from app/src/content/assets/image/ch3/ch3-rl-algorithms-atlas.png rename to app/src/content/assets/image/figures/ch3/ch3-rl-algorithms-atlas.png diff --git a/app/src/content/assets/image/ch3/ch3-rl-examples.png b/app/src/content/assets/image/figures/ch3/ch3-rl-examples.png similarity index 100% rename from app/src/content/assets/image/ch3/ch3-rl-examples.png rename to app/src/content/assets/image/figures/ch3/ch3-rl-examples.png diff --git a/app/src/content/assets/image/ch4/ch4-act-decoder.png b/app/src/content/assets/image/figures/ch4/ch4-act-decoder.png similarity index 100% rename from app/src/content/assets/image/ch4/ch4-act-decoder.png rename to app/src/content/assets/image/figures/ch4/ch4-act-decoder.png diff --git a/app/src/content/assets/image/ch4/ch4-act-encoder.png b/app/src/content/assets/image/figures/ch4/ch4-act-encoder.png similarity index 100% rename from app/src/content/assets/image/ch4/ch4-act-encoder.png rename to app/src/content/assets/image/figures/ch4/ch4-act-encoder.png diff --git a/app/src/content/assets/image/ch4/ch4-act.png b/app/src/content/assets/image/figures/ch4/ch4-act.png similarity index 100% rename from app/src/content/assets/image/ch4/ch4-act.png rename to app/src/content/assets/image/figures/ch4/ch4-act.png diff --git a/app/src/content/assets/image/ch4/ch4-action-vs-observation-distribution.png b/app/src/content/assets/image/figures/ch4/ch4-action-vs-observation-distribution.png similarity index 100% rename from app/src/content/assets/image/ch4/ch4-action-vs-observation-distribution.png rename to app/src/content/assets/image/figures/ch4/ch4-action-vs-observation-distribution.png diff --git a/app/src/content/assets/image/ch4/ch4-async-inference.png b/app/src/content/assets/image/figures/ch4/ch4-async-inference.png similarity index 100% rename from app/src/content/assets/image/ch4/ch4-async-inference.png rename to app/src/content/assets/image/figures/ch4/ch4-async-inference.png diff --git a/app/src/content/assets/image/ch4/ch4-bc-trajectories.png b/app/src/content/assets/image/figures/ch4/ch4-bc-trajectories.png similarity index 100% rename from app/src/content/assets/image/ch4/ch4-bc-trajectories.png rename to app/src/content/assets/image/figures/ch4/ch4-bc-trajectories.png diff --git a/app/src/content/assets/image/ch4/ch4-diffusion-policy.png b/app/src/content/assets/image/figures/ch4/ch4-diffusion-policy.png similarity index 100% rename from app/src/content/assets/image/ch4/ch4-diffusion-policy.png rename to app/src/content/assets/image/figures/ch4/ch4-diffusion-policy.png diff --git a/app/src/content/assets/image/ch4/ch4-diffusion-robot-actions.png b/app/src/content/assets/image/figures/ch4/ch4-diffusion-robot-actions.png similarity index 100% rename from app/src/content/assets/image/ch4/ch4-diffusion-robot-actions.png rename to app/src/content/assets/image/figures/ch4/ch4-diffusion-robot-actions.png diff --git a/app/src/content/assets/image/ch4/ch4-diffusion-vs-flowmatching.png b/app/src/content/assets/image/figures/ch4/ch4-diffusion-vs-flowmatching.png similarity index 100% rename from app/src/content/assets/image/ch4/ch4-diffusion-vs-flowmatching.png rename to app/src/content/assets/image/figures/ch4/ch4-diffusion-vs-flowmatching.png diff --git a/app/src/content/assets/image/ch4/ch4-issues-with-bc.png b/app/src/content/assets/image/figures/ch4/ch4-issues-with-bc.png similarity index 100% rename from app/src/content/assets/image/ch4/ch4-issues-with-bc.png rename to app/src/content/assets/image/figures/ch4/ch4-issues-with-bc.png diff --git a/app/src/content/assets/image/ch4/ch4-latent-variable-model.png b/app/src/content/assets/image/figures/ch4/ch4-latent-variable-model.png similarity index 100% rename from app/src/content/assets/image/ch4/ch4-latent-variable-model.png rename to app/src/content/assets/image/figures/ch4/ch4-latent-variable-model.png diff --git a/app/src/content/assets/image/ch4/ch4-many-latents.png b/app/src/content/assets/image/figures/ch4/ch4-many-latents.png similarity index 100% rename from app/src/content/assets/image/ch4/ch4-many-latents.png rename to app/src/content/assets/image/figures/ch4/ch4-many-latents.png diff --git a/app/src/content/assets/image/ch4/ch4-normalizing-flows.png b/app/src/content/assets/image/figures/ch4/ch4-normalizing-flows.png similarity index 100% rename from app/src/content/assets/image/ch4/ch4-normalizing-flows.png rename to app/src/content/assets/image/figures/ch4/ch4-normalizing-flows.png diff --git a/app/src/content/assets/image/ch4/ch4-observation-action-mapping.png b/app/src/content/assets/image/figures/ch4/ch4-observation-action-mapping.png similarity index 100% rename from app/src/content/assets/image/ch4/ch4-observation-action-mapping.png rename to app/src/content/assets/image/figures/ch4/ch4-observation-action-mapping.png diff --git a/app/src/content/assets/image/ch4/ch4-queues.png b/app/src/content/assets/image/figures/ch4/ch4-queues.png similarity index 100% rename from app/src/content/assets/image/ch4/ch4-queues.png rename to app/src/content/assets/image/figures/ch4/ch4-queues.png diff --git a/app/src/content/assets/image/ch4/ch4-task-effect-on-pairs.png b/app/src/content/assets/image/figures/ch4/ch4-task-effect-on-pairs.png similarity index 100% rename from app/src/content/assets/image/ch4/ch4-task-effect-on-pairs.png rename to app/src/content/assets/image/figures/ch4/ch4-task-effect-on-pairs.png diff --git a/app/src/content/assets/image/ch5/ch5-generalist-policies-timeline.png b/app/src/content/assets/image/figures/ch5/ch5-generalist-policies-timeline.png similarity index 100% rename from app/src/content/assets/image/ch5/ch5-generalist-policies-timeline.png rename to app/src/content/assets/image/figures/ch5/ch5-generalist-policies-timeline.png diff --git a/app/src/content/assets/image/ch5/ch5-ml-vs-robotics-foundation.png b/app/src/content/assets/image/figures/ch5/ch5-ml-vs-robotics-foundation.png similarity index 100% rename from app/src/content/assets/image/ch5/ch5-ml-vs-robotics-foundation.png rename to app/src/content/assets/image/figures/ch5/ch5-ml-vs-robotics-foundation.png diff --git a/app/src/content/assets/image/ch5/ch5-pi0-sampling-timesteps.png b/app/src/content/assets/image/figures/ch5/ch5-pi0-sampling-timesteps.png similarity index 100% rename from app/src/content/assets/image/ch5/ch5-pi0-sampling-timesteps.png rename to app/src/content/assets/image/figures/ch5/ch5-pi0-sampling-timesteps.png diff --git a/app/src/content/assets/image/ch5/ch5-pi0.png b/app/src/content/assets/image/figures/ch5/ch5-pi0.png similarity index 100% rename from app/src/content/assets/image/ch5/ch5-pi0.png rename to app/src/content/assets/image/figures/ch5/ch5-pi0.png diff --git a/app/src/content/assets/image/ch5/ch5-smolvla.png b/app/src/content/assets/image/figures/ch5/ch5-smolvla.png similarity index 100% rename from app/src/content/assets/image/ch5/ch5-smolvla.png rename to app/src/content/assets/image/figures/ch5/ch5-smolvla.png diff --git a/app/src/content/assets/image/ch5/ch5-trends.png b/app/src/content/assets/image/figures/ch5/ch5-trends.png similarity index 100% rename from app/src/content/assets/image/ch5/ch5-trends.png rename to app/src/content/assets/image/figures/ch5/ch5-trends.png diff --git a/app/src/content/assets/data/somedata.json b/app/src/content/assets/image/figures/data/somedata.json similarity index 100% rename from app/src/content/assets/data/somedata.json rename to app/src/content/assets/image/figures/data/somedata.json diff --git a/app/src/content/assets/image/misc/lerobot-team.jpeg b/app/src/content/assets/image/misc/lerobot-team.jpeg deleted file mode 100644 index 330c9a79b9751bf86ffe5ce84a9aaac88ac5d7e6..0000000000000000000000000000000000000000 --- a/app/src/content/assets/image/misc/lerobot-team.jpeg +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7b79149533fb8602ee423c91c068100657745045bfd1507a6a61e30d58c65877 -size 170202 diff --git a/assets/image/figures/ch1/ch1-lerobot-figure1.png b/assets/image/figures/ch1/ch1-lerobot-figure1.png new file mode 100644 index 0000000000000000000000000000000000000000..9a43981b7d60df842224ee6bff9be820809b36b6 --- /dev/null +++ b/assets/image/figures/ch1/ch1-lerobot-figure1.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a850d2b9170736a42366d65dd858408dcffafa3420a0c6cfd678bbdd29a196fa +size 2861318 diff --git a/assets/image/figures/ch2/ch2-approaches.png b/assets/image/figures/ch2/ch2-approaches.png new file mode 100644 index 0000000000000000000000000000000000000000..161aac09e5cae1c51d7a24deb2038ad80358e8cb --- /dev/null +++ b/assets/image/figures/ch2/ch2-approaches.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d07f3166fd9efe5b0823ecca63166c019b6fb9dcc912f7b1ae0fd209a25ba274 +size 93262 diff --git a/assets/image/figures/ch2/ch2-classical-limitations.png b/assets/image/figures/ch2/ch2-classical-limitations.png new file mode 100644 index 0000000000000000000000000000000000000000..969684eb34a3f473e0a0df8ec491c27144d69613 --- /dev/null +++ b/assets/image/figures/ch2/ch2-classical-limitations.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:85742a774d8d1ad3e36fc50d89c5a69409bce98ebe6bdba734896156ba668aa8 +size 4739243 diff --git a/assets/image/figures/ch2/ch2-cost-accessibility.png b/assets/image/figures/ch2/ch2-cost-accessibility.png new file mode 100644 index 0000000000000000000000000000000000000000..17aa82045475dc0e0537649285e4abd0a9aefd2b --- /dev/null +++ b/assets/image/figures/ch2/ch2-cost-accessibility.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:606cbb89fda90a2ddb22dc721ea978ffa9fe34a7f9f0bf1614b6ae53b4117411 +size 1962263 diff --git a/assets/image/figures/ch2/ch2-planar-manipulator-floor-box.png b/assets/image/figures/ch2/ch2-planar-manipulator-floor-box.png new file mode 100644 index 0000000000000000000000000000000000000000..608b518385558b273d591d7f76d1d2804ece01b8 --- /dev/null +++ b/assets/image/figures/ch2/ch2-planar-manipulator-floor-box.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c856918ffb061c235d05e74df6310412f5b41ea907f0f12f55fed5c8b45590b +size 93114 diff --git a/assets/image/figures/ch2/ch2-planar-manipulator-floor-shelf.png b/assets/image/figures/ch2/ch2-planar-manipulator-floor-shelf.png new file mode 100644 index 0000000000000000000000000000000000000000..47c539881d7b58df4b4493093ab6b780c349a476 --- /dev/null +++ b/assets/image/figures/ch2/ch2-planar-manipulator-floor-shelf.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e4abb239c45a576a02fc2cbd0d87f877b2c5f61dcac74e1b8c79a70ebacaca3e +size 83589 diff --git a/assets/image/figures/ch2/ch2-planar-manipulator-floor.png b/assets/image/figures/ch2/ch2-planar-manipulator-floor.png new file mode 100644 index 0000000000000000000000000000000000000000..1f19ca65db5de85acc43ca8240987b99fd298231 --- /dev/null +++ b/assets/image/figures/ch2/ch2-planar-manipulator-floor.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4a2c70f2d7c903d9f16433a9ca44c10892fd0e10ca90e2d9b8438c3d25fa623a +size 58946 diff --git a/assets/image/figures/ch2/ch2-planar-manipulator-free.png b/assets/image/figures/ch2/ch2-planar-manipulator-free.png new file mode 100644 index 0000000000000000000000000000000000000000..42d6dc9662903b2563663a9b409a8dc83f69906f --- /dev/null +++ b/assets/image/figures/ch2/ch2-planar-manipulator-free.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9d860153a76720749a50a6d06c7bcb9886f5605a867f130f66810597ca3f5299 +size 44656 diff --git a/assets/image/figures/ch2/ch2-platforms.png b/assets/image/figures/ch2/ch2-platforms.png new file mode 100644 index 0000000000000000000000000000000000000000..4ccc153ed092d5493052d1ddede64094ae6b4068 --- /dev/null +++ b/assets/image/figures/ch2/ch2-platforms.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:baf76deb1a68b859d1e702bc7d0b4173a6b34b56d4bdf75c4748e80eb1934aad +size 3616534 diff --git a/assets/image/figures/ch2/ch2-so100-to-planar-manipulator.png b/assets/image/figures/ch2/ch2-so100-to-planar-manipulator.png new file mode 100644 index 0000000000000000000000000000000000000000..d4bc70f800df876a10b6fdb4ac51c2544b2977fb --- /dev/null +++ b/assets/image/figures/ch2/ch2-so100-to-planar-manipulator.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:731806e912421ee3f3fcd10c24b5f5e9f4dd448f859e8213f8f11c0821fcbf59 +size 1555756 diff --git a/assets/image/figures/ch3/ch3-agent-env.png b/assets/image/figures/ch3/ch3-agent-env.png new file mode 100644 index 0000000000000000000000000000000000000000..9d3ac5a9b05c8c48faf8660a5cac80737392110f --- /dev/null +++ b/assets/image/figures/ch3/ch3-agent-env.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:43c8641128f72b994a7269561fd6beaf2fbe0d73bb19f58ade559e271de1de31 +size 42614 diff --git a/assets/image/figures/ch3/ch3-duck-sim-vs-real.png b/assets/image/figures/ch3/ch3-duck-sim-vs-real.png new file mode 100644 index 0000000000000000000000000000000000000000..142a5ea15f01aee271c1775e26a6a2c7bc4aedcc --- /dev/null +++ b/assets/image/figures/ch3/ch3-duck-sim-vs-real.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c682cfebec3bf21f579a687d4f6a34d6f7cff225397e081188c39ca3b3def1e7 +size 1762155 diff --git a/assets/image/figures/ch3/ch3-hil-serl-examples.png b/assets/image/figures/ch3/ch3-hil-serl-examples.png new file mode 100644 index 0000000000000000000000000000000000000000..d665f43d5ed8972fc76399ed8caedd9fee4b373e --- /dev/null +++ b/assets/image/figures/ch3/ch3-hil-serl-examples.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ae41b09a8a8412b28994425565438a897f827b3a2048d6832c2be7884b40a2af +size 7216604 diff --git a/assets/image/figures/ch3/ch3-learning-atlas.png b/assets/image/figures/ch3/ch3-learning-atlas.png new file mode 100644 index 0000000000000000000000000000000000000000..6aceb0b7ccaefebf0bb854ab012eca0cc3ac5da2 --- /dev/null +++ b/assets/image/figures/ch3/ch3-learning-atlas.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:124d586210aa9b3a110c712c4eff3629d0064a507c9c77bf937dd00cc959428c +size 178001 diff --git a/assets/image/figures/ch3/ch3-learning-benefits.png b/assets/image/figures/ch3/ch3-learning-benefits.png new file mode 100644 index 0000000000000000000000000000000000000000..89684d039e24b897517612c222ef6e979f42a7c2 --- /dev/null +++ b/assets/image/figures/ch3/ch3-learning-benefits.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c23f98c050afb75098f34a2bca49fa30ebb4a2b373447c36ba62612854253ff3 +size 6936585 diff --git a/assets/image/figures/ch3/ch3-many-ducks.png b/assets/image/figures/ch3/ch3-many-ducks.png new file mode 100644 index 0000000000000000000000000000000000000000..7605bcb2ba0f2abcd7213a4ca092e792db08c504 --- /dev/null +++ b/assets/image/figures/ch3/ch3-many-ducks.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:418bdeff168978207fcc623db74d25b86d11f27d1100a28238bc1591901b93de +size 4872198 diff --git a/assets/image/figures/ch3/ch3-rl-algorithms-atlas.png b/assets/image/figures/ch3/ch3-rl-algorithms-atlas.png new file mode 100644 index 0000000000000000000000000000000000000000..95e818db1704eb52f601c8d5a32f215b7cf7620c --- /dev/null +++ b/assets/image/figures/ch3/ch3-rl-algorithms-atlas.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2aa853e6067e7bd06cfa0d12250d4277fbe2020b8a2b817c005b084c49c905d5 +size 194522 diff --git a/assets/image/figures/ch3/ch3-rl-examples.png b/assets/image/figures/ch3/ch3-rl-examples.png new file mode 100644 index 0000000000000000000000000000000000000000..06de5007b9f0c10c23f79a2af13865a701916662 --- /dev/null +++ b/assets/image/figures/ch3/ch3-rl-examples.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:edb1fa24ee3d279302980016809eab038fc43037156b8d7cadae7fa5b9dddbba +size 9051359 diff --git a/assets/image/figures/ch4/ch4-act-decoder.png b/assets/image/figures/ch4/ch4-act-decoder.png new file mode 100644 index 0000000000000000000000000000000000000000..9a09fcb99bb717287ca74d165a3ca5d6983febba --- /dev/null +++ b/assets/image/figures/ch4/ch4-act-decoder.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:578074c47e65992422e9cb991949b1d63598aded2098dfde3925a33dfd55e481 +size 3180391 diff --git a/assets/image/figures/ch4/ch4-act-encoder.png b/assets/image/figures/ch4/ch4-act-encoder.png new file mode 100644 index 0000000000000000000000000000000000000000..f587680a13512bae2fe83b3b472ea54a273293e5 --- /dev/null +++ b/assets/image/figures/ch4/ch4-act-encoder.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7ceeeccb9dd7e791f215f71ee422d9adfb8c2ff1d2417a851e31ba6a6715aaf7 +size 874336 diff --git a/assets/image/figures/ch4/ch4-act.png b/assets/image/figures/ch4/ch4-act.png new file mode 100644 index 0000000000000000000000000000000000000000..1f884e4a57994ca4a50e979ce8a7595bd02afc6f --- /dev/null +++ b/assets/image/figures/ch4/ch4-act.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:318b6f77277c5e8fcf51e2aba63154ee99052e2bcff2af0387fb3cfd1d07cff7 +size 1517348 diff --git a/assets/image/figures/ch4/ch4-action-vs-observation-distribution.png b/assets/image/figures/ch4/ch4-action-vs-observation-distribution.png new file mode 100644 index 0000000000000000000000000000000000000000..fc82dc6c86ce40126b00697f13a43cc563fe4b4d --- /dev/null +++ b/assets/image/figures/ch4/ch4-action-vs-observation-distribution.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7db4ecc0d54d9cab6b8a16017c81bfd9b7fd5d7997bcdd645ccf57167f7efcf2 +size 274240 diff --git a/assets/image/figures/ch4/ch4-async-inference.png b/assets/image/figures/ch4/ch4-async-inference.png new file mode 100644 index 0000000000000000000000000000000000000000..73aae17126c70f3fca8651ef62b7d519c81e6f58 --- /dev/null +++ b/assets/image/figures/ch4/ch4-async-inference.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:850ebb6e6ad809edc48597a89cf8e25b2664b9137ca4602ae14f164524f8d232 +size 282300 diff --git a/assets/image/figures/ch4/ch4-bc-trajectories.png b/assets/image/figures/ch4/ch4-bc-trajectories.png new file mode 100644 index 0000000000000000000000000000000000000000..d577a6966244c54eb3738bd61af13232a603145a --- /dev/null +++ b/assets/image/figures/ch4/ch4-bc-trajectories.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0ede85dbb8f12b3cced4dc0e12f97e3713d8432953183840f99e8534998d7f3b +size 2253030 diff --git a/assets/image/figures/ch4/ch4-diffusion-policy.png b/assets/image/figures/ch4/ch4-diffusion-policy.png new file mode 100644 index 0000000000000000000000000000000000000000..56da7917d95a1592faafde62702170fac438f903 --- /dev/null +++ b/assets/image/figures/ch4/ch4-diffusion-policy.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c3cb644c79fd016e77c78bd7fcf185908b18fb127f656003eb577349cfb6da40 +size 2805702 diff --git a/assets/image/figures/ch4/ch4-diffusion-robot-actions.png b/assets/image/figures/ch4/ch4-diffusion-robot-actions.png new file mode 100644 index 0000000000000000000000000000000000000000..43d8ce2193bdaeecb172de160290392aaf4000c0 --- /dev/null +++ b/assets/image/figures/ch4/ch4-diffusion-robot-actions.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a59b816b60a53784127e3dcf0aad612ba14474bde57e1c2b73b670665d1b70ec +size 8927638 diff --git a/assets/image/figures/ch4/ch4-diffusion-vs-flowmatching.png b/assets/image/figures/ch4/ch4-diffusion-vs-flowmatching.png new file mode 100644 index 0000000000000000000000000000000000000000..2f4898e0c4db3a001354cc9a78d40e7537b34359 --- /dev/null +++ b/assets/image/figures/ch4/ch4-diffusion-vs-flowmatching.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aef138f5120025b0bad73788bc8b3af91f27331af3b49bafb09b15037944fa12 +size 189022 diff --git a/assets/image/figures/ch4/ch4-issues-with-bc.png b/assets/image/figures/ch4/ch4-issues-with-bc.png new file mode 100644 index 0000000000000000000000000000000000000000..789283d5085bae36ebaf062bd157007988e2dd23 --- /dev/null +++ b/assets/image/figures/ch4/ch4-issues-with-bc.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7b726d8aa64534e8cbec4a0084fd86e4dfcc0b17685559970006a573dd326459 +size 1560808 diff --git a/assets/image/figures/ch4/ch4-latent-variable-model.png b/assets/image/figures/ch4/ch4-latent-variable-model.png new file mode 100644 index 0000000000000000000000000000000000000000..62a7ade0557696ee25c61d10ef323ca1ec9bb077 --- /dev/null +++ b/assets/image/figures/ch4/ch4-latent-variable-model.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e5b1f48d4dc011d5a20b1d5bccc5cde750f4ffab4b8c48bb5b04529a18aa0390 +size 983775 diff --git a/assets/image/figures/ch4/ch4-many-latents.png b/assets/image/figures/ch4/ch4-many-latents.png new file mode 100644 index 0000000000000000000000000000000000000000..d972eb9694fe47d81d7a5bff66f78edd80c83e57 --- /dev/null +++ b/assets/image/figures/ch4/ch4-many-latents.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1f5421aae5c9e9735de598fca1a5c68ef7fd28c8b31112c4675356f6deda9b29 +size 222323 diff --git a/assets/image/figures/ch4/ch4-normalizing-flows.png b/assets/image/figures/ch4/ch4-normalizing-flows.png new file mode 100644 index 0000000000000000000000000000000000000000..cf51b8de51af38c0ea807889d8056d41c524c2d5 --- /dev/null +++ b/assets/image/figures/ch4/ch4-normalizing-flows.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:51f73d09b35b8ccd5685c6b26f7615f8d6ab3df7d045b2502e9232bfe33beace +size 278482 diff --git a/assets/image/figures/ch4/ch4-observation-action-mapping.png b/assets/image/figures/ch4/ch4-observation-action-mapping.png new file mode 100644 index 0000000000000000000000000000000000000000..6206870edf17a28bafe36ca0c5631a62b14f5a6a --- /dev/null +++ b/assets/image/figures/ch4/ch4-observation-action-mapping.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f1a4a70971ea4c7cf73c089a70e4bc9dd1b5aba43021016fea8b323ad2642c53 +size 2081981 diff --git a/assets/image/figures/ch4/ch4-queues.png b/assets/image/figures/ch4/ch4-queues.png new file mode 100644 index 0000000000000000000000000000000000000000..c1e912ba8a2d5b254ea9d990ba8dbab491cb22ed --- /dev/null +++ b/assets/image/figures/ch4/ch4-queues.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8d3072c26d0419ee4b19f4ebd10c66e117e113514326eb3e7864057644c305d7 +size 1971787 diff --git a/assets/image/figures/ch4/ch4-task-effect-on-pairs.png b/assets/image/figures/ch4/ch4-task-effect-on-pairs.png new file mode 100644 index 0000000000000000000000000000000000000000..6fa47c83e5ba456655b025bd651aea0fc6feeeaa --- /dev/null +++ b/assets/image/figures/ch4/ch4-task-effect-on-pairs.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0423b4760f661afa6b81a896a473a4bfc50737b0ecef76fa75051eb6ccf69896 +size 1186204 diff --git a/assets/image/figures/ch5/ch5-generalist-policies-timeline.png b/assets/image/figures/ch5/ch5-generalist-policies-timeline.png new file mode 100644 index 0000000000000000000000000000000000000000..d85a308d7665bd9c6fab4b0f59f622b0e1599745 --- /dev/null +++ b/assets/image/figures/ch5/ch5-generalist-policies-timeline.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:98f0efdb30302f2fd582bbec379007ef3d2188171f0d700014539560b5d29a9f +size 121521 diff --git a/assets/image/figures/ch5/ch5-ml-vs-robotics-foundation.png b/assets/image/figures/ch5/ch5-ml-vs-robotics-foundation.png new file mode 100644 index 0000000000000000000000000000000000000000..0327c71faf9a48c757b6a6f3027f7e54cac6f0e7 --- /dev/null +++ b/assets/image/figures/ch5/ch5-ml-vs-robotics-foundation.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e858e0c5c2d7246e097c8e048d7c378c0ce20c922e66ceac8db8dbb2c5598e79 +size 3389240 diff --git a/assets/image/figures/ch5/ch5-pi0-sampling-timesteps.png b/assets/image/figures/ch5/ch5-pi0-sampling-timesteps.png new file mode 100644 index 0000000000000000000000000000000000000000..84401c9e5468cef66fcd2cdf2014f0c103003c93 --- /dev/null +++ b/assets/image/figures/ch5/ch5-pi0-sampling-timesteps.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2c27d0d34e08154b42692d1a3ea142ef7742ab50547211e9b22f16d79d14fbb3 +size 186917 diff --git a/assets/image/figures/ch5/ch5-pi0.png b/assets/image/figures/ch5/ch5-pi0.png new file mode 100644 index 0000000000000000000000000000000000000000..4ea364ceb9691e4ea9928caac2ee6a32860a52d3 --- /dev/null +++ b/assets/image/figures/ch5/ch5-pi0.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:689a7d0a94d116edce122d8c9010aa456ae7d1d816f5684513711d36c94ebb89 +size 1242717 diff --git a/assets/image/figures/ch5/ch5-smolvla.png b/assets/image/figures/ch5/ch5-smolvla.png new file mode 100644 index 0000000000000000000000000000000000000000..488341b99047ecfad012127baa3a759354577853 --- /dev/null +++ b/assets/image/figures/ch5/ch5-smolvla.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:49575d51c64eb320c588673fb9b33d1d0a3de7f6af7165a18c35ffb40af93e7a +size 1333430 diff --git a/assets/image/figures/ch5/ch5-trends.png b/assets/image/figures/ch5/ch5-trends.png new file mode 100644 index 0000000000000000000000000000000000000000..b399968a1d56a98ce0f4af3d1458cf903a1e1471 --- /dev/null +++ b/assets/image/figures/ch5/ch5-trends.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:357708ec69852658d69c5f3ec3d9c5805939fdaa0d13150f6777731579db09fe +size 636731